core/slice/sort/shared/smallsort.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872
//! This module contains a variety of sort implementations that are optimized for small lengths.
use crate::mem::{self, ManuallyDrop, MaybeUninit};
use crate::slice::sort::shared::FreezeMarker;
use crate::{intrinsics, ptr, slice};
// It's important to differentiate between SMALL_SORT_THRESHOLD performance for
// small slices and small-sort performance sorting small sub-slices as part of
// the main quicksort loop. For the former, testing showed that the
// representative benchmarks for real-world performance are cold CPU state and
// not single-size hot benchmarks. For the latter the CPU will call them many
// times, so hot benchmarks are fine and more realistic. And it's worth it to
// optimize sorting small sub-slices with more sophisticated solutions than
// insertion sort.
/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
/// abstractions.
pub(crate) trait StableSmallSortTypeImpl: Sized {
/// For which input length <= return value of this function, is it valid to call `small_sort`.
fn small_sort_threshold() -> usize;
/// Sorts `v` using strategies optimized for small sizes.
fn small_sort<F: FnMut(&Self, &Self) -> bool>(
v: &mut [Self],
scratch: &mut [MaybeUninit<Self>],
is_less: &mut F,
);
}
impl<T> StableSmallSortTypeImpl for T {
#[inline(always)]
default fn small_sort_threshold() -> usize {
// Optimal number of comparisons, and good perf.
SMALL_SORT_FALLBACK_THRESHOLD
}
#[inline(always)]
default fn small_sort<F: FnMut(&T, &T) -> bool>(
v: &mut [T],
_scratch: &mut [MaybeUninit<T>],
is_less: &mut F,
) {
if v.len() >= 2 {
insertion_sort_shift_left(v, 1, is_less);
}
}
}
impl<T: FreezeMarker> StableSmallSortTypeImpl for T {
#[inline(always)]
fn small_sort_threshold() -> usize {
SMALL_SORT_GENERAL_THRESHOLD
}
#[inline(always)]
fn small_sort<F: FnMut(&T, &T) -> bool>(
v: &mut [T],
scratch: &mut [MaybeUninit<T>],
is_less: &mut F,
) {
small_sort_general_with_scratch(v, scratch, is_less);
}
}
/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
/// abstractions.
pub(crate) trait UnstableSmallSortTypeImpl: Sized {
/// For which input length <= return value of this function, is it valid to call `small_sort`.
fn small_sort_threshold() -> usize;
/// Sorts `v` using strategies optimized for small sizes.
fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
}
impl<T> UnstableSmallSortTypeImpl for T {
#[inline(always)]
default fn small_sort_threshold() -> usize {
SMALL_SORT_FALLBACK_THRESHOLD
}
#[inline(always)]
default fn small_sort<F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
small_sort_fallback(v, is_less);
}
}
impl<T: FreezeMarker> UnstableSmallSortTypeImpl for T {
#[inline(always)]
fn small_sort_threshold() -> usize {
<T as UnstableSmallSortFreezeTypeImpl>::small_sort_threshold()
}
#[inline(always)]
fn small_sort<F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
<T as UnstableSmallSortFreezeTypeImpl>::small_sort(v, is_less);
}
}
/// FIXME(const_trait_impl) use original ipnsort approach with choose_unstable_small_sort,
/// as found here <https://github.com/Voultapher/sort-research-rs/blob/438fad5d0495f65d4b72aa87f0b62fc96611dff3/ipnsort/src/smallsort.rs#L83C10-L83C36>.
pub(crate) trait UnstableSmallSortFreezeTypeImpl: Sized + FreezeMarker {
fn small_sort_threshold() -> usize;
fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
}
impl<T: FreezeMarker> UnstableSmallSortFreezeTypeImpl for T {
#[inline(always)]
default fn small_sort_threshold() -> usize {
if (mem::size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
SMALL_SORT_GENERAL_THRESHOLD
} else {
SMALL_SORT_FALLBACK_THRESHOLD
}
}
#[inline(always)]
default fn small_sort<F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
if (mem::size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
small_sort_general(v, is_less);
} else {
small_sort_fallback(v, is_less);
}
}
}
/// SAFETY: Only used for run-time optimization heuristic.
#[rustc_unsafe_specialization_marker]
trait CopyMarker {}
impl<T: Copy> CopyMarker for T {}
impl<T: FreezeMarker + CopyMarker> UnstableSmallSortFreezeTypeImpl for T {
#[inline(always)]
fn small_sort_threshold() -> usize {
if has_efficient_in_place_swap::<T>()
&& (mem::size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
{
SMALL_SORT_NETWORK_THRESHOLD
} else if (mem::size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
SMALL_SORT_GENERAL_THRESHOLD
} else {
SMALL_SORT_FALLBACK_THRESHOLD
}
}
#[inline(always)]
fn small_sort<F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
if has_efficient_in_place_swap::<T>()
&& (mem::size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
{
small_sort_network(v, is_less);
} else if (mem::size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
small_sort_general(v, is_less);
} else {
small_sort_fallback(v, is_less);
}
}
}
/// Optimal number of comparisons, and good perf.
const SMALL_SORT_FALLBACK_THRESHOLD: usize = 16;
/// From a comparison perspective 20 was ~2% more efficient for fully random input, but for
/// wall-clock performance choosing 32 yielded better performance overall.
///
/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
const SMALL_SORT_GENERAL_THRESHOLD: usize = 32;
/// [`small_sort_general`] uses [`sort8_stable`] as primitive and does a kind of ping-pong merge,
/// where the output of the first two [`sort8_stable`] calls is stored at the end of the scratch
/// buffer. This simplifies panic handling and avoids additional copies. This affects the required
/// scratch buffer size.
///
/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
pub(crate) const SMALL_SORT_GENERAL_SCRATCH_LEN: usize = SMALL_SORT_GENERAL_THRESHOLD + 16;
/// SAFETY: If you change this value, you have to adjust [`small_sort_network`] !
const SMALL_SORT_NETWORK_THRESHOLD: usize = 32;
const SMALL_SORT_NETWORK_SCRATCH_LEN: usize = SMALL_SORT_NETWORK_THRESHOLD;
/// Using a stack array, could cause a stack overflow if the type `T` is very large. To be
/// conservative we limit the usage of small-sorts that require a stack array to types that fit
/// within this limit.
const MAX_STACK_ARRAY_SIZE: usize = 4096;
fn small_sort_fallback<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
if v.len() >= 2 {
insertion_sort_shift_left(v, 1, is_less);
}
}
fn small_sort_general<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
let mut stack_array = MaybeUninit::<[T; SMALL_SORT_GENERAL_SCRATCH_LEN]>::uninit();
// SAFETY: The memory is backed by `stack_array`, and the operation is safe as long as the len
// is the same.
let scratch = unsafe {
slice::from_raw_parts_mut(
stack_array.as_mut_ptr() as *mut MaybeUninit<T>,
SMALL_SORT_GENERAL_SCRATCH_LEN,
)
};
small_sort_general_with_scratch(v, scratch, is_less);
}
fn small_sort_general_with_scratch<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
v: &mut [T],
scratch: &mut [MaybeUninit<T>],
is_less: &mut F,
) {
let len = v.len();
if len < 2 {
return;
}
if scratch.len() < len + 16 {
intrinsics::abort();
}
let v_base = v.as_mut_ptr();
let len_div_2 = len / 2;
// SAFETY: See individual comments.
unsafe {
let scratch_base = scratch.as_mut_ptr() as *mut T;
let presorted_len = if const { mem::size_of::<T>() <= 16 } && len >= 16 {
// SAFETY: scratch_base is valid and has enough space.
sort8_stable(v_base, scratch_base, scratch_base.add(len), is_less);
sort8_stable(
v_base.add(len_div_2),
scratch_base.add(len_div_2),
scratch_base.add(len + 8),
is_less,
);
8
} else if len >= 8 {
// SAFETY: scratch_base is valid and has enough space.
sort4_stable(v_base, scratch_base, is_less);
sort4_stable(v_base.add(len_div_2), scratch_base.add(len_div_2), is_less);
4
} else {
ptr::copy_nonoverlapping(v_base, scratch_base, 1);
ptr::copy_nonoverlapping(v_base.add(len_div_2), scratch_base.add(len_div_2), 1);
1
};
for offset in [0, len_div_2] {
// SAFETY: at this point dst is initialized with presorted_len elements.
// We extend this to desired_len, src is valid for desired_len elements.
let src = v_base.add(offset);
let dst = scratch_base.add(offset);
let desired_len = if offset == 0 { len_div_2 } else { len - len_div_2 };
for i in presorted_len..desired_len {
ptr::copy_nonoverlapping(src.add(i), dst.add(i), 1);
insert_tail(dst, dst.add(i), is_less);
}
}
// SAFETY: see comment in `CopyOnDrop::drop`.
let drop_guard = CopyOnDrop { src: scratch_base, dst: v_base, len };
// SAFETY: at this point scratch_base is fully initialized, allowing us
// to use it as the source of our merge back into the original array.
// If a panic occurs we ensure the original array is restored to a valid
// permutation of the input through drop_guard. This technique is similar
// to ping-pong merging.
bidirectional_merge(
&*ptr::slice_from_raw_parts(drop_guard.src, drop_guard.len),
drop_guard.dst,
is_less,
);
mem::forget(drop_guard);
}
}
struct CopyOnDrop<T> {
src: *const T,
dst: *mut T,
len: usize,
}
impl<T> Drop for CopyOnDrop<T> {
fn drop(&mut self) {
// SAFETY: `src` must contain `len` initialized elements, and dst must
// be valid to write `len` elements.
unsafe {
ptr::copy_nonoverlapping(self.src, self.dst, self.len);
}
}
}
fn small_sort_network<T, F>(v: &mut [T], is_less: &mut F)
where
T: FreezeMarker,
F: FnMut(&T, &T) -> bool,
{
// This implementation is tuned to be efficient for integer types.
let len = v.len();
if len < 2 {
return;
}
if len > SMALL_SORT_NETWORK_SCRATCH_LEN {
intrinsics::abort();
}
let mut stack_array = MaybeUninit::<[T; SMALL_SORT_NETWORK_SCRATCH_LEN]>::uninit();
let len_div_2 = len / 2;
let no_merge = len < 18;
let v_base = v.as_mut_ptr();
let initial_region_len = if no_merge { len } else { len_div_2 };
// SAFETY: Both possible values of `initial_region_len` are in-bounds.
let mut region = unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base, initial_region_len) };
// Avoid compiler unrolling, we *really* don't want that to happen here for binary-size reasons.
loop {
let presorted_len = if region.len() >= 13 {
sort13_optimal(region, is_less);
13
} else if region.len() >= 9 {
sort9_optimal(region, is_less);
9
} else {
1
};
insertion_sort_shift_left(region, presorted_len, is_less);
if no_merge {
return;
}
if region.as_ptr() != v_base {
break;
}
// SAFETY: The right side of `v` based on `len_div_2` is guaranteed in-bounds.
unsafe {
region = &mut *ptr::slice_from_raw_parts_mut(v_base.add(len_div_2), len - len_div_2)
};
}
// SAFETY: We checked that T is Freeze and thus observation safe.
// Should is_less panic v was not modified in parity_merge and retains it's original input.
// scratch and v must not alias and scratch has v.len() space.
unsafe {
let scratch_base = stack_array.as_mut_ptr() as *mut T;
bidirectional_merge(
&mut *ptr::slice_from_raw_parts_mut(v_base, len),
scratch_base,
is_less,
);
ptr::copy_nonoverlapping(scratch_base, v_base, len);
}
}
/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
/// value at position `b_pos` is less than the one at position `a_pos`.
///
/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like
/// types. `is_less` could be a huge function and we want to give the compiler an option to
/// not inline this function. For the same reasons that this function is very perf critical
/// it should be in the same module as the functions that use it.
unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
// SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid
// pointers into `v_base`, and are properly aligned, and part of the same allocation.
unsafe {
let v_a = v_base.add(a_pos);
let v_b = v_base.add(b_pos);
// PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
// in a well defined state, without duplicates.
// Important to only swap if it is more and not if it is equal. is_less should return false for
// equal, so we don't swap.
let should_swap = is_less(&*v_b, &*v_a);
// This is a branchless version of swap if.
// The equivalent code with a branch would be:
//
// if should_swap {
// ptr::swap(left, right, 1);
// }
// The goal is to generate cmov instructions here.
let left_swap = if should_swap { v_b } else { v_a };
let right_swap = if should_swap { v_a } else { v_b };
let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap));
ptr::copy(left_swap, v_a, 1);
ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1);
}
}
/// Sorts the first 9 elements of `v` with a fast fixed function.
///
/// Should `is_less` generate substantial amounts of code the compiler can choose to not inline
/// `swap_if_less`. If the code of a sort impl changes so as to call this function in multiple
/// places, `#[inline(never)]` is recommended to keep binary-size in check. The current design of
/// `small_sort_network` makes sure to only call this once.
fn sort9_optimal<T, F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
if v.len() < 9 {
intrinsics::abort();
}
let v_base = v.as_mut_ptr();
// Optimal sorting network see:
// https://bertdobbelaere.github.io/sorting_networks.html.
// SAFETY: We checked the len.
unsafe {
swap_if_less(v_base, 0, 3, is_less);
swap_if_less(v_base, 1, 7, is_less);
swap_if_less(v_base, 2, 5, is_less);
swap_if_less(v_base, 4, 8, is_less);
swap_if_less(v_base, 0, 7, is_less);
swap_if_less(v_base, 2, 4, is_less);
swap_if_less(v_base, 3, 8, is_less);
swap_if_less(v_base, 5, 6, is_less);
swap_if_less(v_base, 0, 2, is_less);
swap_if_less(v_base, 1, 3, is_less);
swap_if_less(v_base, 4, 5, is_less);
swap_if_less(v_base, 7, 8, is_less);
swap_if_less(v_base, 1, 4, is_less);
swap_if_less(v_base, 3, 6, is_less);
swap_if_less(v_base, 5, 7, is_less);
swap_if_less(v_base, 0, 1, is_less);
swap_if_less(v_base, 2, 4, is_less);
swap_if_less(v_base, 3, 5, is_less);
swap_if_less(v_base, 6, 8, is_less);
swap_if_less(v_base, 2, 3, is_less);
swap_if_less(v_base, 4, 5, is_less);
swap_if_less(v_base, 6, 7, is_less);
swap_if_less(v_base, 1, 2, is_less);
swap_if_less(v_base, 3, 4, is_less);
swap_if_less(v_base, 5, 6, is_less);
}
}
/// Sorts the first 13 elements of `v` with a fast fixed function.
///
/// Should `is_less` generate substantial amounts of code the compiler can choose to not inline
/// `swap_if_less`. If the code of a sort impl changes so as to call this function in multiple
/// places, `#[inline(never)]` is recommended to keep binary-size in check. The current design of
/// `small_sort_network` makes sure to only call this once.
fn sort13_optimal<T, F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
if v.len() < 13 {
intrinsics::abort();
}
let v_base = v.as_mut_ptr();
// Optimal sorting network see:
// https://bertdobbelaere.github.io/sorting_networks.html.
// SAFETY: We checked the len.
unsafe {
swap_if_less(v_base, 0, 12, is_less);
swap_if_less(v_base, 1, 10, is_less);
swap_if_less(v_base, 2, 9, is_less);
swap_if_less(v_base, 3, 7, is_less);
swap_if_less(v_base, 5, 11, is_less);
swap_if_less(v_base, 6, 8, is_less);
swap_if_less(v_base, 1, 6, is_less);
swap_if_less(v_base, 2, 3, is_less);
swap_if_less(v_base, 4, 11, is_less);
swap_if_less(v_base, 7, 9, is_less);
swap_if_less(v_base, 8, 10, is_less);
swap_if_less(v_base, 0, 4, is_less);
swap_if_less(v_base, 1, 2, is_less);
swap_if_less(v_base, 3, 6, is_less);
swap_if_less(v_base, 7, 8, is_less);
swap_if_less(v_base, 9, 10, is_less);
swap_if_less(v_base, 11, 12, is_less);
swap_if_less(v_base, 4, 6, is_less);
swap_if_less(v_base, 5, 9, is_less);
swap_if_less(v_base, 8, 11, is_less);
swap_if_less(v_base, 10, 12, is_less);
swap_if_less(v_base, 0, 5, is_less);
swap_if_less(v_base, 3, 8, is_less);
swap_if_less(v_base, 4, 7, is_less);
swap_if_less(v_base, 6, 11, is_less);
swap_if_less(v_base, 9, 10, is_less);
swap_if_less(v_base, 0, 1, is_less);
swap_if_less(v_base, 2, 5, is_less);
swap_if_less(v_base, 6, 9, is_less);
swap_if_less(v_base, 7, 8, is_less);
swap_if_less(v_base, 10, 11, is_less);
swap_if_less(v_base, 1, 3, is_less);
swap_if_less(v_base, 2, 4, is_less);
swap_if_less(v_base, 5, 6, is_less);
swap_if_less(v_base, 9, 10, is_less);
swap_if_less(v_base, 1, 2, is_less);
swap_if_less(v_base, 3, 4, is_less);
swap_if_less(v_base, 5, 7, is_less);
swap_if_less(v_base, 6, 8, is_less);
swap_if_less(v_base, 2, 3, is_less);
swap_if_less(v_base, 4, 5, is_less);
swap_if_less(v_base, 6, 7, is_less);
swap_if_less(v_base, 8, 9, is_less);
swap_if_less(v_base, 3, 4, is_less);
swap_if_less(v_base, 5, 6, is_less);
}
}
/// Sorts range [begin, tail] assuming [begin, tail) is already sorted.
///
/// # Safety
/// begin < tail and p must be valid and initialized for all begin <= p <= tail.
unsafe fn insert_tail<T, F: FnMut(&T, &T) -> bool>(begin: *mut T, tail: *mut T, is_less: &mut F) {
// SAFETY: see individual comments.
unsafe {
// SAFETY: in-bounds as tail > begin.
let mut sift = tail.sub(1);
if !is_less(&*tail, &*sift) {
return;
}
// SAFETY: after this read tail is never read from again, as we only ever
// read from sift, sift < tail and we only ever decrease sift. Thus this is
// effectively a move, not a copy. Should a panic occur, or we have found
// the correct insertion position, gap_guard ensures the element is moved
// back into the array.
let tmp = ManuallyDrop::new(tail.read());
let mut gap_guard = CopyOnDrop { src: &*tmp, dst: tail, len: 1 };
loop {
// SAFETY: we move sift into the gap (which is valid), and point the
// gap guard destination at sift, ensuring that if a panic occurs the
// gap is once again filled.
ptr::copy_nonoverlapping(sift, gap_guard.dst, 1);
gap_guard.dst = sift;
if sift == begin {
break;
}
// SAFETY: we checked that sift != begin, thus this is in-bounds.
sift = sift.sub(1);
if !is_less(&tmp, &*sift) {
break;
}
}
}
}
/// Sort `v` assuming `v[..offset]` is already sorted.
pub fn insertion_sort_shift_left<T, F: FnMut(&T, &T) -> bool>(
v: &mut [T],
offset: usize,
is_less: &mut F,
) {
let len = v.len();
if offset == 0 || offset > len {
intrinsics::abort();
}
// SAFETY: see individual comments.
unsafe {
// We write this basic loop directly using pointers, as when we use a
// for loop LLVM likes to unroll this loop which we do not want.
// SAFETY: v_end is the one-past-end pointer, and we checked that
// offset <= len, thus tail is also in-bounds.
let v_base = v.as_mut_ptr();
let v_end = v_base.add(len);
let mut tail = v_base.add(offset);
while tail != v_end {
// SAFETY: v_base and tail are both valid pointers to elements, and
// v_base < tail since we checked offset != 0.
insert_tail(v_base, tail, is_less);
// SAFETY: we checked that tail is not yet the one-past-end pointer.
tail = tail.add(1);
}
}
}
/// SAFETY: The caller MUST guarantee that `v_base` is valid for 4 reads and
/// `dst` is valid for 4 writes. The result will be stored in `dst[0..4]`.
pub unsafe fn sort4_stable<T, F: FnMut(&T, &T) -> bool>(
v_base: *const T,
dst: *mut T,
is_less: &mut F,
) {
// By limiting select to picking pointers, we are guaranteed good cmov code-gen
// regardless of type T's size. Further this only does 5 instead of 6
// comparisons compared to a stable transposition 4 element sorting-network,
// and always copies each element exactly once.
// SAFETY: all pointers have offset at most 3 from v_base and dst, and are
// thus in-bounds by the precondition.
unsafe {
// Stably create two pairs a <= b and c <= d.
let c1 = is_less(&*v_base.add(1), &*v_base);
let c2 = is_less(&*v_base.add(3), &*v_base.add(2));
let a = v_base.add(c1 as usize);
let b = v_base.add(!c1 as usize);
let c = v_base.add(2 + c2 as usize);
let d = v_base.add(2 + (!c2 as usize));
// Compare (a, c) and (b, d) to identify max/min. We're left with two
// unknown elements, but because we are a stable sort we must know which
// one is leftmost and which one is rightmost.
// c3, c4 | min max unknown_left unknown_right
// 0, 0 | a d b c
// 0, 1 | a b c d
// 1, 0 | c d a b
// 1, 1 | c b a d
let c3 = is_less(&*c, &*a);
let c4 = is_less(&*d, &*b);
let min = select(c3, c, a);
let max = select(c4, b, d);
let unknown_left = select(c3, a, select(c4, c, b));
let unknown_right = select(c4, d, select(c3, b, c));
// Sort the last two unknown elements.
let c5 = is_less(&*unknown_right, &*unknown_left);
let lo = select(c5, unknown_right, unknown_left);
let hi = select(c5, unknown_left, unknown_right);
ptr::copy_nonoverlapping(min, dst, 1);
ptr::copy_nonoverlapping(lo, dst.add(1), 1);
ptr::copy_nonoverlapping(hi, dst.add(2), 1);
ptr::copy_nonoverlapping(max, dst.add(3), 1);
}
#[inline(always)]
fn select<T>(cond: bool, if_true: *const T, if_false: *const T) -> *const T {
if cond { if_true } else { if_false }
}
}
/// SAFETY: The caller MUST guarantee that `v_base` is valid for 8 reads and
/// writes, `scratch_base` and `dst` MUST be valid for 8 writes. The result will
/// be stored in `dst[0..8]`.
unsafe fn sort8_stable<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
v_base: *mut T,
dst: *mut T,
scratch_base: *mut T,
is_less: &mut F,
) {
// SAFETY: these pointers are all in-bounds by the precondition of our function.
unsafe {
sort4_stable(v_base, scratch_base, is_less);
sort4_stable(v_base.add(4), scratch_base.add(4), is_less);
}
// SAFETY: scratch_base[0..8] is now initialized, allowing us to merge back
// into dst.
unsafe {
bidirectional_merge(&*ptr::slice_from_raw_parts(scratch_base, 8), dst, is_less);
}
}
#[inline(always)]
unsafe fn merge_up<T, F: FnMut(&T, &T) -> bool>(
mut left_src: *const T,
mut right_src: *const T,
mut dst: *mut T,
is_less: &mut F,
) -> (*const T, *const T, *mut T) {
// This is a branchless merge utility function.
// The equivalent code with a branch would be:
//
// if !is_less(&*right_src, &*left_src) {
// ptr::copy_nonoverlapping(left_src, dst, 1);
// left_src = left_src.add(1);
// } else {
// ptr::copy_nonoverlapping(right_src, dst, 1);
// right_src = right_src.add(1);
// }
// dst = dst.add(1);
// SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
// to read and `dst` is valid to write, while not aliasing.
unsafe {
let is_l = !is_less(&*right_src, &*left_src);
let src = if is_l { left_src } else { right_src };
ptr::copy_nonoverlapping(src, dst, 1);
right_src = right_src.add(!is_l as usize);
left_src = left_src.add(is_l as usize);
dst = dst.add(1);
}
(left_src, right_src, dst)
}
#[inline(always)]
unsafe fn merge_down<T, F: FnMut(&T, &T) -> bool>(
mut left_src: *const T,
mut right_src: *const T,
mut dst: *mut T,
is_less: &mut F,
) -> (*const T, *const T, *mut T) {
// This is a branchless merge utility function.
// The equivalent code with a branch would be:
//
// if !is_less(&*right_src, &*left_src) {
// ptr::copy_nonoverlapping(right_src, dst, 1);
// right_src = right_src.wrapping_sub(1);
// } else {
// ptr::copy_nonoverlapping(left_src, dst, 1);
// left_src = left_src.wrapping_sub(1);
// }
// dst = dst.sub(1);
// SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
// to read and `dst` is valid to write, while not aliasing.
unsafe {
let is_l = !is_less(&*right_src, &*left_src);
let src = if is_l { right_src } else { left_src };
ptr::copy_nonoverlapping(src, dst, 1);
right_src = right_src.wrapping_sub(is_l as usize);
left_src = left_src.wrapping_sub(!is_l as usize);
dst = dst.sub(1);
}
(left_src, right_src, dst)
}
/// Merge v assuming v[..len / 2] and v[len / 2..] are sorted.
///
/// Original idea for bi-directional merging by Igor van den Hoven (quadsort),
/// adapted to only use merge up and down. In contrast to the original
/// parity_merge function, it performs 2 writes instead of 4 per iteration.
///
/// # Safety
/// The caller must guarantee that `dst` is valid for v.len() writes.
/// Also `v.as_ptr()` and `dst` must not alias and v.len() must be >= 2.
///
/// Note that T must be Freeze, the comparison function is evaluated on outdated
/// temporary 'copies' that may not end up in the final array.
unsafe fn bidirectional_merge<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
v: &[T],
dst: *mut T,
is_less: &mut F,
) {
// It helps to visualize the merge:
//
// Initial:
//
// |dst (in dst)
// |left |right
// v v
// [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
// ^ ^
// |left_rev |right_rev
// |dst_rev (in dst)
//
// After:
//
// |dst (in dst)
// |left | |right
// v v v
// [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
// ^ ^ ^
// |left_rev | |right_rev
// |dst_rev (in dst)
//
// In each iteration one of left or right moves up one position, and one of
// left_rev or right_rev moves down one position, whereas dst always moves
// up one position and dst_rev always moves down one position. Assuming
// the input was sorted and the comparison function is correctly implemented
// at the end we will have left == left_rev + 1, and right == right_rev + 1,
// fully consuming the input having written it to dst.
let len = v.len();
let src = v.as_ptr();
let len_div_2 = len / 2;
// SAFETY: The caller has to ensure that len >= 2.
unsafe {
intrinsics::assume(len_div_2 != 0); // This can avoid useless code-gen.
}
// SAFETY: no matter what the result of the user-provided comparison function
// is, all 4 read pointers will always be in-bounds. Writing `dst` and `dst_rev`
// will always be in bounds if the caller guarantees that `dst` is valid for
// `v.len()` writes.
unsafe {
let mut left = src;
let mut right = src.add(len_div_2);
let mut dst = dst;
let mut left_rev = src.add(len_div_2 - 1);
let mut right_rev = src.add(len - 1);
let mut dst_rev = dst.add(len - 1);
for _ in 0..len_div_2 {
(left, right, dst) = merge_up(left, right, dst, is_less);
(left_rev, right_rev, dst_rev) = merge_down(left_rev, right_rev, dst_rev, is_less);
}
let left_end = left_rev.wrapping_add(1);
let right_end = right_rev.wrapping_add(1);
// Odd length, so one element is left unconsumed in the input.
if len % 2 != 0 {
let left_nonempty = left < left_end;
let last_src = if left_nonempty { left } else { right };
ptr::copy_nonoverlapping(last_src, dst, 1);
left = left.add(left_nonempty as usize);
right = right.add((!left_nonempty) as usize);
}
// We now should have consumed the full input exactly once. This can only fail if the
// user-provided comparison function fails to implement a strict weak ordering. In that case
// we panic and never access the inconsistent state in dst.
if left != left_end || right != right_end {
panic_on_ord_violation();
}
}
}
#[cfg_attr(not(feature = "panic_immediate_abort"), inline(never), cold)]
#[cfg_attr(feature = "panic_immediate_abort", inline)]
fn panic_on_ord_violation() -> ! {
// This is indicative of a logic bug in the user-provided comparison function or Ord
// implementation. They are expected to implement a total order as explained in the Ord
// documentation.
//
// By panicking we inform the user, that they have a logic bug in their program. If a strict
// weak ordering is not given, the concept of comparison based sorting cannot yield a sorted
// result. E.g.: a < b < c < a
//
// The Ord documentation requires users to implement a total order. Arguably that's
// unnecessarily strict in the context of sorting. Issues only arise if the weaker requirement
// of a strict weak ordering is violated.
//
// The panic message talks about a total order because that's what the Ord documentation talks
// about and requires, so as to not confuse users.
panic!("user-provided comparison function does not correctly implement a total order");
}
#[must_use]
pub(crate) const fn has_efficient_in_place_swap<T>() -> bool {
// Heuristic that holds true on all tested 64-bit capable architectures.
mem::size_of::<T>() <= 8 // mem::size_of::<u64>()
}