-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathgrailsort.zig
2905 lines (2574 loc) · 123 KB
/
grailsort.zig
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//! grailsort.zig
//!
//! This implementation is based on the version maintained by The Holy Grail Sort Project.
//! https://github.com/MusicTheorist/Rewritten-Grailsort
//! It would not have been possible without their patient and detailed explanations of how
//! this ridiculous sort is supposed to work.
//!
//! PLEASE NOTE, THIS IS A WORK IN PROGRESS!
//! This code should not be used in production yet, or really for anything except reference.
//! It is not done yet or fully tested, and some cases are not even implemented.
//!
//! MIT License
//!
//! Copyright (c) 2013 Andrey Astrelin
//! Copyright (c) 2020 The Holy Grail Sort Project
//! Copyright (c) 2021 Martin Wickham
//!
//! Permission is hereby granted, free of charge, to any person obtaining a copy
//! of this software and associated documentation files (the "Software"), to deal
//! in the Software without restriction, including without limitation the rights
//! to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
//! copies of the Software, and to permit persons to whom the Software is
//! furnished to do so, subject to the following conditions:
//!
//! The above copyright notice and this permission notice shall be included in all
//! copies or substantial portions of the Software.
//!
//! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
//! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
//! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
//! AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
//! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
//! OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
//! SOFTWARE.
//!
const std = @import("std");
const assert = std.debug.assert;
// ----------------------- Public API -------------------------
/// Sorts the array in-place
pub fn sort(
comptime T: type,
array: []T,
context: anytype,
comptime lessThan: fn(context: @TypeOf(context), lhs: T, rhs: T) bool,
) void {
sortCommon(T, array, context, lessThan, emptySlice(T));
}
/// Sorts with an allocated buffer. If allocation of the optimal buffer
/// size fails, performs an in-place sort instead.
pub fn sortWithAllocatedBuffer(
comptime T: type,
array: []T,
context: anytype,
comptime lessThan: fn(context: @TypeOf(context), lhs: T, rhs: T) bool,
allocator: *std.mem.Allocator,
) void {
const len = findOptimalBufferLength(array.len);
var allocated = true;
const buffer = allocator.alloc(T, len) catch blk: {
allocated = false;
break :blk emptySlice(T);
};
defer if (allocated) allocator.free(items);
sortCommon(T, array, context, lessThan, buffer);
}
/// Returns the optimal buffer length for a given array size.
/// Meant for use with sortExternal.
pub fn findOptimalBufferLength(array_len: usize) usize {
// find the smallest power of two that is at least
// the sqrt of the length of the array. This method
// handles overflow when the length is close to MAX_USIZE.
var block_len: usize = 4;
var block_len_sq: usize = 16;
while (block_len_sq < array_len and block_len_sq != 0) {
block_len_sq <<= 2;
block_len <<= 1;
}
return block_len;
}
/// Sorts using an external buffer to help make things faster.
/// Use findOptimalBufferLength to find the best buffer length for a given size.
/// Buffers longer than findOptimalBufferLength are still optimal.
/// Buffers smaller than findOptimalBufferLength may still be helpful.
pub fn sortExternal(
comptime T: type,
array: []T,
context: anytype,
comptime lessThan: fn(context:@TypeOf(context), lhs: T, rhs: T) bool,
buffer: []T,
) void {
sortCommon(T, array, context, lessThan, buffer);
}
// ---------------------------- Implementation -----------------------------
// TODO-OPT: There are many places where this implementation could specify
// noalias but does not. There may be a perf benefit to doing so, especially
// when keys are passed separately from data, or if the comparator is opaque.
//
// TODO-OPT: There are many places where this implementation divides by a
// power of two. We could consider passing around the log2 value and using
// shifts instead, but these divisions are not usually on critical paths so
// it may not be worth very much.
//
// TODO-ZEN: This implementation is not very consistent between using slices
// and using ptr + len parameters. It may be worth using pointers in more
// places to make the indexing nicer, especially in places where there is
// a junk buffer before the valid data.
/// This sort can handle three different methods for buffer management.
const BufferMode = enum {
/// There are not enough unique values in the array to make an
/// inline buffer. Fall back to in place merge sorts with no
/// buffer.
no_buffer,
/// There is an inline buffer which may be used to accelerate
/// merge sorts, but must be preserved.
inline_buffer,
/// An external buffer has been provided, don't worry about
/// preserving values in the internal buffer.
external_buffer,
};
/// Arrays smaller than this will do insertion sort instead
/// TODO-OPT: Tune this value.
const TUNE_TOO_SMALL = 16;
/// If the array has fewer than this many unique elements, use a sort that
/// works better on arrays with few actual items.
/// TODO-OPT: Tune this value.
const TUNE_TOO_FEW_KEYS = 4;
/// If the external buffer is smaller than this many items,
/// it's not worth the extra overhead of the out of place merge.
/// This must always be at least 2, otherwise the out of place
/// merge will crash. If the buffer is less than 4 items,
/// the out of place merge degrades to the in place merge plus
/// some checks, so it's not worth it.
/// TODO-OPT: Tune this value.
const TUNE_MIN_BUILD_BLOCKS_BUFFER = 4;
/// If this is true, when running with an external buffer,
/// the junk buffer will be filled with undefined.
const DEBUG_SET_UNDEFINED = false;
/// Root sorting function, used by all public api points.
fn sortCommon(
comptime T: type,
array: []T,
context: anytype,
comptime lessThanFn: fn(context:@TypeOf(context), lhs: T, rhs: T) bool,
buffer: []T,
) void {
const Comparator = struct {
//! This struct just cleans up the parameter lists of inner functions.
context: @TypeOf(context),
pub inline fn lessThan(self: @This(), lhs: T, rhs: T) bool {
// TODO-API: do some tests to see if a compare function
// can actually be performant here
return lessThanFn(self.context, lhs, rhs);
}
pub inline fn compare(self: @This(), lhs: T, rhs: T) std.math.Order {
// TODO-OPT: use an actual compare function here
if (lessThanFn(self.context, lhs, rhs)) return .lt;
if (lessThanFn(self.context, rhs, lhs)) return .gt;
return .eq;
}
};
const cmp = Comparator{ .context = context };
// If the array is too small, just do insertion sort.
if (array.len < TUNE_TOO_SMALL) {
insertSort(T, array, cmp);
return;
}
// This sort works by building a "junk buffer" at the beginning
// of the array. The junk buffer will be used as scratch space
// to accelerate other parts of the sort. We will need to rearrange
// the items in the junk buffer freely. In order to do that
// and maintain stability, we require that all items in the junk
// buffer are unique.
// The rest of the list will be split up into fixed-size blocks.
// These blocks must be a power of two in size, so that we can
// use a fast scrolling merge sort to create them. As shown in
// the paper, the best size for this list is the square root of
// the length of the array.
// This calculates the smallest power of two that is greater
// than or equal to sqrt(array.len).
var block_len = findOptimalBufferLength(array.len);
// We will also need a key item at the beginning of the array
// for each block, which we will use to maintain stability in
// the sort.
// key_len = ceil(array.len / block_len)
// this is safe because array.len >= 16 (and therefore > 0).
// This could be a floor division instead if the use is updated.
// TODO-OPT: Test a bunch of cases with and without this.
// Does it make a meaningful difference?
var key_len = ((array.len - 1) / block_len) + 1;
const ideal_external_buffer = buffer.len >= block_len;
// So the total number of unique items needed at the start of
// the list is the sum of the number of keys and the block length.
// We only need to include the junk buffer in the keys if we
// don't have a large enough external buffer. Otherwise the
// junk buffer is stable so it doesn't need to be unique.
var ideal_keys = if (ideal_external_buffer) key_len else (key_len + block_len);
// Now we can go find those keys, and move them to the front of
// the array.
// TODO-OPT: We can probably use the external buffer to speed this
// up if present.
const keys_found = collectKeys(T, array, ideal_keys, cmp);
// In the event that the array contains very many duplicate elements,
// we may not be able to find enough unique keys. In that case, use
// one of two fallback strategies.
const ideal_buffer = keys_found >= ideal_keys;
if (!ideal_buffer) {
// If we didn't find enough keys, use one of two fallback strategies.
if (keys_found < TUNE_TOO_FEW_KEYS) {
// There are very few unique items, use a different sort.
// TODO-OPT: Should this value be a percentage of the array length instead?
// Or some more complicated calculation?
// TODO-OPT: We can probably use the external buffer to speed this up if present.
lazyStableSort(T, array, cmp);
return;
} else {
// Strategy 2: Block swaps with small scrolling buffer and/or lazy merges
// In this case, we will use the buffer we have to try to optimize
// things for a bit, and then fall back to the keyed selection +
// in place merge sort. Start by reducing our initial
// block length to be the size of our actual buffer.
// TODO-OPT: key_len is a power of two, consider using @clz instead.
// TODO-OPT: Without an external buffer, the optimal assignment is
// to use half of the keys as keys and half as junk, but with an
// external buffer it is better to use the external buffer as
// junk and all of the keys as keys. Later code assumes that
// these two values are related by the in place optimal relation,
// but we could pass separate parameters for these two values in
// the external case to speed up this calculation.
key_len = std.math.floorPowerOfTwo(usize, keys_found);
block_len = 0;
}
}
// In the normal case, this is the end of the keys buffer and
// is equal to keys_found. If there aren't enough keys, this
// is just block_len instead. If we have an external buffer,
// this includes the junk buffer but we may have only found
// key_len keys.
var first_data_idx = if (ideal_external_buffer and ideal_buffer) block_len else (block_len + key_len);
// Our initial blocks are normally the block length.
// If we are in strategy 2, we will use our shortened
// key length instead, to account for our limited buffer.
// TODO-OPT: When we use all available space with an external
// buffer in strategy 2, we will also need to update this.
var subarray_len = if (ideal_buffer) block_len else key_len;
// First, we need to sort each block. This algorithm uses either
// the junk buffer or an external buffer to do an in place merge
// sort, displacing the junk buffer and moving it through the array.
// When all is said and done, the junk buffer will be back where
// it started.
// TODO-OPT: With an ideal buffer, we don't even need to write the
// junk buffer back to the array, we can keep it external forever.
// TODO-ZEN: This `- subarray_len` is awkward, maybe use an array
// pointer and data length instead?
if (buffer.len >= TUNE_MIN_BUILD_BLOCKS_BUFFER) {
buildBlocksExternal(T, array[first_data_idx - subarray_len..], subarray_len, buffer, cmp);
} else {
buildBlocksInPlace(T, array[first_data_idx - subarray_len..], subarray_len, cmp);
}
// At this point, pairs of blocks are sorted, so double the length.
subarray_len *= 2;
// save off our data length for future use
const data_len = array.len - first_data_idx;
// At the start of this loop, the array always looks as follows:
// the keys and junk buffer are at the start of the array, unsorted.
// They are followed by subarrays of length subarray_len, which are
// each sorted. In each iteration, we will combine and sort each
// pair of subarrays, to yield larger subarrays.
while (subarray_len < data_len) : (subarray_len *= 2) {
var curr_block_len = block_len;
var mode: BufferMode = .inline_buffer;
if (!ideal_buffer) {
// Even if we don't have the ideal buffer, we might still
// be able to use a junk buffer for this pass. We just
// need to make sure that we have enough keys for both the
// selection sort and the junk buffer. We need to divvy up
// our keys for use as block keys vs junk buffer.
// since the junk buffer is the size of a block, and a
// subarray is the remaining number of keys times the
// block size, the maximum subarray size we can support
// is (junk_keys * block_keys) / 2. The maximum of this
// function happens when junk_keys == block_keys, so our
// check is (subarray_len * 2) <= (key_len / 2)^2.
// TODO-OPT: We can get away with a slightly smaller key
// buffer for the last iteration, because we don't use
// the full subarray_len.
const key_buffer = key_len / 2;
if (key_buffer * key_buffer >= subarray_len * 2) {
curr_block_len = key_buffer;
} else {
// If we can't use the scrolling buffer, we are going
// to be doing in-place merging. Since that's expensive,
// we want to use the smallest block size possible,
// and put more work on the selection sort.
curr_block_len = (subarray_len * 2) / key_len;
mode = .no_buffer;
}
}
// If we have enough external buffer to put the junk buffer
// in it, (and to hold all needed keys), use that to optimize
// combineBlocks.
// TODO-OPT: The second check here can only be different from
// the first in strategy 2, in all other cases they are equal.
if (buffer.len >= curr_block_len and buffer.len >= (subarray_len * 2 / curr_block_len)) {
mode = .external_buffer;
}
// TODO-OPT: We don't need to do this repeatedly if we have an ideal buffer.
if (mode == .external_buffer) {
copyNoAlias(T, buffer.ptr, array.ptr, curr_block_len);
setUndefined(T, array[0..curr_block_len]);
moveBackToClobberFront(T, array[0..first_data_idx], first_data_idx - curr_block_len);
}
combineBlocks(T, array, first_data_idx, subarray_len, curr_block_len, mode, buffer, cmp);
// TODO-OPT: We don't need to do this repeatedly if we have an ideal buffer.
if (mode == .external_buffer) {
moveFrontToClobberBack(T, array[0..first_data_idx], first_data_idx - curr_block_len);
copyNoAlias(T, array.ptr, buffer.ptr, curr_block_len);
setUndefined(T, buffer[0..curr_block_len]);
}
}
// When all is said and done, we just have the keys at the beginning
// of the array, and one sorted subarray of everything else that follows
// it. We'll finish by sorting the keys, and then performing an in-place
// merge sort of the keys and everything else.
// TODO-OPT: We could maybe do a better merge sort taking advantage of the
// external buffer. Even though it's not big enough to hold the entire
// keys array, it can still help reduce swaps (or let us do them as blocks).
insertSort(T, array[0..first_data_idx], cmp);
lazyMerge(T, array, first_data_idx, cmp);
}
/// This function searches the list for the first ideal_keys unique elements.
/// It puts those items at the beginning of the array in sorted order, and
/// returns the number of unique values that it found to put in that buffer.
/// TODO-OPT: The keys at the beginning do not actually need to be in sorted order.
/// There might be a way to optimize this that clobbers order.
/// TODO-OPT: With an external buffer, we don't need to rewind or preserve keys,
/// we just need to move any not-chosen keys up past the key area.
fn collectKeys(comptime T: type, array: []T, ideal_keys: usize, cmp: anytype) usize {
// The number of keys we have found so far. Since the first
// item in the array is always unique, we always count it.
var keys_found: usize = 1;
// The index of the array of the start of the key buffer.
// The sorted buffer of keys will move around in the array,
// but it starts at the very beginning.
// This rotation is only necessary to preserve stability.
// Without that, we could keep the list at the beginning
// and swap displaced elements with keys. But since we
// need stability, we will move the keys list instead.
// Moving the small keys list is a lot faster in the worst
// case than moving the non-keys to preserve order.
var first_key_idx: usize = 0;
// The index in the array of the item we are currently considering
// adding as a key. The key buffer is always somewhere on the left
// of this, but there may be non-key elements between the buffer and
// this index.
var curr_key_idx: usize = 1;
// We will check every item in the list in order to see if it can be a key.
while (curr_key_idx < array.len) : (curr_key_idx += 1) {
// this is the current set of unique keys. It moves over the course of
// this search, which is why we have first_key_idx. But it is always
// sorted in ascending order.
const keys = array[first_key_idx..][0..keys_found];
// determine the index in the key buffer where we would insert this key.
// TODO-OPT: If we use a compare function that can recognize equals,
// this search can early out on equals, because we reject duplicate keys.
const insert_key_idx = binarySearchLeft(T, keys, array[curr_key_idx], cmp);
// at this point we know that array[curr_key_idx] <= keys[insert_key_idx].
// So we can check if the key we are considering is unique by checking
// if array[curr_key_idx] < keys[insert_key_idx].
if (insert_key_idx == keys_found or
cmp.lessThan(array[curr_key_idx], keys[insert_key_idx])) {
// Move the keys list to butt up against the current key.
// Moving the keys list like this instead of inserting at
// the beginning of the array saves a ton of copies.
// Note that this invalidates `keys`.
rotate(T, array[first_key_idx..curr_key_idx], keys_found);
first_key_idx = curr_key_idx - keys_found;
// Now use rotate to insert the new key at the right position.
const array_insert_idx = first_key_idx + insert_key_idx;
const keys_after_insert = keys_found - insert_key_idx;
rotate(T, array[array_insert_idx..][0..keys_after_insert + 1], keys_after_insert);
// If we've found enough keys, we don't need to keep looking. Break immediately.
keys_found += 1;
if (keys_found >= ideal_keys) break;
}
}
// Ok, now we need to move the keys buffer back to the beginning of the array.
rotate(T, array[0..first_key_idx + keys_found], first_key_idx);
return keys_found;
}
/// This function sorts every block of block_len elements, starting at index block_len.
/// It uses a junk buffer of block_len elements at the beginning to accelerate the sort.
/// This buffer may be reordered, but will be back at the beginning of the array when
/// this function returns.
/// TODO-OPT: We should probably use a small stack buffer of 8 or 16 elements
/// and always call buildBlocksExternal, to optimize the first few merges.
/// As a bonus, this would let us delete the pairwiseSwaps function.
fn buildBlocksInPlace(comptime T: type, array: []T, block_len: usize, cmp: anytype) void {
// We'll use iterative merge sorts to perform this sort.
// Start with an optimized version of merge sort with a size of 1.
// This function will also move two items of the junk buffer to the
// end of the array, not just one.
pairwiseSwaps(T, array[block_len-2..], cmp);
// Then continue doing larger merge sorts and moving items to the end
// of the array. The last merge sort is done in reverse to move the
// junk buffer back to the beginning.
buildInPlace(T, array, 2, block_len, cmp);
}
/// Builds merge chunks in-place. This function accepts an array that consists of
/// a set of keys followed by data items. It must start with exactly
/// `block_len - already_merged_size` keys, followed by all the data items, followed by
/// `already_merged_size` more keys. In other words, there are exactly block_len keys,
/// but already_merged_size of them have been moved to the end of the array already.
/// Additionally, before calling this function, every `already_merged_size`
/// items after the starting keys must be sorted. The last partial block must also be sorted.
/// After calling this function, all of the keys will be at the beginning of the array,
/// and every chunk of 2*block_len items after the keys will be sorted.
/// The keys may be reordered during this process. block_len and already_merged_size
/// must both be powers of two.
fn buildInPlace(comptime T: type, array: []T, already_merged_size: usize, block_len: usize, cmp: anytype) void {
assert(std.math.isPowerOfTwo(block_len));
assert(std.math.isPowerOfTwo(already_merged_size));
assert(already_merged_size <= block_len);
// start_position marks the beginning of the data elements.
// It starts at block_len, moved back by the number of keys
// that have already been merged to the back of the array.
// As we move keys to the back, this will move left.
var start_position = block_len - already_merged_size;
const data_len = array.len - block_len;
// We'll start with a merge size of smallest_merged_size items, and increase that until
// it contains the whole list. For each pass, we will merge pairs of chunks,
// and also move some keys to the end.
var merge_len = already_merged_size;
while (merge_len < block_len) : (merge_len *= 2) {
const full_merge_len = 2 * merge_len;
// Our buffer will be the same size as our merge chunk. That's how many
// keys we will move in one pass over the array.
const buffer_len = merge_len;
// We'll consider each pair of merge chunks, and merge them together into one chunk.
// While we do that, we'll propagate some keys from the front to the back of the list.
var merge_index = start_position;
var remaining_unmerged = data_len;
while (remaining_unmerged >= full_merge_len) : ({
merge_index += full_merge_len;
remaining_unmerged -= full_merge_len;
}) {
mergeForwards(T, array.ptr + merge_index, buffer_len, merge_len, merge_len, cmp);
}
// We need special handling for partial blocks at the end
if (remaining_unmerged > merge_len) {
// If we have more than one block, do a merge with the partial last block.
mergeForwards(T, array.ptr + merge_index, buffer_len, merge_len, remaining_unmerged - merge_len, cmp);
} else {
// Otherwise, rotate the buffer past the sorted chunk of remaining array.
// TODO-OPT: Rotate preserves the order of both halves, but we only care about
// the order of the right half. The left half can be completely unordered. So
// there's room here for a faster implementation.
rotate(T, array[merge_index-buffer_len..][0..merge_len + remaining_unmerged], merge_len);
}
// Finally, move the start position back to get new keys next iteration
start_position -= merge_len;
}
assert(start_position == 0);
// Now that we've created sorted chunks of size block_len, we need
// to do the last chunk. For this one, we'll go backwards through the
// array, moving the keys back to the beginning.
const full_merge_len = 2 * block_len;
// TODO-OPT: Pretty sure this is a power of two, we can use a mask here instead.
const last_block_len = data_len % full_merge_len;
var remaining_full_blocks = data_len / full_merge_len;
const final_offset = data_len - last_block_len;
// First we have to consider the special case of the partially sorted block
// at the end of the array. If it's smaller than a block, we can just rotate it.
// Otherwise, we need to do a merge.
if (last_block_len <= block_len) {
rotate(T, array[final_offset..], last_block_len);
} else {
mergeBackwards(T, array.ptr + final_offset, block_len, last_block_len - block_len, block_len, cmp);
}
// Now continue to merge backwards through the rest of the array, back to the beginning.
var merge_index = final_offset;
while (remaining_full_blocks > 0) : (remaining_full_blocks -= 1) {
merge_index -= full_merge_len;
mergeBackwards(T, array.ptr + merge_index, block_len, block_len, block_len, cmp);
}
}
/// This is similar to buildBlocksInPlace, but uses an external storage buffer
/// to reduce swaps. It will copy junk items into the external array, and then
/// use a variant of the merge operation which clobbers the junk buffer. Once
/// the merge size exceeds the size of the external buffer, it falls back to the
/// in-place version.
/// TODO-ZEN: Combine this with buildBlocksInPlace
fn buildBlocksExternal(comptime T: type, array: []T, block_len: usize, buffer: []T, cmp: anytype) void {
// round the buffer length down to a power of two
var pow2_buffer_len = if (buffer.len >= block_len) block_len
else std.math.floorPowerOfTwo(usize, buffer.len);
// copy as many keys as we can into the external block
copyNoAlias(T, buffer.ptr, array.ptr + (block_len - pow2_buffer_len), pow2_buffer_len);
setUndefined(T, (array.ptr + block_len - pow2_buffer_len)[0..pow2_buffer_len]);
// Use a separate code path for pairs because it's much faster
assert(pow2_buffer_len >= 2);
var start_position = block_len - 2;
pairwiseWrites(T, array[start_position..], cmp);
// Then do forward merges just like in the in place version,
// but clobber the junk buffer instead of preserving it.
// TODO-ZEN: There is probably a better expression of this indexing,
// see lazyStableSort for inspiration.
var merge_len: usize = 2;
while (merge_len < pow2_buffer_len) : (merge_len *= 2) {
const full_merge_len = merge_len * 2;
var remaining = array.len - block_len;
var merge_position = start_position;
while (remaining >= full_merge_len) : ({
remaining -= full_merge_len;
merge_position += full_merge_len;
}) {
mergeForwardExternal(T, array.ptr + merge_position, merge_len, merge_len, merge_len, cmp);
}
if (remaining > merge_len) {
mergeForwardExternal(T, array.ptr + merge_position, merge_len, merge_len, remaining - merge_len, cmp);
} else {
copyNoAlias(T, array.ptr + merge_position - merge_len, array.ptr + merge_position, remaining);
setUndefined(T, (array.ptr + merge_position)[0..remaining]);
}
start_position -= merge_len;
}
assert(start_position + pow2_buffer_len == block_len);
// If our buffer can hold the whole block size, we can
// do an out of place reverse merge as well.
if (pow2_buffer_len == block_len) {
const full_merge_len = 2 * block_len;
const data_len = array.len - block_len;
// TODO-OPT: Pretty sure this is a power of two, we can use a mask here instead.
const last_block_len = data_len % full_merge_len;
var remaining_full_blocks = data_len / full_merge_len;
const final_offset = data_len - last_block_len;
// First we have to consider the special case of the partially sorted block
// at the end of the array. If it's smaller than a block, we can just rotate it.
// Otherwise, we need to do a merge.
if (last_block_len <= block_len) {
rotate(T, array[final_offset..], last_block_len);
} else {
mergeBackwardExternal(T, array.ptr + final_offset, block_len, last_block_len - block_len, block_len, cmp);
}
// Now continue to merge backwards through the rest of the array, back to the beginning.
var merge_index = final_offset;
while (remaining_full_blocks > 0) : (remaining_full_blocks -= 1) {
merge_index -= full_merge_len;
mergeBackwardExternal(T, array.ptr + merge_index, block_len, block_len, block_len, cmp);
}
// Then restore the buffer.
// TODO-OPT: This isn't necessary, we can keep using the buffer.
copyNoAlias(T, array.ptr + (block_len - pow2_buffer_len), buffer.ptr, pow2_buffer_len);
setUndefined(T, buffer[0..pow2_buffer_len]);
} else {
copyNoAlias(T, array.ptr + (array.len - pow2_buffer_len), buffer.ptr, pow2_buffer_len);
setUndefined(T, buffer[0..pow2_buffer_len]);
// TODO-OPT: If the external buffer is large enough, we could do an
// out-of-place reverse merge for even more speed.
buildInPlace(T, array, merge_len, block_len, cmp);
}
}
/// This function inspects every pair of elements in the given array,
/// starting with the third and fourth element, and moving forward by
/// two each time. It puts each pair in the correct order, and moves
/// it back two elements. The first two displaced elements are moved
/// to the end, but may not necessarily be in the correct order.
/// The caller should ensure that the first two elements are not equal,
/// otherwise the sort may not be stable.
/// The passed array must contain at least two elements.
/// So with this array:
/// [1 2 6 5 3 4 8 7 9]
/// It will sort it to
/// [5 6 3 4 7 8 9 2 1].
fn pairwiseSwaps(comptime T: type, array: []T, cmp: anytype) void {
// save the keys to the stack
const first_key = array[0];
const second_key = array[1];
// move all the items down two, while sorting them
pairwiseWrites(T, array, cmp);
// then stamp the saved keys on the end
array[array.len-2] = first_key;
array[array.len-1] = second_key;
}
/// This function inspects every pair of elements in the given array,
/// starting with the third and fourth element, and moving forward by
/// two each time. It puts each pair in the correct order, and moves
/// it back two elements. The first two displaced elements are clobbered.
/// So with this array:
/// [1 2 6 5 3 4 8 7 9]
/// It will sort it to
/// [5 6 3 4 7 8 9 X X]
fn pairwiseWrites(comptime T: type, array: []T, cmp: anytype) void {
var index: usize = 3;
while (index < array.len) : (index += 2) {
// check if the items are out of order, ensuring that equal items
// are considered to be in order.
// TODO-OPT: This wouldn't be difficult to write in a branchless way,
// see if that makes a difference.
if (cmp.lessThan(array[index], array[index-1])) {
// here they are out of order, swap them as we move them back.
array[index - 3] = array[index];
array[index - 2] = array[index - 1];
} else {
// here they are in order, copy them back preserving order.
array[index - 3] = array[index - 1];
array[index - 2] = array[index];
}
}
// check if there's one extra item on the end
if (index == array.len) {
array[index - 3] = array[index-1];
}
}
/// This function iterates pairs of subarrays of length subarray_len,
/// starting at first_block_idx, and including the partial subarray
/// at the end of the list. Each subarray must be sorted, and after
/// this call the pairs will be combined into larger sorted subarrays.
/// There must be at least one item in the array before first_block_idx
/// for each block in the array, excluding the final partial block.
/// If use_junk_buffer is true, there must also be at least block_len
/// more elements for the junk buffer, which will be used to accelerate
/// the merge operation.
fn combineBlocks(
comptime T: type,
array: []T,
first_block_idx: usize,
subarray_len: usize,
block_len: usize,
mode: BufferMode,
external_buffer: []T,
cmp: anytype,
) void {
// The total number of data items, excluding keys and the junk buffer
const data_len = array.len - first_block_idx;
// The length of a merged subarray
const full_merge_len = 2 * subarray_len;
// The number of full merges we will do
var merge_count = data_len / full_merge_len;
// The number of items that do not fit into a full merge
const leftover_count = data_len % full_merge_len;
// The number of blocks in a full merge
const block_count = full_merge_len / block_len;
// The number of blocks in a subarray
const half_block_count = @divExact(block_count, 2);
// The block index of the first block of the second subarray
const initial_median = half_block_count;
// The length of our inline junk buffer.
const junk_len = if (mode == .no_buffer) 0 else block_len;
// The array from which we will pull keys.
const keys_base = if (mode == .external_buffer) external_buffer
else array[0..first_block_idx - junk_len];
// The index of the first subarray to be merged
// TODO-ZEN: Consider making merge_start a pointer
var merge_start: usize = first_block_idx;
// Iterate through pairs of subarrays, merging them.
// If we are using a junk buffer, this operation also
// transfers it from the left to the right.
while (merge_count > 0) : (merge_count -= 1) {
// The keys are used to preserve stability in blockSelectSort.
const keys = keys_base[0..block_count];
// We need to make sure they are in order.
insertSort(T, keys, cmp);
// blockSelectSort will do a selection sort on the blocks in the array.
// It also permutes the keys to match. This puts blocks in closer to the
// right order, allowing us to combine them with mergeBlocks.
const median_key = blockSelectSort(T, keys.ptr, array.ptr + merge_start, initial_median, block_count, block_len, cmp);
// TODO-ZIG: Replace this with inline switch when that's implemented
// TODO-ZEN: Make the parameters the same for all of these in preparation
switch (mode) {
.no_buffer => mergeBlocks(T, keys.ptr, median_key, array.ptr + merge_start, block_count, block_len, 0, 0, .no_buffer, cmp),
.inline_buffer => mergeBlocks(T, keys.ptr, median_key, array.ptr + merge_start, block_count, block_len, 0, 0, .inline_buffer, cmp),
.external_buffer => mergeBlocks(T, keys.ptr, median_key, array.ptr + merge_start, block_count, block_len, 0, 0, .external_buffer, cmp),
}
merge_start += full_merge_len;
}
// If the number of left over items is greater than the size of a subarray,
// we need to merge them. This is a more difficult merge because the last
// subarray cannot participate in the selection sort. Because of that,
// we cannot necessarily merge all full blocks, because some of them are
// not ordered correctly with regard to the trailer.
if (leftover_count > subarray_len) {
const trailer_blocks = leftover_count / block_len;
const trailer_items = leftover_count % block_len;
// The +1 here adds an extra key which tracks the trailer items.
// TODO-OPT: I don't think we actually need a key for the trailer.
// hmm, we do always need one for the median though.
const keys = keys_base[0..trailer_blocks+1];
insertSort(T, keys, cmp);
// perform our selection sort as usual, including even blocks that
// may not end up being part of the standard sort. Only do this if
// we have full blocks on the right side. If it's just a partial block,
// everything except the partial block is already sorted (because it's
// all the left side).
const median_key = if (trailer_blocks <= half_block_count) initial_median
else blockSelectSort(T, keys.ptr, array.ptr + merge_start, initial_median, trailer_blocks, block_len, cmp);
// This counts the number of full blocks at the end of the trailer
// that cannot participate in the standard merge because they come
// after the start of the trailer in sorted order.
// TODO-OPT: We know that any unfit blocks must come from the left
// array. This means we could find the unfit blocks before doing the
// selection sort, and copy them directly to the end of the array.
// This allows for a smaller selection sort, and means that the two
// largest items (which will likely be swapped multiple times during
// the sort) no longer participate. It also means we need to insertion
// sort three fewer keys :P
const unfit_trailer_blocks: usize = if (trailer_items > 0) countLastMergeBlocks(T, array[merge_start..], trailer_blocks, block_len, cmp) else 0;
// The number of blocks that do participate in the normal merge follows immediately.
const normal_blocks = trailer_blocks - unfit_trailer_blocks;
// If there are no normal blocks, the trailer comes first.
if (normal_blocks == 0) {
// Note that this can only happen if there are no trailer blocks
// after the last full subarray. If there were, those blocks
// would be normal blocks.
assert(trailer_blocks == half_block_count);
// In this case, the selection sort did nothing, and the blocks
// are all in sorted order. So we just need to merge the trailer
// items with the sorted left half.
const left_len = trailer_blocks * block_len;
const right_len = trailer_items;
// TODO-ZIG: Change mode to an inline parameter once those are
// implemented, to avoid this switch.
switch (mode) {
.no_buffer => mergeIntoPrecedingJunk(T, array.ptr + merge_start, junk_len, left_len, trailer_items, .no_buffer, cmp),
.inline_buffer => mergeIntoPrecedingJunk(T, array.ptr + merge_start, junk_len, left_len, trailer_items, .inline_buffer, cmp),
.external_buffer => mergeIntoPrecedingJunk(T, array.ptr + merge_start, junk_len, left_len, trailer_items, .external_buffer, cmp),
}
} else {
const unfit_items = block_len * unfit_trailer_blocks + trailer_items;
// Common case, some blocks participate in the merge.
// TODO-ZIG: Replace this with inline switch when that's implemented
switch (mode) {
.no_buffer => mergeBlocks(T, keys.ptr, median_key, array.ptr + merge_start, normal_blocks, block_len, unfit_trailer_blocks, unfit_items, .no_buffer, cmp),
.inline_buffer => mergeBlocks(T, keys.ptr, median_key, array.ptr + merge_start, normal_blocks, block_len, unfit_trailer_blocks, unfit_items, .inline_buffer, cmp),
.external_buffer => mergeBlocks(T, keys.ptr, median_key, array.ptr + merge_start, normal_blocks, block_len, unfit_trailer_blocks, unfit_items, .external_buffer, cmp),
}
}
merge_start += leftover_count;
assert(merge_start == array.len);
}
// If we have a junk buffer, we need to move it back to the beginning of the list.
// TODO-ZIG: Replace this with an inline switch or parameter when those are in.
switch (mode) {
.no_buffer => moveFrontPastJunk(T, array[first_block_idx - junk_len..merge_start], merge_start - first_block_idx, .no_buffer),
.inline_buffer => moveFrontPastJunk(T, array[first_block_idx - junk_len..merge_start], merge_start - first_block_idx, .inline_buffer),
.external_buffer => moveFrontPastJunk(T, array[first_block_idx - junk_len..merge_start], merge_start - first_block_idx, .external_buffer),
}
}
/// Performs a selection sort on the blocks in the array. Only the first item
/// in each sorted block is used for comparison. If the first items in two blocks
/// tie, the keys are used to break the tie. This keeps the sort stable. But
/// it also means we need a full comparison function, not just a less than function,
/// so that we can detect ties. Swaps made to the block data are also made to the keys.
/// The initial_median parameter is the index of a particular block. That index is tracked
/// through swaps made by this function, and the sorted index of that block is returned.
/// This function does not handle the partial block on the end of the array. That must be
/// done externally.
/// Note that the blocks_ptr parameter points to the first valid block. It does not include
/// the junk buffer.
fn blockSelectSort(comptime T: type, noalias keys_ptr: [*]T, noalias blocks_ptr: [*]T, initial_median: usize, block_count: usize, block_len: usize, cmp: anytype) usize {
const keys = keys_ptr[0..block_count];
const blocks = blocks_ptr[0..block_count * block_len];
assert(initial_median < block_count);
// track the index of this block through the sort
var median_key = initial_median;
// blocks to the left of left_block are sorted.
var left_block: usize = 0;
while (left_block < block_count) : (left_block += 1) {
// Search for the smallest block after or including this block
var smallest_block = left_block;
var right_block = left_block + 1;
while (right_block < block_count) : (right_block += 1) {
// Compare blocks by comparing the first item in each block.
// Individual blocks are sorted, so this is comparing the smallest
// item in each block.
const order = cmp.compare(blocks[block_len * right_block],
blocks[block_len * smallest_block]);
// If the blocks tie, use the keys to break the tie.
// This keeps the sort stable, ensuring that the original
// order of the input array is preserved. It works because
// keys are guaranteed to be unique, so they cannot be equal.
if (order == .lt or
(order == .eq and
cmp.lessThan(keys[right_block], keys[smallest_block]))
) {
smallest_block = right_block;
}
}
// If the left block is the smallest, nothing needs to be swapped.
// It's already in the correct position.
if (smallest_block != left_block) {
// Swap the block contents
blockSwap(
T,
blocks.ptr + block_len * left_block,
blocks.ptr + block_len * smallest_block,
block_len,
);
// Also swap the keys, to preserve stability.
// TODO-OPT: If we have an external buffer and there is room,
// we could store block indexes in it instead. Those are faster
// to compare than keys, and faster to swap. They also don't
// need to be sorted back afterwards.
const tmp = keys[left_block];
keys[left_block] = keys[smallest_block];
keys[smallest_block] = tmp;
// If one of the blocks we swapped was the one referenced by,
// the median key, update the median key to track it.
if (median_key == left_block) {
median_key = smallest_block;
} else if (median_key == smallest_block) {
median_key = left_block;
}
}
}
return median_key;
}
/// When handling the trailing elements at the end of the array,
/// we cannot necessarily sort all complete blocks as normal, and
/// then merge in the trailer. The reason is that the trailer does
/// not undergo the selection sort before merging, so it is not
/// necessarily true that the first element of the trailer would be
/// greater than the last sorted element. Because of this, we
/// can't actually merge all trailer blocks as normal. This function
/// counts how many trailer blocks we need to merge manually in order
/// to make sure the sort works.
fn countLastMergeBlocks(
comptime T: type,
trailer: []T,
trailer_blocks: usize,
block_len: usize,
cmp: anytype,
) usize {
var blocks_to_merge_manually: usize = 0;
var first_item_after_blocks = trailer_blocks * block_len;
var curr_full_block = first_item_after_blocks;
// We can include a block in the normal sort as long as that
// block's first item is less than or equal to the trailer.
// If it's greater than the the first item in the trailer,
// we can't sort it normally because we may end up with items
// greater than the start of the trailer in the "sorted" part
// of the list. In the case that the start of a block is
// equal to the start of the trailer, the block's starting
// value is considered to be less than the trailer, because
// the trailer consists of the items that were at the end of the
// array when the sort began.
while (blocks_to_merge_manually < trailer_blocks and
cmp.lessThan(trailer[first_item_after_blocks],
trailer[curr_full_block - block_len])) {
blocks_to_merge_manually += 1;
curr_full_block -= block_len;
}
return blocks_to_merge_manually;
}
/// This function accepts a list of keys and a list of blocks.
/// After calling it, the scratch buffer at the beginning of the
/// array will be at the end (reordered), and all other items
/// will be sorted on the left. If the mode needs a junk buffer,
/// the list of blocks must be preceded by block_len items of junk,
/// and be followed by block_count * block_len items of data.
/// For that data, within each block, the items must be sorted.
/// The blocks themselves must be sorted based on the first item
/// in each block. The data may optionally be followed by a set
/// of "trailer blocks", which must also be sorted in the same way.
/// There may be yet more items in a partial block after the
/// trailer blocks. trailer_len is the total of the items in the
/// trailer blocks and the items afterwards.
/// Trailer blocks must match the criteria in countLastMergeBlocks.
fn mergeBlocks(
comptime T : type,
noalias keys_ptr : [*]const T,
median_key : usize,
noalias blocks_ptr : [*]T,
block_count : usize,
block_len : usize,
trailer_blocks : usize,
trailer_len : usize,
comptime mode : BufferMode,
cmp : anytype,
) void {
const junk_len = if (mode == .no_buffer) 0 else block_len;
// if this is the last segment, we may need one extra item for the median key.
const keys = keys_ptr[0..block_count + trailer_blocks + 1];
const blocks = (blocks_ptr - junk_len)[0..junk_len + block_count * block_len + trailer_len];