rust-seq · imartayan · Feb 1, 2025 · Feb 1, 2025 · Feb 1, 2025 · Feb 1, 2025
diff --git a/simd-minimizers-bench/english.200MB b/simd-minimizers-bench/english.200MB
@@ -0,0 +1 @@
+/home/philae/git/eth/data/pizzachili/english.200MB
diff --git a/simd-minimizers-bench/sources.200MB b/simd-minimizers-bench/sources.200MB
@@ -0,0 +1 @@
+/home/philae/git/eth/data/pizzachili/sources.200MB
diff --git a/simd-minimizers-bench/src/bin/paper.rs b/simd-minimizers-bench/src/bin/paper.rs
@@ -73,7 +73,7 @@ fn bench_minimizers(w: usize, k: usize) {
             v2,
         );
         v2.clear();
-        collect::collect_and_dedup_into::<false>(
+        collect::collect_and_dedup_into(
             minimizers::canonical_minimizers_seq_simd::<_, H>(packed_seq, k, w),
             v2,
         );

diff --git a/simd-minimizers/src/collect.rs b/simd-minimizers/src/collect.rs
@@ -72,46 +72,90 @@ pub fn collect_into(
 }
 
 thread_local! {
-    static CACHE: RefCell<[Vec<u32>; 8]> = RefCell::new(array::from_fn(|_| Vec::new()));
+    static CACHE: RefCell<[Vec<u32>; 16]> = RefCell::new(array::from_fn(|_| Vec::new()));
 }
 
 /// Convenience wrapper around `collect_and_dedup_into`.
-pub fn collect_and_dedup<const SUPER: bool>(
+pub fn collect_and_dedup(
     (par_head, tail): (
         impl ExactSizeIterator<Item = S>,
         impl ExactSizeIterator<Item = u32>,
     ),
 ) -> Vec<u32> {
     let mut v = vec![];
-    collect_and_dedup_into::<SUPER>((par_head, tail), &mut v);
+    collect_and_dedup_into((par_head, tail), &mut v);
     v
 }
 
+/// Convenience wrapper around `collect_and_dedup_with_index_into`.
+pub fn collect_and_dedup_with_index(
+    (par_head, tail): (
+        impl ExactSizeIterator<Item = S>,
+        impl ExactSizeIterator<Item = u32>,
+    ),
+) -> (Vec<u32>, Vec<u32>) {
+    let mut v = vec![];
+    let mut v2 = vec![];
+    collect_and_dedup_with_index_into((par_head, tail), &mut v, &mut v2);
+    (v, v2)
+}
+
+/// Collect a SIMD-iterator into a single vector, and duplicate adjacent equal elements.
+/// Works by taking 8 elements from each stream, and then transposing the SIMD-matrix before writing out the results.
+///
+/// The output is simply the deduplicated input values.
+#[inline(always)]
+pub fn collect_and_dedup_into(
+    (par_head, tail): (
+        impl ExactSizeIterator<Item = S>,
+        impl ExactSizeIterator<Item = u32>,
+    ),
+    out_vec: &mut Vec<u32>,
+) {
+    collect_and_dedup_into_impl::<false>((par_head, tail), out_vec, &mut vec![]);
+}
+
 /// Collect a SIMD-iterator into a single vector, and duplicate adjacent equal elements.
 /// Works by taking 8 elements from each stream, and then transposing the SIMD-matrix before writing out the results.
 ///
-/// By default (when `SUPER` is false), the output is simply the deduplicated input values.
-/// When `SUPER` is true, each returned `u32` is a tuple of `(u16,16)` where the low bits are those of the input value,
-/// and the high bits are the index of the stream it first appeared, i.e., the start of its super-k-mer.
-/// These positions are mod 2^16. When the window length is <2^16, this is sufficient to recover full super-k-mers.
+/// The deduplicated input values are written in `out_vec` and the index of the stream it first appeared, i.e., the start of its super-k-mer, is written in `idx_vec`.
 #[inline(always)]
-pub fn collect_and_dedup_into<const SUPER: bool>(
+pub fn collect_and_dedup_with_index_into(
     (par_head, tail): (
         impl ExactSizeIterator<Item = S>,
         impl ExactSizeIterator<Item = u32>,
     ),
     out_vec: &mut Vec<u32>,
+    idx_vec: &mut Vec<u32>,
+) {
+    collect_and_dedup_into_impl::<true>((par_head, tail), out_vec, idx_vec);
+}
+
+/// Collect a SIMD-iterator into a single vector, and duplicate adjacent equal elements.
+/// Works by taking 8 elements from each stream, and then transposing the SIMD-matrix before writing out the results.
+///
+/// By default (when `SUPER` is false), the deduplicated input values are written in `out_vec`.
+/// When `SUPER` is true, the index of the stream in which the input value first appeared, i.e., the start of its super-k-mer, is additionale written in `idx_vec`.
+#[inline(always)]
+fn collect_and_dedup_into_impl<const SUPER: bool>(
+    (par_head, tail): (
+        impl ExactSizeIterator<Item = S>,
+        impl ExactSizeIterator<Item = u32>,
+    ),
+    out_vec: &mut Vec<u32>,
+    idx_vec: &mut Vec<u32>,
 ) {
     CACHE.with(|v| {
         let mut v = v.borrow_mut();
+        let (v, v2) = v.split_at_mut(8);
 
         let mut write_idx = [0; 8];
         // Vec of last pushed elements in each lane.
-        let mut old = [unsafe { transmute([u32::MAX; 8]) }; 8];
+        let mut old = [S::MAX; 8];
 
         let len = par_head.len();
-        let lane_offsets: [u32x8; 8] = from_fn(|i| u32x8::splat(((i * len) << 16) as u32));
-        let offsets: [u32; 8] = from_fn(|i| (i << 16) as u32);
+        let lane_offsets: [u32x8; 8] = from_fn(|i| u32x8::splat((i * len) as u32));
+        let offsets: [u32; 8] = from_fn(|i| i as u32);
         let mut offsets: u32x8 = unsafe { transmute(offsets) };
 
         let mut m = [u32x8::ZERO; 8];
@@ -120,64 +164,94 @@ pub fn collect_and_dedup_into<const SUPER: bool>(
             m[i % 8] = x;
             if i % 8 == 7 {
                 let t = transpose(m);
-                offsets += u32x8::splat(8 << 16);
                 for j in 0..8 {
                     let lane = t[j];
-                    let vals = if SUPER {
-                        // High 16 bits are the index where the minimizer first becomes minimal.
-                        // Low 16 bits are the position of the minimizer itself.
-                        (offsets + lane_offsets[j]) | (lane & u32x8::splat(0xFFFF))
-                    } else {
-                        lane
-                    };
                     if write_idx[j] + 8 > v[j].len() {
                         let new_len = v[j].len() + 1024;
                         v[j].resize(new_len, 0);
+                        if SUPER {
+                            v2[j].resize(new_len, 0);
+                        }
                     }
                     unsafe {
-                        crate::intrinsics::append_unique_vals(
-                            old[j],
-                            transmute(lane),
-                            transmute(vals),
-                            &mut v[j],
-                            &mut write_idx[j],
-                        );
-                        old[j] = transmute(lane);
+                        if SUPER {
+                            crate::intrinsics::append_unique_vals_2(
+                                old[j],
+                                lane,
+                                lane,
+                                offsets + lane_offsets[j],
+                                &mut v[j],
+                                &mut v2[j],
+                                &mut write_idx[j],
+                            );
+                        } else {
+                            crate::intrinsics::append_unique_vals(
+                                old[j],
+                                lane,
+                                lane,
+                                &mut v[j],
+                                &mut write_idx[j],
+                            );
+                        }
+                        old[j] = lane;
                     }
                 }
+                offsets += u32x8::splat(8);
             }
             i += 1;
         });
 
         for j in 0..8 {
             v[j].truncate(write_idx[j]);
+            if SUPER {
+                v2[j].truncate(write_idx[j]);
+            }
         }
 
         // Manually write the unfinished parts of length k=i%8.
         let t = transpose(m);
         let k = i % 8;
         for j in 0..8 {
-            let lane = &unsafe { transmute::<_, [u32; 8]>(t[j]) }[..k];
-            for x in lane {
+            let lane = t[j].as_array_ref();
+            for (p, x) in lane.iter().take(k).enumerate() {
                 if v[j].last() != Some(x) {
                     v[j].push(*x);
+                    if SUPER {
+                        v2[j].push(offsets.as_array_ref()[p] + lane_offsets[j].as_array_ref()[p]);
+                    }
                 }
             }
         }
 
         // Flatten v.
-        for lane in v.iter() {
-            let mut lane = lane.as_slice();
-            while !lane.is_empty() && Some(lane[0]) == out_vec.last().copied() {
-                lane = &lane[1..];
+        if SUPER {
+            for (lane, lane2) in v.iter().zip(v2.iter()) {
+                let mut lane = lane.as_slice();
+                let mut lane2 = lane2.as_slice();
+                while !lane.is_empty() && Some(lane[0]) == out_vec.last().copied() {
+                    lane = &lane[1..];
+                    lane2 = &lane2[1..];
+                }
+                out_vec.extend_from_slice(lane);
+                idx_vec.extend_from_slice(lane2);
+            }
+        } else {
+            for lane in v.iter() {
+                let mut lane = lane.as_slice();
+                while !lane.is_empty() && Some(lane[0]) == out_vec.last().copied() {
+                    lane = &lane[1..];
+                }
+                out_vec.extend_from_slice(lane);
             }
-            out_vec.extend_from_slice(lane);
         }
 
         // Manually write the dedup'ed explicit tail.
-        for x in tail {
+        for (p, x) in tail.enumerate() {
             if out_vec.last() != Some(&x) {
                 out_vec.push(x);
+                if SUPER {
+                    idx_vec.push((8 * len + p) as u32);
+                }
             }
         }
 

diff --git a/simd-minimizers/src/intrinsics/dedup.rs b/simd-minimizers/src/intrinsics/dedup.rs
@@ -23,6 +23,36 @@ pub unsafe fn append_unique_vals(old: S, new: S, vals: S, v: &mut [u32], write_i
     }
 }
 
+/// Dedup adjacent `new` values (starting with the last element of `old`).
+/// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]` and `vals2` to `v2[write_idx]`.
+#[inline(always)]
+#[cfg(not(any(target_feature = "avx2", target_feature = "neon")))]
+pub unsafe fn append_unique_vals_2(
+    old: S,
+    new: S,
+    vals: S,
+    vals2: S,
+    v: &mut [u32],
+    v2: &mut [u32],
+    write_idx: &mut usize,
+) {
+    unsafe {
+        let old = old.to_array();
+        let new = new.to_array();
+        let vals = vals.to_array();
+        let vals2 = vals2.to_array();
+        let mut prec = old[7];
+        for (i, &curr) in new.iter().enumerate() {
+            if curr != prec {
+                v.as_mut_ptr().add(*write_idx).write(vals[i]);
+                v2.as_mut_ptr().add(*write_idx).write(vals2[i]);
+                *write_idx += 1;
+                prec = curr;
+            }
+        }
+    }
+}
+
 /// Dedup adjacent `new` values (starting with the last element of `old`).
 /// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]`.
 ///
@@ -52,6 +82,46 @@ pub unsafe fn append_unique_vals(old: S, new: S, vals: S, v: &mut [u32], write_i
     }
 }
 
+/// Dedup adjacent `new` values (starting with the last element of `old`).
+/// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]` and `vals2` to `v2[write_idx]`.
+///
+/// Based on Daniel Lemire's blog.
+/// <https://lemire.me/blog/2017/04/10/removing-duplicates-from-lists-quickly/>
+/// <https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/edfd0e8b809d9a57527a7990c4bb44b9d1d05a69/2017/04/10/removeduplicates.cpp>
+#[cfg(target_feature = "avx2")]
+#[inline(always)]
+pub unsafe fn append_unique_vals_2(
+    old: S,
+    new: S,
+    vals: S,
+    vals2: S,
+    v: &mut [u32],
+    v2: &mut [u32],
+    write_idx: &mut usize,
+) {
+    unsafe {
+        use core::arch::x86_64::*;
+
+        let old = transmute(old);
+        let new = transmute(new);
+        let vals = transmute(vals);
+        let vals2 = transmute(vals2);
+
+        let recon = _mm256_blend_epi32(old, new, 0b01111111);
+        let movebyone_mask = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7); // rotate shuffle
+        let vec_tmp = _mm256_permutevar8x32_epi32(recon, movebyone_mask);
+
+        let m = _mm256_movemask_ps(transmute(_mm256_cmpeq_epi32(vec_tmp, new))) as usize;
+        let numberofnewvalues = L - m.count_ones() as usize;
+        let key = transmute(UNIQSHUF[m]);
+        let val = _mm256_permutevar8x32_epi32(vals, key);
+        _mm256_storeu_si256(v.as_mut_ptr().add(*write_idx) as *mut __m256i, val);
+        let val2 = _mm256_permutevar8x32_epi32(vals2, key);
+        _mm256_storeu_si256(v2.as_mut_ptr().add(*write_idx) as *mut __m256i, val2);
+        *write_idx += numberofnewvalues;
+    }
+}
+
 /// Dedup adjacent `new` values (starting with the last element of `old`).
 /// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]`.
 ///
@@ -106,6 +176,73 @@ pub unsafe fn append_unique_vals(old: S, new: S, vals: S, v: &mut [u32], write_i
     }
 }
 
+/// Dedup adjacent `new` values (starting with the last element of `old`).
+/// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]` and `vals2` to `v2[write_idx]`.
+///
+/// Somewhat based on Daniel Lemire's blog.
+/// <https://lemire.me/blog/2017/04/10/removing-duplicates-from-lists-quickly/>
+/// <https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/edfd0e8b809d9a57527a7990c4bb44b9d1d05a69/2017/04/10/removeduplicates.cpp>
+#[inline(always)]
+#[cfg(target_feature = "neon")]
+pub unsafe fn append_unique_vals_2(
+    old: S,
+    new: S,
+    vals: S,
+    vals2: S,
+    v: &mut [u32],
+    v2: &mut [u32],
+    write_idx: &mut usize,
+) {
+    unsafe {
+        use core::arch::aarch64::{vpaddd_u64, vpaddlq_u32, vqtbl2q_u8, vst1_u32_x4};
+        use wide::u32x4;
+
+        let new_old_mask = S::new([
+            u32::MAX,
+            u32::MAX,
+            u32::MAX,
+            u32::MAX,
+            u32::MAX,
+            u32::MAX,
+            u32::MAX,
+            0,
+        ]);
+        let recon = new_old_mask.blend(new, old);
+
+        let rotate_idx = S::new([7, 0, 1, 2, 3, 4, 5, 6]);
+        let idx = rotate_idx * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00);
+        let (i1, i2) = transmute(idx);
+        let t = transmute(recon);
+        let r1 = vqtbl2q_u8(t, i1);
+        let r2 = vqtbl2q_u8(t, i2);
+        let prec: S = transmute((r1, r2));
+
+        let dup = prec.cmp_eq(new);
+        let (d1, d2): (u32x4, u32x4) = transmute(dup);
+        let pow1 = u32x4::new([1, 2, 4, 8]);
+        let pow2 = u32x4::new([16, 32, 64, 128]);
+        let m1 = vpaddd_u64(vpaddlq_u32(transmute(d1 & pow1)));
+        let m2 = vpaddd_u64(vpaddlq_u32(transmute(d2 & pow2)));
+        let m = (m1 | m2) as usize;
+
+        let numberofnewvalues = L - m.count_ones() as usize;
+        let key = UNIQSHUF[m];
+        let idx = key * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00);
+        let (i1, i2) = transmute(idx);
+        let t = transmute(vals);
+        let r1 = vqtbl2q_u8(t, i1);
+        let r2 = vqtbl2q_u8(t, i2);
+        let val: S = transmute((r1, r2));
+        vst1_u32_x4(v.as_mut_ptr().add(*write_idx), transmute(val));
+        let t = transmute(vals2);
+        let r1 = vqtbl2q_u8(t, i1);
+        let r2 = vqtbl2q_u8(t, i2);
+        let val2: S = transmute((r1, r2));
+        vst1_u32_x4(v2.as_mut_ptr().add(*write_idx), transmute(val2));
+        *write_idx += numberofnewvalues;
+    }
+}
+
 /// For each of 256 masks of which elements are different than their predecessor,
 /// a shuffle that sends those new elements to the beginning.
 #[rustfmt::skip]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		/home/philae/git/eth/data/pizzachili/english.200MB
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		/home/philae/git/eth/data/pizzachili/sources.200MB