Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Compute super-k-mer positions as u32s, add wrappers and tests #5

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions simd-minimizers-bench/english.200MB
1 change: 1 addition & 0 deletions simd-minimizers-bench/sources.200MB
2 changes: 1 addition & 1 deletion simd-minimizers-bench/src/bin/paper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ fn bench_minimizers(w: usize, k: usize) {
v2,
);
v2.clear();
collect::collect_and_dedup_into::<false>(
collect::collect_and_dedup_into(
minimizers::canonical_minimizers_seq_simd::<_, H>(packed_seq, k, w),
v2,
);
Expand Down
144 changes: 109 additions & 35 deletions simd-minimizers/src/collect.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,46 +72,90 @@ pub fn collect_into(
}

thread_local! {
static CACHE: RefCell<[Vec<u32>; 8]> = RefCell::new(array::from_fn(|_| Vec::new()));
static CACHE: RefCell<[Vec<u32>; 16]> = RefCell::new(array::from_fn(|_| Vec::new()));
}

/// Convenience wrapper around `collect_and_dedup_into`.
pub fn collect_and_dedup<const SUPER: bool>(
pub fn collect_and_dedup(
(par_head, tail): (
impl ExactSizeIterator<Item = S>,
impl ExactSizeIterator<Item = u32>,
),
) -> Vec<u32> {
let mut v = vec![];
collect_and_dedup_into::<SUPER>((par_head, tail), &mut v);
collect_and_dedup_into((par_head, tail), &mut v);
v
}

/// Convenience wrapper around `collect_and_dedup_with_index_into`.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

document which output component contains what (here and possibly elsewhere)

pub fn collect_and_dedup_with_index(
(par_head, tail): (
impl ExactSizeIterator<Item = S>,
impl ExactSizeIterator<Item = u32>,
),
) -> (Vec<u32>, Vec<u32>) {
let mut v = vec![];
let mut v2 = vec![];
collect_and_dedup_with_index_into((par_head, tail), &mut v, &mut v2);
(v, v2)
}

/// Collect a SIMD-iterator into a single vector, and duplicate adjacent equal elements.
/// Works by taking 8 elements from each stream, and then transposing the SIMD-matrix before writing out the results.
///
/// The output is simply the deduplicated input values.
#[inline(always)]
pub fn collect_and_dedup_into(
(par_head, tail): (
impl ExactSizeIterator<Item = S>,
impl ExactSizeIterator<Item = u32>,
),
out_vec: &mut Vec<u32>,
) {
collect_and_dedup_into_impl::<false>((par_head, tail), out_vec, &mut vec![]);
}

/// Collect a SIMD-iterator into a single vector, and duplicate adjacent equal elements.
/// Works by taking 8 elements from each stream, and then transposing the SIMD-matrix before writing out the results.
///
/// By default (when `SUPER` is false), the output is simply the deduplicated input values.
/// When `SUPER` is true, each returned `u32` is a tuple of `(u16,16)` where the low bits are those of the input value,
/// and the high bits are the index of the stream it first appeared, i.e., the start of its super-k-mer.
/// These positions are mod 2^16. When the window length is <2^16, this is sufficient to recover full super-k-mers.
/// The deduplicated input values are written in `out_vec` and the index of the stream it first appeared, i.e., the start of its super-k-mer, is written in `idx_vec`.
#[inline(always)]
pub fn collect_and_dedup_into<const SUPER: bool>(
pub fn collect_and_dedup_with_index_into(
(par_head, tail): (
impl ExactSizeIterator<Item = S>,
impl ExactSizeIterator<Item = u32>,
),
out_vec: &mut Vec<u32>,
idx_vec: &mut Vec<u32>,
) {
collect_and_dedup_into_impl::<true>((par_head, tail), out_vec, idx_vec);
}

/// Collect a SIMD-iterator into a single vector, and duplicate adjacent equal elements.
/// Works by taking 8 elements from each stream, and then transposing the SIMD-matrix before writing out the results.
///
/// By default (when `SUPER` is false), the deduplicated input values are written in `out_vec`.
/// When `SUPER` is true, the index of the stream in which the input value first appeared, i.e., the start of its super-k-mer, is additionale written in `idx_vec`.
#[inline(always)]
fn collect_and_dedup_into_impl<const SUPER: bool>(
(par_head, tail): (
impl ExactSizeIterator<Item = S>,
impl ExactSizeIterator<Item = u32>,
),
out_vec: &mut Vec<u32>,
idx_vec: &mut Vec<u32>,
) {
CACHE.with(|v| {
let mut v = v.borrow_mut();
let (v, v2) = v.split_at_mut(8);

let mut write_idx = [0; 8];
// Vec of last pushed elements in each lane.
let mut old = [unsafe { transmute([u32::MAX; 8]) }; 8];
let mut old = [S::MAX; 8];

let len = par_head.len();
let lane_offsets: [u32x8; 8] = from_fn(|i| u32x8::splat(((i * len) << 16) as u32));
let offsets: [u32; 8] = from_fn(|i| (i << 16) as u32);
let lane_offsets: [u32x8; 8] = from_fn(|i| u32x8::splat((i * len) as u32));
let offsets: [u32; 8] = from_fn(|i| i as u32);
let mut offsets: u32x8 = unsafe { transmute(offsets) };

let mut m = [u32x8::ZERO; 8];
Expand All @@ -120,64 +164,94 @@ pub fn collect_and_dedup_into<const SUPER: bool>(
m[i % 8] = x;
if i % 8 == 7 {
let t = transpose(m);
offsets += u32x8::splat(8 << 16);
for j in 0..8 {
let lane = t[j];
let vals = if SUPER {
// High 16 bits are the index where the minimizer first becomes minimal.
// Low 16 bits are the position of the minimizer itself.
(offsets + lane_offsets[j]) | (lane & u32x8::splat(0xFFFF))
} else {
lane
};
if write_idx[j] + 8 > v[j].len() {
let new_len = v[j].len() + 1024;
v[j].resize(new_len, 0);
if SUPER {
v2[j].resize(new_len, 0);
}
}
unsafe {
crate::intrinsics::append_unique_vals(
old[j],
transmute(lane),
transmute(vals),
&mut v[j],
&mut write_idx[j],
);
old[j] = transmute(lane);
if SUPER {
crate::intrinsics::append_unique_vals_2(
old[j],
lane,
lane,
offsets + lane_offsets[j],
&mut v[j],
&mut v2[j],
&mut write_idx[j],
);
} else {
crate::intrinsics::append_unique_vals(
old[j],
lane,
lane,
&mut v[j],
&mut write_idx[j],
);
}
old[j] = lane;
}
}
offsets += u32x8::splat(8);
}
i += 1;
});

for j in 0..8 {
v[j].truncate(write_idx[j]);
if SUPER {
v2[j].truncate(write_idx[j]);
}
}

// Manually write the unfinished parts of length k=i%8.
let t = transpose(m);
let k = i % 8;
for j in 0..8 {
let lane = &unsafe { transmute::<_, [u32; 8]>(t[j]) }[..k];
for x in lane {
let lane = t[j].as_array_ref();
for (p, x) in lane.iter().take(k).enumerate() {
if v[j].last() != Some(x) {
v[j].push(*x);
if SUPER {
v2[j].push(offsets.as_array_ref()[p] + lane_offsets[j].as_array_ref()[p]);
}
}
}
}

// Flatten v.
for lane in v.iter() {
let mut lane = lane.as_slice();
while !lane.is_empty() && Some(lane[0]) == out_vec.last().copied() {
lane = &lane[1..];
if SUPER {
for (lane, lane2) in v.iter().zip(v2.iter()) {
let mut lane = lane.as_slice();
let mut lane2 = lane2.as_slice();
while !lane.is_empty() && Some(lane[0]) == out_vec.last().copied() {
lane = &lane[1..];
lane2 = &lane2[1..];
}
out_vec.extend_from_slice(lane);
idx_vec.extend_from_slice(lane2);
}
} else {
for lane in v.iter() {
let mut lane = lane.as_slice();
while !lane.is_empty() && Some(lane[0]) == out_vec.last().copied() {
lane = &lane[1..];
}
out_vec.extend_from_slice(lane);
}
out_vec.extend_from_slice(lane);
}

// Manually write the dedup'ed explicit tail.
for x in tail {
for (p, x) in tail.enumerate() {
if out_vec.last() != Some(&x) {
out_vec.push(x);
if SUPER {
idx_vec.push((8 * len + p) as u32);
}
}
}

Expand Down
137 changes: 137 additions & 0 deletions simd-minimizers/src/intrinsics/dedup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,36 @@ pub unsafe fn append_unique_vals(old: S, new: S, vals: S, v: &mut [u32], write_i
}
}

/// Dedup adjacent `new` values (starting with the last element of `old`).
/// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]` and `vals2` to `v2[write_idx]`.
#[inline(always)]
#[cfg(not(any(target_feature = "avx2", target_feature = "neon")))]
pub unsafe fn append_unique_vals_2(
old: S,
new: S,
vals: S,
vals2: S,
v: &mut [u32],
v2: &mut [u32],
write_idx: &mut usize,
) {
unsafe {
let old = old.to_array();
let new = new.to_array();
let vals = vals.to_array();
let vals2 = vals2.to_array();
let mut prec = old[7];
for (i, &curr) in new.iter().enumerate() {
if curr != prec {
v.as_mut_ptr().add(*write_idx).write(vals[i]);
v2.as_mut_ptr().add(*write_idx).write(vals2[i]);
*write_idx += 1;
prec = curr;
}
}
}
}

/// Dedup adjacent `new` values (starting with the last element of `old`).
/// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]`.
///
Expand Down Expand Up @@ -52,6 +82,46 @@ pub unsafe fn append_unique_vals(old: S, new: S, vals: S, v: &mut [u32], write_i
}
}

/// Dedup adjacent `new` values (starting with the last element of `old`).
/// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]` and `vals2` to `v2[write_idx]`.
///
/// Based on Daniel Lemire's blog.
/// <https://lemire.me/blog/2017/04/10/removing-duplicates-from-lists-quickly/>
/// <https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/edfd0e8b809d9a57527a7990c4bb44b9d1d05a69/2017/04/10/removeduplicates.cpp>
#[cfg(target_feature = "avx2")]
#[inline(always)]
pub unsafe fn append_unique_vals_2(
old: S,
new: S,
vals: S,
vals2: S,
v: &mut [u32],
v2: &mut [u32],
write_idx: &mut usize,
) {
unsafe {
use core::arch::x86_64::*;

let old = transmute(old);
let new = transmute(new);
let vals = transmute(vals);
let vals2 = transmute(vals2);

let recon = _mm256_blend_epi32(old, new, 0b01111111);
let movebyone_mask = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7); // rotate shuffle
let vec_tmp = _mm256_permutevar8x32_epi32(recon, movebyone_mask);

let m = _mm256_movemask_ps(transmute(_mm256_cmpeq_epi32(vec_tmp, new))) as usize;
let numberofnewvalues = L - m.count_ones() as usize;
let key = transmute(UNIQSHUF[m]);
let val = _mm256_permutevar8x32_epi32(vals, key);
_mm256_storeu_si256(v.as_mut_ptr().add(*write_idx) as *mut __m256i, val);
let val2 = _mm256_permutevar8x32_epi32(vals2, key);
_mm256_storeu_si256(v2.as_mut_ptr().add(*write_idx) as *mut __m256i, val2);
*write_idx += numberofnewvalues;
}
}

/// Dedup adjacent `new` values (starting with the last element of `old`).
/// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]`.
///
Expand Down Expand Up @@ -106,6 +176,73 @@ pub unsafe fn append_unique_vals(old: S, new: S, vals: S, v: &mut [u32], write_i
}
}

/// Dedup adjacent `new` values (starting with the last element of `old`).
/// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]` and `vals2` to `v2[write_idx]`.
///
/// Somewhat based on Daniel Lemire's blog.
/// <https://lemire.me/blog/2017/04/10/removing-duplicates-from-lists-quickly/>
/// <https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/edfd0e8b809d9a57527a7990c4bb44b9d1d05a69/2017/04/10/removeduplicates.cpp>
#[inline(always)]
#[cfg(target_feature = "neon")]
pub unsafe fn append_unique_vals_2(
old: S,
new: S,
vals: S,
vals2: S,
v: &mut [u32],
v2: &mut [u32],
write_idx: &mut usize,
) {
unsafe {
use core::arch::aarch64::{vpaddd_u64, vpaddlq_u32, vqtbl2q_u8, vst1_u32_x4};
use wide::u32x4;

let new_old_mask = S::new([
u32::MAX,
u32::MAX,
u32::MAX,
u32::MAX,
u32::MAX,
u32::MAX,
u32::MAX,
0,
]);
let recon = new_old_mask.blend(new, old);

let rotate_idx = S::new([7, 0, 1, 2, 3, 4, 5, 6]);
let idx = rotate_idx * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00);
let (i1, i2) = transmute(idx);
let t = transmute(recon);
let r1 = vqtbl2q_u8(t, i1);
let r2 = vqtbl2q_u8(t, i2);
let prec: S = transmute((r1, r2));

let dup = prec.cmp_eq(new);
let (d1, d2): (u32x4, u32x4) = transmute(dup);
let pow1 = u32x4::new([1, 2, 4, 8]);
let pow2 = u32x4::new([16, 32, 64, 128]);
let m1 = vpaddd_u64(vpaddlq_u32(transmute(d1 & pow1)));
let m2 = vpaddd_u64(vpaddlq_u32(transmute(d2 & pow2)));
let m = (m1 | m2) as usize;

let numberofnewvalues = L - m.count_ones() as usize;
let key = UNIQSHUF[m];
let idx = key * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00);
let (i1, i2) = transmute(idx);
let t = transmute(vals);
let r1 = vqtbl2q_u8(t, i1);
let r2 = vqtbl2q_u8(t, i2);
let val: S = transmute((r1, r2));
vst1_u32_x4(v.as_mut_ptr().add(*write_idx), transmute(val));
let t = transmute(vals2);
let r1 = vqtbl2q_u8(t, i1);
let r2 = vqtbl2q_u8(t, i2);
let val2: S = transmute((r1, r2));
vst1_u32_x4(v2.as_mut_ptr().add(*write_idx), transmute(val2));
*write_idx += numberofnewvalues;
}
}

/// For each of 256 masks of which elements are different than their predecessor,
/// a shuffle that sends those new elements to the beginning.
#[rustfmt::skip]
Expand Down
Loading