diff --git a/src/distributions/utils.rs b/src/distributions/utils.rs index 895756462f..bddb0a4a59 100644 --- a/src/distributions/utils.rs +++ b/src/distributions/utils.rs @@ -31,7 +31,7 @@ macro_rules! wmul_impl { }; // simd bulk implementation - ($(($ty:ident, $wide:ident),)+, $shift:expr) => { + ($(($ty:ident, $wide:ty),)+, $shift:expr) => { $( impl WideningMultiply for $ty { type Output = ($ty, $ty); @@ -152,7 +152,8 @@ mod simd_wmul { (u8x4, u16x4), (u8x8, u16x8), (u8x16, u16x16), - (u8x32, u16x32),, + (u8x32, u16x32), + (u8x64, Simd),, 8 } @@ -162,6 +163,8 @@ mod simd_wmul { wmul_impl! { (u16x8, u32x8),, 16 } #[cfg(not(target_feature = "avx2"))] wmul_impl! { (u16x16, u32x16),, 16 } + #[cfg(not(target_feature = "avx512bw"))] + wmul_impl! { (u16x32, Simd),, 16 } // 16-bit lane widths allow use of the x86 `mulhi` instructions, which // means `wmul` can be implemented with only two instructions. @@ -191,15 +194,11 @@ mod simd_wmul { wmul_impl! { (u32x2, u64x2), (u32x4, u64x4), - (u32x8, u64x8),, + (u32x8, u64x8), + (u32x16, Simd),, 32 } - // TODO: optimize, this seems to seriously slow things down - wmul_impl_large! { (u8x64,) u8, 4 } - #[cfg(not(target_feature = "avx512bw"))] - wmul_impl_large! { (u16x32,) u16, 8 } - wmul_impl_large! { (u32x16,) u32, 16 } wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, 32 } }