Auto merge of #27280 - bluss:siphash-perf, r=alexcrichton

bors · bors · commit ff6c6ce917bd · 2015-07-28T05:38:53.000Z
Improve siphash performance for longer data

Use `ptr::copy_nonoverlapping` (aka memcpy) to load an u64 from the
byte stream. This is correct for any alignment, and the compiler will
use the appropriate instruction to load the data.

Also contains small tweaks that should benefit hashing short data too,
both the commit that removes a variable and the autovectorization of
the hash state initialization (in SipHash::reset).

Benchmarks show that hashing longer data benefits for the improved word loading.

Before (using benchmarks from the first commit in the PR):

The before benchmark is a bit noisy.

```
test hash::sip::bench_bytes_4                              ... bench:          41 ns/iter (+/- 0) = 97 MB/s
test hash::sip::bench_bytes_7                              ... bench:          49 ns/iter (+/- 2) = 142 MB/s
test hash::sip::bench_bytes_8                              ... bench:          42 ns/iter (+/- 4) = 190 MB/s
test hash::sip::bench_bytes_a_16                           ... bench:          57 ns/iter (+/- 14) = 280 MB/s
test hash::sip::bench_bytes_b_32                           ... bench:          85 ns/iter (+/- 74) = 376 MB/s
test hash::sip::bench_bytes_c_128                          ... bench:         278 ns/iter (+/- 33) = 460 MB/s
test hash::sip::bench_long_str                             ... bench:         825 ns/iter (+/- 103)
test hash::sip::bench_str_of_8_bytes                       ... bench:         151 ns/iter (+/- 66)
test hash::sip::bench_str_over_8_bytes                     ... bench:          59 ns/iter (+/- 3)
test hash::sip::bench_str_under_8_bytes                    ... bench:          47 ns/iter (+/- 56)
test hash::sip::bench_u32                                  ... bench:          39 ns/iter (+/- 93) = 205 MB/s
test hash::sip::bench_u32_keyed                            ... bench:          40 ns/iter (+/- 88) = 200 MB/s
test hash::sip::bench_u64                                  ... bench:          54 ns/iter (+/- 96) = 148 MB/s
```

After:

```
test hash::sip::bench_bytes_4                              ... bench:          41 ns/iter (+/- 3) = 97 MB/s
test hash::sip::bench_bytes_7                              ... bench:          48 ns/iter (+/- 0) = 145 MB/s
test hash::sip::bench_bytes_8                              ... bench:          35 ns/iter (+/- 1) = 228 MB/s
test hash::sip::bench_bytes_a_16                           ... bench:          45 ns/iter (+/- 1) = 355 MB/s
test hash::sip::bench_bytes_b_32                           ... bench:          60 ns/iter (+/- 0) = 533 MB/s
test hash::sip::bench_bytes_c_128                          ... bench:         161 ns/iter (+/- 5) = 795 MB/s
test hash::sip::bench_long_str                             ... bench:         514 ns/iter (+/- 5)
test hash::sip::bench_str_of_8_bytes                       ... bench:          44 ns/iter (+/- 0)
test hash::sip::bench_str_over_8_bytes                     ... bench:          51 ns/iter (+/- 0)
test hash::sip::bench_str_under_8_bytes                    ... bench:          52 ns/iter (+/- 6)
test hash::sip::bench_u32                                  ... bench:          40 ns/iter (+/- 2) = 200 MB/s
test hash::sip::bench_u32_keyed                            ... bench:          39 ns/iter (+/- 1) = 205 MB/s
test hash::sip::bench_u64                                  ... bench:          36 ns/iter (+/- 1) = 222 MB/s
```
diff --git a/src/libcore/hash/sip.rs b/src/libcore/hash/sip.rs
@@ -10,6 +10,7 @@
 
 //! An implementation of SipHash 2-4.
 
+use ptr;
 use prelude::*;
 use super::Hasher;
 
@@ -31,9 +32,13 @@ pub struct SipHasher {
     k0: u64,
     k1: u64,
     length: usize, // how many bytes we've processed
+    // v0, v2 and v1, v3 show up in pairs in the algorithm,
+    // and simd implementations of SipHash will use vectors
+    // of v02 and v13. By placing them in this order in the struct,
+    // the compiler can pick up on just a few simd optimizations by itself.
     v0: u64,      // hash state
-    v1: u64,
     v2: u64,
+    v1: u64,
     v3: u64,
     tail: u64, // unprocessed bytes le
     ntail: usize,  // how many bytes in tail are valid
@@ -65,6 +70,20 @@ macro_rules! u8to64_le {
     });
 }
 
+/// Load a full u64 word from a byte stream, in LE order. Use
+/// `copy_nonoverlapping` to let the compiler generate the most efficient way
+/// to load u64 from a possibly unaligned address.
+///
+/// Unsafe because: unchecked indexing at i..i+8
+#[inline]
+unsafe fn load_u64_le(buf: &[u8], i: usize) -> u64 {
+    debug_assert!(i + 8 <= buf.len());
+    let mut data = 0u64;
+    ptr::copy_nonoverlapping(buf.get_unchecked(i),
+                             &mut data as *mut _ as *mut u8, 8);
+    data.to_le()
+}
+
 macro_rules! rotl {
     ($x:expr, $b:expr) =>
     (($x << $b) | ($x >> (64_i32.wrapping_sub($b))))
@@ -146,12 +165,11 @@ impl SipHasher {
 
         // Buffered tail is now flushed, process new input.
         let len = length - needed;
-        let end = len & (!0x7);
         let left = len & 0x7;
 
         let mut i = needed;
-        while i < end {
-            let mi = u8to64_le!(msg, i);
+        while i < len - left {
+            let mi = unsafe { load_u64_le(msg, i) };
 
             self.v3 ^= mi;
             compress!(self.v0, self.v1, self.v2, self.v3);
diff --git a/src/libcoretest/hash/sip.rs b/src/libcoretest/hash/sip.rs
@@ -7,7 +7,7 @@
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
-use test::Bencher;
+use test::{Bencher, black_box};
 
 use core::hash::{Hash, Hasher};
 use core::hash::SipHasher;
@@ -57,6 +57,12 @@ fn hash_with_keys<T: Hash>(k1: u64, k2: u64, x: &T) -> u64 {
     st.finish()
 }
 
+fn hash_bytes(x: &[u8]) -> u64 {
+    let mut s = SipHasher::default();
+    Hasher::write(&mut s, x);
+    s.finish()
+}
+
 #[test]
 #[allow(unused_must_use)]
 fn test_siphash() {
@@ -266,10 +272,88 @@ officia deserunt mollit anim id est laborum.";
     })
 }
 
+#[bench]
+fn bench_u32(b: &mut Bencher) {
+    let u = 162629500u32;
+    let u = black_box(u);
+    b.iter(|| {
+        hash(&u)
+    });
+    b.bytes = 8;
+}
+
+#[bench]
+fn bench_u32_keyed(b: &mut Bencher) {
+    let u = 162629500u32;
+    let u = black_box(u);
+    let k1 = black_box(0x1);
+    let k2 = black_box(0x2);
+    b.iter(|| {
+        hash_with_keys(k1, k2, &u)
+    });
+    b.bytes = 8;
+}
+
 #[bench]
 fn bench_u64(b: &mut Bencher) {
     let u = 16262950014981195938u64;
+    let u = black_box(u);
     b.iter(|| {
-        assert_eq!(hash(&u), 5254097107239593357);
-    })
+        hash(&u)
+    });
+    b.bytes = 8;
+}
+
+#[bench]
+fn bench_bytes_4(b: &mut Bencher) {
+    let data = black_box([b' '; 4]);
+    b.iter(|| {
+        hash_bytes(&data)
+    });
+    b.bytes = 4;
+}
+
+#[bench]
+fn bench_bytes_7(b: &mut Bencher) {
+    let data = black_box([b' '; 7]);
+    b.iter(|| {
+        hash_bytes(&data)
+    });
+    b.bytes = 7;
+}
+
+#[bench]
+fn bench_bytes_8(b: &mut Bencher) {
+    let data = black_box([b' '; 8]);
+    b.iter(|| {
+        hash_bytes(&data)
+    });
+    b.bytes = 8;
+}
+
+#[bench]
+fn bench_bytes_a_16(b: &mut Bencher) {
+    let data = black_box([b' '; 16]);
+    b.iter(|| {
+        hash_bytes(&data)
+    });
+    b.bytes = 16;
+}
+
+#[bench]
+fn bench_bytes_b_32(b: &mut Bencher) {
+    let data = black_box([b' '; 32]);
+    b.iter(|| {
+        hash_bytes(&data)
+    });
+    b.bytes = 32;
+}
+
+#[bench]
+fn bench_bytes_c_128(b: &mut Bencher) {
+    let data = black_box([b' '; 128]);
+    b.iter(|| {
+        hash_bytes(&data)
+    });
+    b.bytes = 128;
 }