Skip to content

Commit ff6c6ce

Browse files
committed
Auto merge of #27280 - bluss:siphash-perf, r=alexcrichton
Improve siphash performance for longer data Use `ptr::copy_nonoverlapping` (aka memcpy) to load an u64 from the byte stream. This is correct for any alignment, and the compiler will use the appropriate instruction to load the data. Also contains small tweaks that should benefit hashing short data too, both the commit that removes a variable and the autovectorization of the hash state initialization (in SipHash::reset). Benchmarks show that hashing longer data benefits for the improved word loading. Before (using benchmarks from the first commit in the PR): The before benchmark is a bit noisy. ``` test hash::sip::bench_bytes_4 ... bench: 41 ns/iter (+/- 0) = 97 MB/s test hash::sip::bench_bytes_7 ... bench: 49 ns/iter (+/- 2) = 142 MB/s test hash::sip::bench_bytes_8 ... bench: 42 ns/iter (+/- 4) = 190 MB/s test hash::sip::bench_bytes_a_16 ... bench: 57 ns/iter (+/- 14) = 280 MB/s test hash::sip::bench_bytes_b_32 ... bench: 85 ns/iter (+/- 74) = 376 MB/s test hash::sip::bench_bytes_c_128 ... bench: 278 ns/iter (+/- 33) = 460 MB/s test hash::sip::bench_long_str ... bench: 825 ns/iter (+/- 103) test hash::sip::bench_str_of_8_bytes ... bench: 151 ns/iter (+/- 66) test hash::sip::bench_str_over_8_bytes ... bench: 59 ns/iter (+/- 3) test hash::sip::bench_str_under_8_bytes ... bench: 47 ns/iter (+/- 56) test hash::sip::bench_u32 ... bench: 39 ns/iter (+/- 93) = 205 MB/s test hash::sip::bench_u32_keyed ... bench: 40 ns/iter (+/- 88) = 200 MB/s test hash::sip::bench_u64 ... bench: 54 ns/iter (+/- 96) = 148 MB/s ``` After: ``` test hash::sip::bench_bytes_4 ... bench: 41 ns/iter (+/- 3) = 97 MB/s test hash::sip::bench_bytes_7 ... bench: 48 ns/iter (+/- 0) = 145 MB/s test hash::sip::bench_bytes_8 ... bench: 35 ns/iter (+/- 1) = 228 MB/s test hash::sip::bench_bytes_a_16 ... bench: 45 ns/iter (+/- 1) = 355 MB/s test hash::sip::bench_bytes_b_32 ... bench: 60 ns/iter (+/- 0) = 533 MB/s test hash::sip::bench_bytes_c_128 ... bench: 161 ns/iter (+/- 5) = 795 MB/s test hash::sip::bench_long_str ... bench: 514 ns/iter (+/- 5) test hash::sip::bench_str_of_8_bytes ... bench: 44 ns/iter (+/- 0) test hash::sip::bench_str_over_8_bytes ... bench: 51 ns/iter (+/- 0) test hash::sip::bench_str_under_8_bytes ... bench: 52 ns/iter (+/- 6) test hash::sip::bench_u32 ... bench: 40 ns/iter (+/- 2) = 200 MB/s test hash::sip::bench_u32_keyed ... bench: 39 ns/iter (+/- 1) = 205 MB/s test hash::sip::bench_u64 ... bench: 36 ns/iter (+/- 1) = 222 MB/s ```
2 parents 9ca511c + 27c44ce commit ff6c6ce

File tree

2 files changed

+109
-7
lines changed

2 files changed

+109
-7
lines changed

src/libcore/hash/sip.rs

+22-4
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
//! An implementation of SipHash 2-4.
1212
13+
use ptr;
1314
use prelude::*;
1415
use super::Hasher;
1516

@@ -31,9 +32,13 @@ pub struct SipHasher {
3132
k0: u64,
3233
k1: u64,
3334
length: usize, // how many bytes we've processed
35+
// v0, v2 and v1, v3 show up in pairs in the algorithm,
36+
// and simd implementations of SipHash will use vectors
37+
// of v02 and v13. By placing them in this order in the struct,
38+
// the compiler can pick up on just a few simd optimizations by itself.
3439
v0: u64, // hash state
35-
v1: u64,
3640
v2: u64,
41+
v1: u64,
3742
v3: u64,
3843
tail: u64, // unprocessed bytes le
3944
ntail: usize, // how many bytes in tail are valid
@@ -65,6 +70,20 @@ macro_rules! u8to64_le {
6570
});
6671
}
6772

73+
/// Load a full u64 word from a byte stream, in LE order. Use
74+
/// `copy_nonoverlapping` to let the compiler generate the most efficient way
75+
/// to load u64 from a possibly unaligned address.
76+
///
77+
/// Unsafe because: unchecked indexing at i..i+8
78+
#[inline]
79+
unsafe fn load_u64_le(buf: &[u8], i: usize) -> u64 {
80+
debug_assert!(i + 8 <= buf.len());
81+
let mut data = 0u64;
82+
ptr::copy_nonoverlapping(buf.get_unchecked(i),
83+
&mut data as *mut _ as *mut u8, 8);
84+
data.to_le()
85+
}
86+
6887
macro_rules! rotl {
6988
($x:expr, $b:expr) =>
7089
(($x << $b) | ($x >> (64_i32.wrapping_sub($b))))
@@ -146,12 +165,11 @@ impl SipHasher {
146165

147166
// Buffered tail is now flushed, process new input.
148167
let len = length - needed;
149-
let end = len & (!0x7);
150168
let left = len & 0x7;
151169

152170
let mut i = needed;
153-
while i < end {
154-
let mi = u8to64_le!(msg, i);
171+
while i < len - left {
172+
let mi = unsafe { load_u64_le(msg, i) };
155173

156174
self.v3 ^= mi;
157175
compress!(self.v0, self.v1, self.v2, self.v3);

src/libcoretest/hash/sip.rs

+87-3
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
10-
use test::Bencher;
10+
use test::{Bencher, black_box};
1111

1212
use core::hash::{Hash, Hasher};
1313
use core::hash::SipHasher;
@@ -57,6 +57,12 @@ fn hash_with_keys<T: Hash>(k1: u64, k2: u64, x: &T) -> u64 {
5757
st.finish()
5858
}
5959

60+
fn hash_bytes(x: &[u8]) -> u64 {
61+
let mut s = SipHasher::default();
62+
Hasher::write(&mut s, x);
63+
s.finish()
64+
}
65+
6066
#[test]
6167
#[allow(unused_must_use)]
6268
fn test_siphash() {
@@ -266,10 +272,88 @@ officia deserunt mollit anim id est laborum.";
266272
})
267273
}
268274

275+
#[bench]
276+
fn bench_u32(b: &mut Bencher) {
277+
let u = 162629500u32;
278+
let u = black_box(u);
279+
b.iter(|| {
280+
hash(&u)
281+
});
282+
b.bytes = 8;
283+
}
284+
285+
#[bench]
286+
fn bench_u32_keyed(b: &mut Bencher) {
287+
let u = 162629500u32;
288+
let u = black_box(u);
289+
let k1 = black_box(0x1);
290+
let k2 = black_box(0x2);
291+
b.iter(|| {
292+
hash_with_keys(k1, k2, &u)
293+
});
294+
b.bytes = 8;
295+
}
296+
269297
#[bench]
270298
fn bench_u64(b: &mut Bencher) {
271299
let u = 16262950014981195938u64;
300+
let u = black_box(u);
272301
b.iter(|| {
273-
assert_eq!(hash(&u), 5254097107239593357);
274-
})
302+
hash(&u)
303+
});
304+
b.bytes = 8;
305+
}
306+
307+
#[bench]
308+
fn bench_bytes_4(b: &mut Bencher) {
309+
let data = black_box([b' '; 4]);
310+
b.iter(|| {
311+
hash_bytes(&data)
312+
});
313+
b.bytes = 4;
314+
}
315+
316+
#[bench]
317+
fn bench_bytes_7(b: &mut Bencher) {
318+
let data = black_box([b' '; 7]);
319+
b.iter(|| {
320+
hash_bytes(&data)
321+
});
322+
b.bytes = 7;
323+
}
324+
325+
#[bench]
326+
fn bench_bytes_8(b: &mut Bencher) {
327+
let data = black_box([b' '; 8]);
328+
b.iter(|| {
329+
hash_bytes(&data)
330+
});
331+
b.bytes = 8;
332+
}
333+
334+
#[bench]
335+
fn bench_bytes_a_16(b: &mut Bencher) {
336+
let data = black_box([b' '; 16]);
337+
b.iter(|| {
338+
hash_bytes(&data)
339+
});
340+
b.bytes = 16;
341+
}
342+
343+
#[bench]
344+
fn bench_bytes_b_32(b: &mut Bencher) {
345+
let data = black_box([b' '; 32]);
346+
b.iter(|| {
347+
hash_bytes(&data)
348+
});
349+
b.bytes = 32;
350+
}
351+
352+
#[bench]
353+
fn bench_bytes_c_128(b: &mut Bencher) {
354+
let data = black_box([b' '; 128]);
355+
b.iter(|| {
356+
hash_bytes(&data)
357+
});
358+
b.bytes = 128;
275359
}

0 commit comments

Comments
 (0)