Skip to content

Commit caca212

Browse files
committed
Auto merge of #74024 - Folyd:master, r=m-ou-se
Improve slice.binary_search_by()'s best-case performance to O(1) This PR aimed to improve the [slice.binary_search_by()](https://doc.rust-lang.org/std/primitive.slice.html#method.binary_search_by)'s best-case performance to O(1). # Noticed I don't know why the docs of `binary_search_by` said `"If there are multiple matches, then any one of the matches could be returned."`, but the implementation isn't the same thing. Actually, it returns the **last one** if multiple matches found. Then we got two options: ## If returns the last one is the correct or desired result Then I can rectify the docs and revert my changes. ## If the docs are correct or desired result Then my changes can be merged after fully reviewed. However, if my PR gets merged, another issue raised: this could be a **breaking change** since if multiple matches found, the returning order no longer the last one instead of it could be any one. For example: ```rust let mut s = vec![0, 1, 1, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55]; let num = 1; let idx = s.binary_search(&num); s.insert(idx, 2); // Old implementations assert_eq!(s, [0, 1, 1, 1, 1, 2, 2, 3, 5, 8, 13, 21, 34, 42, 55]); // New implementations assert_eq!(s, [0, 1, 1, 1, 2, 1, 2, 3, 5, 8, 13, 21, 34, 42, 55]); ``` # Benchmarking **Old implementations** ```sh $ ./x.py bench --stage 1 library/libcore test slice::binary_search_l1 ... bench: 59 ns/iter (+/- 4) test slice::binary_search_l1_with_dups ... bench: 59 ns/iter (+/- 3) test slice::binary_search_l2 ... bench: 76 ns/iter (+/- 5) test slice::binary_search_l2_with_dups ... bench: 77 ns/iter (+/- 17) test slice::binary_search_l3 ... bench: 183 ns/iter (+/- 23) test slice::binary_search_l3_with_dups ... bench: 185 ns/iter (+/- 19) ``` **New implementations (1)** Implemented by this PR. ```rust if cmp == Equal { return Ok(mid); } else if cmp == Less { base = mid } ``` ```sh $ ./x.py bench --stage 1 library/libcore test slice::binary_search_l1 ... bench: 58 ns/iter (+/- 2) test slice::binary_search_l1_with_dups ... bench: 37 ns/iter (+/- 4) test slice::binary_search_l2 ... bench: 76 ns/iter (+/- 3) test slice::binary_search_l2_with_dups ... bench: 57 ns/iter (+/- 6) test slice::binary_search_l3 ... bench: 200 ns/iter (+/- 30) test slice::binary_search_l3_with_dups ... bench: 157 ns/iter (+/- 6) $ ./x.py bench --stage 1 library/libcore test slice::binary_search_l1 ... bench: 59 ns/iter (+/- 8) test slice::binary_search_l1_with_dups ... bench: 37 ns/iter (+/- 2) test slice::binary_search_l2 ... bench: 77 ns/iter (+/- 2) test slice::binary_search_l2_with_dups ... bench: 57 ns/iter (+/- 2) test slice::binary_search_l3 ... bench: 198 ns/iter (+/- 21) test slice::binary_search_l3_with_dups ... bench: 158 ns/iter (+/- 11) ``` **New implementations (2)** Suggested by `@nbdd0121` in [comment](#74024 (comment)). ```rust base = if cmp == Greater { base } else { mid }; if cmp == Equal { break } ``` ```sh $ ./x.py bench --stage 1 library/libcore test slice::binary_search_l1 ... bench: 59 ns/iter (+/- 7) test slice::binary_search_l1_with_dups ... bench: 37 ns/iter (+/- 5) test slice::binary_search_l2 ... bench: 75 ns/iter (+/- 3) test slice::binary_search_l2_with_dups ... bench: 56 ns/iter (+/- 3) test slice::binary_search_l3 ... bench: 195 ns/iter (+/- 15) test slice::binary_search_l3_with_dups ... bench: 151 ns/iter (+/- 7) $ ./x.py bench --stage 1 library/libcore test slice::binary_search_l1 ... bench: 57 ns/iter (+/- 2) test slice::binary_search_l1_with_dups ... bench: 38 ns/iter (+/- 2) test slice::binary_search_l2 ... bench: 77 ns/iter (+/- 11) test slice::binary_search_l2_with_dups ... bench: 57 ns/iter (+/- 4) test slice::binary_search_l3 ... bench: 194 ns/iter (+/- 15) test slice::binary_search_l3_with_dups ... bench: 151 ns/iter (+/- 18) ``` I run some benchmarking testings against on two implementations. The new implementation has a lot of improvement in duplicates cases, while in `binary_search_l3` case, it's a little bit slower than the old one.
2 parents 8fd946c + 3eb5bee commit caca212

File tree

3 files changed

+80
-29
lines changed

3 files changed

+80
-29
lines changed

Diff for: library/core/benches/slice.rs

+38-6
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,21 @@ enum Cache {
77
L3,
88
}
99

10+
impl Cache {
11+
fn size(&self) -> usize {
12+
match self {
13+
Cache::L1 => 1000, // 8kb
14+
Cache::L2 => 10_000, // 80kb
15+
Cache::L3 => 1_000_000, // 8Mb
16+
}
17+
}
18+
}
19+
1020
fn binary_search<F>(b: &mut Bencher, cache: Cache, mapper: F)
1121
where
1222
F: Fn(usize) -> usize,
1323
{
14-
let size = match cache {
15-
Cache::L1 => 1000, // 8kb
16-
Cache::L2 => 10_000, // 80kb
17-
Cache::L3 => 1_000_000, // 8Mb
18-
};
24+
let size = cache.size();
1925
let v = (0..size).map(&mapper).collect::<Vec<_>>();
2026
let mut r = 0usize;
2127
b.iter(move || {
@@ -24,7 +30,18 @@ where
2430
// Lookup the whole range to get 50% hits and 50% misses.
2531
let i = mapper(r % size);
2632
black_box(v.binary_search(&i).is_ok());
27-
})
33+
});
34+
}
35+
36+
fn binary_search_worst_case(b: &mut Bencher, cache: Cache) {
37+
let size = cache.size();
38+
39+
let mut v = vec![0; size];
40+
let i = 1;
41+
v[size - 1] = i;
42+
b.iter(move || {
43+
black_box(v.binary_search(&i).is_ok());
44+
});
2845
}
2946

3047
#[bench]
@@ -57,6 +74,21 @@ fn binary_search_l3_with_dups(b: &mut Bencher) {
5774
binary_search(b, Cache::L3, |i| i / 16 * 16);
5875
}
5976

77+
#[bench]
78+
fn binary_search_l1_worst_case(b: &mut Bencher) {
79+
binary_search_worst_case(b, Cache::L1);
80+
}
81+
82+
#[bench]
83+
fn binary_search_l2_worst_case(b: &mut Bencher) {
84+
binary_search_worst_case(b, Cache::L2);
85+
}
86+
87+
#[bench]
88+
fn binary_search_l3_worst_case(b: &mut Bencher) {
89+
binary_search_worst_case(b, Cache::L3);
90+
}
91+
6092
macro_rules! rotate {
6193
($fn:ident, $n:expr, $mapper:expr) => {
6294
#[bench]

Diff for: library/core/src/slice/mod.rs

+25-19
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
99
#![stable(feature = "rust1", since = "1.0.0")]
1010

11-
use crate::cmp::Ordering::{self, Equal, Greater, Less};
11+
use crate::cmp::Ordering::{self, Greater, Less};
1212
use crate::marker::Copy;
1313
use crate::mem;
1414
use crate::num::NonZeroUsize;
@@ -2185,25 +2185,31 @@ impl<T> [T] {
21852185
where
21862186
F: FnMut(&'a T) -> Ordering,
21872187
{
2188-
let s = self;
2189-
let mut size = s.len();
2190-
if size == 0 {
2191-
return Err(0);
2192-
}
2193-
let mut base = 0usize;
2194-
while size > 1 {
2195-
let half = size / 2;
2196-
let mid = base + half;
2197-
// SAFETY: the call is made safe by the following inconstants:
2198-
// - `mid >= 0`: by definition
2199-
// - `mid < size`: `mid = size / 2 + size / 4 + size / 8 ...`
2200-
let cmp = f(unsafe { s.get_unchecked(mid) });
2201-
base = if cmp == Greater { base } else { mid };
2202-
size -= half;
2188+
let mut size = self.len();
2189+
let mut left = 0;
2190+
let mut right = size;
2191+
while left < right {
2192+
let mid = left + size / 2;
2193+
2194+
// SAFETY: the call is made safe by the following invariants:
2195+
// - `mid >= 0`
2196+
// - `mid < size`: `mid` is limited by `[left; right)` bound.
2197+
let cmp = f(unsafe { self.get_unchecked(mid) });
2198+
2199+
// The reason why we use if/else control flow rather than match
2200+
// is because match reorders comparison operations, which is perf sensitive.
2201+
// This is x86 asm for u8: https://rust.godbolt.org/z/8Y8Pra.
2202+
if cmp == Less {
2203+
left = mid + 1;
2204+
} else if cmp == Greater {
2205+
right = mid;
2206+
} else {
2207+
return Ok(mid);
2208+
}
2209+
2210+
size = right - left;
22032211
}
2204-
// SAFETY: base is always in [0, size) because base <= mid.
2205-
let cmp = f(unsafe { s.get_unchecked(base) });
2206-
if cmp == Equal { Ok(base) } else { Err(base + (cmp == Less) as usize) }
2212+
Err(left)
22072213
}
22082214

22092215
/// Binary searches this sorted slice with a key extraction function.

Diff for: library/core/tests/slice.rs

+17-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use core::cell::Cell;
2+
use core::cmp::Ordering;
23
use core::result::Result::{Err, Ok};
34

45
#[test]
@@ -64,6 +65,17 @@ fn test_binary_search() {
6465
assert_eq!(b.binary_search(&6), Err(4));
6566
assert_eq!(b.binary_search(&7), Ok(4));
6667
assert_eq!(b.binary_search(&8), Err(5));
68+
69+
let b = [(); usize::MAX];
70+
assert_eq!(b.binary_search(&()), Ok(usize::MAX / 2));
71+
}
72+
73+
#[test]
74+
fn test_binary_search_by_overflow() {
75+
let b = [(); usize::MAX];
76+
assert_eq!(b.binary_search_by(|_| Ordering::Equal), Ok(usize::MAX / 2));
77+
assert_eq!(b.binary_search_by(|_| Ordering::Greater), Err(0));
78+
assert_eq!(b.binary_search_by(|_| Ordering::Less), Err(usize::MAX));
6779
}
6880

6981
#[test]
@@ -73,13 +85,13 @@ fn test_binary_search_implementation_details() {
7385
let b = [1, 1, 2, 2, 3, 3, 3];
7486
assert_eq!(b.binary_search(&1), Ok(1));
7587
assert_eq!(b.binary_search(&2), Ok(3));
76-
assert_eq!(b.binary_search(&3), Ok(6));
88+
assert_eq!(b.binary_search(&3), Ok(5));
7789
let b = [1, 1, 1, 1, 1, 3, 3, 3, 3];
7890
assert_eq!(b.binary_search(&1), Ok(4));
79-
assert_eq!(b.binary_search(&3), Ok(8));
91+
assert_eq!(b.binary_search(&3), Ok(7));
8092
let b = [1, 1, 1, 1, 3, 3, 3, 3, 3];
81-
assert_eq!(b.binary_search(&1), Ok(3));
82-
assert_eq!(b.binary_search(&3), Ok(8));
93+
assert_eq!(b.binary_search(&1), Ok(2));
94+
assert_eq!(b.binary_search(&3), Ok(4));
8395
}
8496

8597
#[test]
@@ -1982,6 +1994,7 @@ fn test_copy_within_panics_dest_too_long() {
19821994
// The length is only 13, so a slice of length 4 starting at index 10 is out of bounds.
19831995
bytes.copy_within(0..4, 10);
19841996
}
1997+
19851998
#[test]
19861999
#[should_panic(expected = "slice index starts at 2 but ends at 1")]
19872000
fn test_copy_within_panics_src_inverted() {

0 commit comments

Comments
 (0)