From d24e56fefb699fa953def653d53efd9d6b611139 Mon Sep 17 00:00:00 2001 From: Naveen Naidu Date: Fri, 10 May 2024 17:07:53 +0530 Subject: [PATCH] Approach 09: Use memchr and loop instead of split (71s) Use memchr instead of split. memchr uses SIMD for faster seraching of needle in the haystack. The split function is slow because it contructs the split parse everytime it is called. Few links to read more: * https://github.com/BurntSushi/memchr * https://blog.burntsushi.net/bstr/#motivation-based-on-performance --- Cargo.toml | 1 + flamegraphs/09-use-memchr/flamegraph.svg | 491 +++++++++++++++++++++++ src/main.rs | 67 ++-- 3 files changed, 527 insertions(+), 32 deletions(-) create mode 100644 flamegraphs/09-use-memchr/flamegraph.svg diff --git a/Cargo.toml b/Cargo.toml index c76e7ac..3eac055 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,3 +15,4 @@ rust_decimal_macros = "1.34" fast-float = "0.2" rustc-hash = { version = "1.0"} memmap2 = {version = "0.9.4"} +memchr = { version = "2", default-features = false } diff --git a/flamegraphs/09-use-memchr/flamegraph.svg b/flamegraphs/09-use-memchr/flamegraph.svg new file mode 100644 index 0000000..d30a6a5 --- /dev/null +++ b/flamegraphs/09-use-memchr/flamegraph.svg @@ -0,0 +1,491 @@ +Flame Graph Reset ZoomSearch [ld-linux-x86-64.so.2] (7 samples, 0.02%)[unknown] (4 samples, 0.01%)[ld-linux-x86-64.so.2] (8 samples, 0.03%)<core::ops::range::Range<usize> as core::slice::index::SliceIndex<[T]>>::index (200 samples, 0.67%)core::slice::index::<impl core::ops::index::Index<I> for [T]>::index (805 samples, 2.70%)co..<core::ops::range::RangeFrom<usize> as core::slice::index::SliceIndex<[T]>>::index (605 samples, 2.03%)<..<core::ops::range::RangeFrom<usize> as core::slice::index::SliceIndex<[T]>>::get_unchecked (263 samples, 0.88%)<core::ops::range::Range<usize> as core::slice::index::SliceIndex<[T]>>::get_unchecked (263 samples, 0.88%)core::ptr::const_ptr::<impl *const T>::add (263 samples, 0.88%)<f32 as core::ops::arith::Div>::div (3,889 samples, 13.05%)<f32 as core::ops::a..<f32 as fast_float::float::Float>::from_u64 (41 samples, 0.14%)<f32 as fast_float::float::Float>::pow10_fast_path (121 samples, 0.41%)fast_float::number::Number::try_fast_path (4,816 samples, 16.16%)fast_float::number::Numbe..fast_float::number::Number::is_fast_path (201 samples, 0.67%)fast_float::common::AsciiStr::check_first (121 samples, 0.41%)fast_float::common::AsciiStr::is_empty (107 samples, 0.36%)fast_float::common::AsciiStr::check_first_either (122 samples, 0.41%)fast_float::common::AsciiStr::first (204 samples, 0.68%)fast_float::common::AsciiStr::offset_from (58 samples, 0.19%)core::num::<impl isize>::wrapping_sub (58 samples, 0.19%)fast_float::number::try_parse_8digits_le (319 samples, 1.07%)fast_float::common::AsciiStr::try_read_u64 (319 samples, 1.07%)fast_float::common::AsciiStr::check_len (252 samples, 0.85%)core::ptr::const_ptr::<impl *const T>::add (163 samples, 0.55%)core::num::<impl u8>::is_ascii_digit (431 samples, 1.45%)fast_float::common::AsciiStr::first (4 samples, 0.01%)fast_float::common::AsciiStr::is_empty (407 samples, 1.37%)fast_float::common::AsciiStr::step (57 samples, 0.19%)fast_float::common::AsciiStr::step_by (57 samples, 0.19%)core::ptr::const_ptr::<impl *const T>::add (57 samples, 0.19%)fast_float::parse (8,159 samples, 27.37%)fast_float::parsefast_float::FastFloat::parse_float (8,159 samples, 27.37%)fast_float::FastFloat::parse_floatfast_float::FastFloat::parse_float_partial (8,159 samples, 27.37%)fast_float::FastFloat::parse_float_partialfast_float::parse::parse_float (8,159 samples, 27.37%)fast_float::parse::parse_floatfast_float::number::parse_number (3,139 samples, 10.53%)fast_float::num..fast_float::number::try_parse_digits (1,703 samples, 5.71%)fast_fl..fast_float::common::AsciiStr::parse_digits (1,703 samples, 5.71%)fast_fl..fast_float::number::try_parse_digits::_{{closure}} (201 samples, 0.67%)core::num::<impl u64>::wrapping_add (127 samples, 0.43%)<core::option::Option<T> as core::ops::try_trait::Try>::branch (218 samples, 0.73%)<*const T as memchr::ext::Pointer>::distance (7 samples, 0.02%)core::ptr::const_ptr::<impl *const T>::offset_from (7 samples, 0.02%)<memchr::vector::SensibleMoveMask as memchr::vector::MoveMask>::has_non_zero (10 samples, 0.03%)[unknown] (85 samples, 0.29%)[unknown] (53 samples, 0.18%)[unknown] (49 samples, 0.16%)[unknown] (41 samples, 0.14%)[unknown] (24 samples, 0.08%)[unknown] (12 samples, 0.04%)[unknown] (10 samples, 0.03%)[unknown] (7 samples, 0.02%)[unknown] (3 samples, 0.01%)memchr::vector::x86sse2::<impl memchr::vector::Vector for core::core_arch::x86::__m128i>::cmpeq (72 samples, 0.24%)core::core_arch::x86::sse2::_mm_cmpeq_epi8 (72 samples, 0.24%)memchr::arch::generic::memchr::One<V>::search_chunk (1,151 samples, 3.86%)memc..memchr::vector::x86sse2::<impl memchr::vector::Vector for core::core_arch::x86::__m128i>::movemask (241 samples, 0.81%)core::core_arch::x86::sse2::_mm_movemask_epi8 (241 samples, 0.81%)memchr::vector::Vector::movemask_will_have_non_zero (7 samples, 0.02%)memchr::vector::x86sse2::<impl memchr::vector::Vector for core::core_arch::x86::__m128i>::movemask (7 samples, 0.02%)core::core_arch::x86::sse2::_mm_movemask_epi8 (7 samples, 0.02%)core::core_arch::x86::sse2::_mm_cmpeq_epi8 (21 samples, 0.07%)memchr::vector::x86sse2::<impl memchr::vector::Vector for core::core_arch::x86::__m128i>::cmpeq (46 samples, 0.15%)memchr::arch::x86_64::memchr::memchr_raw::find_sse2 (25 samples, 0.08%)[unknown] (6 samples, 0.02%)[unknown] (3 samples, 0.01%)[unknown] (3 samples, 0.01%)[unknown] (3 samples, 0.01%)memchr::memchr::memchr (2,815 samples, 9.44%)memchr::memchr..memchr::arch::generic::memchr::search_slice_with_raw (2,815 samples, 9.44%)memchr::arch::..memchr::memchr::memchr::_{{closure}} (2,408 samples, 8.08%)memchr::mem..memchr::memchr::memchr_raw (2,408 samples, 8.08%)memchr::mem..memchr::arch::x86_64::memchr::memchr_raw (2,408 samples, 8.08%)memchr::arc..memchr::arch::x86_64::memchr::memchr_raw::find_sse2 (1,961 samples, 6.58%)memchr::a..memchr::arch::x86_64::sse2::memchr::One::find_raw (1,767 samples, 5.93%)memchr::..memchr::arch::x86_64::sse2::memchr::One::find_raw_impl (1,262 samples, 4.23%)memch..memchr::arch::generic::memchr::One<V>::find_raw (1,262 samples, 4.23%)memch..memchr::vector::x86sse2::<impl memchr::vector::Vector for core::core_arch::x86::__m128i>::or (9 samples, 0.03%)core::core_arch::x86::sse2::_mm_or_si128 (9 samples, 0.03%)std::collections::hash::map::Entry<K,V>::and_modify (559 samples, 1.88%)s..rust_1brc::calculate_station_values::_{{closure}} (559 samples, 1.88%)r..core::hash::Hasher::write_length_prefix (11 samples, 0.04%)<rustc_hash::FxHasher as core::hash::Hasher>::write_usize (11 samples, 0.04%)rustc_hash::FxHasher::add_to_hash (11 samples, 0.04%)core::num::<impl usize>::wrapping_mul (11 samples, 0.04%)core::slice::index::<impl core::ops::index::Index<I> for [T]>::index (275 samples, 0.92%)<core::ops::range::RangeFrom<usize> as core::slice::index::SliceIndex<[T]>>::index (275 samples, 0.92%)<core::ops::range::RangeFrom<usize> as core::slice::index::SliceIndex<[T]>>::get_unchecked (275 samples, 0.92%)<core::ops::range::Range<usize> as core::slice::index::SliceIndex<[T]>>::get_unchecked (275 samples, 0.92%)core::ptr::const_ptr::<impl *const T>::add (63 samples, 0.21%)<usize as core::ops::bit::BitXor>::bitxor (55 samples, 0.18%)core::num::<impl usize>::rotate_left (1,220 samples, 4.09%)core..<rustc_hash::FxHasher as core::hash::Hasher>::write (2,458 samples, 8.25%)<rustc_hash..rustc_hash::FxHasher::add_to_hash (1,411 samples, 4.73%)rustc_..core::num::<impl usize>::wrapping_mul (136 samples, 0.46%)hashbrown::map::make_hash (2,527 samples, 8.48%)hashbrown::m..core::hash::BuildHasher::hash_one (2,527 samples, 8.48%)core::hash::..core::hash::impls::<impl core::hash::Hash for &T>::hash (2,527 samples, 8.48%)core::hash::..core::hash::impls::<impl core::hash::Hash for &T>::hash (2,527 samples, 8.48%)core::hash::..core::hash::impls::<impl core::hash::Hash for [T]>::hash (2,527 samples, 8.48%)core::hash::..core::hash::impls::<impl core::hash::Hash for u8>::hash_slice (2,516 samples, 8.44%)core::hash::..hashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry (58 samples, 0.19%)<hashbrown::raw::bitmask::BitMaskIter as core::iter::traits::iterator::Iterator>::next (1,185 samples, 3.98%)<has..hashbrown::raw::bitmask::BitMask::lowest_set_bit (1,185 samples, 3.98%)hash..core::num::nonzero::NonZero<u16>::new (978 samples, 3.28%)cor..hashbrown::raw::RawTable<T,A>::bucket (474 samples, 1.59%)hashbrown::raw::Bucket<T>::from_base_index (474 samples, 1.59%)core::ptr::mut_ptr::<impl *mut T>::sub (474 samples, 1.59%)core::ptr::mut_ptr::<impl *mut T>::offset (474 samples, 1.59%)<[A] as core::slice::cmp::SlicePartialEq<B>>::equal (6,994 samples, 23.46%)<[A] as core::slice::cmp::SlicePartia..[libc.so.6] (4,850 samples, 16.27%)[libc.so.6]hashbrown::raw::RawTable<T,A>::find::_{{closure}} (7,492 samples, 25.13%)hashbrown::raw::RawTable<T,A>::find::_{{..hashbrown::rustc_entry::_<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry::_{{closure}} (7,018 samples, 23.54%)hashbrown::rustc_entry::_<impl hashbr..core::cmp::impls::<impl core::cmp::PartialEq<&B> for &A>::eq (7,018 samples, 23.54%)core::cmp::impls::<impl core::cmp::Pa..core::slice::cmp::<impl core::cmp::PartialEq<[B]> for [A]>::eq (7,018 samples, 23.54%)core::slice::cmp::<impl core::cmp::Pa..hashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry (24 samples, 0.08%)hashbrown::raw::h2 (1,217 samples, 4.08%)hash..hashbrown::raw::sse2::Group::load (331 samples, 1.11%)core::core_arch::x86::sse2::_mm_loadu_si128 (331 samples, 1.11%)hashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry (329 samples, 1.10%)hashbrown::raw::sse2::Group::match_byte (1,938 samples, 6.50%)hashbrown..core::core_arch::x86::sse2::_mm_movemask_epi8 (1,938 samples, 6.50%)core::cor..hashbrown::raw::sse2::Group::match_empty (283 samples, 0.95%)hashbrown::raw::sse2::Group::match_byte (283 samples, 0.95%)core::core_arch::x86::sse2::_mm_movemask_epi8 (283 samples, 0.95%)hashbrown::raw::RawTableInner::find_inner (12,826 samples, 43.03%)hashbrown::raw::RawTableInner::find_innerhashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry (32 samples, 0.11%)hashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry (16,926 samples, 56.78%)hashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entryhashbrown::raw::RawTable<T,A>::find (13,603 samples, 45.63%)hashbrown::raw::RawTable<T,A>::findhashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry (5 samples, 0.02%)all (29,810 samples, 100%)rust-1brc (29,810 samples, 100.00%)rust-1brc_start (29,802 samples, 99.97%)_start__libc_start_main (29,802 samples, 99.97%)__libc_start_main[libc.so.6] (29,802 samples, 99.97%)[libc.so.6]main (29,802 samples, 99.97%)mainstd::rt::lang_start_internal (29,802 samples, 99.97%)std::rt::lang_start_internalstd::rt::lang_start::_{{closure}} (29,802 samples, 99.97%)std::rt::lang_start::_{{closure}}std::sys_common::backtrace::__rust_begin_short_backtrace (29,802 samples, 99.97%)std::sys_common::backtrace::__rust_begin_short_backtracecore::ops::function::FnOnce::call_once (29,802 samples, 99.97%)core::ops::function::FnOnce::call_oncerust_1brc::main (29,802 samples, 99.97%)rust_1brc::mainrust_1brc::calculate_station_values (29,802 samples, 99.97%)rust_1brc::calculate_station_valuesstd::collections::hash::map::HashMap<K,V,S>::entry (17,262 samples, 57.91%)std::collections::hash::map::HashMap<K,V,S>::entrystd::collections::hash::map::map_entry (97 samples, 0.33%) \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 51cfa99..729b5d3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,6 +5,7 @@ use std::time::Instant; use fast_float; use rustc_hash::FxHashMap; use memmap2::Mmap; +use memchr::memchr; #[derive(Parser, Debug)] #[command( @@ -25,44 +26,46 @@ struct StationValues { count: u32, } -fn read_line(data: &[u8]) -> (&[u8], f32) { - let mut parts = data.rsplit(|&c| c == b';'); - let value_str = parts.next().expect("Failed to parse value string"); - let value = fast_float::parse(value_str).expect("Failed to parse value"); - let station_name = parts.next().expect("Failed to parse station name"); - (station_name, value) -} - // Calculate the station values fn calculate_station_values(data:&[u8]) -> FxHashMap<&[u8], StationValues> { let mut result: FxHashMap<&[u8], StationValues> = FxHashMap::default(); - let lines = data.split(|&c| c == b'\n'); - for line in lines { - if line.is_empty() { - continue; - } + let mut buffer = data; + loop { + match memchr(b';', &buffer) { + None => { + break; + } + Some(comma_seperator) => { + let end = memchr(b'\n', &buffer[comma_seperator..]).unwrap(); + let name = &buffer[..comma_seperator]; + let value = &buffer[comma_seperator+1..comma_seperator+end]; + let value = fast_float::parse(value).expect("Failed to parse value"); + + result + .entry(name) + .and_modify(|e| { + if value < e.min { + e.min = value; + } + if value > e.max { + e.max = value; + } + e.mean = e.mean + value; + e.count += 1; + }) + .or_insert(StationValues { + min: value, + max: value, + mean: value, + count: 1, + }); + buffer = &buffer[comma_seperator+end+1..]; + } - let (station_name, value) = read_line(line); - result - .entry(station_name) - .and_modify(|e| { - if value < e.min { - e.min = value; - } - if value > e.max { - e.max = value; - } - e.mean = e.mean + value; - e.count += 1; - }) - .or_insert(StationValues { - min: value, - max: value, - mean: value, - count: 1, - }); + } } + // Calculate the mean for all entries and round off to 1 decimal place for (_, station_values) in result.iter_mut() { station_values.mean = round_off(station_values.mean / station_values.count as f32);