From 93e4eb25c7a09e3d1eb0210f47639b3c836cd5af Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Tue, 23 Jul 2024 15:04:34 -0400 Subject: [PATCH] Faster `GenericByteView` construction (#6102) * add benchmark to track performance * fast byte view construction * make doc happy * fix clippy * update comments --- .../src/builder/generic_bytes_view_builder.rs | 56 ++++++++++++++----- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 587255cc6b6a..7726ee35240f 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -420,23 +420,49 @@ pub type StringViewBuilder = GenericByteViewBuilder; /// [`GenericByteViewBuilder::append_null`] as normal. pub type BinaryViewBuilder = GenericByteViewBuilder; +/// Creates a view from a fixed length input (the compiler can generate +/// specialized code for this) +fn make_inlined_view(data: &[u8]) -> u128 { + let mut view_buffer = [0; 16]; + view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes()); + view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]); + u128::from_le_bytes(view_buffer) +} + /// Create a view based on the given data, block id and offset -#[inline(always)] +/// Note that the code below is carefully examined with x86_64 assembly code: +/// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined), +/// which slows down things. +#[inline(never)] pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 { - let len = data.len() as u32; - if len <= 12 { - let mut view_buffer = [0; 16]; - view_buffer[0..4].copy_from_slice(&len.to_le_bytes()); - view_buffer[4..4 + data.len()].copy_from_slice(data); - u128::from_le_bytes(view_buffer) - } else { - let view = ByteView { - length: len, - prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()), - buffer_index: block_id, - offset, - }; - view.into() + let len = data.len(); + + // Generate specialized code for each potential small string length + // to improve performance + match len { + 0 => make_inlined_view::<0>(data), + 1 => make_inlined_view::<1>(data), + 2 => make_inlined_view::<2>(data), + 3 => make_inlined_view::<3>(data), + 4 => make_inlined_view::<4>(data), + 5 => make_inlined_view::<5>(data), + 6 => make_inlined_view::<6>(data), + 7 => make_inlined_view::<7>(data), + 8 => make_inlined_view::<8>(data), + 9 => make_inlined_view::<9>(data), + 10 => make_inlined_view::<10>(data), + 11 => make_inlined_view::<11>(data), + 12 => make_inlined_view::<12>(data), + // When string is longer than 12 bytes, it can't be inlined, we create a ByteView instead. + _ => { + let view = ByteView { + length: len as u32, + prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()), + buffer_index: block_id, + offset, + }; + view.as_u128() + } } }