From 8ec44cd3c4ae8171021a47eb111f277d16dc2a76 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 24 Jan 2024 23:53:10 +0000 Subject: [PATCH 1/3] Add ListBuilder::with_field (#5330) --- .../src/builder/generic_list_builder.rs | 110 +++++++++++------- 1 file changed, 71 insertions(+), 39 deletions(-) diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 116e2553cfb7..1f4f6fbc08f6 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -17,10 +17,9 @@ use crate::builder::{ArrayBuilder, BufferBuilder}; use crate::{Array, ArrayRef, GenericListArray, OffsetSizeTrait}; -use arrow_buffer::Buffer; use arrow_buffer::NullBufferBuilder; -use arrow_data::ArrayData; -use arrow_schema::Field; +use arrow_buffer::{Buffer, OffsetBuffer}; +use arrow_schema::{Field, FieldRef}; use std::any::Any; use std::sync::Arc; @@ -92,6 +91,7 @@ pub struct GenericListBuilder { offsets_builder: BufferBuilder, null_buffer_builder: NullBufferBuilder, values_builder: T, + field: Option, } impl Default for GenericListBuilder { @@ -116,6 +116,19 @@ impl GenericListBuilder) -> Self { + Self { + field: Some(field.into()), + ..self } } } @@ -275,53 +288,37 @@ where /// Builds the [`GenericListArray`] and reset this builder. pub fn finish(&mut self) -> GenericListArray { - let len = self.len(); - let values_arr = self.values_builder.finish(); - let values_data = values_arr.to_data(); + let values = self.values_builder.finish(); + let nulls = self.null_buffer_builder.finish(); - let offset_buffer = self.offsets_builder.finish(); - let null_bit_buffer = self.null_buffer_builder.finish(); + let offsets = self.offsets_builder.finish(); + // Safety: Safe by construction + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; self.offsets_builder.append(OffsetSize::zero()); - let field = Arc::new(Field::new( - "item", - values_data.data_type().clone(), - true, // TODO: find a consistent way of getting this - )); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(field); - let array_data_builder = ArrayData::builder(data_type) - .len(len) - .add_buffer(offset_buffer) - .add_child_data(values_data) - .nulls(null_bit_buffer); - let array_data = unsafe { array_data_builder.build_unchecked() }; + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; - GenericListArray::::from(array_data) + GenericListArray::new(field, offsets, values, nulls) } /// Builds the [`GenericListArray`] without resetting the builder. pub fn finish_cloned(&self) -> GenericListArray { - let len = self.len(); - let values_arr = self.values_builder.finish_cloned(); - let values_data = values_arr.to_data(); - - let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + let values = self.values_builder.finish_cloned(); let nulls = self.null_buffer_builder.finish_cloned(); - let field = Arc::new(Field::new( - "item", - values_data.data_type().clone(), - true, // TODO: find a consistent way of getting this - )); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(field); - let array_data_builder = ArrayData::builder(data_type) - .len(len) - .add_buffer(offset_buffer) - .add_child_data(values_data) - .nulls(nulls); - let array_data = unsafe { array_data_builder.build_unchecked() }; + let offsets = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + // Safety: safe by construction + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; - GenericListArray::::from(array_data) + GenericListArray::new(field, offsets, values, nulls) } /// Returns the current offsets buffer as a slice @@ -765,4 +762,39 @@ mod tests { assert_eq!(0, i1.null_count()); assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); } + + #[test] + fn test_non_nullable_list() { + let field = Arc::new(Field::new("item", DataType::Int32, false)); + let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), Some(2), Some(3)]); + builder.append_null(); // This is fine as nullability refers to nullability of values + builder.append_value([Some(4)]); + let array = builder.finish(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::List(field.clone())); + + builder.append_value([Some(4), Some(5)]); + let array = builder.finish(); + assert_eq!(array.data_type(), &DataType::List(field)); + assert_eq!(array.len(), 1); + } + + #[test] + #[should_panic(expected = "Non-nullable field of ListArray \\\"item\\\" cannot contain nulls")] + fn test_checks_nullability() { + let field = Arc::new(Field::new("item", DataType::Int32, false)); + let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), None]); + builder.finish(); + } + + #[test] + #[should_panic(expected = "ListArray expected data type Int64 got Int32")] + fn test_checks_data_type() { + let field = Arc::new(Field::new("item", DataType::Int64, false)); + let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1)]); + builder.finish(); + } } From 2b8b37e79b1700391860232737af2a66e388f4a9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 24 Jan 2024 23:57:09 +0000 Subject: [PATCH 2/3] Tweak docs --- arrow-array/src/builder/generic_list_builder.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 1f4f6fbc08f6..570d45876c0c 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -124,7 +124,8 @@ impl GenericListBuilder) -> Self { Self { field: Some(field.into()), From c873c8f49290ea97dd3af1fcff9b0b4c9d713827 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 25 Jan 2024 12:00:08 +0000 Subject: [PATCH 3/3] Review feedback --- arrow-array/src/builder/generic_list_builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 570d45876c0c..b857224c5da6 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -765,8 +765,8 @@ mod tests { } #[test] - fn test_non_nullable_list() { - let field = Arc::new(Field::new("item", DataType::Int32, false)); + fn test_with_field() { + let field = Arc::new(Field::new("bar", DataType::Int32, false)); let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone()); builder.append_value([Some(1), Some(2), Some(3)]); builder.append_null(); // This is fine as nullability refers to nullability of values