diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 6ccd2f4766e67..5380debe3b696 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -60,6 +60,8 @@ struct ARROW_EXPORT ArrayStatistics { case Type::FIXED_SIZE_BINARY: case Type::LARGE_STRING: case Type::LARGE_BINARY: + case Type::BINARY_VIEW: + case Type::STRING_VIEW: return array_type; default: return utf8(); diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 8bffa808a4ed5..700e1bb2c9af1 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -32,6 +32,7 @@ #include "arrow/array/builder_nested.h" #include "arrow/array/builder_union.h" #include "arrow/array/concatenate.h" +#include "arrow/array/statistics.h" #include "arrow/array/validate.h" #include "arrow/c/abi.h" #include "arrow/pretty_print.h" @@ -39,6 +40,7 @@ #include "arrow/table.h" #include "arrow/tensor.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/iterator.h" #include "arrow/util/logging_internal.h" #include "arrow/util/vector.h" @@ -556,6 +558,21 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat } return Status::OK(); } +struct StringBuilderVisitor { + template + enable_if_has_string_view Visit(const DataType&, + ArrayBuilder* raw_builder, + const std::string& value) { + using Builder = typename TypeTraits::BuilderType; + auto builder = static_cast(raw_builder); + return builder->Append(value); + } + + Status Visit(const DataType& type, ArrayBuilder*, const std::string&) { + return Status::Invalid("Only string types are supported and the current type is ", + type.ToString()); + } +}; } // namespace Result> RecordBatch::MakeStatisticsArray( @@ -580,7 +597,7 @@ Result> RecordBatch::MakeStatisticsArray( RETURN_NOT_OK(EnumerateStatistics(*this, [&](const EnumeratedStatistics& statistics) { int8_t i = 0; for (const auto& field : values_types) { - if (field->type()->id() == statistics.type->id()) { + if (field->type()->Equals(statistics.type)) { break; } i++; @@ -680,8 +697,8 @@ Result> RecordBatch::MakeStatisticsArray( return static_cast(builder)->Append(value); } Status operator()(const std::string& value) { - return static_cast(builder)->Append( - value.data(), static_cast(value.size())); + StringBuilderVisitor visitor; + return VisitTypeInline(*builder->type(), &visitor, builder, value); } } visitor; visitor.builder = values_builders[values_type_index].get(); diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index a659f8798e5c2..3dc847bb96cd7 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -26,13 +26,17 @@ #include #include #include +#include #include #include #include "arrow/array/array_base.h" #include "arrow/array/array_dict.h" #include "arrow/array/array_nested.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" #include "arrow/array/data.h" +#include "arrow/array/statistics.h" #include "arrow/array/util.h" #include "arrow/c/abi.h" #include "arrow/chunked_array.h" @@ -45,9 +49,11 @@ #include "arrow/testing/random.h" #include "arrow/type.h" #include "arrow/type_fwd.h" +#include "arrow/type_traits.h" #include "arrow/util/float16.h" #include "arrow/util/iterator.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/visit_type_inline.h" namespace arrow { @@ -1033,15 +1039,32 @@ Result> BuildArray( } return builder.Finish(); } +struct StringBuilderVisitor { + template + enable_if_t::value, Status> Visit( + const DataType&, ArrayBuilder* raw_builder, + const std::vector& values) { + using Builder = typename TypeTraits::BuilderType; + auto builder = static_cast(raw_builder); + for (const auto& value : values) { + ARROW_RETURN_NOT_OK(builder->Append(value)); + } + return Status::OK(); + } -template > -Result> BuildArray(const std::vector& values) { - using BuilderType = typename TypeTraits::BuilderType; - BuilderType builder; - for (const auto& value : values) { - ARROW_RETURN_NOT_OK(builder.Append(value)); + Status Visit(const DataType& type, ArrayBuilder*, const std::vector&) { + return Status::Invalid("Only string types are supported and the current type is", + type.ToString()); } - return builder.Finish(); +}; +Result> BuildArray(const std::shared_ptr& string_type, + const std::vector& values) { + std::unique_ptr array_builder; + ARROW_RETURN_NOT_OK(MakeBuilder(default_memory_pool(), string_type, &array_builder)); + StringBuilderVisitor visitor; + ARROW_RETURN_NOT_OK( + VisitTypeInline(*string_type, &visitor, array_builder.get(), values)); + return array_builder->Finish(); } template @@ -1056,41 +1079,44 @@ std::vector StatisticsValuesToRawValues( template ::value>> -Result> BuildArray(const std::vector& values) { +Result> BuildArray(const std::vector& values, + const std::shared_ptr& array_type) { struct Builder { - const std::vector& values_; - explicit Builder(const std::vector& values) - : values_(values) {} + const std::vector& values; + const std::shared_ptr& array_type; + explicit Builder(const std::vector& values, + const std::shared_ptr& array_type) + : values(values), array_type(array_type) {} Result> operator()(const bool&) { - auto values = StatisticsValuesToRawValues(values_); - return BuildArray(values); + auto raw_values = StatisticsValuesToRawValues(values); + return BuildArray(raw_values); } Result> operator()(const int64_t&) { - auto values = StatisticsValuesToRawValues(values_); - return BuildArray(values); + auto raw_values = StatisticsValuesToRawValues(values); + return BuildArray(raw_values); } Result> operator()(const uint64_t&) { - auto values = StatisticsValuesToRawValues(values_); - return BuildArray(values); + auto raw_values = StatisticsValuesToRawValues(values); + return BuildArray(raw_values); } Result> operator()(const double&) { - auto values = StatisticsValuesToRawValues(values_); - return BuildArray(values); + auto raw_values = StatisticsValuesToRawValues(values); + return BuildArray(raw_values); } Result> operator()(const std::string&) { - auto values = StatisticsValuesToRawValues(values_); - return BuildArray(values); + auto raw_values = StatisticsValuesToRawValues(values); + return BuildArray(array_type, raw_values); } - } builder(values); + } builder(values, array_type); return std::visit(builder, values[0]); } Result> MakeStatisticsArray( const std::string& columns_json, const std::vector>& nested_statistics_keys, - const std::vector>& - nested_statistics_values) { + const std::vector>& nested_statistics_values, + const std::vector>& array_types = {}) { auto columns_type = int32(); auto columns_array = ArrayFromJSON(columns_type, columns_json); const auto n_columns = columns_array->length(); @@ -1137,6 +1163,7 @@ Result> MakeStatisticsArray( for (size_t i = 0; i < nested_statistics_keys.size(); ++i) { const auto& statistics_keys = nested_statistics_keys[i]; const auto& statistics_values = nested_statistics_values[i]; + const auto& array_type = (i < array_types.size()) ? array_types[i] : null(); statistics_offsets.push_back(offset); for (size_t j = 0; j < statistics_keys.size(); ++j) { const auto& key = statistics_keys[j]; @@ -1154,11 +1181,11 @@ Result> MakeStatisticsArray( } keys_indices.push_back(key_index); - auto values_type = ArrayStatistics::ValueToArrowType(value, arrow::null()); + auto values_type = ArrayStatistics::ValueToArrowType(value, array_type); int8_t values_type_code = 0; for (; values_type_code < static_cast(values_types.size()); ++values_type_code) { - if (values_types[values_type_code] == values_type) { + if (values_types[values_type_code]->Equals(values_type)) { break; } } @@ -1186,16 +1213,18 @@ Result> MakeStatisticsArray( struct_({field("column", columns_type), field("statistics", statistics_type)}); ARROW_ASSIGN_OR_RAISE(auto keys_indices_array, BuildArray(keys_indices)); - ARROW_ASSIGN_OR_RAISE(auto keys_dictionary_array, - BuildArray(keys_dictionary)); + // The statistics schema specifies the type of dictionary key is utf8(StringType) + ARROW_ASSIGN_OR_RAISE(auto keys_dictionary_array, BuildArray(utf8(), keys_dictionary)); ARROW_ASSIGN_OR_RAISE( auto keys_array, DictionaryArray::FromArrays(keys_type, keys_indices_array, keys_dictionary_array)); std::vector> values_arrays; - for (const auto& values : values_values) { + for (size_t i = 0; i < values_values.size(); ++i) { + const auto& values = values_values[i]; + const auto& array_type = (i < array_types.size()) ? array_types[i] : null(); ARROW_ASSIGN_OR_RAISE(auto values_array, - BuildArray(values)); + BuildArray(values, array_type)); values_arrays.push_back(values_array); } ARROW_ASSIGN_OR_RAISE(auto values_value_type_ids_array, @@ -1215,6 +1244,21 @@ Result> MakeStatisticsArray( std::move(statistics_array)}; return std::make_shared(struct_type, n_columns, struct_arrays); } + +std::shared_ptr GenerateString(const std::shared_ptr& data_type) { + if (data_type->id() == Type::FIXED_SIZE_BINARY) { + auto byte_width = data_type->byte_width(); + std::string a(byte_width, 'a'); + std::string b(byte_width, 'b'); + std::string c(byte_width, 'c'); + std::stringstream ss; + ss << R"([")" << a << R"(",")" << b << R"(",")" << c << R"("])"; + return ArrayFromJSON(data_type, ss.str()); + } else { + return ArrayFromJSON(data_type, R"(["a","b","c"])"); + } +} + }; // namespace TEST_F(TestRecordBatch, MakeStatisticsArrayRowCount) { @@ -1423,34 +1467,148 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } -TEST_F(TestRecordBatch, MakeStatisticsArrayString) { - auto schema = - ::arrow::schema({field("no-statistics", boolean()), field("string", utf8())}); - auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); - auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, \"c\"]")->data()->Copy(); - string_array_data->statistics = std::make_shared(); - string_array_data->statistics->is_max_exact = true; - string_array_data->statistics->max = "c"; - auto string_array = MakeArray(std::move(string_array_data)); - auto batch = RecordBatch::Make(schema, string_array->length(), - {no_statistics_array, string_array}); +template +class TestRecordBatchMakeStatisticsArrayBinary : public ::testing::Test { + public: + void TestMaxApproximation() { + ArrayStatistics::ValueType max("c"); + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("string", type())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array = GenerateString(type()); + string_array->data()->statistics = std::make_shared(); + string_array->data()->statistics->max = max; + + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + max, + }}, + {null(), type()})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); + } + + std::shared_ptr<::arrow::DataType> type() { + if constexpr (std::is_same_v) { + return fixed_size_binary(1); + } else { + return TypeTraits::type_singleton(); + } + } +}; +TYPED_TEST_SUITE(TestRecordBatchMakeStatisticsArrayBinary, + AllBinaryOrBinrayViewLikeArrowTypes); + +TYPED_TEST(TestRecordBatchMakeStatisticsArrayBinary, MaxApproximation) { + this->TestMaxApproximation(); +} + +// Validates that the union array creates two distinct child arrays for two +// FixedSizeBinaryArrays with unequal byte widths. +TEST_F(TestRecordBatch, MakeStatisticsArrayDifferentSizeFixedSizeBinary) { + auto fixed_size_type1 = fixed_size_binary(1); + auto fixed_size_type2 = fixed_size_binary(2); + + auto fixed_size_array1 = GenerateString(fixed_size_type1); + fixed_size_array1->data()->statistics = std::make_shared(); + fixed_size_array1->data()->statistics->max = + std::string(fixed_size_type1->byte_width(), 'c'); + + auto fixed_size_array2 = GenerateString(fixed_size_type2); + fixed_size_array2->data()->statistics = std::make_shared(); + fixed_size_array2->data()->statistics->max = + std::string(fixed_size_type2->byte_width(), 'c'); + + auto schema = ::arrow::schema( + {field("fixed_size1", fixed_size_type1), field("fixed_size2", fixed_size_type2)}); + auto batch = RecordBatch::Make(schema, fixed_size_array1->length(), + {fixed_size_array1, fixed_size_array2}); ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 0, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{ + std::string(fixed_size_type1->byte_width(), 'c')}, + }, + { + ArrayStatistics::ValueType{ + std::string(fixed_size_type2->byte_width(), 'c')}, + }}, + {null(), fixed_size_type1, fixed_size_type2})); + + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +// Validates that the union array creates a single child array for two +// FixedSizeBinaryArrays with equal byte widths. +TEST_F(TestRecordBatch, MakeStatisticsArraySameSizeFixedSizeBinary) { + auto fixed_size_type = fixed_size_binary(2); + ArrayStatistics::ValueType max(std::string(fixed_size_type->byte_width(), 'c')); + + auto fixed_size_array1 = GenerateString(fixed_size_type); + fixed_size_array1->data()->statistics = std::make_shared(); + fixed_size_array1->data()->statistics->max = max; + + ASSERT_OK_AND_ASSIGN(auto fixed_size_array2, + fixed_size_array1->CopyTo(default_cpu_memory_manager())); + + auto schema = ::arrow::schema( + {field("fixed_size1", fixed_size_type), field("fixed_size2", fixed_size_type)}); + auto batch = RecordBatch::Make(schema, fixed_size_array1->length(), + {fixed_size_array1, fixed_size_array2}); + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 0, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + max, + }, + { + max, + }}, + {null(), fixed_size_type, fixed_size_type})); - ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, - MakeStatisticsArray("[null, 1]", - {{ - ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, - }, - { - ARROW_STATISTICS_KEY_MAX_VALUE_EXACT, - }}, - {{ - ArrayStatistics::ValueType{int64_t{3}}, - }, - { - ArrayStatistics::ValueType{"c"}, - }})); AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 89a986097f878..2ff4b72af402d 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -182,6 +182,9 @@ using BaseBinaryArrowTypes = using BaseBinaryOrBinaryViewLikeArrowTypes = ::testing::Types; +using AllBinaryOrBinrayViewLikeArrowTypes = + ::testing::Types; using BinaryArrowTypes = ::testing::Types;