Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

ORC writer API changes for granular statistics #10058

Merged
merged 34 commits into from
Jan 20, 2022
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
b8ae756
separate stats by level
vuule Jan 11, 2022
d6c18c8
encode rg stats
vuule Jan 13, 2022
cb2b972
rename putb
vuule Jan 13, 2022
95f018c
add put_bytes
vuule Jan 13, 2022
c5b62b9
include rg stats in rg index entries
vuule Jan 13, 2022
f1f6958
docs
vuule Jan 13, 2022
c078879
fix; don't use optional
vuule Jan 14, 2022
735e195
ORC writer API changes to accommodate stats granularity:
mythrocks Jan 14, 2022
61733f9
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
vuule Jan 14, 2022
4bfe814
return written size from put_byte and put_bytes
vuule Jan 14, 2022
1be86c2
Optional serialization of rowgroup statistics.
mythrocks Jan 14, 2022
2535754
reuse encode_field_number in protobuf writer
vuule Jan 15, 2022
683a016
copyright year
vuule Jan 15, 2022
61a8bec
comment
vuule Jan 15, 2022
961d989
Python support for ORC Writer Statistics API changes.
mythrocks Jan 15, 2022
8804cde
Removed unused is_statistics_enabled() method.
mythrocks Jan 15, 2022
8e47d45
Merge remote-tracking branch 'vuule/fea-rowgroup-stats' into fea-orc-…
mythrocks Jan 18, 2022
79b1dbd
Fixups after review:
mythrocks Jan 18, 2022
5cd27bb
Fix range of stats-frequency in ORC writer benchmark.
mythrocks Jan 18, 2022
841238d
ORC Statistics Tests
mythrocks Jan 19, 2022
6161673
Update Copyright year.
mythrocks Jan 19, 2022
b234961
Merge remote-tracking branch 'origin/branch-22.02' into fea-orc-stats…
mythrocks Jan 19, 2022
dd3b9b2
Removed redundant conversion between ORC and Parquet terms.
mythrocks Jan 19, 2022
65fdf83
Merge remote-tracking branch 'origin/branch-22.02' into fea-orc-stats…
mythrocks Jan 19, 2022
493df76
Correct parameter name for statistics in to_orc():
mythrocks Jan 19, 2022
9769010
Fix skip-offset for orc_encode_statistics.
mythrocks Jan 19, 2022
2748ab8
Remove unused header.
mythrocks Jan 19, 2022
a02c505
Fix logic for skipping rowgroup stats serialization.
mythrocks Jan 19, 2022
b480b8d
Replace magic numbers for ORC statistics constants
mythrocks Jan 19, 2022
6465028
Fix formatting for ORC Writer Python bindings
mythrocks Jan 19, 2022
edaade3
Test stats serialization for non-default stats frequency
mythrocks Jan 19, 2022
c4dead7
Switch `test_chunked_orc_writer_statistics_frequency` to run on small…
mythrocks Jan 19, 2022
5fb0464
Merge remote-tracking branch 'origin/branch-22.02' into fea-orc-stats…
mythrocks Jan 20, 2022
a7cb148
Formatting changes.
mythrocks Jan 20, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/

#include "cudf/io/types.hpp"
#include <benchmark/benchmark.h>

#include <benchmarks/common/generate_benchmark_input.hpp>
Expand Down Expand Up @@ -65,8 +66,14 @@ void BM_orc_write_varying_inout(benchmark::State& state)

void BM_orc_write_varying_options(benchmark::State& state)
{
auto const compression = static_cast<cudf::io::compression_type>(state.range(0));
auto const enable_stats = state.range(1) != 0;
auto const compression = static_cast<cudf::io::compression_type>(state.range(0));
auto const stats_freq = [&] {
switch (state.range(2)) {
case 0: return cudf::io::STATISTICS_NONE;
case 1: return cudf::io::ORC_STATISTICS_STRIPE;
default: return cudf::io::ORC_STATISTICS_ROW_GROUP;
}
}();

auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
Expand All @@ -85,7 +92,7 @@ void BM_orc_write_varying_options(benchmark::State& state)
cudf_io::orc_writer_options const options =
cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view)
.compression(compression)
.enable_statistics(enable_stats);
.enable_statistics(stats_freq);
cudf_io::write_orc(options);
}

Expand Down
79 changes: 58 additions & 21 deletions cpp/include/cudf/io/orc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,18 @@ table_with_metadata read_orc(
*/
class orc_writer_options_builder;

/**
* @brief Constants to disambiguate statistics terminology for ORC.
*
* ORC refers to its finest granularity of row-grouping as "row group",
* which corresponds to Parquet "pages".
* Similarly, ORC's "stripe" corresponds to a Parquet "row group".
* The following constants disambiguate the terminology for the statistics
* collected at each level.
*/
static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP;
static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE;

/**
* @brief Settings to use for `write_orc()`.
*/
Expand All @@ -442,8 +454,8 @@ class orc_writer_options {
sink_info _sink;
// Specify the compression format to use
compression_type _compression = compression_type::AUTO;
// Enable writing column statistics
bool _enable_statistics = true;
// Specify frequency of statistics collection
statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
// Maximum size of each stripe (unless smaller than a single row group)
size_t _stripe_size_bytes = default_stripe_size_bytes;
// Maximum number of rows in stripe (unless smaller than a single row group)
Expand Down Expand Up @@ -501,7 +513,12 @@ class orc_writer_options {
/**
* @brief Whether writing column statistics is enabled/disabled.
*/
bool is_enabled_statistics() const { return _enable_statistics; }
bool is_enabled_statistics() const { return _stats_freq != statistics_freq::STATISTICS_NONE; }

/**
* @brief Returns frequency of statistics collection.
*/
statistics_freq get_statistics_freq() const { return _stats_freq; }

/**
* @brief Returns maximum stripe size, in bytes.
Expand Down Expand Up @@ -547,11 +564,16 @@ class orc_writer_options {
void set_compression(compression_type comp) { _compression = comp; }

/**
* @brief Enable/Disable writing column statistics.
* @brief Choose granularity of statistics collection.
*
* The granularity can be set to:
* - cudf::io::STATISTICS_NONE: No statistics are collected.
* - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
* - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
*
* @param val Boolean value to enable/disable statistics.
* @param val Frequency of statistics collection.
*/
void enable_statistics(bool val) { _enable_statistics = val; }
void enable_statistics(statistics_freq val) { _stats_freq = val; }

/**
* @brief Sets the maximum stripe size, in bytes.
Expand Down Expand Up @@ -644,14 +666,19 @@ class orc_writer_options_builder {
}

/**
* @brief Enable/Disable writing column statistics.
* @brief Choose granularity of column statistics to be written
*
* The granularity can be set to:
* - cudf::io::STATISTICS_NONE: No statistics are collected.
* - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
* - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
*
* @param val Boolean value to enable/disable.
* @param val Level of statistics collection.
* @return this for chaining.
*/
orc_writer_options_builder& enable_statistics(bool val)
orc_writer_options_builder& enable_statistics(statistics_freq val)
{
options._enable_statistics = val;
options._stats_freq = val;
return *this;
}

Expand Down Expand Up @@ -772,8 +799,8 @@ class chunked_orc_writer_options {
sink_info _sink;
// Specify the compression format to use
compression_type _compression = compression_type::AUTO;
// Enable writing column statistics
bool _enable_statistics = true;
// Specify granularity of statistics collection
statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
// Maximum size of each stripe (unless smaller than a single row group)
size_t _stripe_size_bytes = default_stripe_size_bytes;
// Maximum number of rows in stripe (unless smaller than a single row group)
Expand Down Expand Up @@ -822,9 +849,9 @@ class chunked_orc_writer_options {
compression_type get_compression() const { return _compression; }

/**
* @brief Whether writing column statistics is enabled/disabled.
* @brief Returns granularity of statistics collection.
*/
bool is_enabled_statistics() const { return _enable_statistics; }
statistics_freq get_statistics_freq() const { return _stats_freq; }

/**
* @brief Returns maximum stripe size, in bytes.
Expand Down Expand Up @@ -865,11 +892,16 @@ class chunked_orc_writer_options {
void set_compression(compression_type comp) { _compression = comp; }

/**
* @brief Enable/Disable writing column statistics.
* @brief Choose granularity of statistics collection
*
* The granularity can be set to:
* - cudf::io::STATISTICS_NONE: No statistics are collected.
* - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
* - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
*
* @param val Boolean value to enable/disable.
* @param val Frequency of statistics collection.
*/
void enable_statistics(bool val) { _enable_statistics = val; }
void enable_statistics(statistics_freq val) { _stats_freq = val; }

/**
* @brief Sets the maximum stripe size, in bytes.
Expand Down Expand Up @@ -952,14 +984,19 @@ class chunked_orc_writer_options_builder {
}

/**
* @brief Enable/Disable writing column statistics.
* @brief Choose granularity of statistics collection
*
* The granularity can be set to:
* - cudf::io::STATISTICS_NONE: No statistics are collected.
* - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
* - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
*
* @param val Boolean value to enable/disable.
* @param val Frequency of statistics collection.
* @return this for chaining.
*/
chunked_orc_writer_options_builder& enable_statistics(bool val)
chunked_orc_writer_options_builder& enable_statistics(statistics_freq val)
{
options._enable_statistics = val;
options._stats_freq = val;
return *this;
}

Expand Down
57 changes: 34 additions & 23 deletions cpp/src/io/orc/orc.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -209,43 +209,54 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk,
int32_t data_ofs,
int32_t data2_blk,
int32_t data2_ofs,
TypeKind kind)
TypeKind kind,
ColStatsBlob const* stats)
{
size_t sz = 0, lpos;
putb(1 * 8 + PB_TYPE_FIXEDLEN); // 1:RowIndex.entry
put_uint(encode_field_number(1, PB_TYPE_FIXEDLEN)); // 1:RowIndex.entry
lpos = m_buf->size();
putb(0xcd); // sz+2
putb(1 * 8 + PB_TYPE_FIXEDLEN); // 1:positions[packed=true]
putb(0xcd); // sz
put_byte(0xcd); // sz+2
put_uint(encode_field_number(1, PB_TYPE_FIXEDLEN)); // 1:positions[packed=true]
put_byte(0xcd); // sz
if (present_blk >= 0) sz += put_uint(present_blk);
if (present_ofs >= 0) {
sz += put_uint(present_ofs) + 2;
putb(0); // run pos = 0
putb(0); // bit pos = 0
sz += put_uint(present_ofs);
sz += put_byte(0); // run pos = 0
sz += put_byte(0); // bit pos = 0
}
if (data_blk >= 0) { sz += put_uint(data_blk); }
if (data_ofs >= 0) {
sz += put_uint(data_ofs);
if (kind != STRING && kind != FLOAT && kind != DOUBLE && kind != DECIMAL) {
putb(0); // RLE run pos always zero (assumes RLE aligned with row index boundaries)
sz++;
// RLE run pos always zero (assumes RLE aligned with row index boundaries)
sz += put_byte(0);
if (kind == BOOLEAN) {
putb(0); // bit position in byte, always zero
sz++;
// bit position in byte, always zero
sz += put_byte(0);
}
}
}
if (kind !=
INT) // INT kind can be passed in to bypass 2nd stream index (dictionary length streams)
{
// INT kind can be passed in to bypass 2nd stream index (dictionary length streams)
if (kind != INT) {
if (data2_blk >= 0) { sz += put_uint(data2_blk); }
if (data2_ofs >= 0) {
sz += put_uint(data2_ofs) + 1;
putb(0); // RLE run pos always zero (assumes RLE aligned with row index boundaries)
sz += put_uint(data2_ofs);
// RLE run pos always zero (assumes RLE aligned with row index boundaries)
sz += put_byte(0);
}
}
m_buf->data()[lpos] = (uint8_t)(sz + 2);
// size of the field 1
m_buf->data()[lpos + 2] = (uint8_t)(sz);

if (stats != nullptr) {
sz += put_uint(encode_field_number<decltype(*stats)>(2)); // 2: statistics
// Statistics field contains its length as varint and dtype specific data (encoded on the GPU)
sz += put_uint(stats->size());
sz += put_bytes(*stats);
}

// size of the whole row index entry
m_buf->data()[lpos] = (uint8_t)(sz + 2);
}

size_t ProtobufWriter::write(const PostScript& s)
Expand All @@ -256,7 +267,7 @@ size_t ProtobufWriter::write(const PostScript& s)
if (s.compression != NONE) { w.field_uint(3, s.compressionBlockSize); }
w.field_packed_uint(4, s.version);
w.field_uint(5, s.metadataLength);
w.field_string(8000, s.magic);
w.field_blob(8000, s.magic);
return w.value();
}

Expand Down Expand Up @@ -300,8 +311,8 @@ size_t ProtobufWriter::write(const SchemaType& s)
size_t ProtobufWriter::write(const UserMetadataItem& s)
{
ProtobufFieldWriter w(this);
w.field_string(1, s.name);
w.field_string(2, s.value);
w.field_blob(1, s.name);
w.field_blob(2, s.value);
return w.value();
}

Expand All @@ -310,7 +321,7 @@ size_t ProtobufWriter::write(const StripeFooter& s)
ProtobufFieldWriter w(this);
w.field_repeated_struct(1, s.streams);
w.field_repeated_struct(2, s.columns);
if (s.writerTimezone != "") { w.field_string(3, s.writerTimezone); }
if (s.writerTimezone != "") { w.field_blob(3, s.writerTimezone); }
return w.value();
}

Expand Down
Loading