Skip to content

Commit 01ffb64

Browse files
dhegbergalamb
andauthored
Support Null regex override in csv parser options. (#13228)
Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 1fc7769 commit 01ffb64

File tree

14 files changed

+292
-62
lines changed

14 files changed

+292
-62
lines changed

datafusion-cli/Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/common/src/config.rs

+3
Original file line numberDiff line numberDiff line change
@@ -1655,7 +1655,10 @@ config_namespace! {
16551655
pub timestamp_format: Option<String>, default = None
16561656
pub timestamp_tz_format: Option<String>, default = None
16571657
pub time_format: Option<String>, default = None
1658+
// The output format for Nulls in the CSV writer.
16581659
pub null_value: Option<String>, default = None
1660+
// The input regex for Nulls when loading CSVs.
1661+
pub null_regex: Option<String>, default = None
16591662
pub comment: Option<u8>, default = None
16601663
}
16611664
}

datafusion/core/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ object_store = { workspace = true }
121121
parking_lot = { workspace = true }
122122
parquet = { workspace = true, optional = true, default-features = true }
123123
rand = { workspace = true }
124+
regex = { workspace = true }
124125
sqlparser = { workspace = true }
125126
tempfile = { workspace = true }
126127
tokio = { workspace = true }

datafusion/core/benches/csv_load.rs

+10
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,16 @@ fn criterion_benchmark(c: &mut Criterion) {
7575
)
7676
})
7777
});
78+
79+
group.bench_function("null regex override", |b| {
80+
b.iter(|| {
81+
load_csv(
82+
ctx.clone(),
83+
test_file.path().to_str().unwrap(),
84+
CsvReadOptions::default().null_regex(Some("^NULL$|^$".to_string())),
85+
)
86+
})
87+
});
7888
}
7989

8090
criterion_group!(benches, criterion_benchmark);

datafusion/core/src/datasource/file_format/csv.rs

+78-3
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ use datafusion_physical_expr_common::sort_expr::LexRequirement;
5959
use futures::stream::BoxStream;
6060
use futures::{pin_mut, Stream, StreamExt, TryStreamExt};
6161
use object_store::{delimited::newline_delimited_stream, ObjectMeta, ObjectStore};
62+
use regex::Regex;
6263

6364
#[derive(Default)]
6465
/// Factory struct used to create [CsvFormatFactory]
@@ -218,6 +219,13 @@ impl CsvFormat {
218219
self
219220
}
220221

222+
/// Set the regex to use for null values in the CSV reader.
223+
/// - default to treat empty values as null.
224+
pub fn with_null_regex(mut self, null_regex: Option<String>) -> Self {
225+
self.options.null_regex = null_regex;
226+
self
227+
}
228+
221229
/// Returns `Some(true)` if the first line is a header, `Some(false)` if
222230
/// it is not, and `None` if it is not specified.
223231
pub fn has_header(&self) -> Option<bool> {
@@ -502,6 +510,12 @@ impl CsvFormat {
502510
.with_delimiter(self.options.delimiter)
503511
.with_quote(self.options.quote);
504512

513+
if let Some(null_regex) = &self.options.null_regex {
514+
let regex = Regex::new(null_regex.as_str())
515+
.expect("Unable to parse CSV null regex.");
516+
format = format.with_null_regex(regex);
517+
}
518+
505519
if let Some(escape) = self.options.escape {
506520
format = format.with_escape(escape);
507521
}
@@ -813,8 +827,67 @@ mod tests {
813827
let state = session_ctx.state();
814828

815829
let projection = None;
816-
let exec =
817-
get_exec(&state, "aggregate_test_100.csv", projection, None, true).await?;
830+
let root = "./tests/data/csv";
831+
let format = CsvFormat::default().with_has_header(true);
832+
let exec = scan_format(
833+
&state,
834+
&format,
835+
root,
836+
"aggregate_test_100_with_nulls.csv",
837+
projection,
838+
None,
839+
)
840+
.await?;
841+
842+
let x: Vec<String> = exec
843+
.schema()
844+
.fields()
845+
.iter()
846+
.map(|f| format!("{}: {:?}", f.name(), f.data_type()))
847+
.collect();
848+
assert_eq!(
849+
vec![
850+
"c1: Utf8",
851+
"c2: Int64",
852+
"c3: Int64",
853+
"c4: Int64",
854+
"c5: Int64",
855+
"c6: Int64",
856+
"c7: Int64",
857+
"c8: Int64",
858+
"c9: Int64",
859+
"c10: Utf8",
860+
"c11: Float64",
861+
"c12: Float64",
862+
"c13: Utf8",
863+
"c14: Null",
864+
"c15: Utf8"
865+
],
866+
x
867+
);
868+
869+
Ok(())
870+
}
871+
872+
#[tokio::test]
873+
async fn infer_schema_with_null_regex() -> Result<()> {
874+
let session_ctx = SessionContext::new();
875+
let state = session_ctx.state();
876+
877+
let projection = None;
878+
let root = "./tests/data/csv";
879+
let format = CsvFormat::default()
880+
.with_has_header(true)
881+
.with_null_regex(Some("^NULL$|^$".to_string()));
882+
let exec = scan_format(
883+
&state,
884+
&format,
885+
root,
886+
"aggregate_test_100_with_nulls.csv",
887+
projection,
888+
None,
889+
)
890+
.await?;
818891

819892
let x: Vec<String> = exec
820893
.schema()
@@ -836,7 +909,9 @@ mod tests {
836909
"c10: Utf8",
837910
"c11: Float64",
838911
"c12: Float64",
839-
"c13: Utf8"
912+
"c13: Utf8",
913+
"c14: Null",
914+
"c15: Null"
840915
],
841916
x
842917
);

datafusion/core/src/datasource/file_format/options.rs

+11-1
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ pub struct CsvReadOptions<'a> {
8787
pub file_compression_type: FileCompressionType,
8888
/// Indicates how the file is sorted
8989
pub file_sort_order: Vec<Vec<SortExpr>>,
90+
/// Optional regex to match null values
91+
pub null_regex: Option<String>,
9092
}
9193

9294
impl Default for CsvReadOptions<'_> {
@@ -112,6 +114,7 @@ impl<'a> CsvReadOptions<'a> {
112114
file_compression_type: FileCompressionType::UNCOMPRESSED,
113115
file_sort_order: vec![],
114116
comment: None,
117+
null_regex: None,
115118
}
116119
}
117120

@@ -212,6 +215,12 @@ impl<'a> CsvReadOptions<'a> {
212215
self.file_sort_order = file_sort_order;
213216
self
214217
}
218+
219+
/// Configure the null parsing regex.
220+
pub fn null_regex(mut self, null_regex: Option<String>) -> Self {
221+
self.null_regex = null_regex;
222+
self
223+
}
215224
}
216225

217226
/// Options that control the reading of Parquet files.
@@ -534,7 +543,8 @@ impl ReadOptions<'_> for CsvReadOptions<'_> {
534543
.with_terminator(self.terminator)
535544
.with_newlines_in_values(self.newlines_in_values)
536545
.with_schema_infer_max_rec(self.schema_infer_max_records)
537-
.with_file_compression_type(self.file_compression_type.to_owned());
546+
.with_file_compression_type(self.file_compression_type.to_owned())
547+
.with_null_regex(self.null_regex.clone());
538548

539549
ListingOptions::new(Arc::new(file_format))
540550
.with_file_extension(self.file_extension)

0 commit comments

Comments
 (0)