Skip to content

Commit 1fc7769

Browse files
authored
fix: Ignore empty files in ListingTable when listing files with or without partition filters, as well as when inferring schema (#13750)
* fix: Ignore empty files in ListingTable when listing files with or without partition filters, as well as when inferring schema * clippy * fix csv and json tests * add testing for parquet * cleanup * fix parquet tests * document describe_partition, add back repartition options to one of the csv empty files tests
1 parent 82a40f3 commit 1fc7769

File tree

5 files changed

+180
-37
lines changed

5 files changed

+180
-37
lines changed

datafusion/core/src/datasource/file_format/csv.rs

+10-28
Original file line numberDiff line numberDiff line change
@@ -1259,73 +1259,57 @@ mod tests {
12591259
Ok(())
12601260
}
12611261

1262-
/// Read a single empty csv file in parallel
1262+
/// Read a single empty csv file
12631263
///
12641264
/// empty_0_byte.csv:
12651265
/// (file is empty)
1266-
#[rstest(n_partitions, case(1), case(2), case(3), case(4))]
12671266
#[tokio::test]
1268-
async fn test_csv_parallel_empty_file(n_partitions: usize) -> Result<()> {
1269-
let config = SessionConfig::new()
1270-
.with_repartition_file_scans(true)
1271-
.with_repartition_file_min_size(0)
1272-
.with_target_partitions(n_partitions);
1273-
let ctx = SessionContext::new_with_config(config);
1267+
async fn test_csv_empty_file() -> Result<()> {
1268+
let ctx = SessionContext::new();
12741269
ctx.register_csv(
12751270
"empty",
12761271
"tests/data/empty_0_byte.csv",
12771272
CsvReadOptions::new().has_header(false),
12781273
)
12791274
.await?;
12801275

1281-
// Require a predicate to enable repartition for the optimizer
12821276
let query = "select * from empty where random() > 0.5;";
12831277
let query_result = ctx.sql(query).await?.collect().await?;
1284-
let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
12851278

12861279
#[rustfmt::skip]
12871280
let expected = ["++",
12881281
"++"];
12891282
assert_batches_eq!(expected, &query_result);
1290-
assert_eq!(1, actual_partitions); // Won't get partitioned if all files are empty
12911283

12921284
Ok(())
12931285
}
12941286

1295-
/// Read a single empty csv file with header in parallel
1287+
/// Read a single empty csv file with header
12961288
///
12971289
/// empty.csv:
12981290
/// c1,c2,c3
1299-
#[rstest(n_partitions, case(1), case(2), case(3))]
13001291
#[tokio::test]
1301-
async fn test_csv_parallel_empty_with_header(n_partitions: usize) -> Result<()> {
1302-
let config = SessionConfig::new()
1303-
.with_repartition_file_scans(true)
1304-
.with_repartition_file_min_size(0)
1305-
.with_target_partitions(n_partitions);
1306-
let ctx = SessionContext::new_with_config(config);
1292+
async fn test_csv_empty_with_header() -> Result<()> {
1293+
let ctx = SessionContext::new();
13071294
ctx.register_csv(
13081295
"empty",
13091296
"tests/data/empty.csv",
13101297
CsvReadOptions::new().has_header(true),
13111298
)
13121299
.await?;
13131300

1314-
// Require a predicate to enable repartition for the optimizer
13151301
let query = "select * from empty where random() > 0.5;";
13161302
let query_result = ctx.sql(query).await?.collect().await?;
1317-
let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
13181303

13191304
#[rustfmt::skip]
13201305
let expected = ["++",
13211306
"++"];
13221307
assert_batches_eq!(expected, &query_result);
1323-
assert_eq!(n_partitions, actual_partitions);
13241308

13251309
Ok(())
13261310
}
13271311

1328-
/// Read multiple empty csv files in parallel
1312+
/// Read multiple empty csv files
13291313
///
13301314
/// all_empty
13311315
/// ├── empty0.csv
@@ -1334,13 +1318,13 @@ mod tests {
13341318
///
13351319
/// empty0.csv/empty1.csv/empty2.csv:
13361320
/// (file is empty)
1337-
#[rstest(n_partitions, case(1), case(2), case(3), case(4))]
13381321
#[tokio::test]
1339-
async fn test_csv_parallel_multiple_empty_files(n_partitions: usize) -> Result<()> {
1322+
async fn test_csv_multiple_empty_files() -> Result<()> {
1323+
// Testing that partitioning doesn't break with empty files
13401324
let config = SessionConfig::new()
13411325
.with_repartition_file_scans(true)
13421326
.with_repartition_file_min_size(0)
1343-
.with_target_partitions(n_partitions);
1327+
.with_target_partitions(4);
13441328
let ctx = SessionContext::new_with_config(config);
13451329
let file_format = Arc::new(CsvFormat::default().with_has_header(false));
13461330
let listing_options = ListingOptions::new(file_format.clone())
@@ -1358,13 +1342,11 @@ mod tests {
13581342
// Require a predicate to enable repartition for the optimizer
13591343
let query = "select * from empty where random() > 0.5;";
13601344
let query_result = ctx.sql(query).await?.collect().await?;
1361-
let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
13621345

13631346
#[rustfmt::skip]
13641347
let expected = ["++",
13651348
"++"];
13661349
assert_batches_eq!(expected, &query_result);
1367-
assert_eq!(1, actual_partitions); // Won't get partitioned if all files are empty
13681350

13691351
Ok(())
13701352
}

datafusion/core/src/datasource/file_format/json.rs

+2-6
Original file line numberDiff line numberDiff line change
@@ -619,13 +619,11 @@ mod tests {
619619
Ok(())
620620
}
621621

622-
#[rstest(n_partitions, case(1), case(2), case(3), case(4))]
623622
#[tokio::test]
624-
async fn it_can_read_empty_ndjson_in_parallel(n_partitions: usize) -> Result<()> {
623+
async fn it_can_read_empty_ndjson() -> Result<()> {
625624
let config = SessionConfig::new()
626625
.with_repartition_file_scans(true)
627-
.with_repartition_file_min_size(0)
628-
.with_target_partitions(n_partitions);
626+
.with_repartition_file_min_size(0);
629627

630628
let ctx = SessionContext::new_with_config(config);
631629

@@ -638,7 +636,6 @@ mod tests {
638636
let query = "SELECT * FROM json_parallel_empty WHERE random() > 0.5;";
639637

640638
let result = ctx.sql(query).await?.collect().await?;
641-
let actual_partitions = count_num_partitions(&ctx, query).await?;
642639

643640
#[rustfmt::skip]
644641
let expected = [
@@ -647,7 +644,6 @@ mod tests {
647644
];
648645

649646
assert_batches_eq!(expected, &result);
650-
assert_eq!(1, actual_partitions);
651647

652648
Ok(())
653649
}

datafusion/core/src/datasource/file_format/parquet.rs

+55-2
Original file line numberDiff line numberDiff line change
@@ -1312,7 +1312,7 @@ mod tests {
13121312

13131313
use crate::datasource::file_format::parquet::test_util::store_parquet;
13141314
use crate::physical_plan::metrics::MetricValue;
1315-
use crate::prelude::{SessionConfig, SessionContext};
1315+
use crate::prelude::{ParquetReadOptions, SessionConfig, SessionContext};
13161316
use arrow::array::{Array, ArrayRef, StringArray};
13171317
use arrow_array::types::Int32Type;
13181318
use arrow_array::{DictionaryArray, Int32Array, Int64Array};
@@ -1323,8 +1323,8 @@ mod tests {
13231323
as_float64_array, as_int32_array, as_timestamp_nanosecond_array,
13241324
};
13251325
use datafusion_common::config::ParquetOptions;
1326-
use datafusion_common::ScalarValue;
13271326
use datafusion_common::ScalarValue::Utf8;
1327+
use datafusion_common::{assert_batches_eq, ScalarValue};
13281328
use datafusion_execution::object_store::ObjectStoreUrl;
13291329
use datafusion_execution::runtime_env::RuntimeEnv;
13301330
use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
@@ -2251,6 +2251,59 @@ mod tests {
22512251
scan_format(state, &*format, &testdata, file_name, projection, limit).await
22522252
}
22532253

2254+
/// Test that 0-byte files don't break while reading
2255+
#[tokio::test]
2256+
async fn test_read_empty_parquet() -> Result<()> {
2257+
let tmp_dir = tempfile::TempDir::new().unwrap();
2258+
let path = format!("{}/empty.parquet", tmp_dir.path().to_string_lossy());
2259+
File::create(&path).await?;
2260+
2261+
let ctx = SessionContext::new();
2262+
2263+
let df = ctx
2264+
.read_parquet(&path, ParquetReadOptions::default())
2265+
.await
2266+
.expect("read_parquet should succeed");
2267+
2268+
let result = df.collect().await?;
2269+
#[rustfmt::skip]
2270+
let expected = ["++",
2271+
"++"];
2272+
assert_batches_eq!(expected, &result);
2273+
2274+
Ok(())
2275+
}
2276+
2277+
/// Test that 0-byte files don't break while reading
2278+
#[tokio::test]
2279+
async fn test_read_partitioned_empty_parquet() -> Result<()> {
2280+
let tmp_dir = tempfile::TempDir::new().unwrap();
2281+
let partition_dir = tmp_dir.path().join("col1=a");
2282+
std::fs::create_dir(&partition_dir).unwrap();
2283+
File::create(partition_dir.join("empty.parquet"))
2284+
.await
2285+
.unwrap();
2286+
2287+
let ctx = SessionContext::new();
2288+
2289+
let df = ctx
2290+
.read_parquet(
2291+
tmp_dir.path().to_str().unwrap(),
2292+
ParquetReadOptions::new()
2293+
.table_partition_cols(vec![("col1".to_string(), DataType::Utf8)]),
2294+
)
2295+
.await
2296+
.expect("read_parquet should succeed");
2297+
2298+
let result = df.collect().await?;
2299+
#[rustfmt::skip]
2300+
let expected = ["++",
2301+
"++"];
2302+
assert_batches_eq!(expected, &result);
2303+
2304+
Ok(())
2305+
}
2306+
22542307
fn build_ctx(store_url: &url::Url) -> Arc<TaskContext> {
22552308
let tmp_dir = tempfile::TempDir::new().unwrap();
22562309
let local = Arc::new(

datafusion/core/src/datasource/listing/helpers.rs

+111-1
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,13 @@ impl Partition {
171171
trace!("Listing partition {}", self.path);
172172
let prefix = Some(&self.path).filter(|p| !p.as_ref().is_empty());
173173
let result = store.list_with_delimiter(prefix).await?;
174-
self.files = Some(result.objects);
174+
self.files = Some(
175+
result
176+
.objects
177+
.into_iter()
178+
.filter(|object_meta| object_meta.size > 0)
179+
.collect(),
180+
);
175181
Ok((self, result.common_prefixes))
176182
}
177183
}
@@ -418,6 +424,7 @@ pub async fn pruned_partition_list<'a>(
418424
table_path
419425
.list_all_files(ctx, store, file_extension)
420426
.await?
427+
.try_filter(|object_meta| futures::future::ready(object_meta.size > 0))
421428
.map_ok(|object_meta| object_meta.into()),
422429
));
423430
}
@@ -566,6 +573,7 @@ mod tests {
566573
async fn test_pruned_partition_list_empty() {
567574
let (store, state) = make_test_store_and_state(&[
568575
("tablepath/mypartition=val1/notparquetfile", 100),
576+
("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0),
569577
("tablepath/file.parquet", 100),
570578
]);
571579
let filter = Expr::eq(col("mypartition"), lit("val1"));
@@ -590,6 +598,7 @@ mod tests {
590598
let (store, state) = make_test_store_and_state(&[
591599
("tablepath/mypartition=val1/file.parquet", 100),
592600
("tablepath/mypartition=val2/file.parquet", 100),
601+
("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0),
593602
("tablepath/mypartition=val1/other=val3/file.parquet", 100),
594603
]);
595604
let filter = Expr::eq(col("mypartition"), lit("val1"));
@@ -671,6 +680,107 @@ mod tests {
671680
);
672681
}
673682

683+
/// Describe a partition as a (path, depth, files) tuple for easier assertions
684+
fn describe_partition(partition: &Partition) -> (&str, usize, Vec<&str>) {
685+
(
686+
partition.path.as_ref(),
687+
partition.depth,
688+
partition
689+
.files
690+
.as_ref()
691+
.map(|f| f.iter().map(|f| f.location.filename().unwrap()).collect())
692+
.unwrap_or_default(),
693+
)
694+
}
695+
696+
#[tokio::test]
697+
async fn test_list_partition() {
698+
let (store, _) = make_test_store_and_state(&[
699+
("tablepath/part1=p1v1/file.parquet", 100),
700+
("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100),
701+
("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100),
702+
("tablepath/part1=p1v3/part2=p2v1/file3.parquet", 100),
703+
("tablepath/part1=p1v2/part2=p2v2/file4.parquet", 100),
704+
("tablepath/part1=p1v2/part2=p2v2/empty.parquet", 0),
705+
]);
706+
707+
let partitions = list_partitions(
708+
store.as_ref(),
709+
&ListingTableUrl::parse("file:///tablepath/").unwrap(),
710+
0,
711+
None,
712+
)
713+
.await
714+
.expect("listing partitions failed");
715+
716+
assert_eq!(
717+
&partitions
718+
.iter()
719+
.map(describe_partition)
720+
.collect::<Vec<_>>(),
721+
&vec![
722+
("tablepath", 0, vec![]),
723+
("tablepath/part1=p1v1", 1, vec![]),
724+
("tablepath/part1=p1v2", 1, vec![]),
725+
("tablepath/part1=p1v3", 1, vec![]),
726+
]
727+
);
728+
729+
let partitions = list_partitions(
730+
store.as_ref(),
731+
&ListingTableUrl::parse("file:///tablepath/").unwrap(),
732+
1,
733+
None,
734+
)
735+
.await
736+
.expect("listing partitions failed");
737+
738+
assert_eq!(
739+
&partitions
740+
.iter()
741+
.map(describe_partition)
742+
.collect::<Vec<_>>(),
743+
&vec![
744+
("tablepath", 0, vec![]),
745+
("tablepath/part1=p1v1", 1, vec!["file.parquet"]),
746+
("tablepath/part1=p1v2", 1, vec![]),
747+
("tablepath/part1=p1v2/part2=p2v1", 2, vec![]),
748+
("tablepath/part1=p1v2/part2=p2v2", 2, vec![]),
749+
("tablepath/part1=p1v3", 1, vec![]),
750+
("tablepath/part1=p1v3/part2=p2v1", 2, vec![]),
751+
]
752+
);
753+
754+
let partitions = list_partitions(
755+
store.as_ref(),
756+
&ListingTableUrl::parse("file:///tablepath/").unwrap(),
757+
2,
758+
None,
759+
)
760+
.await
761+
.expect("listing partitions failed");
762+
763+
assert_eq!(
764+
&partitions
765+
.iter()
766+
.map(describe_partition)
767+
.collect::<Vec<_>>(),
768+
&vec![
769+
("tablepath", 0, vec![]),
770+
("tablepath/part1=p1v1", 1, vec!["file.parquet"]),
771+
("tablepath/part1=p1v2", 1, vec![]),
772+
("tablepath/part1=p1v3", 1, vec![]),
773+
(
774+
"tablepath/part1=p1v2/part2=p2v1",
775+
2,
776+
vec!["file1.parquet", "file2.parquet"]
777+
),
778+
("tablepath/part1=p1v2/part2=p2v2", 2, vec!["file4.parquet"]),
779+
("tablepath/part1=p1v3/part2=p2v1", 2, vec!["file3.parquet"]),
780+
]
781+
);
782+
}
783+
674784
#[test]
675785
fn test_parse_partitions_for_path() {
676786
assert_eq!(

datafusion/core/src/datasource/listing/table.rs

+2
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,8 @@ impl ListingOptions {
470470
let files: Vec<_> = table_path
471471
.list_all_files(state, store.as_ref(), &self.file_extension)
472472
.await?
473+
// Empty files cannot affect schema but may throw when trying to read for it
474+
.try_filter(|object_meta| future::ready(object_meta.size > 0))
473475
.try_collect()
474476
.await?;
475477

0 commit comments

Comments
 (0)