Skip to content

Commit 9ca09cf

Browse files
authored
Allow FileSource-specific repartitioning (#14754)
* FileSource specific repartitioning * fix doc typo * remove * Avro doesn't support repartitioning
1 parent faace2c commit 9ca09cf

File tree

7 files changed

+50
-42
lines changed

7 files changed

+50
-42
lines changed

datafusion/core/src/datasource/data_source.rs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ use crate::datasource::physical_plan::{FileOpener, FileScanConfig};
2626

2727
use arrow::datatypes::SchemaRef;
2828
use datafusion_common::Statistics;
29+
use datafusion_datasource::file_groups::FileGroupPartitioner;
30+
use datafusion_physical_expr::LexOrdering;
2931
use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
3032
use datafusion_physical_plan::DisplayFormatType;
3133

@@ -62,9 +64,33 @@ pub trait FileSource: Send + Sync {
6264
fn fmt_extra(&self, _t: DisplayFormatType, _f: &mut Formatter) -> fmt::Result {
6365
Ok(())
6466
}
65-
/// Return true if the file format supports repartition
67+
68+
/// If supported by the [`FileSource`], redistribute files across partitions according to their size.
69+
/// Allows custom file formats to implement their own repartitioning logic.
6670
///
67-
/// If this returns true, the DataSourceExec may repartition the data
68-
/// by breaking up the input files into multiple smaller groups.
69-
fn supports_repartition(&self, config: &FileScanConfig) -> bool;
71+
/// Provides a default repartitioning behavior, see comments on [`FileGroupPartitioner`] for more detail.
72+
fn repartitioned(
73+
&self,
74+
target_partitions: usize,
75+
repartition_file_min_size: usize,
76+
output_ordering: Option<LexOrdering>,
77+
config: &FileScanConfig,
78+
) -> datafusion_common::Result<Option<FileScanConfig>> {
79+
if config.file_compression_type.is_compressed() || config.new_lines_in_values {
80+
return Ok(None);
81+
}
82+
83+
let repartitioned_file_groups_option = FileGroupPartitioner::new()
84+
.with_target_partitions(target_partitions)
85+
.with_repartition_file_min_size(repartition_file_min_size)
86+
.with_preserve_order_within_groups(output_ordering.is_some())
87+
.repartition_file_groups(&config.file_groups);
88+
89+
if let Some(repartitioned_file_groups) = repartitioned_file_groups_option {
90+
let mut source = config.clone();
91+
source.file_groups = repartitioned_file_groups;
92+
return Ok(Some(source));
93+
}
94+
Ok(None)
95+
}
7096
}

datafusion/core/src/datasource/physical_plan/arrow_file.rs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -256,10 +256,6 @@ impl FileSource for ArrowSource {
256256
fn file_type(&self) -> &str {
257257
"arrow"
258258
}
259-
260-
fn supports_repartition(&self, config: &FileScanConfig) -> bool {
261-
!(config.file_compression_type.is_compressed() || config.new_lines_in_values)
262-
}
263259
}
264260

265261
/// The struct arrow that implements `[FileOpener]` trait

datafusion/core/src/datasource/physical_plan/avro.rs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -255,10 +255,15 @@ impl FileSource for AvroSource {
255255
fn file_type(&self) -> &str {
256256
"avro"
257257
}
258-
fn supports_repartition(&self, config: &FileScanConfig) -> bool {
259-
!(config.file_compression_type.is_compressed()
260-
|| config.new_lines_in_values
261-
|| self.as_any().downcast_ref::<AvroSource>().is_some())
258+
259+
fn repartitioned(
260+
&self,
261+
_target_partitions: usize,
262+
_repartition_file_min_size: usize,
263+
_output_ordering: Option<LexOrdering>,
264+
_config: &FileScanConfig,
265+
) -> Result<Option<FileScanConfig>> {
266+
Ok(None)
262267
}
263268
}
264269

datafusion/core/src/datasource/physical_plan/csv.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -618,9 +618,6 @@ impl FileSource for CsvSource {
618618
fn fmt_extra(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
619619
write!(f, ", has_header={}", self.has_header)
620620
}
621-
fn supports_repartition(&self, config: &FileScanConfig) -> bool {
622-
!(config.file_compression_type.is_compressed() || config.new_lines_in_values)
623-
}
624621
}
625622

626623
impl FileOpener for CsvOpener {

datafusion/core/src/datasource/physical_plan/file_scan_config.rs

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
//! file sources.
2020
2121
use super::{
22-
get_projected_output_ordering, statistics::MinMaxStatistics, FileGroupPartitioner,
23-
FileGroupsDisplay, FileStream,
22+
get_projected_output_ordering, statistics::MinMaxStatistics, FileGroupsDisplay,
23+
FileStream,
2424
};
2525
use crate::datasource::file_format::file_compression_type::FileCompressionType;
2626
use crate::datasource::{listing::PartitionedFile, object_store::ObjectStoreUrl};
@@ -203,30 +203,21 @@ impl DataSource for FileScanConfig {
203203
self.fmt_file_source(t, f)
204204
}
205205

206-
/// Redistribute files across partitions according to their size
207-
/// See comments on [`FileGroupPartitioner`] for more detail.
206+
/// If supported by the underlying [`FileSource`], redistribute files across partitions according to their size.
208207
fn repartitioned(
209208
&self,
210209
target_partitions: usize,
211210
repartition_file_min_size: usize,
212211
output_ordering: Option<LexOrdering>,
213212
) -> Result<Option<Arc<dyn DataSource>>> {
214-
if !self.source.supports_repartition(self) {
215-
return Ok(None);
216-
}
217-
218-
let repartitioned_file_groups_option = FileGroupPartitioner::new()
219-
.with_target_partitions(target_partitions)
220-
.with_repartition_file_min_size(repartition_file_min_size)
221-
.with_preserve_order_within_groups(output_ordering.is_some())
222-
.repartition_file_groups(&self.file_groups);
223-
224-
if let Some(repartitioned_file_groups) = repartitioned_file_groups_option {
225-
let mut source = self.clone();
226-
source.file_groups = repartitioned_file_groups;
227-
return Ok(Some(Arc::new(source)));
228-
}
229-
Ok(None)
213+
let source = self.source.repartitioned(
214+
target_partitions,
215+
repartition_file_min_size,
216+
output_ordering,
217+
self,
218+
)?;
219+
220+
Ok(source.map(|s| Arc::new(s) as _))
230221
}
231222

232223
fn output_partitioning(&self) -> Partitioning {

datafusion/core/src/datasource/physical_plan/json.rs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -313,10 +313,6 @@ impl FileSource for JsonSource {
313313
fn file_type(&self) -> &str {
314314
"json"
315315
}
316-
317-
fn supports_repartition(&self, config: &FileScanConfig) -> bool {
318-
!(config.file_compression_type.is_compressed() || config.new_lines_in_values)
319-
}
320316
}
321317

322318
impl FileOpener for JsonOpener {

datafusion/core/src/datasource/physical_plan/parquet/source.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,4 @@ impl FileSource for ParquetSource {
586586
}
587587
}
588588
}
589-
fn supports_repartition(&self, _config: &FileScanConfig) -> bool {
590-
true
591-
}
592589
}

0 commit comments

Comments
 (0)