Skip to content

Commit a76f95a

Browse files
committed
Update documentation to exlain the relationship between scan/skip/selection
1 parent 8d44ed2 commit a76f95a

File tree

1 file changed

+39
-11
lines changed

1 file changed

+39
-11
lines changed

datafusion/core/src/datasource/physical_plan/parquet/access_plan.rs

+39-11
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ use parquet::file::metadata::RowGroupMetaData;
2020

2121
/// A selection of rows and row groups within a ParquetFile to decode.
2222
///
23-
/// A `ParquetAccessPlan` is used to limits the row groups and data pages a `ParquetExec`
24-
/// will read and decode and this improve performance.
23+
/// A `ParquetAccessPlan` is used to limit the row groups and data pages a `ParquetExec`
24+
/// will read and decode to improve performance.
2525
///
2626
/// Note that page level pruning based on ArrowPredicate is applied after all of
2727
/// these selections
@@ -39,11 +39,13 @@ use parquet::file::metadata::RowGroupMetaData;
3939
/// let mut access_plan = ParquetAccessPlan::new_all(4);
4040
/// access_plan.skip(0); // skip row group
4141
/// // Use parquet reader RowSelector to specify scanning rows 100-200 and 350-400
42+
/// // in a row group that has 1000 rows
4243
/// let row_selection = RowSelection::from(vec![
4344
/// RowSelector::skip(100),
4445
/// RowSelector::select(100),
4546
/// RowSelector::skip(150),
4647
/// RowSelector::select(50),
48+
/// RowSelector::skip(600), // skip last 600 rows
4749
/// ]);
4850
/// access_plan.scan_selection(1, row_selection);
4951
/// access_plan.skip(2); // skip row group 2
@@ -158,7 +160,7 @@ impl ParquetAccessPlan {
158160
}
159161
}
160162

161-
/// Return the overall `RowSelection` for all scanned row groups
163+
/// Return an overall `RowSelection`, if needed
162164
///
163165
/// This is used to compute the row selection for the parquet reader. See
164166
/// [`ArrowReaderBuilder::with_row_selection`] for more details.
@@ -174,28 +176,54 @@ impl ParquetAccessPlan {
174176
///
175177
/// If there are no [`RowGroupAccess::Selection`]s, the overall row
176178
/// selection is `None` because each row group is either entirely skipped or
177-
/// scanned, as specified by [`Self::row_group_indexes`].
179+
/// scanned, which is covered by [`Self::row_group_indexes`].
178180
///
179-
/// # Example
181+
/// If there are any [`RowGroupAccess::Selection`], an overall row selection
182+
/// is returned for *all* the rows in the row groups that are not skipped.
183+
/// Thus it includes a `Select` selection for any [`RowGroupAccess::Scan`].
184+
///
185+
/// # Example: No Selections
186+
///
187+
/// Given an access plan like this
188+
///
189+
/// ```text
190+
/// RowGroupAccess::Scan (scan all row group 0)
191+
/// RowGroupAccess::Skip (skip row group 1)
192+
/// RowGroupAccess::Scan (scan all row group 2)
193+
/// RowGroupAccess::Scan (scan all row group 3)
194+
/// ```
195+
///
196+
/// The overall row selection would be `None` because there are no
197+
/// [`RowGroupAccess::Selection`]s. The row group indexes
198+
/// returned by [`Self::row_group_indexes`] would be `0, 2, 3` .
199+
///
200+
/// # Example: With Selections
180201
///
181202
/// Given an access plan like this:
182203
///
183204
/// ```text
184-
/// Scan (scan all row group 0)
185-
/// Skip (skip row group 1)
186-
/// Select 50-100 (scan rows 50-100 in row group 2)
205+
/// RowGroupAccess::Scan (scan all row group 0)
206+
/// RowGroupAccess::Skip (skip row group 1)
207+
/// RowGroupAccess::Select (skip 50, scan 50, skip 900) (scan rows 50-100 in row group 2)
208+
/// RowGroupAccess::Scan (scan all row group 3)
187209
/// ```
188210
///
189211
/// Assuming each row group has 1000 rows, the resulting row selection would
190-
/// be the rows to scan in row group 0 and 2:
212+
/// be the rows to scan in row group 0, 2 and 4:
191213
///
192214
/// ```text
193-
/// Select 1000 (scan all rows in row group 0)
194-
/// Select 50-100 (scan rows 50-100 in row group 2)
215+
/// RowSelection::Select(1000) (scan all rows in row group 0)
216+
/// RowSelection::Skip(50) (skip first 50 rows in row group 2)
217+
/// RowSelection::Select(50) (scan rows 50-100 in row group 2)
218+
/// RowSelection::Skip(900) (skip last 900 rows in row group 2)
219+
/// RowSelection::Select(1000) (scan all rows in row group 3)
195220
/// ```
196221
///
197222
/// Note there is no entry for the (entirely) skipped row group 1.
198223
///
224+
/// The row group indexes returned by [`Self::row_group_indexes`] would
225+
/// still be `0, 2, 3` .
226+
///
199227
/// [`ArrowReaderBuilder::with_row_selection`]: parquet::arrow::arrow_reader::ArrowReaderBuilder::with_row_selection
200228
pub fn into_overall_row_selection(
201229
self,

0 commit comments

Comments
 (0)