@@ -20,8 +20,8 @@ use parquet::file::metadata::RowGroupMetaData;
20
20
21
21
/// A selection of rows and row groups within a ParquetFile to decode.
22
22
///
23
- /// A `ParquetAccessPlan` is used to limits the row groups and data pages a `ParquetExec`
24
- /// will read and decode and this improve performance.
23
+ /// A `ParquetAccessPlan` is used to limit the row groups and data pages a `ParquetExec`
24
+ /// will read and decode to improve performance.
25
25
///
26
26
/// Note that page level pruning based on ArrowPredicate is applied after all of
27
27
/// these selections
@@ -39,11 +39,13 @@ use parquet::file::metadata::RowGroupMetaData;
39
39
/// let mut access_plan = ParquetAccessPlan::new_all(4);
40
40
/// access_plan.skip(0); // skip row group
41
41
/// // Use parquet reader RowSelector to specify scanning rows 100-200 and 350-400
42
+ /// // in a row group that has 1000 rows
42
43
/// let row_selection = RowSelection::from(vec![
43
44
/// RowSelector::skip(100),
44
45
/// RowSelector::select(100),
45
46
/// RowSelector::skip(150),
46
47
/// RowSelector::select(50),
48
+ /// RowSelector::skip(600), // skip last 600 rows
47
49
/// ]);
48
50
/// access_plan.scan_selection(1, row_selection);
49
51
/// access_plan.skip(2); // skip row group 2
@@ -158,7 +160,7 @@ impl ParquetAccessPlan {
158
160
}
159
161
}
160
162
161
- /// Return the overall `RowSelection` for all scanned row groups
163
+ /// Return an overall `RowSelection`, if needed
162
164
///
163
165
/// This is used to compute the row selection for the parquet reader. See
164
166
/// [`ArrowReaderBuilder::with_row_selection`] for more details.
@@ -174,28 +176,54 @@ impl ParquetAccessPlan {
174
176
///
175
177
/// If there are no [`RowGroupAccess::Selection`]s, the overall row
176
178
/// selection is `None` because each row group is either entirely skipped or
177
- /// scanned, as specified by [`Self::row_group_indexes`].
179
+ /// scanned, which is covered by [`Self::row_group_indexes`].
178
180
///
179
- /// # Example
181
+ /// If there are any [`RowGroupAccess::Selection`], an overall row selection
182
+ /// is returned for *all* the rows in the row groups that are not skipped.
183
+ /// Thus it includes a `Select` selection for any [`RowGroupAccess::Scan`].
184
+ ///
185
+ /// # Example: No Selections
186
+ ///
187
+ /// Given an access plan like this
188
+ ///
189
+ /// ```text
190
+ /// RowGroupAccess::Scan (scan all row group 0)
191
+ /// RowGroupAccess::Skip (skip row group 1)
192
+ /// RowGroupAccess::Scan (scan all row group 2)
193
+ /// RowGroupAccess::Scan (scan all row group 3)
194
+ /// ```
195
+ ///
196
+ /// The overall row selection would be `None` because there are no
197
+ /// [`RowGroupAccess::Selection`]s. The row group indexes
198
+ /// returned by [`Self::row_group_indexes`] would be `0, 2, 3` .
199
+ ///
200
+ /// # Example: With Selections
180
201
///
181
202
/// Given an access plan like this:
182
203
///
183
204
/// ```text
184
- /// Scan (scan all row group 0)
185
- /// Skip (skip row group 1)
186
- /// Select 50-100 (scan rows 50-100 in row group 2)
205
+ /// RowGroupAccess::Scan (scan all row group 0)
206
+ /// RowGroupAccess::Skip (skip row group 1)
207
+ /// RowGroupAccess::Select (skip 50, scan 50, skip 900) (scan rows 50-100 in row group 2)
208
+ /// RowGroupAccess::Scan (scan all row group 3)
187
209
/// ```
188
210
///
189
211
/// Assuming each row group has 1000 rows, the resulting row selection would
190
- /// be the rows to scan in row group 0 and 2 :
212
+ /// be the rows to scan in row group 0, 2 and 4 :
191
213
///
192
214
/// ```text
193
- /// Select 1000 (scan all rows in row group 0)
194
- /// Select 50-100 (scan rows 50-100 in row group 2)
215
+ /// RowSelection::Select(1000) (scan all rows in row group 0)
216
+ /// RowSelection::Skip(50) (skip first 50 rows in row group 2)
217
+ /// RowSelection::Select(50) (scan rows 50-100 in row group 2)
218
+ /// RowSelection::Skip(900) (skip last 900 rows in row group 2)
219
+ /// RowSelection::Select(1000) (scan all rows in row group 3)
195
220
/// ```
196
221
///
197
222
/// Note there is no entry for the (entirely) skipped row group 1.
198
223
///
224
+ /// The row group indexes returned by [`Self::row_group_indexes`] would
225
+ /// still be `0, 2, 3` .
226
+ ///
199
227
/// [`ArrowReaderBuilder::with_row_selection`]: parquet::arrow::arrow_reader::ArrowReaderBuilder::with_row_selection
200
228
pub fn into_overall_row_selection (
201
229
self ,
0 commit comments