@@ -55,6 +55,7 @@ use super::ParquetFileMetrics;
55
55
/// Note: This method currently ignores ColumnOrder
56
56
/// <https://github.com/apache/arrow-datafusion/issues/8335>
57
57
pub ( crate ) fn prune_row_groups_by_statistics (
58
+ arrow_schema : & Schema ,
58
59
parquet_schema : & SchemaDescriptor ,
59
60
groups : & [ RowGroupMetaData ] ,
60
61
range : Option < FileRange > ,
@@ -80,7 +81,7 @@ pub(crate) fn prune_row_groups_by_statistics(
80
81
let pruning_stats = RowGroupPruningStatistics {
81
82
parquet_schema,
82
83
row_group_metadata : metadata,
83
- arrow_schema : predicate . schema ( ) . as_ref ( ) ,
84
+ arrow_schema,
84
85
} ;
85
86
match predicate. prune ( & pruning_stats) {
86
87
Ok ( values) => {
@@ -415,11 +416,11 @@ mod tests {
415
416
fn row_group_pruning_predicate_simple_expr ( ) {
416
417
use datafusion_expr:: { col, lit} ;
417
418
// int > 1 => c1_max > 1
418
- let schema = Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Int32 , false ) ] ) ;
419
+ let schema =
420
+ Arc :: new ( Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Int32 , false ) ] ) ) ;
419
421
let expr = col ( "c1" ) . gt ( lit ( 15 ) ) ;
420
422
let expr = logical2physical ( & expr, & schema) ;
421
- let pruning_predicate =
422
- PruningPredicate :: try_new ( expr, Arc :: new ( schema) ) . unwrap ( ) ;
423
+ let pruning_predicate = PruningPredicate :: try_new ( expr, schema. clone ( ) ) . unwrap ( ) ;
423
424
424
425
let field = PrimitiveTypeField :: new ( "c1" , PhysicalType :: INT32 ) ;
425
426
let schema_descr = get_test_schema_descr ( vec ! [ field] ) ;
@@ -435,6 +436,7 @@ mod tests {
435
436
let metrics = parquet_file_metrics ( ) ;
436
437
assert_eq ! (
437
438
prune_row_groups_by_statistics(
439
+ & schema,
438
440
& schema_descr,
439
441
& [ rgm1, rgm2] ,
440
442
None ,
@@ -449,11 +451,11 @@ mod tests {
449
451
fn row_group_pruning_predicate_missing_stats ( ) {
450
452
use datafusion_expr:: { col, lit} ;
451
453
// int > 1 => c1_max > 1
452
- let schema = Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Int32 , false ) ] ) ;
454
+ let schema =
455
+ Arc :: new ( Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Int32 , false ) ] ) ) ;
453
456
let expr = col ( "c1" ) . gt ( lit ( 15 ) ) ;
454
457
let expr = logical2physical ( & expr, & schema) ;
455
- let pruning_predicate =
456
- PruningPredicate :: try_new ( expr, Arc :: new ( schema) ) . unwrap ( ) ;
458
+ let pruning_predicate = PruningPredicate :: try_new ( expr, schema. clone ( ) ) . unwrap ( ) ;
457
459
458
460
let field = PrimitiveTypeField :: new ( "c1" , PhysicalType :: INT32 ) ;
459
461
let schema_descr = get_test_schema_descr ( vec ! [ field] ) ;
@@ -470,6 +472,7 @@ mod tests {
470
472
// is null / undefined so the first row group can't be filtered out
471
473
assert_eq ! (
472
474
prune_row_groups_by_statistics(
475
+ & schema,
473
476
& schema_descr,
474
477
& [ rgm1, rgm2] ,
475
478
None ,
@@ -518,6 +521,7 @@ mod tests {
518
521
// when conditions are joined using AND
519
522
assert_eq ! (
520
523
prune_row_groups_by_statistics(
524
+ & schema,
521
525
& schema_descr,
522
526
groups,
523
527
None ,
@@ -531,12 +535,13 @@ mod tests {
531
535
// this bypasses the entire predicate expression and no row groups are filtered out
532
536
let expr = col ( "c1" ) . gt ( lit ( 15 ) ) . or ( col ( "c2" ) . rem ( lit ( 2 ) ) . eq ( lit ( 0 ) ) ) ;
533
537
let expr = logical2physical ( & expr, & schema) ;
534
- let pruning_predicate = PruningPredicate :: try_new ( expr, schema) . unwrap ( ) ;
538
+ let pruning_predicate = PruningPredicate :: try_new ( expr, schema. clone ( ) ) . unwrap ( ) ;
535
539
536
540
// if conditions in predicate are joined with OR and an unsupported expression is used
537
541
// this bypasses the entire predicate expression and no row groups are filtered out
538
542
assert_eq ! (
539
543
prune_row_groups_by_statistics(
544
+ & schema,
540
545
& schema_descr,
541
546
groups,
542
547
None ,
@@ -547,6 +552,64 @@ mod tests {
547
552
) ;
548
553
}
549
554
555
+ #[ test]
556
+ fn row_group_pruning_predicate_file_schema ( ) {
557
+ use datafusion_expr:: { col, lit} ;
558
+ // test row group predicate when file schema is different than table schema
559
+ // c1 > 0
560
+ let table_schema = Arc :: new ( Schema :: new ( vec ! [
561
+ Field :: new( "c1" , DataType :: Int32 , false ) ,
562
+ Field :: new( "c2" , DataType :: Int32 , false ) ,
563
+ ] ) ) ;
564
+ let expr = col ( "c1" ) . gt ( lit ( 0 ) ) ;
565
+ let expr = logical2physical ( & expr, & table_schema) ;
566
+ let pruning_predicate =
567
+ PruningPredicate :: try_new ( expr, table_schema. clone ( ) ) . unwrap ( ) ;
568
+
569
+ // Model a file schema's column order c2 then c1, which is the opposite
570
+ // of the table schema
571
+ let file_schema = Arc :: new ( Schema :: new ( vec ! [
572
+ Field :: new( "c2" , DataType :: Int32 , false ) ,
573
+ Field :: new( "c1" , DataType :: Int32 , false ) ,
574
+ ] ) ) ;
575
+ let schema_descr = get_test_schema_descr ( vec ! [
576
+ PrimitiveTypeField :: new( "c2" , PhysicalType :: INT32 ) ,
577
+ PrimitiveTypeField :: new( "c1" , PhysicalType :: INT32 ) ,
578
+ ] ) ;
579
+ // rg1 has c2 less than zero, c1 greater than zero
580
+ let rgm1 = get_row_group_meta_data (
581
+ & schema_descr,
582
+ vec ! [
583
+ ParquetStatistics :: int32( Some ( -10 ) , Some ( -1 ) , None , 0 , false ) , // c2
584
+ ParquetStatistics :: int32( Some ( 1 ) , Some ( 10 ) , None , 0 , false ) ,
585
+ ] ,
586
+ ) ;
587
+ // rg1 has c2 greater than zero, c1 less than zero
588
+ let rgm2 = get_row_group_meta_data (
589
+ & schema_descr,
590
+ vec ! [
591
+ ParquetStatistics :: int32( Some ( 1 ) , Some ( 10 ) , None , 0 , false ) ,
592
+ ParquetStatistics :: int32( Some ( -10 ) , Some ( -1 ) , None , 0 , false ) ,
593
+ ] ,
594
+ ) ;
595
+
596
+ let metrics = parquet_file_metrics ( ) ;
597
+ let groups = & [ rgm1, rgm2] ;
598
+ // the first row group should be left because c1 is greater than zero
599
+ // the second should be filtered out because c1 is less than zero
600
+ assert_eq ! (
601
+ prune_row_groups_by_statistics(
602
+ & file_schema, // NB must be file schema, not table_schema
603
+ & schema_descr,
604
+ groups,
605
+ None ,
606
+ Some ( & pruning_predicate) ,
607
+ & metrics
608
+ ) ,
609
+ vec![ 0 ]
610
+ ) ;
611
+ }
612
+
550
613
fn gen_row_group_meta_data_for_pruning_predicate ( ) -> Vec < RowGroupMetaData > {
551
614
let schema_descr = get_test_schema_descr ( vec ! [
552
615
PrimitiveTypeField :: new( "c1" , PhysicalType :: INT32 ) ,
@@ -580,13 +643,14 @@ mod tests {
580
643
let schema_descr = arrow_to_parquet_schema ( & schema) . unwrap ( ) ;
581
644
let expr = col ( "c1" ) . gt ( lit ( 15 ) ) . and ( col ( "c2" ) . is_null ( ) ) ;
582
645
let expr = logical2physical ( & expr, & schema) ;
583
- let pruning_predicate = PruningPredicate :: try_new ( expr, schema) . unwrap ( ) ;
646
+ let pruning_predicate = PruningPredicate :: try_new ( expr, schema. clone ( ) ) . unwrap ( ) ;
584
647
let groups = gen_row_group_meta_data_for_pruning_predicate ( ) ;
585
648
586
649
let metrics = parquet_file_metrics ( ) ;
587
650
// First row group was filtered out because it contains no null value on "c2".
588
651
assert_eq ! (
589
652
prune_row_groups_by_statistics(
653
+ & schema,
590
654
& schema_descr,
591
655
& groups,
592
656
None ,
@@ -612,14 +676,15 @@ mod tests {
612
676
. gt ( lit ( 15 ) )
613
677
. and ( col ( "c2" ) . eq ( lit ( ScalarValue :: Boolean ( None ) ) ) ) ;
614
678
let expr = logical2physical ( & expr, & schema) ;
615
- let pruning_predicate = PruningPredicate :: try_new ( expr, schema) . unwrap ( ) ;
679
+ let pruning_predicate = PruningPredicate :: try_new ( expr, schema. clone ( ) ) . unwrap ( ) ;
616
680
let groups = gen_row_group_meta_data_for_pruning_predicate ( ) ;
617
681
618
682
let metrics = parquet_file_metrics ( ) ;
619
683
// bool = NULL always evaluates to NULL (and thus will not
620
684
// pass predicates. Ideally these should both be false
621
685
assert_eq ! (
622
686
prune_row_groups_by_statistics(
687
+ & schema,
623
688
& schema_descr,
624
689
& groups,
625
690
None ,
@@ -638,8 +703,11 @@ mod tests {
638
703
639
704
// INT32: c1 > 5, the c1 is decimal(9,2)
640
705
// The type of scalar value if decimal(9,2), don't need to do cast
641
- let schema =
642
- Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Decimal128 ( 9 , 2 ) , false ) ] ) ;
706
+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
707
+ "c1" ,
708
+ DataType :: Decimal128 ( 9 , 2 ) ,
709
+ false ,
710
+ ) ] ) ) ;
643
711
let field = PrimitiveTypeField :: new ( "c1" , PhysicalType :: INT32 )
644
712
. with_logical_type ( LogicalType :: Decimal {
645
713
scale : 2 ,
@@ -650,8 +718,7 @@ mod tests {
650
718
let schema_descr = get_test_schema_descr ( vec ! [ field] ) ;
651
719
let expr = col ( "c1" ) . gt ( lit ( ScalarValue :: Decimal128 ( Some ( 500 ) , 9 , 2 ) ) ) ;
652
720
let expr = logical2physical ( & expr, & schema) ;
653
- let pruning_predicate =
654
- PruningPredicate :: try_new ( expr, Arc :: new ( schema) ) . unwrap ( ) ;
721
+ let pruning_predicate = PruningPredicate :: try_new ( expr, schema. clone ( ) ) . unwrap ( ) ;
655
722
let rgm1 = get_row_group_meta_data (
656
723
& schema_descr,
657
724
// [1.00, 6.00]
@@ -679,6 +746,7 @@ mod tests {
679
746
let metrics = parquet_file_metrics ( ) ;
680
747
assert_eq ! (
681
748
prune_row_groups_by_statistics(
749
+ & schema,
682
750
& schema_descr,
683
751
& [ rgm1, rgm2, rgm3] ,
684
752
None ,
@@ -692,8 +760,11 @@ mod tests {
692
760
// The c1 type is decimal(9,0) in the parquet file, and the type of scalar is decimal(5,2).
693
761
// We should convert all type to the coercion type, which is decimal(11,2)
694
762
// The decimal of arrow is decimal(5,2), the decimal of parquet is decimal(9,0)
695
- let schema =
696
- Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Decimal128 ( 9 , 0 ) , false ) ] ) ;
763
+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
764
+ "c1" ,
765
+ DataType :: Decimal128 ( 9 , 0 ) ,
766
+ false ,
767
+ ) ] ) ) ;
697
768
698
769
let field = PrimitiveTypeField :: new ( "c1" , PhysicalType :: INT32 )
699
770
. with_logical_type ( LogicalType :: Decimal {
@@ -708,8 +779,7 @@ mod tests {
708
779
Decimal128 ( 11 , 2 ) ,
709
780
) ) ;
710
781
let expr = logical2physical ( & expr, & schema) ;
711
- let pruning_predicate =
712
- PruningPredicate :: try_new ( expr, Arc :: new ( schema) ) . unwrap ( ) ;
782
+ let pruning_predicate = PruningPredicate :: try_new ( expr, schema. clone ( ) ) . unwrap ( ) ;
713
783
let rgm1 = get_row_group_meta_data (
714
784
& schema_descr,
715
785
// [100, 600]
@@ -743,6 +813,7 @@ mod tests {
743
813
let metrics = parquet_file_metrics ( ) ;
744
814
assert_eq ! (
745
815
prune_row_groups_by_statistics(
816
+ & schema,
746
817
& schema_descr,
747
818
& [ rgm1, rgm2, rgm3, rgm4] ,
748
819
None ,
@@ -753,8 +824,11 @@ mod tests {
753
824
) ;
754
825
755
826
// INT64: c1 < 5, the c1 is decimal(18,2)
756
- let schema =
757
- Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Decimal128 ( 18 , 2 ) , false ) ] ) ;
827
+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
828
+ "c1" ,
829
+ DataType :: Decimal128 ( 18 , 2 ) ,
830
+ false ,
831
+ ) ] ) ) ;
758
832
let field = PrimitiveTypeField :: new ( "c1" , PhysicalType :: INT64 )
759
833
. with_logical_type ( LogicalType :: Decimal {
760
834
scale : 2 ,
@@ -765,8 +839,7 @@ mod tests {
765
839
let schema_descr = get_test_schema_descr ( vec ! [ field] ) ;
766
840
let expr = col ( "c1" ) . lt ( lit ( ScalarValue :: Decimal128 ( Some ( 500 ) , 18 , 2 ) ) ) ;
767
841
let expr = logical2physical ( & expr, & schema) ;
768
- let pruning_predicate =
769
- PruningPredicate :: try_new ( expr, Arc :: new ( schema) ) . unwrap ( ) ;
842
+ let pruning_predicate = PruningPredicate :: try_new ( expr, schema. clone ( ) ) . unwrap ( ) ;
770
843
let rgm1 = get_row_group_meta_data (
771
844
& schema_descr,
772
845
// [6.00, 8.00]
@@ -791,6 +864,7 @@ mod tests {
791
864
let metrics = parquet_file_metrics ( ) ;
792
865
assert_eq ! (
793
866
prune_row_groups_by_statistics(
867
+ & schema,
794
868
& schema_descr,
795
869
& [ rgm1, rgm2, rgm3] ,
796
870
None ,
@@ -802,8 +876,11 @@ mod tests {
802
876
803
877
// FIXED_LENGTH_BYTE_ARRAY: c1 = decimal128(100000, 28, 3), the c1 is decimal(18,2)
804
878
// the type of parquet is decimal(18,2)
805
- let schema =
806
- Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Decimal128 ( 18 , 2 ) , false ) ] ) ;
879
+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
880
+ "c1" ,
881
+ DataType :: Decimal128 ( 18 , 2 ) ,
882
+ false ,
883
+ ) ] ) ) ;
807
884
let field = PrimitiveTypeField :: new ( "c1" , PhysicalType :: FIXED_LEN_BYTE_ARRAY )
808
885
. with_logical_type ( LogicalType :: Decimal {
809
886
scale : 2 ,
@@ -817,8 +894,7 @@ mod tests {
817
894
let left = cast ( col ( "c1" ) , DataType :: Decimal128 ( 28 , 3 ) ) ;
818
895
let expr = left. eq ( lit ( ScalarValue :: Decimal128 ( Some ( 100000 ) , 28 , 3 ) ) ) ;
819
896
let expr = logical2physical ( & expr, & schema) ;
820
- let pruning_predicate =
821
- PruningPredicate :: try_new ( expr, Arc :: new ( schema) ) . unwrap ( ) ;
897
+ let pruning_predicate = PruningPredicate :: try_new ( expr, schema. clone ( ) ) . unwrap ( ) ;
822
898
// we must use the big-endian when encode the i128 to bytes or vec[u8].
823
899
let rgm1 = get_row_group_meta_data (
824
900
& schema_descr,
@@ -862,6 +938,7 @@ mod tests {
862
938
let metrics = parquet_file_metrics ( ) ;
863
939
assert_eq ! (
864
940
prune_row_groups_by_statistics(
941
+ & schema,
865
942
& schema_descr,
866
943
& [ rgm1, rgm2, rgm3] ,
867
944
None ,
@@ -873,8 +950,11 @@ mod tests {
873
950
874
951
// BYTE_ARRAY: c1 = decimal128(100000, 28, 3), the c1 is decimal(18,2)
875
952
// the type of parquet is decimal(18,2)
876
- let schema =
877
- Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Decimal128 ( 18 , 2 ) , false ) ] ) ;
953
+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
954
+ "c1" ,
955
+ DataType :: Decimal128 ( 18 , 2 ) ,
956
+ false ,
957
+ ) ] ) ) ;
878
958
let field = PrimitiveTypeField :: new ( "c1" , PhysicalType :: BYTE_ARRAY )
879
959
. with_logical_type ( LogicalType :: Decimal {
880
960
scale : 2 ,
@@ -888,8 +968,7 @@ mod tests {
888
968
let left = cast ( col ( "c1" ) , DataType :: Decimal128 ( 28 , 3 ) ) ;
889
969
let expr = left. eq ( lit ( ScalarValue :: Decimal128 ( Some ( 100000 ) , 28 , 3 ) ) ) ;
890
970
let expr = logical2physical ( & expr, & schema) ;
891
- let pruning_predicate =
892
- PruningPredicate :: try_new ( expr, Arc :: new ( schema) ) . unwrap ( ) ;
971
+ let pruning_predicate = PruningPredicate :: try_new ( expr, schema. clone ( ) ) . unwrap ( ) ;
893
972
// we must use the big-endian when encode the i128 to bytes or vec[u8].
894
973
let rgm1 = get_row_group_meta_data (
895
974
& schema_descr,
@@ -922,6 +1001,7 @@ mod tests {
922
1001
let metrics = parquet_file_metrics ( ) ;
923
1002
assert_eq ! (
924
1003
prune_row_groups_by_statistics(
1004
+ & schema,
925
1005
& schema_descr,
926
1006
& [ rgm1, rgm2, rgm3] ,
927
1007
None ,
0 commit comments