@@ -2425,89 +2425,179 @@ mod tests {
2425
2425
2426
2426
fn test_invalid_utf8_string_array_inner < O : OffsetSizeTrait > ( ) {
2427
2427
let cases = [
2428
- (
2429
- invalid_utf8_first_char :: < O > ( ) ,
2430
- "Parquet argument error: Parquet error: encountered non UTF-8 data" ,
2431
- ) ,
2432
- (
2433
- invalid_utf8_later_char :: < O > ( ) ,
2434
- "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 6" ,
2435
- ) ,
2428
+ invalid_utf8_first_char :: < O > ( ) ,
2429
+ invalid_utf8_first_char_long_strings :: < O > ( ) ,
2430
+ invalid_utf8_later_char :: < O > ( ) ,
2431
+ invalid_utf8_later_char_long_strings :: < O > ( ) ,
2432
+ invalid_utf8_later_char_really_long_strings :: < O > ( ) ,
2433
+ invalid_utf8_later_char_really_long_strings2 :: < O > ( ) ,
2436
2434
] ;
2437
- for ( array, expected_error) in cases {
2438
- // data is not valid utf8 we can not construct a correct StringArray
2439
- // safely, so purposely create an invalid StringArray
2440
- let array = unsafe {
2441
- GenericStringArray :: < O > :: new_unchecked (
2442
- array. offsets ( ) . clone ( ) ,
2443
- array. values ( ) . clone ( ) ,
2444
- array. nulls ( ) . cloned ( ) ,
2445
- )
2446
- } ;
2447
- let data_type = array. data_type ( ) . clone ( ) ;
2448
- let data = write_to_parquet ( Arc :: new ( array) ) ;
2449
- let err = read_from_parquet ( data) . unwrap_err ( ) ;
2450
- assert_eq ! ( err. to_string( ) , expected_error, "data type: {data_type:?}" )
2435
+ for array in & cases {
2436
+ for encoding in STRING_ENCODINGS {
2437
+ // data is not valid utf8 we can not construct a correct StringArray
2438
+ // safely, so purposely create an invalid StringArray
2439
+ let array = unsafe {
2440
+ GenericStringArray :: < O > :: new_unchecked (
2441
+ array. offsets ( ) . clone ( ) ,
2442
+ array. values ( ) . clone ( ) ,
2443
+ array. nulls ( ) . cloned ( ) ,
2444
+ )
2445
+ } ;
2446
+ let data_type = array. data_type ( ) . clone ( ) ;
2447
+ let data = write_to_parquet_with_encoding ( Arc :: new ( array) , * encoding) ;
2448
+ let err = read_from_parquet ( data) . unwrap_err ( ) ;
2449
+ let expected_err =
2450
+ "Parquet argument error: Parquet error: encountered non UTF-8 data" ;
2451
+ assert ! (
2452
+ err. to_string( ) . contains( expected_err) ,
2453
+ "data type: {data_type:?}, expected: {expected_err}, got: {err}"
2454
+ ) ;
2455
+ }
2451
2456
}
2452
2457
}
2453
2458
2454
2459
#[ test]
2455
2460
fn test_invalid_utf8_string_view_array ( ) {
2456
2461
let cases = [
2457
- (
2458
- invalid_utf8_first_char :: < i32 > ( ) ,
2459
- "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 11" ,
2460
- ) ,
2461
- (
2462
- invalid_utf8_later_char :: < i32 > ( ) ,
2463
- "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 14" ,
2464
- ) ,
2462
+ invalid_utf8_first_char :: < i32 > ( ) ,
2463
+ invalid_utf8_first_char_long_strings :: < i32 > ( ) ,
2464
+ invalid_utf8_later_char :: < i32 > ( ) ,
2465
+ invalid_utf8_later_char_long_strings :: < i32 > ( ) ,
2466
+ invalid_utf8_later_char_really_long_strings :: < i32 > ( ) ,
2467
+ invalid_utf8_later_char_really_long_strings2 :: < i32 > ( ) ,
2465
2468
] ;
2466
- for ( array, expected_error) in cases {
2467
- let array = arrow_cast:: cast ( & array, & ArrowDataType :: BinaryView ) . unwrap ( ) ;
2468
- let array = array. as_binary_view ( ) ;
2469
-
2470
- // data is not valid utf8 we can not construct a correct StringArray
2471
- // safely, so purposely create an invalid StringArray
2472
- let array = unsafe {
2473
- StringViewArray :: new_unchecked (
2474
- array. views ( ) . clone ( ) ,
2475
- array. data_buffers ( ) . to_vec ( ) ,
2476
- array. nulls ( ) . cloned ( ) ,
2477
- )
2478
- } ;
2479
- let data_type = array. data_type ( ) . clone ( ) ;
2480
- let data = write_to_parquet ( Arc :: new ( array) ) ;
2481
- let err = read_from_parquet ( data) . unwrap_err ( ) ;
2482
- assert_eq ! ( err. to_string( ) , expected_error, "data type: {data_type:?}" )
2469
+
2470
+ for encoding in STRING_ENCODINGS {
2471
+ for array in & cases {
2472
+ let array = arrow_cast:: cast ( & array, & ArrowDataType :: BinaryView ) . unwrap ( ) ;
2473
+ let array = array. as_binary_view ( ) ;
2474
+
2475
+ // data is not valid utf8 we can not construct a correct StringArray
2476
+ // safely, so purposely create an invalid StringViewArray
2477
+ let array = unsafe {
2478
+ StringViewArray :: new_unchecked (
2479
+ array. views ( ) . clone ( ) ,
2480
+ array. data_buffers ( ) . to_vec ( ) ,
2481
+ array. nulls ( ) . cloned ( ) ,
2482
+ )
2483
+ } ;
2484
+
2485
+ let data_type = array. data_type ( ) . clone ( ) ;
2486
+ let data = write_to_parquet_with_encoding ( Arc :: new ( array) , * encoding) ;
2487
+ let err = read_from_parquet ( data) . unwrap_err ( ) ;
2488
+ let expected_err =
2489
+ "Parquet argument error: Parquet error: encountered non UTF-8 data" ;
2490
+ assert ! (
2491
+ err. to_string( ) . contains( expected_err) ,
2492
+ "data type: {data_type:?}, expected: {expected_err}, got: {err}"
2493
+ ) ;
2494
+ }
2483
2495
}
2484
2496
}
2485
2497
2498
+ /// Encodings suitable for string data
2499
+ const STRING_ENCODINGS : & [ Option < Encoding > ] = & [
2500
+ None ,
2501
+ Some ( Encoding :: PLAIN ) ,
2502
+ Some ( Encoding :: DELTA_LENGTH_BYTE_ARRAY ) ,
2503
+ Some ( Encoding :: DELTA_BYTE_ARRAY ) ,
2504
+ ] ;
2505
+
2506
+ /// Invalid Utf-8 sequence in the first character
2507
+ /// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
2508
+ const INVALID_UTF8_FIRST_CHAR : & [ u8 ] = & [ 0xa0 , 0xa1 , 0x20 , 0x20 ] ;
2509
+
2510
+ /// Invalid Utf=8 sequence in NOT the first character
2511
+ /// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
2512
+ const INVALID_UTF8_LATER_CHAR : & [ u8 ] = & [ 0x20 , 0x20 , 0x20 , 0xa0 , 0xa1 , 0x20 , 0x20 ] ;
2513
+
2486
2514
/// returns a BinaryArray with invalid UTF8 data in the first character
2487
2515
fn invalid_utf8_first_char < O : OffsetSizeTrait > ( ) -> GenericBinaryArray < O > {
2488
- // invalid sequence in the first character
2489
- // https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
2490
2516
let valid: & [ u8 ] = b" " ;
2491
- let invalid: & [ u8 ] = & [ 0xa0 , 0xa1 , 0x20 , 0x20 ] ;
2517
+ let invalid = INVALID_UTF8_FIRST_CHAR ;
2492
2518
GenericBinaryArray :: < O > :: from_iter ( vec ! [ None , Some ( valid) , None , Some ( invalid) ] )
2493
2519
}
2494
2520
2521
+ /// Returns a BinaryArray with invalid UTF8 data in the first character of a
2522
+ /// string larger than 12 bytes which is handled specially when reading
2523
+ /// `ByteViewArray`s
2524
+ fn invalid_utf8_first_char_long_strings < O : OffsetSizeTrait > ( ) -> GenericBinaryArray < O > {
2525
+ let valid: & [ u8 ] = b" " ;
2526
+ let mut invalid = vec ! [ ] ;
2527
+ invalid. extend_from_slice ( b"ThisStringIsCertainlyLongerThan12Bytes" ) ;
2528
+ invalid. extend_from_slice ( INVALID_UTF8_FIRST_CHAR ) ;
2529
+ GenericBinaryArray :: < O > :: from_iter ( vec ! [ None , Some ( valid) , None , Some ( & invalid) ] )
2530
+ }
2531
+
2495
2532
/// returns a BinaryArray with invalid UTF8 data in a character other than
2496
2533
/// the first (this is checked in a special codepath)
2497
2534
fn invalid_utf8_later_char < O : OffsetSizeTrait > ( ) -> GenericBinaryArray < O > {
2498
- // invalid sequence in NOT the first character
2499
- // https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
2500
2535
let valid: & [ u8 ] = b" " ;
2501
- let invalid: & [ u8 ] = & [ 0x20 , 0x20 , 0x20 , 0xa0 , 0xa1 , 0x20 , 0x20 ] ;
2536
+ let invalid: & [ u8 ] = INVALID_UTF8_LATER_CHAR ;
2502
2537
GenericBinaryArray :: < O > :: from_iter ( vec ! [ None , Some ( valid) , None , Some ( invalid) ] )
2503
2538
}
2504
2539
2505
- // writes the array into a single column parquet file
2506
- fn write_to_parquet ( array : ArrayRef ) -> Vec < u8 > {
2540
+ /// returns a BinaryArray with invalid UTF8 data in a character other than
2541
+ /// the first in a string larger than 12 bytes which is handled specially
2542
+ /// when reading `ByteViewArray`s (this is checked in a special codepath)
2543
+ fn invalid_utf8_later_char_long_strings < O : OffsetSizeTrait > ( ) -> GenericBinaryArray < O > {
2544
+ let valid: & [ u8 ] = b" " ;
2545
+ let mut invalid = vec ! [ ] ;
2546
+ invalid. extend_from_slice ( b"ThisStringIsCertainlyLongerThan12Bytes" ) ;
2547
+ invalid. extend_from_slice ( INVALID_UTF8_LATER_CHAR ) ;
2548
+ GenericBinaryArray :: < O > :: from_iter ( vec ! [ None , Some ( valid) , None , Some ( & invalid) ] )
2549
+ }
2550
+
2551
+ /// returns a BinaryArray with invalid UTF8 data in a character other than
2552
+ /// the first in a string larger than 128 bytes which is handled specially
2553
+ /// when reading `ByteViewArray`s (this is checked in a special codepath)
2554
+ fn invalid_utf8_later_char_really_long_strings < O : OffsetSizeTrait > ( ) -> GenericBinaryArray < O > {
2555
+ let valid: & [ u8 ] = b" " ;
2556
+ let mut invalid = vec ! [ ] ;
2557
+ for _ in 0 ..10 {
2558
+ // each instance is 38 bytes
2559
+ invalid. extend_from_slice ( b"ThisStringIsCertainlyLongerThan12Bytes" ) ;
2560
+ }
2561
+ invalid. extend_from_slice ( INVALID_UTF8_LATER_CHAR ) ;
2562
+ GenericBinaryArray :: < O > :: from_iter ( vec ! [ None , Some ( valid) , None , Some ( & invalid) ] )
2563
+ }
2564
+
2565
+ /// returns a BinaryArray with small invalid UTF8 data followed by a large
2566
+ /// invalid UTF8 data in a character other than the first in a string larger
2567
+ fn invalid_utf8_later_char_really_long_strings2 < O : OffsetSizeTrait > ( ) -> GenericBinaryArray < O > {
2568
+ let valid: & [ u8 ] = b" " ;
2569
+ let mut valid_long = vec ! [ ] ;
2570
+ for _ in 0 ..10 {
2571
+ // each instance is 38 bytes
2572
+ valid_long. extend_from_slice ( b"ThisStringIsCertainlyLongerThan12Bytes" ) ;
2573
+ }
2574
+ let invalid = INVALID_UTF8_LATER_CHAR ;
2575
+ GenericBinaryArray :: < O > :: from_iter ( vec ! [
2576
+ None ,
2577
+ Some ( valid) ,
2578
+ Some ( invalid) ,
2579
+ None ,
2580
+ Some ( & valid_long) ,
2581
+ Some ( valid) ,
2582
+ ] )
2583
+ }
2584
+
2585
+ /// writes the array into a single column parquet file with the specified
2586
+ /// encoding.
2587
+ ///
2588
+ /// If no encoding is specified, use default (dictionary) encoding
2589
+ fn write_to_parquet_with_encoding ( array : ArrayRef , encoding : Option < Encoding > ) -> Vec < u8 > {
2507
2590
let batch = RecordBatch :: try_from_iter ( vec ! [ ( "c" , array) ] ) . unwrap ( ) ;
2508
2591
let mut data = vec ! [ ] ;
2509
2592
let schema = batch. schema ( ) ;
2510
- let props = None ;
2593
+ let props = encoding. map ( |encoding| {
2594
+ WriterProperties :: builder ( )
2595
+ // must disable dictionary encoding to actually use encoding
2596
+ . set_dictionary_enabled ( false )
2597
+ . set_encoding ( encoding)
2598
+ . build ( )
2599
+ } ) ;
2600
+
2511
2601
{
2512
2602
let mut writer = ArrowWriter :: try_new ( & mut data, schema, props) . unwrap ( ) ;
2513
2603
writer. write ( & batch) . unwrap ( ) ;
0 commit comments