Skip to content

Commit a64f806

Browse files
committed
Additional tests for parquet reader utf8 validation
1 parent 8355823 commit a64f806

File tree

1 file changed

+146
-56
lines changed
  • parquet/src/arrow/arrow_reader

1 file changed

+146
-56
lines changed

parquet/src/arrow/arrow_reader/mod.rs

+146-56
Original file line numberDiff line numberDiff line change
@@ -2425,89 +2425,179 @@ mod tests {
24252425

24262426
fn test_invalid_utf8_string_array_inner<O: OffsetSizeTrait>() {
24272427
let cases = [
2428-
(
2429-
invalid_utf8_first_char::<O>(),
2430-
"Parquet argument error: Parquet error: encountered non UTF-8 data",
2431-
),
2432-
(
2433-
invalid_utf8_later_char::<O>(),
2434-
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 6",
2435-
),
2428+
invalid_utf8_first_char::<O>(),
2429+
invalid_utf8_first_char_long_strings::<O>(),
2430+
invalid_utf8_later_char::<O>(),
2431+
invalid_utf8_later_char_long_strings::<O>(),
2432+
invalid_utf8_later_char_really_long_strings::<O>(),
2433+
invalid_utf8_later_char_really_long_strings2::<O>(),
24362434
];
2437-
for (array, expected_error) in cases {
2438-
// data is not valid utf8 we can not construct a correct StringArray
2439-
// safely, so purposely create an invalid StringArray
2440-
let array = unsafe {
2441-
GenericStringArray::<O>::new_unchecked(
2442-
array.offsets().clone(),
2443-
array.values().clone(),
2444-
array.nulls().cloned(),
2445-
)
2446-
};
2447-
let data_type = array.data_type().clone();
2448-
let data = write_to_parquet(Arc::new(array));
2449-
let err = read_from_parquet(data).unwrap_err();
2450-
assert_eq!(err.to_string(), expected_error, "data type: {data_type:?}")
2435+
for array in &cases {
2436+
for encoding in STRING_ENCODINGS {
2437+
// data is not valid utf8 we can not construct a correct StringArray
2438+
// safely, so purposely create an invalid StringArray
2439+
let array = unsafe {
2440+
GenericStringArray::<O>::new_unchecked(
2441+
array.offsets().clone(),
2442+
array.values().clone(),
2443+
array.nulls().cloned(),
2444+
)
2445+
};
2446+
let data_type = array.data_type().clone();
2447+
let data = write_to_parquet_with_encoding(Arc::new(array), *encoding);
2448+
let err = read_from_parquet(data).unwrap_err();
2449+
let expected_err =
2450+
"Parquet argument error: Parquet error: encountered non UTF-8 data";
2451+
assert!(
2452+
err.to_string().contains(expected_err),
2453+
"data type: {data_type:?}, expected: {expected_err}, got: {err}"
2454+
);
2455+
}
24512456
}
24522457
}
24532458

24542459
#[test]
24552460
fn test_invalid_utf8_string_view_array() {
24562461
let cases = [
2457-
(
2458-
invalid_utf8_first_char::<i32>(),
2459-
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 11",
2460-
),
2461-
(
2462-
invalid_utf8_later_char::<i32>(),
2463-
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 14",
2464-
),
2462+
invalid_utf8_first_char::<i32>(),
2463+
invalid_utf8_first_char_long_strings::<i32>(),
2464+
invalid_utf8_later_char::<i32>(),
2465+
invalid_utf8_later_char_long_strings::<i32>(),
2466+
invalid_utf8_later_char_really_long_strings::<i32>(),
2467+
invalid_utf8_later_char_really_long_strings2::<i32>(),
24652468
];
2466-
for (array, expected_error) in cases {
2467-
let array = arrow_cast::cast(&array, &ArrowDataType::BinaryView).unwrap();
2468-
let array = array.as_binary_view();
2469-
2470-
// data is not valid utf8 we can not construct a correct StringArray
2471-
// safely, so purposely create an invalid StringArray
2472-
let array = unsafe {
2473-
StringViewArray::new_unchecked(
2474-
array.views().clone(),
2475-
array.data_buffers().to_vec(),
2476-
array.nulls().cloned(),
2477-
)
2478-
};
2479-
let data_type = array.data_type().clone();
2480-
let data = write_to_parquet(Arc::new(array));
2481-
let err = read_from_parquet(data).unwrap_err();
2482-
assert_eq!(err.to_string(), expected_error, "data type: {data_type:?}")
2469+
2470+
for encoding in STRING_ENCODINGS {
2471+
for array in &cases {
2472+
let array = arrow_cast::cast(&array, &ArrowDataType::BinaryView).unwrap();
2473+
let array = array.as_binary_view();
2474+
2475+
// data is not valid utf8 we can not construct a correct StringArray
2476+
// safely, so purposely create an invalid StringViewArray
2477+
let array = unsafe {
2478+
StringViewArray::new_unchecked(
2479+
array.views().clone(),
2480+
array.data_buffers().to_vec(),
2481+
array.nulls().cloned(),
2482+
)
2483+
};
2484+
2485+
let data_type = array.data_type().clone();
2486+
let data = write_to_parquet_with_encoding(Arc::new(array), *encoding);
2487+
let err = read_from_parquet(data).unwrap_err();
2488+
let expected_err =
2489+
"Parquet argument error: Parquet error: encountered non UTF-8 data";
2490+
assert!(
2491+
err.to_string().contains(expected_err),
2492+
"data type: {data_type:?}, expected: {expected_err}, got: {err}"
2493+
);
2494+
}
24832495
}
24842496
}
24852497

2498+
/// Encodings suitable for string data
2499+
const STRING_ENCODINGS: &[Option<Encoding>] = &[
2500+
None,
2501+
Some(Encoding::PLAIN),
2502+
Some(Encoding::DELTA_LENGTH_BYTE_ARRAY),
2503+
Some(Encoding::DELTA_BYTE_ARRAY),
2504+
];
2505+
2506+
/// Invalid Utf-8 sequence in the first character
2507+
/// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
2508+
const INVALID_UTF8_FIRST_CHAR: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];
2509+
2510+
/// Invalid Utf=8 sequence in NOT the first character
2511+
/// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
2512+
const INVALID_UTF8_LATER_CHAR: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20];
2513+
24862514
/// returns a BinaryArray with invalid UTF8 data in the first character
24872515
fn invalid_utf8_first_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
2488-
// invalid sequence in the first character
2489-
// https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
24902516
let valid: &[u8] = b" ";
2491-
let invalid: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];
2517+
let invalid = INVALID_UTF8_FIRST_CHAR;
24922518
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(invalid)])
24932519
}
24942520

2521+
/// Returns a BinaryArray with invalid UTF8 data in the first character of a
2522+
/// string larger than 12 bytes which is handled specially when reading
2523+
/// `ByteViewArray`s
2524+
fn invalid_utf8_first_char_long_strings<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
2525+
let valid: &[u8] = b" ";
2526+
let mut invalid = vec![];
2527+
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
2528+
invalid.extend_from_slice(INVALID_UTF8_FIRST_CHAR);
2529+
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(&invalid)])
2530+
}
2531+
24952532
/// returns a BinaryArray with invalid UTF8 data in a character other than
24962533
/// the first (this is checked in a special codepath)
24972534
fn invalid_utf8_later_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
2498-
// invalid sequence in NOT the first character
2499-
// https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
25002535
let valid: &[u8] = b" ";
2501-
let invalid: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20];
2536+
let invalid: &[u8] = INVALID_UTF8_LATER_CHAR;
25022537
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(invalid)])
25032538
}
25042539

2505-
// writes the array into a single column parquet file
2506-
fn write_to_parquet(array: ArrayRef) -> Vec<u8> {
2540+
/// returns a BinaryArray with invalid UTF8 data in a character other than
2541+
/// the first in a string larger than 12 bytes which is handled specially
2542+
/// when reading `ByteViewArray`s (this is checked in a special codepath)
2543+
fn invalid_utf8_later_char_long_strings<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
2544+
let valid: &[u8] = b" ";
2545+
let mut invalid = vec![];
2546+
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
2547+
invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR);
2548+
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(&invalid)])
2549+
}
2550+
2551+
/// returns a BinaryArray with invalid UTF8 data in a character other than
2552+
/// the first in a string larger than 128 bytes which is handled specially
2553+
/// when reading `ByteViewArray`s (this is checked in a special codepath)
2554+
fn invalid_utf8_later_char_really_long_strings<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
2555+
let valid: &[u8] = b" ";
2556+
let mut invalid = vec![];
2557+
for _ in 0..10 {
2558+
// each instance is 38 bytes
2559+
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
2560+
}
2561+
invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR);
2562+
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(&invalid)])
2563+
}
2564+
2565+
/// returns a BinaryArray with small invalid UTF8 data followed by a large
2566+
/// invalid UTF8 data in a character other than the first in a string larger
2567+
fn invalid_utf8_later_char_really_long_strings2<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
2568+
let valid: &[u8] = b" ";
2569+
let mut valid_long = vec![];
2570+
for _ in 0..10 {
2571+
// each instance is 38 bytes
2572+
valid_long.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
2573+
}
2574+
let invalid = INVALID_UTF8_LATER_CHAR;
2575+
GenericBinaryArray::<O>::from_iter(vec![
2576+
None,
2577+
Some(valid),
2578+
Some(invalid),
2579+
None,
2580+
Some(&valid_long),
2581+
Some(valid),
2582+
])
2583+
}
2584+
2585+
/// writes the array into a single column parquet file with the specified
2586+
/// encoding.
2587+
///
2588+
/// If no encoding is specified, use default (dictionary) encoding
2589+
fn write_to_parquet_with_encoding(array: ArrayRef, encoding: Option<Encoding>) -> Vec<u8> {
25072590
let batch = RecordBatch::try_from_iter(vec![("c", array)]).unwrap();
25082591
let mut data = vec![];
25092592
let schema = batch.schema();
2510-
let props = None;
2593+
let props = encoding.map(|encoding| {
2594+
WriterProperties::builder()
2595+
// must disable dictionary encoding to actually use encoding
2596+
.set_dictionary_enabled(false)
2597+
.set_encoding(encoding)
2598+
.build()
2599+
});
2600+
25112601
{
25122602
let mut writer = ArrowWriter::try_new(&mut data, schema, props).unwrap();
25132603
writer.write(&batch).unwrap();

0 commit comments

Comments
 (0)