@@ -314,6 +314,8 @@ impl ByteViewArrayDecoderPlain {
314
314
let buf = self . buf . as_ref ( ) ;
315
315
let mut read = 0 ;
316
316
output. views . reserve ( to_read) ;
317
+
318
+ let mut utf8_validation_begin = self . offset ;
317
319
while self . offset < self . buf . len ( ) && read != to_read {
318
320
if self . offset + 4 > self . buf . len ( ) {
319
321
return Err ( ParquetError :: EOF ( "eof decoding byte array" . into ( ) ) ) ;
@@ -332,7 +334,38 @@ impl ByteViewArrayDecoderPlain {
332
334
}
333
335
334
336
if self . validate_utf8 {
335
- check_valid_utf8 ( unsafe { buf. get_unchecked ( start_offset..end_offset) } ) ?;
337
+ // It seems you are trying to understand what's going on here, take a breath and be patient.
338
+ // Utf-8 validation is a non-trivial task, here are some background facts:
339
+ // (1) Validating one 2048-byte string is much faster than validating 128 of 16-byte string.
340
+ // As shown in https://github.com/apache/arrow-rs/pull/6009#issuecomment-2211174229
341
+ // Potentially because the SIMD operations favor longer strings.
342
+ // (2) Practical strings are short, 99% of strings are smaller than 100 bytes, as shown in paper:
343
+ // https://www.vldb.org/pvldb/vol17/p148-zeng.pdf, Figure 5f.
344
+ // (3) Parquet plain encoding makes utf-8 validation harder,
345
+ // because it stores the length of each string right before the string.
346
+ // This means naive utf-8 validation will be slow, because the validation need to skip the length bytes.
347
+ // I.e., the validation cannot validate the buffer in one pass, but instead, validate strings chunk by chunk.
348
+ //
349
+ // Given the above observations, the goal is to do batch validation as much as possible.
350
+ // The key idea is that if the length is smaller than 128 (99% of the case), then the length bytes are valid utf-8, as reasoned below:
351
+ // If the length is smaller than 128, its 4-byte encoding are [0, 0, 0, len].
352
+ // Each of the byte is a valid ASCII character, so they are valid utf-8.
353
+ // Since they are all smaller than 128, the won't break a utf-8 code point (won't mess with later bytes).
354
+ //
355
+ // The implementation keeps a water mark `utf8_validation_begin` to track the beginning of the buffer that is not validated.
356
+ // If the length is smaller than 128, then we continue to next string.
357
+ // If the length is larger than 128, then we validate the buffer before the length bytes, and move the water mark to the beginning of next string.
358
+ if len < 128 {
359
+ // fast path, move to next string.
360
+ // the len bytes are valid utf8.
361
+ } else {
362
+ // unfortunately, the len bytes may not be valid utf8, we need to wrap up and validate everything before it.
363
+ check_valid_utf8 ( unsafe {
364
+ buf. get_unchecked ( utf8_validation_begin..self . offset )
365
+ } ) ?;
366
+ // move the cursor to skip the len bytes.
367
+ utf8_validation_begin = start_offset;
368
+ }
336
369
}
337
370
338
371
unsafe {
@@ -342,6 +375,11 @@ impl ByteViewArrayDecoderPlain {
342
375
read += 1 ;
343
376
}
344
377
378
+ // validate the last part of the buffer
379
+ if self . validate_utf8 {
380
+ check_valid_utf8 ( unsafe { buf. get_unchecked ( utf8_validation_begin..self . offset ) } ) ?;
381
+ }
382
+
345
383
self . max_remaining_values -= to_read;
346
384
Ok ( to_read)
347
385
}
0 commit comments