Skip to content

Commit

Permalink
[Kernel][Defaults] Handle legacy map types in Parquet files (#3097)
Browse files Browse the repository at this point in the history
## Description
Currently, Kernel's Parquet reader explicitly looks for the `key_value`
repeated group under the Parquet map type, but the older versions of
Parquet writers wrote any name for the repeated group. Instead of
looking for the explicit `key_value` element, fetch the first element in
the list. See
[here](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps)
for more details.

## How was this patch tested?
The
[test](https://github.com/apache/spark/blob/master/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetThriftCompatibilitySuite.scala#L29)
and sample file written by legacy writers are taken from Apache Spark™.

Some columns (arrays with 2-level encoding, another legacy format) from
the test file are currently not supported. I will follow up with a
separate PR. It involves bit refactoring on the ArrayColumnReader.
  • Loading branch information
vkorukanti authored May 14, 2024
1 parent b1b84d5 commit b9fe0e1
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ private static Converter createElementConverter(
ArrayType typeFromClient,
GroupType typeFromFile) {

checkArgument(
typeFromFile.getFieldCount() == 1, "Expected exactly one field in the array type");
checkArgument(typeFromFile.getFieldCount() == 1,
"Expected exactly one field in the array type, but got: " + typeFromFile);
GroupType repeatedGroup = typeFromFile.getType(0).asGroupType();

// TODO: handle the legacy 2-level list physical format
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import io.delta.kernel.data.ColumnVector;
import io.delta.kernel.types.MapType;

import static io.delta.kernel.internal.util.Preconditions.checkArgument;

import io.delta.kernel.defaults.internal.data.vector.DefaultMapVector;
import static io.delta.kernel.defaults.internal.parquet.ParquetColumnReaders.createConverter;

Expand Down Expand Up @@ -57,10 +59,16 @@ public ColumnVector getDataColumnVector(int batchSize) {
}

private static Converter[] createElementConverters(
int initialBatchSize,
MapType typeFromClient,
GroupType typeFromFile) {
final GroupType innerMapType = (GroupType) typeFromFile.getType("key_value");
int initialBatchSize,
MapType typeFromClient,
GroupType typeFromFile) {
// Repeated element can be any name. Latest Parquet versions use "key_value" as the name,
// but legacy versions can use any arbitrary name for the repeated group.
// See https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps for details
checkArgument(typeFromFile.getFieldCount() == 1,
"Expected exactly one repeated field in the map type, but got: " + typeFromFile);

GroupType innerMapType = typeFromFile.getType(0).asGroupType();
Converter[] elemConverters = new Converter[2];
elemConverters[0] = createConverter(
initialBatchSize,
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,66 @@ class ParquetFileReaderSuite extends AnyFunSuite
checkAnswer(actResult1, expResult1)

// File with multiple row-groups [0, 20000) where rowIndex = id
val filePath = getTestResourceFilePath("parquet/")
val filePath = getTestResourceFilePath("parquet/row_index_multiple_row_groups.parquet")
val actResult2 = readParquetFilesUsingKernel(filePath, readSchema)
val expResult2 = (0L until 20000L).map(i => TestRow(i, i))

checkAnswer(actResult2, expResult2)
}

/////////////////////////////////////////////////////////////////////////////////////////////////
// Test compatibility with Parquet legacy format files //
/////////////////////////////////////////////////////////////////////////////////////////////////

// Test and the test file are copied from Spark's `ParquetThriftCompatibilitySuite`
test("read parquet file generated by parquet-thrift") {
val parquetFilePath = getTestResourceFilePath("parquet/parquet-thrift-compat.snappy.parquet")

val readSchema = new StructType()
.add("boolColumn", BooleanType.BOOLEAN)
.add("byteColumn", ByteType.BYTE)
.add("shortColumn", ShortType.SHORT)
.add("intColumn", IntegerType.INTEGER)
.add("longColumn", LongType.LONG)
.add("doubleColumn", DoubleType.DOUBLE)
// Thrift `BINARY` values are actually unencoded `STRING` values, and thus are always
// treated as `BINARY (UTF8)` in parquet-thrift, since parquet-thrift always assume
// Thrift `STRING`s are encoded using UTF-8.
.add("binaryColumn", StringType.STRING)
.add("stringColumn", StringType.STRING)
.add("enumColumn", StringType.STRING)
// maybe indicates nullable columns, above ones are non-nullable
.add("maybeBoolColumn", BooleanType.BOOLEAN)
.add("maybeByteColumn", ByteType.BYTE)
.add("maybeShortColumn", ShortType.SHORT)
.add("maybeIntColumn", IntegerType.INTEGER)
.add("maybeLongColumn", LongType.LONG)
.add("maybeDoubleColumn", DoubleType.DOUBLE)
// Thrift `BINARY` values are actually unencoded `STRING` values, and thus are always
// treated as `BINARY (UTF8)` in parquet-thrift, since parquet-thrift always assume
// Thrift `STRING`s are encoded using UTF-8.
.add("maybeBinaryColumn", StringType.STRING)
.add("maybeStringColumn", StringType.STRING)
.add("maybeEnumColumn", StringType.STRING)
// TODO: not working - separate PR to handle 2-level legacy lists
// .add("stringsColumn", new ArrayType(StringType.STRING, true /* containsNull */))
// .add("intSetColumn", new ArrayType(IntegerType.INTEGER, true /* containsNull */))
.add("intToStringColumn",
new MapType(IntegerType.INTEGER, StringType.STRING, true /* valueContainsNull */))
// TODO: not working - separate PR to handle 2-level legacy lists
// .add("complexColumn", new MapType(
// IntegerType.INTEGER,
// new ArrayType(
// new StructType()
// .add("nestedIntsColumn", new ArrayType(IntegerType.INTEGER, true /* containsNull */))
// .add("nestedStringColumn", StringType.STRING)
// .add("stringColumn", StringType.STRING),
// true /* containsNull */),
// true /* valueContainsNull */))

assert(parquetFileRowCount(parquetFilePath) === 10)
checkAnswer(
readParquetFilesUsingKernel(parquetFilePath, readSchema), /* actual */
readParquetFilesUsingSpark(parquetFilePath, readSchema) /* expected */)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -207,27 +207,24 @@ trait ParquetSuiteBase extends TestUtils {
}

def readParquetUsingKernelAsColumnarBatches(
actualFileDir: String,
inputFileOrDir: String,
readSchema: StructType,
predicate: Optional[Predicate] = Optional.empty()): Seq[ColumnarBatch] = {
val parquetFiles = Files.list(Paths.get(actualFileDir))
.iterator().asScala
.map(_.toString)
.filter(path => path.endsWith(".parquet"))
.map(path => FileStatus.of(path, 0L, 0L))
val parquetFileList = parquetFiles(inputFileOrDir)
.map(FileStatus.of(_, 0, 0))

val data = defaultEngine.getParquetHandler.readParquetFiles(
toCloseableIterator(parquetFiles.asJava),
toCloseableIterator(parquetFileList.asJava.iterator()),
readSchema,
predicate)

data.asScala.toSeq
}

def parquetFileCount(path: String): Long = parquetFiles(path).size
def parquetFileCount(fileOrDir: String): Long = parquetFiles(fileOrDir).size

def parquetFileRowCount(path: String): Long = {
val files = parquetFiles(path)
def parquetFileRowCount(fileOrDir: String): Long = {
val files = parquetFiles(fileOrDir)

var rowCount = 0L
files.foreach { file =>
Expand All @@ -238,12 +235,17 @@ trait ParquetSuiteBase extends TestUtils {
rowCount
}

def parquetFiles(path: String): Seq[String] = {
Files.list(Paths.get(path))
.iterator().asScala
.map(_.toString)
.filter(path => path.endsWith(".parquet"))
.toSeq
def parquetFiles(fileOrDir: String): Seq[String] = {
val fileOrDirPath = Paths.get(fileOrDir)
if (Files.isDirectory(fileOrDirPath)) {
Files.list(fileOrDirPath)
.iterator().asScala
.map(_.toString)
.filter(path => path.endsWith(".parquet"))
.toSeq
} else {
Seq(fileOrDir)
}
}

def footer(path: String): ParquetMetadata = {
Expand Down

0 comments on commit b9fe0e1

Please # to comment.