[Hudi] Support list/map data type conversion (#3320)

#### Which Delta project/connector is this regarding?  - [ ] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [x] Other (Hudi) ## Description  This PR adds functionality to convert Delta tables with list or map type columns to be Hudi-readable. ## How was this patch tested?  Added unit tests in ConvertToHudiSuite and tested manually with external Hudi Spark reader. ## Does this PR introduce _any_ user-facing changes?  Yes. Previously users could not enable the Delta table property for Hudi conversion on tables containing list/map columns and would receive an unsupportedType error but now they can.
delta-io · Jul 1, 2024 · dd39415 · dd39415
1 parent 207d8d2
commit dd39415
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 20 deletions.
diff --git a/hudi/src/main/scala/org/apache/spark/sql/delta/hudi/HudiSchemaUtils.scala b/hudi/src/main/scala/org/apache/spark/sql/delta/hudi/HudiSchemaUtils.scala
@@ -45,12 +45,16 @@ object HudiSchemaUtils extends DeltaLogging {
         finalizeSchema(
           Schema.createRecord(currentPath, null, null, false, avroFields),
           isNullable)
-      // TODO: Add List and Map support: https://github.com/delta-io/delta/issues/2738
+
       case ArrayType(elementType, containsNull) =>
-        throw new UnsupportedOperationException("UniForm Hudi doesn't support Array columns")
+        finalizeSchema(
+          Schema.createArray(transform(elementType, containsNull, currentPath)),
+          isNullable)
 
       case MapType(keyType, valueType, valueContainsNull) =>
-        throw new UnsupportedOperationException("UniForm Hudi doesn't support Map columns")
+        finalizeSchema(
+          Schema.createMap(transform(valueType, valueContainsNull, currentPath)),
+          isNullable)
 
       case atomicType: AtomicType => convertAtomic(atomicType, isNullable)
 

diff --git a/hudi/src/test/scala/org/apache/spark/sql/delta/hudi/ConvertToHudiSuite.scala b/hudi/src/test/scala/org/apache/spark/sql/delta/hudi/ConvertToHudiSuite.scala
@@ -150,18 +150,55 @@ class ConvertToHudiSuite extends QueryTest with Eventually {
     }
   }
 
-  for (invalidFieldDef <- Seq("col3 ARRAY<STRING>", "col3 MAP<STRING, STRING>")) {
-    test(s"Table Throws Exception for Unsupported Type ($invalidFieldDef)") {
-      intercept[DeltaUnsupportedOperationException] {
-        _sparkSession.sql(
-          s"""CREATE TABLE `$testTableName` (col1 INT, col2 STRING, $invalidFieldDef) USING DELTA
-             |LOCATION '$testTablePath'
-             |TBLPROPERTIES (
-             |  'delta.universalFormat.enabledFormats' = 'hudi',
-             |  'delta.enableDeletionVectors' = false
-             |)""".stripMargin)
-      }
-    }
+  test(s"Conversion behavior for lists") {
+    _sparkSession.sql(
+      s"""CREATE TABLE `$testTableName` (col1 ARRAY<INT>) USING DELTA
+         |LOCATION '$testTablePath'
+         |TBLPROPERTIES (
+         |  'delta.universalFormat.enabledFormats' = 'hudi'
+         |)""".stripMargin)
+    _sparkSession.sql(s"INSERT INTO `$testTableName` VALUES (array(1, 2, 3))")
+    verifyFilesAndSchemaMatch()
+  }
+
+  test(s"Conversion behavior for lists of structs") {
+    _sparkSession.sql(
+      s"""CREATE TABLE `$testTableName`
+         |(col1 ARRAY<STRUCT<field1: INT, field2: STRING>>) USING DELTA
+         |LOCATION '$testTablePath'
+         |TBLPROPERTIES (
+         |  'delta.universalFormat.enabledFormats' = 'hudi'
+         |)""".stripMargin)
+    _sparkSession.sql(s"INSERT INTO `$testTableName` " +
+      s"VALUES (array(named_struct('field1', 1, 'field2', 'hello'), " +
+      s"named_struct('field1', 2, 'field2', 'world')))")
+    verifyFilesAndSchemaMatch()
+  }
+
+  test(s"Conversion behavior for lists of lists") {
+    _sparkSession.sql(
+      s"""CREATE TABLE `$testTableName`
+         |(col1 ARRAY<ARRAY<INT>>) USING DELTA
+         |LOCATION '$testTablePath'
+         |TBLPROPERTIES (
+         |  'delta.universalFormat.enabledFormats' = 'hudi'
+         |)""".stripMargin)
+    _sparkSession.sql(s"INSERT INTO `$testTableName` " +
+      s"VALUES (array(array(1, 2, 3), array(4, 5, 6)))")
+    verifyFilesAndSchemaMatch()
+  }
+
+  test(s"Conversion behavior for maps") {
+    _sparkSession.sql(
+      s"""CREATE TABLE `$testTableName` (col1 MAP<STRING, INT>) USING DELTA
+         |LOCATION '$testTablePath'
+         |TBLPROPERTIES (
+         |  'delta.universalFormat.enabledFormats' = 'hudi'
+         |)""".stripMargin)
+    _sparkSession.sql(
+      s"INSERT INTO `$testTableName` VALUES (map('a', 1, 'b', 2, 'c', 3))"
+    )
+    verifyFilesAndSchemaMatch()
   }
 
   test("validate Hudi timeline archival and cleaning") {

diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/UniversalFormat.scala b/spark/src/main/scala/org/apache/spark/sql/delta/UniversalFormat.scala
@@ -25,16 +25,17 @@ import org.apache.spark.sql.delta.schema.SchemaUtils
 import org.apache.spark.internal.MDC
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
-import org.apache.spark.sql.types.{ArrayType, MapType, NullType}
+import org.apache.spark.sql.types.NullType
 
 /**
  * Utils to validate the Universal Format (UniForm) Delta feature (NOT a table feature).
  *
  * The UniForm Delta feature governs and implements the actual conversion of Delta metadata into
  * other formats.
  *
- * Currently, UniForm only supports Iceberg. When `delta.universalFormat.enabledFormats` contains
- * "iceberg", we say that Universal Format (Iceberg) is enabled.
+ * UniForm supports both Iceberg and Hudi. When `delta.universalFormat.enabledFormats` contains
+ * "iceberg", we say that Universal Format (Iceberg) is enabled. When it contains "hudi", we say
+ * that Universal Format (Hudi) is enabled.
  *
  * [[enforceInvariantsAndDependencies]] ensures that all of UniForm's requirements for the
  * specified format are met (e.g. for 'iceberg' that IcebergCompatV1 or V2 is enabled).
@@ -101,9 +102,8 @@ object UniversalFormat extends DeltaLogging {
       if (DeltaConfigs.ENABLE_DELETION_VECTORS_CREATION.fromMetaData(newestMetadata)) {
         throw DeltaErrors.uniFormHudiDeleteVectorCompat()
       }
-      // TODO: remove once map/list support is added https://github.com/delta-io/delta/issues/2738
       SchemaUtils.findAnyTypeRecursively(newestMetadata.schema) { f =>
-        f.isInstanceOf[MapType] || f.isInstanceOf[ArrayType] || f.isInstanceOf[NullType]
+        f.isInstanceOf[NullType]
       } match {
         case Some(unsupportedType) =>
           throw DeltaErrors.uniFormHudiSchemaCompat(unsupportedType)