From e15132b021f3d4ec5b2f359aaf682fa29fadbb46 Mon Sep 17 00:00:00 2001 From: Jiaheng Tang Date: Fri, 17 May 2024 07:55:48 -0700 Subject: [PATCH] [Spark] Fall back to zordering when clustering on a single column (#3109) ## Description Fall back to zorder when clustering on a single column, because hilbert clustering doesn't support 1 column. Resolves #3087 ## How was this patch tested? New unit test. --- .../delta/skipping/MultiDimClustering.scala | 1 + .../ClusteredTableClusteringSuite.scala | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/skipping/MultiDimClustering.scala b/spark/src/main/scala/org/apache/spark/sql/delta/skipping/MultiDimClustering.scala index 46c3e0132bc..d2bb56a499a 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/skipping/MultiDimClustering.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/skipping/MultiDimClustering.scala @@ -50,6 +50,7 @@ object MultiDimClustering { curve: String): DataFrame = { assert(colNames.nonEmpty, "Cannot cluster by zero columns!") val clusteringImpl = curve match { + case "hilbert" if colNames.size == 1 => ZOrderClustering case "hilbert" => HilbertClustering case "zorder" => ZOrderClustering case unknownCurve => diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/clustering/ClusteredTableClusteringSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/clustering/ClusteredTableClusteringSuite.scala index d1b2e0da346..362dd40b539 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/clustering/ClusteredTableClusteringSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/clustering/ClusteredTableClusteringSuite.scala @@ -79,4 +79,28 @@ class ClusteredTableClusteringSuite extends SparkFunSuite } } } + + test("cluster by 1 column") { + withSQLConf(SQLConf.MAX_RECORDS_PER_FILE.key -> "2") { + withClusteredTable( + table = table, + schema = "col1 int, col2 int", + clusterBy = "col1") { + addFiles(table, numFiles = 4) + val files0 = getFiles(table) + assert(files0.size === 4) + assertNotClustered(files0) + + // Optimize should cluster the data into two 2 files since MAX_RECORDS_PER_FILE is 2. + runOptimize(table) { metrics => + assert(metrics.numFilesRemoved == 4) + assert(metrics.numFilesAdded == 2) + } + + val files1 = getFiles(table) + assert(files1.size == 2) + assertClustered(files1) + } + } + } }