-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Spark] Drop Type widening feature: read Parquet footers to collect f…
…iles to rewrite (#3155) ## What changes were proposed in this pull request? The initial approach to identify files that contain a type that differs from the table schema and that must be rewritten before dropping the type widening table feature is convoluted and turns out to be more brittle than intended. This change switches instead to directly reading the file schema from the Parquet footer and rewriting all files that have a mismatching type. ### Additional Context Files are identified using their default row commit version (a part of the row tracking feature) and matched against type changes previously applied to the table and recorded in the table metadata: any file written before the latest type change should use a different type and must be rewritten. This requires multiple pieces of information to be accurately tracked: - Default row commit versions must be correctly assigned to all files. E.p. files that are copied over without modification must never be assigned a new default row commit version. On the other hand, default row commit versions are preserved across CLONE but these versions don't match anything in the new cloned table. - Type change history must be reliably recorded and preserved across schema changes, e.g. column mapping. Any bug will likely lead to files not being correctly rewritten before removing the table feature, potentially leaving the table in an unreadable state. ## How was this patch tested? Tests added in previous PR to cover CLONE and RESTORE: #3053 Tests added and updated in this PR to cover rewriting files with different column types when removing the table feature.
- Loading branch information
Showing
6 changed files
with
174 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
111 changes: 111 additions & 0 deletions
111
spark/src/main/scala/org/apache/spark/sql/delta/commands/ReorgTableHelper.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
/* | ||
* Copyright (2021) The Delta Lake Project Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.delta.commands | ||
|
||
import org.apache.spark.sql.delta.Snapshot | ||
import org.apache.spark.sql.delta.actions.AddFile | ||
import org.apache.spark.sql.delta.commands.VacuumCommand.generateCandidateFileMap | ||
import org.apache.spark.sql.delta.schema.SchemaMergingUtils | ||
import org.apache.spark.sql.delta.util.DeltaFileOperations | ||
import org.apache.hadoop.conf.Configuration | ||
import org.apache.hadoop.fs.{FileStatus, Path} | ||
|
||
import org.apache.spark.sql.SparkSession | ||
import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetToSparkSchemaConverter} | ||
import org.apache.spark.sql.types.{AtomicType, StructField, StructType} | ||
import org.apache.spark.util.SerializableConfiguration | ||
|
||
trait ReorgTableHelper extends Serializable { | ||
/** | ||
* Determine whether `fileSchema` has any columns that has a type that differs from | ||
* `tablePhysicalSchema`. | ||
*/ | ||
protected def fileHasDifferentTypes( | ||
fileSchema: StructType, | ||
tablePhysicalSchema: StructType): Boolean = { | ||
SchemaMergingUtils.transformColumns(fileSchema, tablePhysicalSchema) { | ||
case (_, StructField(_, fileType: AtomicType, _, _), | ||
Some(StructField(_, tableType: AtomicType, _, _)), _) if fileType != tableType => | ||
return true | ||
case (_, field, _, _) => field | ||
} | ||
false | ||
} | ||
|
||
/** | ||
* Apply a filter on the list of AddFile to only keep the files that have physical parquet schema | ||
* that satisfies the given filter function. | ||
* | ||
* Note: Filtering happens on the executors: **any variable captured by `filterFileFn` must be | ||
* Serializable** | ||
*/ | ||
protected def filterParquetFilesOnExecutors( | ||
spark: SparkSession, | ||
files: Seq[AddFile], | ||
snapshot: Snapshot, | ||
ignoreCorruptFiles: Boolean)( | ||
filterFileFn: StructType => Boolean): Seq[AddFile] = { | ||
|
||
val serializedConf = new SerializableConfiguration(snapshot.deltaLog.newDeltaHadoopConf()) | ||
val assumeBinaryIsString = spark.sessionState.conf.isParquetBinaryAsString | ||
val assumeInt96IsTimestamp = spark.sessionState.conf.isParquetINT96AsTimestamp | ||
val dataPath = new Path(snapshot.deltaLog.dataPath.toString) | ||
|
||
import org.apache.spark.sql.delta.implicits._ | ||
|
||
files.toDF(spark).as[AddFile].mapPartitions { iter => | ||
filterParquetFiles(iter.toList, dataPath, serializedConf.value, ignoreCorruptFiles, | ||
assumeBinaryIsString, assumeInt96IsTimestamp)(filterFileFn).toIterator | ||
}.collect() | ||
} | ||
|
||
protected def filterParquetFiles( | ||
files: Seq[AddFile], | ||
dataPath: Path, | ||
configuration: Configuration, | ||
ignoreCorruptFiles: Boolean, | ||
assumeBinaryIsString: Boolean, | ||
assumeInt96IsTimestamp: Boolean)( | ||
filterFileFn: StructType => Boolean): Seq[AddFile] = { | ||
val nameToAddFileMap = generateCandidateFileMap(dataPath, files) | ||
|
||
val fileStatuses = nameToAddFileMap.map { case (absPath, addFile) => | ||
new FileStatus( | ||
/* length */ addFile.size, | ||
/* isDir */ false, | ||
/* blockReplication */ 0, | ||
/* blockSize */ 1, | ||
/* modificationTime */ addFile.modificationTime, | ||
new Path(absPath) | ||
) | ||
} | ||
|
||
val footers = DeltaFileOperations.readParquetFootersInParallel( | ||
configuration, | ||
fileStatuses.toList, | ||
ignoreCorruptFiles) | ||
|
||
val converter = | ||
new ParquetToSparkSchemaConverter(assumeBinaryIsString, assumeInt96IsTimestamp) | ||
|
||
val filesNeedToRewrite = footers.filter { footer => | ||
val fileSchema = ParquetFileFormat.readSchemaFromFooter(footer, converter) | ||
filterFileFn(fileSchema) | ||
}.map(_.getFile.toString) | ||
filesNeedToRewrite.map(absPath => nameToAddFileMap(absPath)) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters