From 75f094eb4cb499abb3aea3495fb667b1981d0a41 Mon Sep 17 00:00:00 2001 From: Aaron Weeden Date: Thu, 14 Mar 2024 18:11:56 -0400 Subject: [PATCH] Allow multiple columns and multiple files in `RegexTransformIngestor`. --- .../ETL/Ingestor/RegexTransformIngestor.php | 139 ++++++++++++------ 1 file changed, 96 insertions(+), 43 deletions(-) diff --git a/classes/ETL/Ingestor/RegexTransformIngestor.php b/classes/ETL/Ingestor/RegexTransformIngestor.php index 972a7a5e48..3848cf1f0e 100644 --- a/classes/ETL/Ingestor/RegexTransformIngestor.php +++ b/classes/ETL/Ingestor/RegexTransformIngestor.php @@ -2,25 +2,48 @@ /* ========================================================================================== * RegexTransform. This ingestor transforms values using the preg_filter() function. * - * The regular expressions, name of the source to split and the destination column + * The regular expressions, names of the sources to split and the destination columns * to populate are specified using configuration properties. All other source - * columns are passed unmodified. If the regular expression does not match then + * columns are passed unmodified. If no regular expressions match then * the row is not passed by the ingestor. * * Configuration properties: * - * - regex_column: defines the column in the source table to read and column name in the + * - regex_column: defines the columns in the source table to read and column names in the * output to use for the transformed data. For example: - * { "source": "dest" } would read the data in column named "source" and - * the transformed content of "source" would be written to "dest". - * - regex_config: a json formatted string that contains regular expression and output - * patterns. The the regex format is the one used by preg_filter(). - * For example: + * { "dest1": "source1", "dest2": "source2" } would read the data in column named + * "source1" and the transformed content of "source1" would be written to "dest1", + * then the transformed content of "source2" would be written to "dest2". If "dest1" + * and "source2" are the same, "dest2" will transform the old value of "source2" + * (old meaning before it was transformed from "source1" to "dest1"). + * - regex_config: a mapping whose keys are destination columns and whose values are arrays of file paths + * relative to the configuration directory. For example: + * { + * "destination_column_1": [ + * "path/to/config1.json", + * "path/to/config1.d/file1.json", + * "path/to/config1.d/file2.json", + * ], + * "destination_column_2": [ + * "path/to/config2.json", + * "path/to/config2.d/file1.json" + * "path/to/config2.d/file2.json" + * ] + * } + * The files at those paths must each contain JSON formatted strings that contain regular expression + * and output patterns. The regex format is the one used by preg_filter(). For example, the file could + * contain: * { * "#foo_([a-z]+)$#": "bar_$1" * } - * defines a regex that matches foo_ and any lowercase letters and then transforms + * This would define a regex that matches foo_ and any lowercase letters and then transforms * it to bar_ with the same letters. + * In each array, the first path in the array must point to a existing file. If any of the + * subsequent paths point to files that do not exist, those files will be ignored. If a file + * does exist, and if the JSON string in it contains a key that matches a key that has already + * been encountered in a previous file in the same array, then the value of that key in the file + * will override all previous values encountered. The final key/value pairs will be used to do + * the preg_filter() and the result will be written to the corresponding column. * */ namespace ETL\Ingestor; @@ -33,19 +56,8 @@ class RegexTransformIngestor extends pdoIngestor implements iAction { - /** - * The name of the column in the source table to explode(). - */ - private $srcKey; - /** - * The name of the column in the destination table populate. - */ - private $destKey; - - /* - * Array of regular expressions to test. - */ - private $regexconf; + private $regex_column; + private $regex_config; /** * @see ETL\Ingestor\pdoIngestor::__construct() @@ -54,38 +66,79 @@ public function __construct(aOptions $options, EtlConfiguration $etlConfig, Logg { parent::__construct($options, $etlConfig, $logger); - $this->verifyRequiredConfigKeys(array('regex_config', 'regex_column'), $options); - - foreach($options->regex_column as $key => $value) { - $this->srcKey = $key; - $this->destKey = $value; - break; - } - - $rconf = json_decode($options->regex_config, true); + $this->verifyRequiredConfigKeys(array('regex_column', 'regex_config'), $options); - $this->patterns = array_keys($rconf); - $this->replacements = array_values($rconf); + $this->regex_column = $options->regex_column; + $this->regex_config = $options->regex_config; } /** * @see ETL\Ingestor\pdoIngestor::transform() */ - protected function transform(array $srcRecord, $orderId) + protected function transform(array $srcRecord, &$orderId) { $transformedRecord = array(); + $outdata = $srcRecord; + $ignored = true; + foreach ($this->regex_config as $dest => $configFilePaths) { + $mainConfig = $this->getConfigFromPath($configFilePaths[0], true); + for ($i = 1; $i < count($configFilePaths); $i++) { + $config = $this->getConfigFromPath($configFilePaths[$i], false); + if (!is_null($config)) { + foreach ($config as $key => $value) { + $mainConfig[$key] = $value; + } + } + } + $srcColumn = $srcRecord[$this->regex_column->{$dest}]; + $res = preg_filter( + array_keys($mainConfig), + array_values($mainConfig), + $srcColumn + ); - $res = preg_filter($this->patterns, $this->replacements, $srcRecord[$this->srcKey]); - - if ($res !== null) { - $outdata = $srcRecord; - $outdata[$this->destKey] = $res; - + if (!is_null($res)) { + $outdata[$dest] = $res; + $ignored = false; + } else { + $this->logger->debug("$dest: Ignore $srcColumn"); + } + } + if (!$ignored) { $transformedRecord[] = $outdata; - } else { - $this->logger->debug("Ignore " . $srcRecord[$this->srcKey]); } - return $transformedRecord; } + + /** + * Given a path to a file relative to the configuration directory, return + * the JSON config in that file, decoded as an associative array. + * + * @param string $path the path of the file relative to the configuration + * directory. + * @param bool $failOnError if true, throw an exception if there is an + * error loading the contents of the file. + * Otherwise, return null. + * @return array|null the JSON config in the file, decoded as an + * associative array. + * @throws Exception if $failOnError is true and there is an error loading + * the contents of the file. + */ + private function getConfigFromPath(string $path, $failOnError) + { + $fullPath = \xd_utilities\qualify_path( + $path, + $this->etlConfig->getBaseDir() + ); + $fileContents = @file_get_contents($fullPath); + if (false === $fileContents) { + if ($failOnError) { + $this->logAndThrowException( + "Error loading contents of file '" . $path . "'" + ); + } + return null; + } + return json_decode($fileContents, true); + } }