Skip to content

Commit

Permalink
Allow multiple columns and multiple files in RegexTransformIngestor.
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronweeden committed May 13, 2024
1 parent fb64507 commit 75f094e
Showing 1 changed file with 96 additions and 43 deletions.
139 changes: 96 additions & 43 deletions classes/ETL/Ingestor/RegexTransformIngestor.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,48 @@
/* ==========================================================================================
* RegexTransform. This ingestor transforms values using the preg_filter() function.
*
* The regular expressions, name of the source to split and the destination column
* The regular expressions, names of the sources to split and the destination columns
* to populate are specified using configuration properties. All other source
* columns are passed unmodified. If the regular expression does not match then
* columns are passed unmodified. If no regular expressions match then
* the row is not passed by the ingestor.
*
* Configuration properties:
*
* - regex_column: defines the column in the source table to read and column name in the
* - regex_column: defines the columns in the source table to read and column names in the
* output to use for the transformed data. For example:
* { "source": "dest" } would read the data in column named "source" and
* the transformed content of "source" would be written to "dest".
* - regex_config: a json formatted string that contains regular expression and output
* patterns. The the regex format is the one used by preg_filter().
* For example:
* { "dest1": "source1", "dest2": "source2" } would read the data in column named
* "source1" and the transformed content of "source1" would be written to "dest1",
* then the transformed content of "source2" would be written to "dest2". If "dest1"
* and "source2" are the same, "dest2" will transform the old value of "source2"
* (old meaning before it was transformed from "source1" to "dest1").
* - regex_config: a mapping whose keys are destination columns and whose values are arrays of file paths
* relative to the configuration directory. For example:
* {
* "destination_column_1": [
* "path/to/config1.json",
* "path/to/config1.d/file1.json",
* "path/to/config1.d/file2.json",
* ],
* "destination_column_2": [
* "path/to/config2.json",
* "path/to/config2.d/file1.json"
* "path/to/config2.d/file2.json"
* ]
* }
* The files at those paths must each contain JSON formatted strings that contain regular expression
* and output patterns. The regex format is the one used by preg_filter(). For example, the file could
* contain:
* {
* "#foo_([a-z]+)$#": "bar_$1"
* }
* defines a regex that matches foo_ and any lowercase letters and then transforms
* This would define a regex that matches foo_ and any lowercase letters and then transforms
* it to bar_ with the same letters.
* In each array, the first path in the array must point to a existing file. If any of the
* subsequent paths point to files that do not exist, those files will be ignored. If a file
* does exist, and if the JSON string in it contains a key that matches a key that has already
* been encountered in a previous file in the same array, then the value of that key in the file
* will override all previous values encountered. The final key/value pairs will be used to do
* the preg_filter() and the result will be written to the corresponding column.
*
*/
namespace ETL\Ingestor;
Expand All @@ -33,19 +56,8 @@

class RegexTransformIngestor extends pdoIngestor implements iAction
{
/**
* The name of the column in the source table to explode().
*/
private $srcKey;
/**
* The name of the column in the destination table populate.
*/
private $destKey;

/*
* Array of regular expressions to test.
*/
private $regexconf;
private $regex_column;
private $regex_config;

/**
* @see ETL\Ingestor\pdoIngestor::__construct()
Expand All @@ -54,38 +66,79 @@ public function __construct(aOptions $options, EtlConfiguration $etlConfig, Logg
{
parent::__construct($options, $etlConfig, $logger);

$this->verifyRequiredConfigKeys(array('regex_config', 'regex_column'), $options);

foreach($options->regex_column as $key => $value) {
$this->srcKey = $key;
$this->destKey = $value;
break;
}

$rconf = json_decode($options->regex_config, true);
$this->verifyRequiredConfigKeys(array('regex_column', 'regex_config'), $options);

$this->patterns = array_keys($rconf);
$this->replacements = array_values($rconf);
$this->regex_column = $options->regex_column;
$this->regex_config = $options->regex_config;
}

/**
* @see ETL\Ingestor\pdoIngestor::transform()
*/
protected function transform(array $srcRecord, $orderId)
protected function transform(array $srcRecord, &$orderId)
{
$transformedRecord = array();
$outdata = $srcRecord;
$ignored = true;
foreach ($this->regex_config as $dest => $configFilePaths) {
$mainConfig = $this->getConfigFromPath($configFilePaths[0], true);
for ($i = 1; $i < count($configFilePaths); $i++) {
$config = $this->getConfigFromPath($configFilePaths[$i], false);
if (!is_null($config)) {
foreach ($config as $key => $value) {
$mainConfig[$key] = $value;
}
}
}
$srcColumn = $srcRecord[$this->regex_column->{$dest}];
$res = preg_filter(
array_keys($mainConfig),
array_values($mainConfig),
$srcColumn
);

$res = preg_filter($this->patterns, $this->replacements, $srcRecord[$this->srcKey]);

if ($res !== null) {
$outdata = $srcRecord;
$outdata[$this->destKey] = $res;

if (!is_null($res)) {
$outdata[$dest] = $res;
$ignored = false;
} else {
$this->logger->debug("$dest: Ignore $srcColumn");
}
}
if (!$ignored) {
$transformedRecord[] = $outdata;
} else {
$this->logger->debug("Ignore " . $srcRecord[$this->srcKey]);
}

return $transformedRecord;
}

/**
* Given a path to a file relative to the configuration directory, return
* the JSON config in that file, decoded as an associative array.
*
* @param string $path the path of the file relative to the configuration
* directory.
* @param bool $failOnError if true, throw an exception if there is an
* error loading the contents of the file.
* Otherwise, return null.
* @return array|null the JSON config in the file, decoded as an
* associative array.
* @throws Exception if $failOnError is true and there is an error loading
* the contents of the file.
*/
private function getConfigFromPath(string $path, $failOnError)
{
$fullPath = \xd_utilities\qualify_path(
$path,
$this->etlConfig->getBaseDir()
);
$fileContents = @file_get_contents($fullPath);
if (false === $fileContents) {
if ($failOnError) {
$this->logAndThrowException(
"Error loading contents of file '" . $path . "'"
);
}
return null;
}
return json_decode($fileContents, true);
}
}

0 comments on commit 75f094e

Please # to comment.