From f896cf9dd0a9b014daff726d4d3b4639db0810e8 Mon Sep 17 00:00:00 2001 From: balch027 Date: Thu, 26 Oct 2023 14:43:18 +0000 Subject: [PATCH 1/8] Added checks for duplicates of comparison_features, feature_selections, and column_mappings --- hlink/scripts/lib/conf_validations.py | 16 ++++++++++++++++ hlink/spark/session.py | 3 ++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/hlink/scripts/lib/conf_validations.py b/hlink/scripts/lib/conf_validations.py index 3e9598a..e4cd8d7 100644 --- a/hlink/scripts/lib/conf_validations.py +++ b/hlink/scripts/lib/conf_validations.py @@ -162,8 +162,11 @@ def check_comparison_features(config, columns_available): raise ValueError( "No [[comparison_features]] exist. Please add [[comparison_features]]." ) + duplicates = [] for c in comparison_features: alias = c.get("alias") + if alias in comps: + duplicates.append(alias) if alias is None: raise ValueError( f"No alias exists for a [[comparison_features]]: {c}. Please add an 'alias'." @@ -181,7 +184,10 @@ def check_comparison_features(config, columns_available): raise ValueError( f"Within [[comparison_features]] the 'column_name' {cname} is not available from a previous [[column_mappings]] or [[feature_selections]]: {c}" ) + comps.append(alias) + if duplicates != []: + raise ValueError(f"Alias names are not unique: {duplicates}") return comps @@ -203,6 +209,7 @@ def check_feature_selections(config, columns_available): feature_selections = config.get("feature_selections") if feature_selections is None: return + duplicates = [] for f in feature_selections: input_column = f.get("input_column") output_column = f.get("output_column") or f.get("output_col") @@ -219,7 +226,11 @@ def check_feature_selections(config, columns_available): raise ValueError( f"No 'output_column' or 'output_col' value for [[feature_selections]]: {f}" ) + if output_column in columns_available: + duplicates.append(output_column) columns_available.append(output_column) + if duplicates != []: + raise ValueError(f"Output column are not unique: {duplicates}") def check_substitution_columns(config, columns_available): @@ -258,6 +269,7 @@ def check_column_mappings(config, df_a, df_b): if not column_mappings: raise ValueError("No [[column_mappings]] exist in the conf file.") columns_available = [] + duplicates = [] for c in column_mappings: alias = c.get("alias") column_name = c.get("column_name") @@ -279,10 +291,14 @@ def check_column_mappings(config, df_a, df_b): raise ValueError( f"Within a [[column_mappings]] the column_name: '{column_name}' does not exist in datasource_b and no previous [[column_mapping]] alias exists for it. Column mapping: {c}. Available columns: \n {df_b.columns}" ) + if column_name in columns_available: + duplicates.append(column_name) if alias: columns_available.append(alias) else: columns_available.append(column_name) + if duplicates != []: + raise ValueError(f"Column names are not unique: {duplicates}") return columns_available diff --git a/hlink/spark/session.py b/hlink/spark/session.py index 75e82c2..542718a 100644 --- a/hlink/spark/session.py +++ b/hlink/spark/session.py @@ -29,7 +29,8 @@ def __init__(self, derby_dir, warehouse_dir, tmp_dir, python, db_name): self.python = python def spark_conf(self, executor_cores, executor_memory, driver_memory, cores): - spark_package_path = os.path.dirname(hlink.spark.__file__) + ## Was getting an error where hlink.spark.__file__ return None and everything breaks + spark_package_path = os.path.dirname(hlink.spark.session.__file__) jar_path = os.path.join( spark_package_path, "jars", "hlink_lib-assembly-1.0.jar" ) From 726371c07b79e49cded3664fd80417b6d77a986c Mon Sep 17 00:00:00 2001 From: balch027 Date: Thu, 26 Oct 2023 15:34:33 +0000 Subject: [PATCH 2/8] Added tests to this and blacked --- hlink/scripts/lib/conf_validations.py | 2 +- hlink/tests/conf/duplicate_col_maps.toml | 14 ++++++++ hlink/tests/conf/duplicate_comp_features.toml | 36 +++++++++++++++++++ hlink/tests/conf/duplicate_feature_sel.toml | 24 +++++++++++++ hlink/tests/conf_validations_test.py | 3 ++ 5 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 hlink/tests/conf/duplicate_col_maps.toml create mode 100644 hlink/tests/conf/duplicate_comp_features.toml create mode 100644 hlink/tests/conf/duplicate_feature_sel.toml diff --git a/hlink/scripts/lib/conf_validations.py b/hlink/scripts/lib/conf_validations.py index e4cd8d7..5f975ee 100644 --- a/hlink/scripts/lib/conf_validations.py +++ b/hlink/scripts/lib/conf_validations.py @@ -230,7 +230,7 @@ def check_feature_selections(config, columns_available): duplicates.append(output_column) columns_available.append(output_column) if duplicates != []: - raise ValueError(f"Output column are not unique: {duplicates}") + raise ValueError(f"Output columns are not unique: {duplicates}") def check_substitution_columns(config, columns_available): diff --git a/hlink/tests/conf/duplicate_col_maps.toml b/hlink/tests/conf/duplicate_col_maps.toml new file mode 100644 index 0000000..a60a959 --- /dev/null +++ b/hlink/tests/conf/duplicate_col_maps.toml @@ -0,0 +1,14 @@ +id_column = "ID" + +[datasource_a] +file = "hlink/tests/input_data/conf_validation/a.csv" + +[datasource_b] +file = "hlink/tests/input_data/conf_validation/b.csv" + +[[column_mappings]] +column_name = "NAMEFRST" + +## Duplicate column name, should throw error +[[column_mappings]] +column_name = "NAMEFRST" diff --git a/hlink/tests/conf/duplicate_comp_features.toml b/hlink/tests/conf/duplicate_comp_features.toml new file mode 100644 index 0000000..03aed6c --- /dev/null +++ b/hlink/tests/conf/duplicate_comp_features.toml @@ -0,0 +1,36 @@ +id_column = "ID" + +[datasource_a] +file = "hlink/tests/input_data/conf_validation/a.csv" + +[datasource_b] +file = "hlink/tests/input_data/conf_validation/b.csv" + +[[column_mappings]] +column_name = "NAMEFRST" + +[[column_mappings]] +column_name = "NAMELAST" + +[[feature_selections]] +input_column = "NAMEFRST" +output_col = "namefrst_clean_bigrams" +transform = "bigrams" + +[[feature_selections]] +input_column = "NAMELAST" +output_col = "namelast_clean_bigrams" +transform = "bigrams" + +[[blocking]] +column_name = "NAMELAST" + +[[comparison_features]] +alias = "bg_namefrst" +column_name = "namefrst_clean_bigrams" + +## Duplicate alias, should throw error +[[comparison_features]] +alias = "bg_namefrst" +column_name = "namelast_clean_bigrams" + diff --git a/hlink/tests/conf/duplicate_feature_sel.toml b/hlink/tests/conf/duplicate_feature_sel.toml new file mode 100644 index 0000000..d621de0 --- /dev/null +++ b/hlink/tests/conf/duplicate_feature_sel.toml @@ -0,0 +1,24 @@ +id_column = "ID" + +[datasource_a] +file = "hlink/tests/input_data/conf_validation/a.csv" + +[datasource_b] +file = "hlink/tests/input_data/conf_validation/b.csv" + +[[column_mappings]] +column_name = "NAMEFRST" + +[[column_mappings]] +column_name = "NAMELAST" + +[[feature_selections]] +input_column = "NAMEFRST" +output_col = "namefrst_clean_bigrams" +transform = "bigrams" + +## Duplicate output_col, should throw error +[[feature_selections]] +input_column = "NAMELAST" +output_col = "namefrst_clean_bigrams" +transform = "bigrams" diff --git a/hlink/tests/conf_validations_test.py b/hlink/tests/conf_validations_test.py index 211d0a6..ffa85ff 100644 --- a/hlink/tests/conf_validations_test.py +++ b/hlink/tests/conf_validations_test.py @@ -13,6 +13,9 @@ ("missing_datasource_b", r"Section \[datasource_b\] does not exist in config"), ("no_id_column_a", "Datasource A is missing the id column 'ID'"), ("no_id_column_b", "Datasource B is missing the id column 'ID'"), + ("duplicate_comp_features", "Alias names are not unique"), + ("duplicate_feature_sel", "Output columns are not unique"), + ("duplicate_col_maps", "Column names are not unique"), ], ) def test_invalid_conf(conf_dir_path, spark, conf_name, error_msg): From 902cf07b0434d40a3cdd837903b3a48ce6222452 Mon Sep 17 00:00:00 2001 From: balch027 Date: Thu, 26 Oct 2023 15:40:25 +0000 Subject: [PATCH 3/8] Fixed the block comment thing --- hlink/spark/session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hlink/spark/session.py b/hlink/spark/session.py index 542718a..e9ea882 100644 --- a/hlink/spark/session.py +++ b/hlink/spark/session.py @@ -29,7 +29,7 @@ def __init__(self, derby_dir, warehouse_dir, tmp_dir, python, db_name): self.python = python def spark_conf(self, executor_cores, executor_memory, driver_memory, cores): - ## Was getting an error where hlink.spark.__file__ return None and everything breaks + # Was getting an error where hlink.spark.__file__ return None and everything breaks spark_package_path = os.path.dirname(hlink.spark.session.__file__) jar_path = os.path.join( spark_package_path, "jars", "hlink_lib-assembly-1.0.jar" From 2489123bb4e3850fefb5576e13db3ba9e253050a Mon Sep 17 00:00:00 2001 From: balch027 Date: Thu, 26 Oct 2023 16:58:40 +0000 Subject: [PATCH 4/8] Addressed Riley's comments --- hlink/scripts/lib/conf_validations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hlink/scripts/lib/conf_validations.py b/hlink/scripts/lib/conf_validations.py index 5f975ee..f393b4c 100644 --- a/hlink/scripts/lib/conf_validations.py +++ b/hlink/scripts/lib/conf_validations.py @@ -165,7 +165,7 @@ def check_comparison_features(config, columns_available): duplicates = [] for c in comparison_features: alias = c.get("alias") - if alias in comps: + if alias in comps or alias in columns_available: duplicates.append(alias) if alias is None: raise ValueError( @@ -291,8 +291,8 @@ def check_column_mappings(config, df_a, df_b): raise ValueError( f"Within a [[column_mappings]] the column_name: '{column_name}' does not exist in datasource_b and no previous [[column_mapping]] alias exists for it. Column mapping: {c}. Available columns: \n {df_b.columns}" ) - if column_name in columns_available: - duplicates.append(column_name) + if column_name in columns_available or alias in columns_available: + duplicates.append(alias if alias else column_name) if alias: columns_available.append(alias) else: From e49284547a2c58a23cbf6db037574616d4b93c06 Mon Sep 17 00:00:00 2001 From: balch027 Date: Thu, 26 Oct 2023 17:14:40 +0000 Subject: [PATCH 5/8] Removing the session thingy --- hlink/spark/session.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hlink/spark/session.py b/hlink/spark/session.py index e9ea882..75e82c2 100644 --- a/hlink/spark/session.py +++ b/hlink/spark/session.py @@ -29,8 +29,7 @@ def __init__(self, derby_dir, warehouse_dir, tmp_dir, python, db_name): self.python = python def spark_conf(self, executor_cores, executor_memory, driver_memory, cores): - # Was getting an error where hlink.spark.__file__ return None and everything breaks - spark_package_path = os.path.dirname(hlink.spark.session.__file__) + spark_package_path = os.path.dirname(hlink.spark.__file__) jar_path = os.path.join( spark_package_path, "jars", "hlink_lib-assembly-1.0.jar" ) From 3704687bc59533ac0e9cdf5ef885d01bb02773db Mon Sep 17 00:00:00 2001 From: balch027 Date: Thu, 26 Oct 2023 17:28:17 +0000 Subject: [PATCH 6/8] Added extra error message and also allowed for the same column to be read multiple times with different aliases --- hlink/scripts/lib/conf_validations.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/hlink/scripts/lib/conf_validations.py b/hlink/scripts/lib/conf_validations.py index f393b4c..6c76727 100644 --- a/hlink/scripts/lib/conf_validations.py +++ b/hlink/scripts/lib/conf_validations.py @@ -187,7 +187,7 @@ def check_comparison_features(config, columns_available): comps.append(alias) if duplicates != []: - raise ValueError(f"Alias names are not unique: {duplicates}") + raise ValueError(f"Alias names are not unique. Check comparison features section to use unique aliases for each feature: {', '.join(duplicates)}") return comps @@ -230,7 +230,7 @@ def check_feature_selections(config, columns_available): duplicates.append(output_column) columns_available.append(output_column) if duplicates != []: - raise ValueError(f"Output columns are not unique: {duplicates}") + raise ValueError(f"Output columns are not unique. Check feature selectionss to ensure output columns are unique: {', '.join(duplicates)}") def check_substitution_columns(config, columns_available): @@ -291,14 +291,17 @@ def check_column_mappings(config, df_a, df_b): raise ValueError( f"Within a [[column_mappings]] the column_name: '{column_name}' does not exist in datasource_b and no previous [[column_mapping]] alias exists for it. Column mapping: {c}. Available columns: \n {df_b.columns}" ) - if column_name in columns_available or alias in columns_available: - duplicates.append(alias if alias else column_name) + if alias in columns_available: + duplicates.append(alias) + elif not alias and column_name in columns_available: + duplicates.append(column_name) + if alias: columns_available.append(alias) else: columns_available.append(column_name) if duplicates != []: - raise ValueError(f"Column names are not unique: {duplicates}") + raise ValueError(f"Column names are not unique. Check column mappings to ensure unique aliases or column names are used: {', '.join(duplicates)}") return columns_available From e840c8d1761d84a32d3f8db8d35fe006d5bf507a Mon Sep 17 00:00:00 2001 From: balch027 Date: Thu, 26 Oct 2023 17:31:32 +0000 Subject: [PATCH 7/8] blacked --- hlink/scripts/lib/conf_validations.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/hlink/scripts/lib/conf_validations.py b/hlink/scripts/lib/conf_validations.py index 6c76727..f9fb72b 100644 --- a/hlink/scripts/lib/conf_validations.py +++ b/hlink/scripts/lib/conf_validations.py @@ -187,7 +187,9 @@ def check_comparison_features(config, columns_available): comps.append(alias) if duplicates != []: - raise ValueError(f"Alias names are not unique. Check comparison features section to use unique aliases for each feature: {', '.join(duplicates)}") + raise ValueError( + f"Alias names are not unique. Check comparison features section to use unique aliases for each feature: {', '.join(duplicates)}" + ) return comps @@ -230,7 +232,9 @@ def check_feature_selections(config, columns_available): duplicates.append(output_column) columns_available.append(output_column) if duplicates != []: - raise ValueError(f"Output columns are not unique. Check feature selectionss to ensure output columns are unique: {', '.join(duplicates)}") + raise ValueError( + f"Output columns are not unique. Check feature selectionss to ensure output columns are unique: {', '.join(duplicates)}" + ) def check_substitution_columns(config, columns_available): @@ -301,7 +305,9 @@ def check_column_mappings(config, df_a, df_b): else: columns_available.append(column_name) if duplicates != []: - raise ValueError(f"Column names are not unique. Check column mappings to ensure unique aliases or column names are used: {', '.join(duplicates)}") + raise ValueError( + f"Column names are not unique. Check column mappings to ensure unique aliases or column names are used: {', '.join(duplicates)}" + ) return columns_available From 77ace6dc48a1f0a7e93f56e79100d65564c6041f Mon Sep 17 00:00:00 2001 From: balch027 Date: Mon, 30 Oct 2023 13:23:10 +0000 Subject: [PATCH 8/8] updated to address Riley --- hlink/scripts/lib/conf_validations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hlink/scripts/lib/conf_validations.py b/hlink/scripts/lib/conf_validations.py index f9fb72b..e30a398 100644 --- a/hlink/scripts/lib/conf_validations.py +++ b/hlink/scripts/lib/conf_validations.py @@ -165,12 +165,12 @@ def check_comparison_features(config, columns_available): duplicates = [] for c in comparison_features: alias = c.get("alias") - if alias in comps or alias in columns_available: - duplicates.append(alias) if alias is None: raise ValueError( f"No alias exists for a [[comparison_features]]: {c}. Please add an 'alias'." ) + if alias in comps: + duplicates.append(alias) column_name = c.get("column_name") or c.get("first_init_col") column_names = c.get("column_names") or c.get("mid_init_cols") if column_name is not None: @@ -188,7 +188,7 @@ def check_comparison_features(config, columns_available): comps.append(alias) if duplicates != []: raise ValueError( - f"Alias names are not unique. Check comparison features section to use unique aliases for each feature: {', '.join(duplicates)}" + f"Alias names are not unique. Check comparison features section to use unique aliases for each feature: {', '.join(set(duplicates))}" ) return comps