diff --git a/hlink/scripts/lib/conf_validations.py b/hlink/scripts/lib/conf_validations.py index 3e9598a..e30a398 100644 --- a/hlink/scripts/lib/conf_validations.py +++ b/hlink/scripts/lib/conf_validations.py @@ -162,12 +162,15 @@ def check_comparison_features(config, columns_available): raise ValueError( "No [[comparison_features]] exist. Please add [[comparison_features]]." ) + duplicates = [] for c in comparison_features: alias = c.get("alias") if alias is None: raise ValueError( f"No alias exists for a [[comparison_features]]: {c}. Please add an 'alias'." ) + if alias in comps: + duplicates.append(alias) column_name = c.get("column_name") or c.get("first_init_col") column_names = c.get("column_names") or c.get("mid_init_cols") if column_name is not None: @@ -181,7 +184,12 @@ def check_comparison_features(config, columns_available): raise ValueError( f"Within [[comparison_features]] the 'column_name' {cname} is not available from a previous [[column_mappings]] or [[feature_selections]]: {c}" ) + comps.append(alias) + if duplicates != []: + raise ValueError( + f"Alias names are not unique. Check comparison features section to use unique aliases for each feature: {', '.join(set(duplicates))}" + ) return comps @@ -203,6 +211,7 @@ def check_feature_selections(config, columns_available): feature_selections = config.get("feature_selections") if feature_selections is None: return + duplicates = [] for f in feature_selections: input_column = f.get("input_column") output_column = f.get("output_column") or f.get("output_col") @@ -219,7 +228,13 @@ def check_feature_selections(config, columns_available): raise ValueError( f"No 'output_column' or 'output_col' value for [[feature_selections]]: {f}" ) + if output_column in columns_available: + duplicates.append(output_column) columns_available.append(output_column) + if duplicates != []: + raise ValueError( + f"Output columns are not unique. Check feature selectionss to ensure output columns are unique: {', '.join(duplicates)}" + ) def check_substitution_columns(config, columns_available): @@ -258,6 +273,7 @@ def check_column_mappings(config, df_a, df_b): if not column_mappings: raise ValueError("No [[column_mappings]] exist in the conf file.") columns_available = [] + duplicates = [] for c in column_mappings: alias = c.get("alias") column_name = c.get("column_name") @@ -279,10 +295,19 @@ def check_column_mappings(config, df_a, df_b): raise ValueError( f"Within a [[column_mappings]] the column_name: '{column_name}' does not exist in datasource_b and no previous [[column_mapping]] alias exists for it. Column mapping: {c}. Available columns: \n {df_b.columns}" ) + if alias in columns_available: + duplicates.append(alias) + elif not alias and column_name in columns_available: + duplicates.append(column_name) + if alias: columns_available.append(alias) else: columns_available.append(column_name) + if duplicates != []: + raise ValueError( + f"Column names are not unique. Check column mappings to ensure unique aliases or column names are used: {', '.join(duplicates)}" + ) return columns_available diff --git a/hlink/tests/conf/duplicate_col_maps.toml b/hlink/tests/conf/duplicate_col_maps.toml new file mode 100644 index 0000000..a60a959 --- /dev/null +++ b/hlink/tests/conf/duplicate_col_maps.toml @@ -0,0 +1,14 @@ +id_column = "ID" + +[datasource_a] +file = "hlink/tests/input_data/conf_validation/a.csv" + +[datasource_b] +file = "hlink/tests/input_data/conf_validation/b.csv" + +[[column_mappings]] +column_name = "NAMEFRST" + +## Duplicate column name, should throw error +[[column_mappings]] +column_name = "NAMEFRST" diff --git a/hlink/tests/conf/duplicate_comp_features.toml b/hlink/tests/conf/duplicate_comp_features.toml new file mode 100644 index 0000000..03aed6c --- /dev/null +++ b/hlink/tests/conf/duplicate_comp_features.toml @@ -0,0 +1,36 @@ +id_column = "ID" + +[datasource_a] +file = "hlink/tests/input_data/conf_validation/a.csv" + +[datasource_b] +file = "hlink/tests/input_data/conf_validation/b.csv" + +[[column_mappings]] +column_name = "NAMEFRST" + +[[column_mappings]] +column_name = "NAMELAST" + +[[feature_selections]] +input_column = "NAMEFRST" +output_col = "namefrst_clean_bigrams" +transform = "bigrams" + +[[feature_selections]] +input_column = "NAMELAST" +output_col = "namelast_clean_bigrams" +transform = "bigrams" + +[[blocking]] +column_name = "NAMELAST" + +[[comparison_features]] +alias = "bg_namefrst" +column_name = "namefrst_clean_bigrams" + +## Duplicate alias, should throw error +[[comparison_features]] +alias = "bg_namefrst" +column_name = "namelast_clean_bigrams" + diff --git a/hlink/tests/conf/duplicate_feature_sel.toml b/hlink/tests/conf/duplicate_feature_sel.toml new file mode 100644 index 0000000..d621de0 --- /dev/null +++ b/hlink/tests/conf/duplicate_feature_sel.toml @@ -0,0 +1,24 @@ +id_column = "ID" + +[datasource_a] +file = "hlink/tests/input_data/conf_validation/a.csv" + +[datasource_b] +file = "hlink/tests/input_data/conf_validation/b.csv" + +[[column_mappings]] +column_name = "NAMEFRST" + +[[column_mappings]] +column_name = "NAMELAST" + +[[feature_selections]] +input_column = "NAMEFRST" +output_col = "namefrst_clean_bigrams" +transform = "bigrams" + +## Duplicate output_col, should throw error +[[feature_selections]] +input_column = "NAMELAST" +output_col = "namefrst_clean_bigrams" +transform = "bigrams" diff --git a/hlink/tests/conf_validations_test.py b/hlink/tests/conf_validations_test.py index 211d0a6..ffa85ff 100644 --- a/hlink/tests/conf_validations_test.py +++ b/hlink/tests/conf_validations_test.py @@ -13,6 +13,9 @@ ("missing_datasource_b", r"Section \[datasource_b\] does not exist in config"), ("no_id_column_a", "Datasource A is missing the id column 'ID'"), ("no_id_column_b", "Datasource B is missing the id column 'ID'"), + ("duplicate_comp_features", "Alias names are not unique"), + ("duplicate_feature_sel", "Output columns are not unique"), + ("duplicate_col_maps", "Column names are not unique"), ], ) def test_invalid_conf(conf_dir_path, spark, conf_name, error_msg):