Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Working to add checks for duplicated comparison features, feature selection, and column mappings #113

Merged
merged 8 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions hlink/scripts/lib/conf_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,12 +162,15 @@ def check_comparison_features(config, columns_available):
raise ValueError(
"No [[comparison_features]] exist. Please add [[comparison_features]]."
)
duplicates = []
for c in comparison_features:
alias = c.get("alias")
if alias is None:
raise ValueError(
f"No alias exists for a [[comparison_features]]: {c}. Please add an 'alias'."
)
if alias in comps:
duplicates.append(alias)
column_name = c.get("column_name") or c.get("first_init_col")
column_names = c.get("column_names") or c.get("mid_init_cols")
if column_name is not None:
Expand All @@ -181,7 +184,12 @@ def check_comparison_features(config, columns_available):
raise ValueError(
f"Within [[comparison_features]] the 'column_name' {cname} is not available from a previous [[column_mappings]] or [[feature_selections]]: {c}"
)

comps.append(alias)
if duplicates != []:
raise ValueError(
f"Alias names are not unique. Check comparison features section to use unique aliases for each feature: {', '.join(set(duplicates))}"
)
return comps


Expand All @@ -203,6 +211,7 @@ def check_feature_selections(config, columns_available):
feature_selections = config.get("feature_selections")
if feature_selections is None:
return
duplicates = []
for f in feature_selections:
input_column = f.get("input_column")
output_column = f.get("output_column") or f.get("output_col")
Expand All @@ -219,7 +228,13 @@ def check_feature_selections(config, columns_available):
raise ValueError(
f"No 'output_column' or 'output_col' value for [[feature_selections]]: {f}"
)
if output_column in columns_available:
duplicates.append(output_column)
columns_available.append(output_column)
if duplicates != []:
raise ValueError(
f"Output columns are not unique. Check feature selectionss to ensure output columns are unique: {', '.join(duplicates)}"
)


def check_substitution_columns(config, columns_available):
Expand Down Expand Up @@ -258,6 +273,7 @@ def check_column_mappings(config, df_a, df_b):
if not column_mappings:
raise ValueError("No [[column_mappings]] exist in the conf file.")
columns_available = []
duplicates = []
for c in column_mappings:
alias = c.get("alias")
column_name = c.get("column_name")
Expand All @@ -279,10 +295,19 @@ def check_column_mappings(config, df_a, df_b):
raise ValueError(
f"Within a [[column_mappings]] the column_name: '{column_name}' does not exist in datasource_b and no previous [[column_mapping]] alias exists for it. Column mapping: {c}. Available columns: \n {df_b.columns}"
)
if alias in columns_available:
duplicates.append(alias)
elif not alias and column_name in columns_available:
duplicates.append(column_name)

if alias:
columns_available.append(alias)
else:
columns_available.append(column_name)
if duplicates != []:
raise ValueError(
f"Column names are not unique. Check column mappings to ensure unique aliases or column names are used: {', '.join(duplicates)}"
)
return columns_available


Expand Down
14 changes: 14 additions & 0 deletions hlink/tests/conf/duplicate_col_maps.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
id_column = "ID"

[datasource_a]
file = "hlink/tests/input_data/conf_validation/a.csv"

[datasource_b]
file = "hlink/tests/input_data/conf_validation/b.csv"

[[column_mappings]]
column_name = "NAMEFRST"

## Duplicate column name, should throw error
[[column_mappings]]
column_name = "NAMEFRST"
36 changes: 36 additions & 0 deletions hlink/tests/conf/duplicate_comp_features.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
id_column = "ID"

[datasource_a]
file = "hlink/tests/input_data/conf_validation/a.csv"

[datasource_b]
file = "hlink/tests/input_data/conf_validation/b.csv"

[[column_mappings]]
column_name = "NAMEFRST"

[[column_mappings]]
column_name = "NAMELAST"

[[feature_selections]]
input_column = "NAMEFRST"
output_col = "namefrst_clean_bigrams"
transform = "bigrams"

[[feature_selections]]
input_column = "NAMELAST"
output_col = "namelast_clean_bigrams"
transform = "bigrams"

[[blocking]]
column_name = "NAMELAST"

[[comparison_features]]
alias = "bg_namefrst"
column_name = "namefrst_clean_bigrams"

## Duplicate alias, should throw error
[[comparison_features]]
alias = "bg_namefrst"
column_name = "namelast_clean_bigrams"

24 changes: 24 additions & 0 deletions hlink/tests/conf/duplicate_feature_sel.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
id_column = "ID"

[datasource_a]
file = "hlink/tests/input_data/conf_validation/a.csv"

[datasource_b]
file = "hlink/tests/input_data/conf_validation/b.csv"

[[column_mappings]]
column_name = "NAMEFRST"

[[column_mappings]]
column_name = "NAMELAST"

[[feature_selections]]
input_column = "NAMEFRST"
output_col = "namefrst_clean_bigrams"
transform = "bigrams"

## Duplicate output_col, should throw error
[[feature_selections]]
input_column = "NAMELAST"
output_col = "namefrst_clean_bigrams"
transform = "bigrams"
3 changes: 3 additions & 0 deletions hlink/tests/conf_validations_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
("missing_datasource_b", r"Section \[datasource_b\] does not exist in config"),
("no_id_column_a", "Datasource A is missing the id column 'ID'"),
("no_id_column_b", "Datasource B is missing the id column 'ID'"),
("duplicate_comp_features", "Alias names are not unique"),
("duplicate_feature_sel", "Output columns are not unique"),
("duplicate_col_maps", "Column names are not unique"),
riley-harper marked this conversation as resolved.
Show resolved Hide resolved
],
)
def test_invalid_conf(conf_dir_path, spark, conf_name, error_msg):
Expand Down