Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Working to add checks for duplicated comparison features, feature selection, and column mappings #113

Merged
merged 8 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions hlink/scripts/lib/conf_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,11 @@ def check_comparison_features(config, columns_available):
raise ValueError(
"No [[comparison_features]] exist. Please add [[comparison_features]]."
)
duplicates = []
for c in comparison_features:
alias = c.get("alias")
if alias in comps or alias in columns_available:
duplicates.append(alias)
if alias is None:
raise ValueError(
f"No alias exists for a [[comparison_features]]: {c}. Please add an 'alias'."
Expand All @@ -181,7 +184,10 @@ def check_comparison_features(config, columns_available):
raise ValueError(
f"Within [[comparison_features]] the 'column_name' {cname} is not available from a previous [[column_mappings]] or [[feature_selections]]: {c}"
)

comps.append(alias)
if duplicates != []:
raise ValueError(f"Alias names are not unique: {duplicates}")
riley-harper marked this conversation as resolved.
Show resolved Hide resolved
return comps


Expand All @@ -203,6 +209,7 @@ def check_feature_selections(config, columns_available):
feature_selections = config.get("feature_selections")
if feature_selections is None:
return
duplicates = []
for f in feature_selections:
input_column = f.get("input_column")
output_column = f.get("output_column") or f.get("output_col")
Expand All @@ -219,7 +226,11 @@ def check_feature_selections(config, columns_available):
raise ValueError(
f"No 'output_column' or 'output_col' value for [[feature_selections]]: {f}"
)
if output_column in columns_available:
duplicates.append(output_column)
columns_available.append(output_column)
if duplicates != []:
raise ValueError(f"Output columns are not unique: {duplicates}")


def check_substitution_columns(config, columns_available):
Expand Down Expand Up @@ -258,6 +269,7 @@ def check_column_mappings(config, df_a, df_b):
if not column_mappings:
raise ValueError("No [[column_mappings]] exist in the conf file.")
columns_available = []
duplicates = []
for c in column_mappings:
alias = c.get("alias")
column_name = c.get("column_name")
Expand All @@ -279,10 +291,14 @@ def check_column_mappings(config, df_a, df_b):
raise ValueError(
f"Within a [[column_mappings]] the column_name: '{column_name}' does not exist in datasource_b and no previous [[column_mapping]] alias exists for it. Column mapping: {c}. Available columns: \n {df_b.columns}"
)
if column_name in columns_available or alias in columns_available:
duplicates.append(alias if alias else column_name)
riley-harper marked this conversation as resolved.
Show resolved Hide resolved
if alias:
columns_available.append(alias)
else:
columns_available.append(column_name)
if duplicates != []:
raise ValueError(f"Column names are not unique: {duplicates}")
return columns_available


Expand Down
14 changes: 14 additions & 0 deletions hlink/tests/conf/duplicate_col_maps.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
id_column = "ID"

[datasource_a]
file = "hlink/tests/input_data/conf_validation/a.csv"

[datasource_b]
file = "hlink/tests/input_data/conf_validation/b.csv"

[[column_mappings]]
column_name = "NAMEFRST"

## Duplicate column name, should throw error
[[column_mappings]]
column_name = "NAMEFRST"
36 changes: 36 additions & 0 deletions hlink/tests/conf/duplicate_comp_features.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
id_column = "ID"

[datasource_a]
file = "hlink/tests/input_data/conf_validation/a.csv"

[datasource_b]
file = "hlink/tests/input_data/conf_validation/b.csv"

[[column_mappings]]
column_name = "NAMEFRST"

[[column_mappings]]
column_name = "NAMELAST"

[[feature_selections]]
input_column = "NAMEFRST"
output_col = "namefrst_clean_bigrams"
transform = "bigrams"

[[feature_selections]]
input_column = "NAMELAST"
output_col = "namelast_clean_bigrams"
transform = "bigrams"

[[blocking]]
column_name = "NAMELAST"

[[comparison_features]]
alias = "bg_namefrst"
column_name = "namefrst_clean_bigrams"

## Duplicate alias, should throw error
[[comparison_features]]
alias = "bg_namefrst"
column_name = "namelast_clean_bigrams"

24 changes: 24 additions & 0 deletions hlink/tests/conf/duplicate_feature_sel.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
id_column = "ID"

[datasource_a]
file = "hlink/tests/input_data/conf_validation/a.csv"

[datasource_b]
file = "hlink/tests/input_data/conf_validation/b.csv"

[[column_mappings]]
column_name = "NAMEFRST"

[[column_mappings]]
column_name = "NAMELAST"

[[feature_selections]]
input_column = "NAMEFRST"
output_col = "namefrst_clean_bigrams"
transform = "bigrams"

## Duplicate output_col, should throw error
[[feature_selections]]
input_column = "NAMELAST"
output_col = "namefrst_clean_bigrams"
transform = "bigrams"
3 changes: 3 additions & 0 deletions hlink/tests/conf_validations_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
("missing_datasource_b", r"Section \[datasource_b\] does not exist in config"),
("no_id_column_a", "Datasource A is missing the id column 'ID'"),
("no_id_column_b", "Datasource B is missing the id column 'ID'"),
("duplicate_comp_features", "Alias names are not unique"),
("duplicate_feature_sel", "Output columns are not unique"),
("duplicate_col_maps", "Column names are not unique"),
riley-harper marked this conversation as resolved.
Show resolved Hide resolved
],
)
def test_invalid_conf(conf_dir_path, spark, conf_name, error_msg):
Expand Down