From 458376b2e4975ae361eacffdbd7ff388ca310d34 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Fri, 24 Jan 2025 02:08:48 -0800 Subject: [PATCH] add casing support for status col --- genie/database_to_staging.py | 16 +++++++++++----- tests/test_database_to_staging.py | 11 ++++++++++- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 1e817d6a..4c1d3576 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -683,18 +683,24 @@ def store_gene_panel_files( def filter_out_germline_variants( - input_data: pd.DataFrame, status_col: str + input_data: pd.DataFrame, status_col_str: str ) -> pd.DataFrame: - """Filters out germline variants given a status col. Genie pipeline - cannot have any of these variants. + """Filters out germline variants given a status col str. Genie pipeline + cannot have any of these variants. NOTE: We have to search for the + status column because there's no column name validation in the release + steps so the status column may have different casing. Args: input_data (pd.DataFrame): input data with germline variants to filter out - status_col (str): status column for the data + status_col_str (str): search string for the status column for the data Returns: pd.DataFrame: filtered out germline variant data """ + # find status col SV_Status + status_col = [ + col for col in input_data.columns if col.lower() == status_col_str.lower() + ][0] return input_data[input_data[status_col] != "GERMLINE"].reset_index(drop=True) @@ -750,7 +756,7 @@ def store_sv_files( ) sv_df = sv_df[sv_df["SAMPLE_ID"].isin(keep_for_merged_consortium_samples)] - sv_df = filter_out_germline_variants(input_data=sv_df, status_col="SV_STATUS") + sv_df = filter_out_germline_variants(input_data=sv_df) sv_df.rename(columns=transform._col_name_to_titlecase, inplace=True) sv_text = process_functions.removePandasDfFloat(sv_df) sv_path = os.path.join(GENIE_RELEASE_DIR, "data_sv.txt") diff --git a/tests/test_database_to_staging.py b/tests/test_database_to_staging.py index 70d3c0c8..f1c22f7c 100644 --- a/tests/test_database_to_staging.py +++ b/tests/test_database_to_staging.py @@ -140,8 +140,17 @@ def test_store_assay_info_files(syn): dict(SV_STATUS=["SOMATIC", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"]) ), ), + ( + pd.DataFrame( + dict(SV_Status=["GERMLINE", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"]) + ), + "SV_STATUS", + pd.DataFrame( + dict(SV_Status=["SOMATIC"], Sample_ID=["GENIE-2"]) + ), + ), ], - ids=["all_germline", "some_germline", "no_germline"], + ids=["all_germline", "some_germline", "no_germline", "diff_status_col_case"], ) def test_that_filter_out_germline_variants_returns_expected( input_data, filter_col, expected_result