Skip to content

Commit

Permalink
add casing support for status col
Browse files Browse the repository at this point in the history
  • Loading branch information
rxu17 committed Jan 24, 2025
1 parent 0cd8f8c commit 458376b
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 6 deletions.
16 changes: 11 additions & 5 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,18 +683,24 @@ def store_gene_panel_files(


def filter_out_germline_variants(
input_data: pd.DataFrame, status_col: str
input_data: pd.DataFrame, status_col_str: str
) -> pd.DataFrame:
"""Filters out germline variants given a status col. Genie pipeline
cannot have any of these variants.
"""Filters out germline variants given a status col str. Genie pipeline
cannot have any of these variants. NOTE: We have to search for the
status column because there's no column name validation in the release
steps so the status column may have different casing.
Args:
input_data (pd.DataFrame): input data with germline variants to filter out
status_col (str): status column for the data
status_col_str (str): search string for the status column for the data
Returns:
pd.DataFrame: filtered out germline variant data
"""
# find status col SV_Status
status_col = [
col for col in input_data.columns if col.lower() == status_col_str.lower()
][0]
return input_data[input_data[status_col] != "GERMLINE"].reset_index(drop=True)


Expand Down Expand Up @@ -750,7 +756,7 @@ def store_sv_files(
)

sv_df = sv_df[sv_df["SAMPLE_ID"].isin(keep_for_merged_consortium_samples)]
sv_df = filter_out_germline_variants(input_data=sv_df, status_col="SV_STATUS")
sv_df = filter_out_germline_variants(input_data=sv_df)
sv_df.rename(columns=transform._col_name_to_titlecase, inplace=True)
sv_text = process_functions.removePandasDfFloat(sv_df)
sv_path = os.path.join(GENIE_RELEASE_DIR, "data_sv.txt")
Expand Down
11 changes: 10 additions & 1 deletion tests/test_database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,17 @@ def test_store_assay_info_files(syn):
dict(SV_STATUS=["SOMATIC", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"])
),
),
(
pd.DataFrame(
dict(SV_Status=["GERMLINE", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"])
),
"SV_STATUS",
pd.DataFrame(
dict(SV_Status=["SOMATIC"], Sample_ID=["GENIE-2"])
),
),
],
ids=["all_germline", "some_germline", "no_germline"],
ids=["all_germline", "some_germline", "no_germline", "diff_status_col_case"],
)
def test_that_filter_out_germline_variants_returns_expected(
input_data, filter_col, expected_result
Expand Down

0 comments on commit 458376b

Please # to comment.