diff --git a/pythonCode/modules/extraction_text/BioBERT_extraction.py b/pythonCode/modules/extraction_text/BioBERT_extraction.py index 6ec58a07..e0458689 100644 --- a/pythonCode/modules/extraction_text/BioBERT_extraction.py +++ b/pythonCode/modules/extraction_text/BioBERT_extraction.py @@ -191,14 +191,12 @@ def generate_biobert_notes_embeddings(self, dataframe, identifiers_list, frequen df_row = pd.DataFrame(row).transpose() df_row_embeddings = pd.DataFrame( [self.get_biobert_embeddings_from_event_list(df_row[column_text])]) - # Insert time in the dataframe - df_row_embeddings.insert(0, column_time, df_row[column_time].item()) # Insert patient_id in the dataframe df_row_embeddings.insert(0, column_id, df_row[column_id].item()) df_notes_embeddings = pd.concat([df_notes_embeddings, df_row_embeddings], ignore_index=True) # Rename columns - col_number = len(df_notes_embeddings.columns) - 2 - df_notes_embeddings.columns = [column_id, column_time] + [column_prefix + str(i) for i in range(col_number)] + col_number = len(df_notes_embeddings.columns) - 1 + df_notes_embeddings.columns = [column_id] + [column_prefix + str(i) for i in range(col_number)] elif column_time != "": # Iterate over patients diff --git a/pythonCode/modules/extraction_text/to_master_BioBERT_extraction.py b/pythonCode/modules/extraction_text/to_master_BioBERT_extraction.py index 81c513e5..ceeccfcc 100644 --- a/pythonCode/modules/extraction_text/to_master_BioBERT_extraction.py +++ b/pythonCode/modules/extraction_text/to_master_BioBERT_extraction.py @@ -35,10 +35,8 @@ def _custom_process(self, json_config: dict) -> dict: """ go_print(json.dumps(json_config, indent=4)) - # Check if the process is necessary + # Get frequency frequency = json_config["relativeToExtractionType"]["frequency"] - if frequency != "Patient" and frequency != "Admission" and frequency != "HourRange": - return self.results # Initialize data extracted_data_file = json_config["csvResultsPath"] @@ -46,12 +44,13 @@ def _custom_process(self, json_config: dict) -> dict: selected_columns = json_config["relativeToExtractionType"]["selectedColumns"] # Set master table format depending on frequency (for notes there is nothing to do) - if frequency == "Patient": + if frequency == "Patient" or frequency == "Note": df_notes = pd.read_csv(json_config["csvPath"]) df_notes[selected_columns["time"]] = pd.to_datetime(df_notes[selected_columns["time"]]) df_notes = df_notes[[selected_columns["patientIdentifier"], selected_columns["time"]]] - idx_min_date = df_notes.groupby(selected_columns["patientIdentifier"])[selected_columns["time"]].idxmin() - df_notes = df_notes.loc[idx_min_date] + if frequency == "Patient": + idx_min_date = df_notes.groupby(selected_columns["patientIdentifier"])[selected_columns["time"]].idxmin() + df_notes = df_notes.loc[idx_min_date] df_tmp = extracted_data.merge(df_notes, on=[selected_columns["patientIdentifier"]]) extracted_data.insert(1, selected_columns["time"], df_tmp[selected_columns["time"]]) elif frequency == "Admission": diff --git a/pythonCode/modules/extraction_ts/TSfresh_extraction.py b/pythonCode/modules/extraction_ts/TSfresh_extraction.py index 8a8d6818..9dc6ed06 100644 --- a/pythonCode/modules/extraction_ts/TSfresh_extraction.py +++ b/pythonCode/modules/extraction_ts/TSfresh_extraction.py @@ -70,8 +70,6 @@ def generate_TSfresh_embeddings(self, dataframe, identifiers_list, frequency, co columns = list(df_patient_embeddings.columns) new_columns = [column_prefix + col for col in columns] df_patient_embeddings.columns = new_columns - # Insert time in the dataframe - df_patient_embeddings.insert(0, column_time, df_patient[column_time].iloc[0]) # Insert patient_id in the dataframe df_patient_embeddings.insert(0, column_id, patient_id) df_ts_embeddings = pd.concat([df_ts_embeddings, df_patient_embeddings], ignore_index=True) @@ -119,7 +117,7 @@ def generate_TSfresh_embeddings(self, dataframe, identifiers_list, frequency, co columns = list(df_time_embeddings.columns) new_columns = [column_prefix + col for col in columns] df_time_embeddings.columns = new_columns - # Insert time in the dataframe (only start_date if the dataframe must respect submaster table format) + # Insert time in the dataframe df_time_embeddings.insert(0, "end_date", end_date) df_time_embeddings.insert(0, "start_date", start_date) # Insert patient_id in the dataframe diff --git a/pythonCode/modules/extraction_ts/to_master_TSfresh_extraction.py b/pythonCode/modules/extraction_ts/to_master_TSfresh_extraction.py index a81aaed0..daa84502 100644 --- a/pythonCode/modules/extraction_ts/to_master_TSfresh_extraction.py +++ b/pythonCode/modules/extraction_ts/to_master_TSfresh_extraction.py @@ -1,3 +1,4 @@ +import dask.dataframe as dd import json import os import pandas as pd @@ -35,10 +36,8 @@ def _custom_process(self, json_config: dict) -> dict: """ go_print(json.dumps(json_config, indent=4)) - # Check if the process is necessary + # Get frequency frequency = json_config["relativeToExtractionType"]["frequency"] - if frequency != "Admission" and frequency != "HourRange": - return self.results # Initialize data extracted_data_file = json_config["csvResultsPath"] @@ -48,6 +47,14 @@ def _custom_process(self, json_config: dict) -> dict: # Set master table format depending on frequency (for notes there is nothing to do) if frequency == "Admission": extracted_data.drop(columns=[selected_columns["admissionIdentifier"]], inplace=True) + elif frequency == "Patient": + df_ts = pd.read_csv(json_config["csvPath"]) + df_ts[selected_columns["time"]] = pd.to_datetime(df_ts[selected_columns["time"]]) + df_ts = df_ts[[selected_columns["patientIdentifier"], selected_columns["time"]]] + idx_min_date = df_ts.groupby(selected_columns["patientIdentifier"])[selected_columns["time"]].idxmin() + df_ts = df_ts.loc[idx_min_date] + df_tmp = extracted_data.merge(df_ts, on=[selected_columns["patientIdentifier"]]) + extracted_data.insert(1, selected_columns["time"], df_tmp[selected_columns["time"]]) elif frequency == "HourRange": extracted_data.drop(columns=["end_date"], inplace=True) diff --git a/renderer/components/extractionTabular/extractionTypes/extractionBioBERT.jsx b/renderer/components/extractionTabular/extractionTypes/extractionBioBERT.jsx index 15e93444..ec5ce7da 100644 --- a/renderer/components/extractionTabular/extractionTypes/extractionBioBERT.jsx +++ b/renderer/components/extractionTabular/extractionTypes/extractionBioBERT.jsx @@ -126,7 +126,10 @@ const ExtractionBioBERT = ({ dataframe, setExtractionJsonData, setMayProceed }) * */ useEffect(() => { - if (frequency == "Patient") { + if (frequency == "Patient" && masterTableCompatible) { + setMayProceed(biobertPath && selectedColumns.patientIdentifier !== "" && selectedColumns.notes !== "" && selectedColumns.time !== "") + setExtractionJsonData({ biobertPath: biobertPath, selectedColumns: selectedColumns, columnPrefix: columnPrefix, frequency: frequency, masterTableCompatible: masterTableCompatible }) + } else if (frequency == "Patient") { setMayProceed(biobertPath && selectedColumns.patientIdentifier !== "" && selectedColumns.notes !== "") setExtractionJsonData({ biobertPath: biobertPath, selectedColumns: selectedColumns, columnPrefix: columnPrefix, frequency: frequency, masterTableCompatible: masterTableCompatible }) } else if (frequency == "Admission") { @@ -135,9 +138,12 @@ const ExtractionBioBERT = ({ dataframe, setExtractionJsonData, setMayProceed }) } else if (frequency == "HourRange") { setMayProceed(biobertPath && selectedColumns.patientIdentifier !== "" && selectedColumns.notes !== "" && selectedColumns.time !== "") setExtractionJsonData({ biobertPath: biobertPath, selectedColumns: selectedColumns, columnPrefix: columnPrefix, frequency: frequency, hourRange: hourRange, masterTableCompatible: masterTableCompatible }) - } else if (frequency == "Note") { + } else if (frequency == "Note" && masterTableCompatible) { setMayProceed(biobertPath && selectedColumns.patientIdentifier !== "" && selectedColumns.notes !== "" && selectedColumns.time !== "") setExtractionJsonData({ biobertPath: biobertPath, selectedColumns: selectedColumns, columnPrefix: columnPrefix, frequency: frequency, masterTableCompatible: masterTableCompatible }) + } else if (frequency == "Note") { + setMayProceed(biobertPath && selectedColumns.patientIdentifier !== "" && selectedColumns.notes !== "") + setExtractionJsonData({ biobertPath: biobertPath, selectedColumns: selectedColumns, columnPrefix: columnPrefix, frequency: frequency, masterTableCompatible: masterTableCompatible }) } }, [selectedColumns, frequency, hourRange, masterTableCompatible, columnPrefix]) @@ -199,7 +205,7 @@ const ExtractionBioBERT = ({ dataframe, setExtractionJsonData, setMayProceed })