diff --git a/CHANGELOG.md b/CHANGELOG.md index c533eb00a..61e46ce3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ created. ## Upcoming ### Added + +- ([#446](https://github.com/microsoft/InnerEye-DeepLearning/pull/446)) Guarding `save_outlier` so that it works when +institution id and series id columns are missing. - ([#441](https://github.com/microsoft/InnerEye-DeepLearning/pull/441)) Add script to move models from one AzureML workspace to another: `python InnerEye/Scripts/move_model.py` - ([#417](https://github.com/microsoft/InnerEye-DeepLearning/pull/417)) Added a generic way of adding PyTorch Lightning models to the toolbox. It is now possible to train almost any Lightning model with the InnerEye toolbox in AzureML, diff --git a/InnerEye/ML/visualizers/plot_cross_validation.py b/InnerEye/ML/visualizers/plot_cross_validation.py index ab83d5fb0..320aa2817 100644 --- a/InnerEye/ML/visualizers/plot_cross_validation.py +++ b/InnerEye/ML/visualizers/plot_cross_validation.py @@ -676,14 +676,17 @@ def save_outliers(config: PlotCrossValidationConfig, f.write(f"\n\n=== METRIC: {metric_type} ===\n\n") if len(outliers) > 0: - outliers_summary = str(outliers.groupby( - [MetricsFileColumns.Patient.value, MetricsFileColumns.Structure.value, - CSV_SERIES_HEADER, CSV_INSTITUTION_HEADER]) + # If running inside institution there may be no CSV_SERIES_HEADER and CSV_INSTITUTION_HEADER columns + groupby_columns = [MetricsFileColumns.Patient.value, MetricsFileColumns.Structure.value] + if CSV_SERIES_HEADER in outliers.columns and CSV_INSTITUTION_HEADER in outliers.columns: + groupby_columns += [CSV_SERIES_HEADER, CSV_INSTITUTION_HEADER] + outliers_summary = str(outliers.groupby(groupby_columns) .describe()[metric_type][stats_columns] .sort_values(stats_columns, ascending=False)) f.write(outliers_summary) - f.write("\n\n") - f.write(create_portal_query_for_outliers(outliers)) + if CSV_INSTITUTION_HEADER in outliers.columns and CSV_SERIES_HEADER in outliers.columns: + f.write("\n\n") + f.write(create_portal_query_for_outliers(outliers)) else: f.write("No outliers found") @@ -693,7 +696,11 @@ def save_outliers(config: PlotCrossValidationConfig, def create_portal_query_for_outliers(df: pd.DataFrame) -> str: """ Create a portal query string as a conjunction of the disjunctions of the unique InstitutionId and seriesId values. + + The passed data frame must have CSV_INSTITUTION_HEADER and CSV_SERIES_HEADER columns """ + if CSV_INSTITUTION_HEADER not in df.columns or CSV_SERIES_HEADER not in df.columns: + raise ValueError(f"Data frame must have columns {CSV_INSTITUTION_HEADER} and {CSV_SERIES_HEADER}") return PORTAL_QUERY_TEMPLATE.format( " OR ".join(map(lambda x: 'r.InstitutionId = "{}"'.format(x), df[CSV_INSTITUTION_HEADER].unique())), " OR ".join(map(lambda x: 'STARTSWITH(r.VersionedDicomImageSeries.Latest.Series.InstanceUID,"{}")'.format(x), diff --git a/Tests/ML/test_data/Val_outliers_pruned.txt b/Tests/ML/test_data/Val_outliers_pruned.txt new file mode 100644 index 000000000..2a28a5220 --- /dev/null +++ b/Tests/ML/test_data/Val_outliers_pruned.txt @@ -0,0 +1,56 @@ + + +=== METRIC: Dice === + + count mean min max +Patient Structure +341 rectum 1.0 1.1 1.1 1.1 +366 rectum 1.0 0.9 0.9 0.9 +411 rectum 1.0 0.9 0.9 0.9 +306 rectum 1.0 0.8 0.8 0.8 +344 rectum 1.0 0.8 0.8 0.8 +365 rectum 1.0 0.7 0.7 0.7 +343 rectum 1.0 0.4 0.4 0.4 +409 rectum 1.0 0.4 0.4 0.4 +320 rectum 1.0 0.3 0.3 0.3 +374 rectum 1.0 0.2 0.2 0.2 +341 femur_r 1.0 0.1 0.1 0.1 +366 seminalvesicles 1.0 0.1 0.1 0.1 +306 bladder 1.0 0.0 0.0 0.0 + femur_r 1.0 0.0 0.0 0.0 + prostate 1.0 0.0 0.0 0.0 + seminalvesicles 1.0 0.0 0.0 0.0 +320 bladder 1.0 0.0 0.0 0.0 + femur_r 1.0 0.0 0.0 0.0 + prostate 1.0 0.0 0.0 0.0 + seminalvesicles 1.0 0.0 0.0 0.0 +341 bladder 1.0 0.0 0.0 0.0 + prostate 1.0 0.0 0.0 0.0 + seminalvesicles 1.0 0.0 0.0 0.0 +343 bladder 1.0 0.0 0.0 0.0 + femur_r 1.0 0.0 0.0 0.0 + prostate 1.0 0.0 0.0 0.0 + seminalvesicles 1.0 0.0 0.0 0.0 +344 bladder 1.0 0.0 0.0 0.0 + femur_r 1.0 0.0 0.0 0.0 + prostate 1.0 0.0 0.0 0.0 + seminalvesicles 1.0 0.0 0.0 0.0 +365 bladder 1.0 0.0 0.0 0.0 + femur_r 1.0 0.0 0.0 0.0 + prostate 1.0 0.0 0.0 0.0 + seminalvesicles 1.0 0.0 0.0 0.0 +366 bladder 1.0 0.0 0.0 0.0 + femur_r 1.0 0.0 0.0 0.0 + prostate 1.0 0.0 0.0 0.0 +374 bladder 1.0 0.0 0.0 0.0 + femur_r 1.0 0.0 0.0 0.0 + prostate 1.0 0.0 0.0 0.0 + seminalvesicles 1.0 0.0 0.0 0.0 +409 bladder 1.0 0.0 0.0 0.0 + femur_r 1.0 0.0 0.0 0.0 + prostate 1.0 0.0 0.0 0.0 + seminalvesicles 1.0 0.0 0.0 0.0 +411 bladder 1.0 0.0 0.0 0.0 + femur_r 1.0 0.0 0.0 0.0 + prostate 1.0 0.0 0.0 0.0 + seminalvesicles 1.0 0.0 0.0 0.0 \ No newline at end of file diff --git a/Tests/ML/visualizers/test_plot_cross_validation.py b/Tests/ML/visualizers/test_plot_cross_validation.py index 0f0b27f8b..42654ba7b 100644 --- a/Tests/ML/visualizers/test_plot_cross_validation.py +++ b/Tests/ML/visualizers/test_plot_cross_validation.py @@ -7,6 +7,7 @@ import pandas as pd import pytest +from pytest import raises from azureml.core import Run from pandas.core.dtypes.common import is_string_dtype @@ -275,9 +276,15 @@ def test_save_outliers(test_config: PlotCrossValidationConfig, assert test_config.run_recovery_id dataset_split_metrics = {x: _get_metrics_df(test_config.run_recovery_id, x) for x in [ModelExecutionMode.VAL]} save_outliers(test_config, dataset_split_metrics, test_config.outputs_directory) - f = f"{ModelExecutionMode.VAL.value}_outliers.txt" - assert_text_files_match(full_file=test_config.outputs_directory / f, - expected_file=full_ml_test_data_path(f)) + filename = f"{ModelExecutionMode.VAL.value}_outliers.txt" + assert_text_files_match(full_file=test_config.outputs_directory / filename, expected_file=full_ml_test_data_path(filename)) + # Now test without the CSV_INSTITUTION_HEADER and CSV_SERIES_HEADER columns, which will be missing in institutions' environments + dataset_split_metrics_pruned = { + x: _get_metrics_df(test_config.run_recovery_id, x).drop(columns=[CSV_INSTITUTION_HEADER, CSV_SERIES_HEADER], errors="ignore") + for x in [ModelExecutionMode.VAL]} + save_outliers(test_config, dataset_split_metrics_pruned, test_config.outputs_directory) + test_data_filename = f"{ModelExecutionMode.VAL.value}_outliers_pruned.txt" + assert_text_files_match(full_file=test_config.outputs_directory / filename, expected_file=full_ml_test_data_path(test_data_filename)) def test_create_portal_query_for_outliers() -> None: @@ -289,8 +296,17 @@ def test_create_portal_query_for_outliers() -> None: expected = PORTAL_QUERY_TEMPLATE.format('r.InstitutionId = "0" OR r.InstitutionId = "1"', 'STARTSWITH(r.VersionedDicomImageSeries.Latest.Series.InstanceUID,"3") OR ' 'STARTSWITH(r.VersionedDicomImageSeries.Latest.Series.InstanceUID,"4")') - assert expected == create_portal_query_for_outliers(test_df) + with raises(ValueError) as institution_column_missing_error: + test_df_pruned = test_df.drop(columns=[CSV_INSTITUTION_HEADER]) + create_portal_query_for_outliers(test_df_pruned) + error_message = str(institution_column_missing_error.value) + assert CSV_INSTITUTION_HEADER in error_message + with raises(ValueError) as series_column_missing_error: + test_df_pruned = test_df.drop(columns=[CSV_SERIES_HEADER]) + create_portal_query_for_outliers(test_df_pruned) + error_message = str(series_column_missing_error.value) + assert CSV_SERIES_HEADER in error_message def test_create_summary(test_output_dirs: OutputFolderForTests) -> None: