Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Guard plot_cross_validation.save_outliers for institutionId and seriesId columns confirm with unit test #446

Merged
merged 7 commits into from
Apr 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ created.
## Upcoming

### Added

- ([#446](https://github.com/microsoft/InnerEye-DeepLearning/pull/446)) Guarding `save_outlier` so that it works when
institution id and series id columns are missing.
- ([#441](https://github.com/microsoft/InnerEye-DeepLearning/pull/441)) Add script to move models from one AzureML workspace to another: `python InnerEye/Scripts/move_model.py`
- ([#417](https://github.com/microsoft/InnerEye-DeepLearning/pull/417)) Added a generic way of adding PyTorch Lightning
models to the toolbox. It is now possible to train almost any Lightning model with the InnerEye toolbox in AzureML,
Expand Down
17 changes: 12 additions & 5 deletions InnerEye/ML/visualizers/plot_cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,14 +676,17 @@ def save_outliers(config: PlotCrossValidationConfig,

f.write(f"\n\n=== METRIC: {metric_type} ===\n\n")
if len(outliers) > 0:
outliers_summary = str(outliers.groupby(
[MetricsFileColumns.Patient.value, MetricsFileColumns.Structure.value,
CSV_SERIES_HEADER, CSV_INSTITUTION_HEADER])
# If running inside institution there may be no CSV_SERIES_HEADER and CSV_INSTITUTION_HEADER columns
groupby_columns = [MetricsFileColumns.Patient.value, MetricsFileColumns.Structure.value]
if CSV_SERIES_HEADER in outliers.columns and CSV_INSTITUTION_HEADER in outliers.columns:
groupby_columns += [CSV_SERIES_HEADER, CSV_INSTITUTION_HEADER]
outliers_summary = str(outliers.groupby(groupby_columns)
.describe()[metric_type][stats_columns]
.sort_values(stats_columns, ascending=False))
f.write(outliers_summary)
f.write("\n\n")
f.write(create_portal_query_for_outliers(outliers))
if CSV_INSTITUTION_HEADER in outliers.columns and CSV_SERIES_HEADER in outliers.columns:
f.write("\n\n")
f.write(create_portal_query_for_outliers(outliers))
else:
f.write("No outliers found")

Expand All @@ -693,7 +696,11 @@ def save_outliers(config: PlotCrossValidationConfig,
def create_portal_query_for_outliers(df: pd.DataFrame) -> str:
"""
Create a portal query string as a conjunction of the disjunctions of the unique InstitutionId and seriesId values.

The passed data frame must have CSV_INSTITUTION_HEADER and CSV_SERIES_HEADER columns
"""
if CSV_INSTITUTION_HEADER not in df.columns or CSV_SERIES_HEADER not in df.columns:
raise ValueError(f"Data frame must have columns {CSV_INSTITUTION_HEADER} and {CSV_SERIES_HEADER}")
return PORTAL_QUERY_TEMPLATE.format(
" OR ".join(map(lambda x: 'r.InstitutionId = "{}"'.format(x), df[CSV_INSTITUTION_HEADER].unique())),
" OR ".join(map(lambda x: 'STARTSWITH(r.VersionedDicomImageSeries.Latest.Series.InstanceUID,"{}")'.format(x),
Expand Down
56 changes: 56 additions & 0 deletions Tests/ML/test_data/Val_outliers_pruned.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@


=== METRIC: Dice ===

count mean min max
Patient Structure
341 rectum 1.0 1.1 1.1 1.1
366 rectum 1.0 0.9 0.9 0.9
411 rectum 1.0 0.9 0.9 0.9
306 rectum 1.0 0.8 0.8 0.8
344 rectum 1.0 0.8 0.8 0.8
365 rectum 1.0 0.7 0.7 0.7
343 rectum 1.0 0.4 0.4 0.4
409 rectum 1.0 0.4 0.4 0.4
320 rectum 1.0 0.3 0.3 0.3
374 rectum 1.0 0.2 0.2 0.2
341 femur_r 1.0 0.1 0.1 0.1
366 seminalvesicles 1.0 0.1 0.1 0.1
306 bladder 1.0 0.0 0.0 0.0
femur_r 1.0 0.0 0.0 0.0
prostate 1.0 0.0 0.0 0.0
seminalvesicles 1.0 0.0 0.0 0.0
320 bladder 1.0 0.0 0.0 0.0
femur_r 1.0 0.0 0.0 0.0
prostate 1.0 0.0 0.0 0.0
seminalvesicles 1.0 0.0 0.0 0.0
341 bladder 1.0 0.0 0.0 0.0
prostate 1.0 0.0 0.0 0.0
seminalvesicles 1.0 0.0 0.0 0.0
343 bladder 1.0 0.0 0.0 0.0
femur_r 1.0 0.0 0.0 0.0
prostate 1.0 0.0 0.0 0.0
seminalvesicles 1.0 0.0 0.0 0.0
344 bladder 1.0 0.0 0.0 0.0
femur_r 1.0 0.0 0.0 0.0
prostate 1.0 0.0 0.0 0.0
seminalvesicles 1.0 0.0 0.0 0.0
365 bladder 1.0 0.0 0.0 0.0
femur_r 1.0 0.0 0.0 0.0
prostate 1.0 0.0 0.0 0.0
seminalvesicles 1.0 0.0 0.0 0.0
366 bladder 1.0 0.0 0.0 0.0
femur_r 1.0 0.0 0.0 0.0
prostate 1.0 0.0 0.0 0.0
374 bladder 1.0 0.0 0.0 0.0
femur_r 1.0 0.0 0.0 0.0
prostate 1.0 0.0 0.0 0.0
seminalvesicles 1.0 0.0 0.0 0.0
409 bladder 1.0 0.0 0.0 0.0
femur_r 1.0 0.0 0.0 0.0
prostate 1.0 0.0 0.0 0.0
seminalvesicles 1.0 0.0 0.0 0.0
411 bladder 1.0 0.0 0.0 0.0
femur_r 1.0 0.0 0.0 0.0
prostate 1.0 0.0 0.0 0.0
seminalvesicles 1.0 0.0 0.0 0.0
24 changes: 20 additions & 4 deletions Tests/ML/visualizers/test_plot_cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import pandas as pd
import pytest
from pytest import raises
from azureml.core import Run
from pandas.core.dtypes.common import is_string_dtype

Expand Down Expand Up @@ -275,9 +276,15 @@ def test_save_outliers(test_config: PlotCrossValidationConfig,
assert test_config.run_recovery_id
dataset_split_metrics = {x: _get_metrics_df(test_config.run_recovery_id, x) for x in [ModelExecutionMode.VAL]}
save_outliers(test_config, dataset_split_metrics, test_config.outputs_directory)
f = f"{ModelExecutionMode.VAL.value}_outliers.txt"
assert_text_files_match(full_file=test_config.outputs_directory / f,
expected_file=full_ml_test_data_path(f))
filename = f"{ModelExecutionMode.VAL.value}_outliers.txt"
assert_text_files_match(full_file=test_config.outputs_directory / filename, expected_file=full_ml_test_data_path(filename))
# Now test without the CSV_INSTITUTION_HEADER and CSV_SERIES_HEADER columns, which will be missing in institutions' environments
dataset_split_metrics_pruned = {
x: _get_metrics_df(test_config.run_recovery_id, x).drop(columns=[CSV_INSTITUTION_HEADER, CSV_SERIES_HEADER], errors="ignore")
for x in [ModelExecutionMode.VAL]}
save_outliers(test_config, dataset_split_metrics_pruned, test_config.outputs_directory)
test_data_filename = f"{ModelExecutionMode.VAL.value}_outliers_pruned.txt"
assert_text_files_match(full_file=test_config.outputs_directory / filename, expected_file=full_ml_test_data_path(test_data_filename))


def test_create_portal_query_for_outliers() -> None:
Expand All @@ -289,8 +296,17 @@ def test_create_portal_query_for_outliers() -> None:
expected = PORTAL_QUERY_TEMPLATE.format('r.InstitutionId = "0" OR r.InstitutionId = "1"',
'STARTSWITH(r.VersionedDicomImageSeries.Latest.Series.InstanceUID,"3") OR '
'STARTSWITH(r.VersionedDicomImageSeries.Latest.Series.InstanceUID,"4")')

assert expected == create_portal_query_for_outliers(test_df)
with raises(ValueError) as institution_column_missing_error:
test_df_pruned = test_df.drop(columns=[CSV_INSTITUTION_HEADER])
create_portal_query_for_outliers(test_df_pruned)
error_message = str(institution_column_missing_error.value)
assert CSV_INSTITUTION_HEADER in error_message
with raises(ValueError) as series_column_missing_error:
test_df_pruned = test_df.drop(columns=[CSV_SERIES_HEADER])
create_portal_query_for_outliers(test_df_pruned)
error_message = str(series_column_missing_error.value)
assert CSV_SERIES_HEADER in error_message


def test_create_summary(test_output_dirs: OutputFolderForTests) -> None:
Expand Down