From 40cd369757b398170e2b0083df0855bbe7d39fa5 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Fri, 30 Apr 2021 13:19:00 +0100
Subject: [PATCH 1/5] Initial

---
 InnerEye/ML/model_config_base.py |  7 ++-----
 InnerEye/ML/model_testing.py     | 17 +++++++++++++++--
 InnerEye/ML/run_ml.py            |  5 +----
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/InnerEye/ML/model_config_base.py b/InnerEye/ML/model_config_base.py
index 745d248ba..47c660fc2 100644
--- a/InnerEye/ML/model_config_base.py
+++ b/InnerEye/ML/model_config_base.py
@@ -13,6 +13,7 @@
 from pandas import DataFrame
 
 from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY
+from InnerEye.Common.common_util import ModelProcessing
 from InnerEye.Common.metrics_constants import TrackedMetrics
 from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode, STORED_CSV_FILE_NAMES
 from InnerEye.ML.deep_learning_config import DeepLearningConfig
@@ -248,16 +249,12 @@ def set_derived_model_properties(self, model: Any) -> None:
         """
         pass
 
-    def generate_custom_report(self, report_dir: Path, train_metrics: Path, val_metrics: Path,
-                               test_metrics: Path) -> Path:
+    def generate_custom_report(self, report_dir: Path, model_proc: ModelProcessing) -> Path:
         """
         Enables creating a custom results report, given the metrics files written during model training and inference.
         By default, this method is a no-op.
 
         :param report_dir: The output directory where the generated report should be saved.
-        :param train_metrics: The CSV file with training metrics.
-        :param val_metrics: The CSV file with validation metrics.
-        :param test_metrics: The CSV file with test metrics.
         :return: The path to the generated report file.
         """
         pass
diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
index 7b7421f4b..f5b5f1223 100644
--- a/InnerEye/ML/model_testing.py
+++ b/InnerEye/ML/model_testing.py
@@ -17,7 +17,7 @@
 from InnerEye.Common.common_util import BEST_EPOCH_FOLDER_NAME, METRICS_AGGREGATES_FILE, ModelProcessing, \
     SUBJECT_METRICS_FILE_NAME, get_best_epoch_results_path, is_linux, logging_section
 from InnerEye.Common.fixed_paths import DEFAULT_RESULT_IMAGE_NAME
-from InnerEye.Common.metrics_constants import MetricType, MetricsFileColumns
+from InnerEye.Common.metrics_constants import MetricType, MetricsFileColumns, LoggingColumns
 from InnerEye.ML import metrics, plotting
 from InnerEye.ML.common import ModelExecutionMode, STORED_CSV_FILE_NAMES
 from InnerEye.ML.config import DATASET_ID_FILE, GROUND_TRUTH_IDS_FILE, IMAGE_CHANNEL_IDS_FILE, SegmentationModelBase
@@ -43,6 +43,7 @@
 BOXPLOT_FILE = "metrics_boxplot.png"
 THUMBNAILS_FOLDER = "thumbnails"
 
+MODEL_OUTPUT_CSV = "model_outputs.csv"
 
 def model_test(config: ModelConfigBase,
                data_split: ModelExecutionMode,
@@ -428,12 +429,24 @@ def test_epoch(checkpoint_paths: List[Path]) -> Optional[MetricsDict]:
 
         logging.info(f"Starting to evaluate model on {data_split.value} set.")
         metrics_dict = create_metrics_dict_for_scalar_models(config)
+
+        results_folder = config.outputs_folder / get_best_epoch_results_path(data_split, model_proc)
+        csv_file = results_folder / MODEL_OUTPUT_CSV
+        os.makedirs(str(results_folder), exist_ok=True)
+        with open(csv_file, "w") as f:
+            f.write(f"{LoggingColumns.Patient.value},{LoggingColumns.Hue.value},{LoggingColumns.Label.value},"
+                    f"{LoggingColumns.ModelOutput.value},{LoggingColumns.CrossValidationSplitIndex.value}\n")
+
         for sample in ds:
             result = pipeline.predict(sample)
             model_output = result.posteriors
             label = result.labels.to(device=model_output.device)
             label = posthoc_label_transform(label)
             sample_id = result.subject_ids[0]
+            with open(csv_file, "a") as f:
+                for i in range(len(config.target_names)):
+                    f.write(f"{sample_id},{config.target_names[i]},{label[0][i].item()},{model_output[0][i].item()},"
+                            f"{cross_val_split_index}\n")
             compute_scalar_metrics(metrics_dict,
                                    subject_ids=[sample_id],
                                    model_output=model_output,
@@ -465,7 +478,7 @@ def test_epoch(checkpoint_paths: List[Path]) -> Optional[MetricsDict]:
             # during train time. If this is not the case, or we are running on the test set, create the metrics
             # file.
             if not csv_file.exists():
-                os.makedirs(str(results_folder), exist_ok=False)
+                os.makedirs(str(results_folder), exist_ok=True)
                 df_logger = DataframeLogger(csv_file)
                 # For test if ensemble split should be default, else record which fold produced this prediction
                 cv_index = DEFAULT_CROSS_VALIDATION_SPLIT_INDEX if model_proc == ModelProcessing.ENSEMBLE_CREATION \
diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py
index 48ee4131c..efe747244 100644
--- a/InnerEye/ML/run_ml.py
+++ b/InnerEye/ML/run_ml.py
@@ -889,10 +889,7 @@ def get_epoch_path(mode: ModelExecutionMode) -> Path:
                 else:
                     logging.info(f"Cannot create report for config of type {type(config)}.")
 
-            config.generate_custom_report(report_dir=reports_dir,
-                                          train_metrics=path_to_best_epoch_train,
-                                          val_metrics=path_to_best_epoch_val,
-                                          test_metrics=path_to_best_epoch_test)
+            config.generate_custom_report(report_dir=reports_dir, model_proc=model_proc)
         except Exception as ex:
             print_exception(ex, "Failed to generated reporting notebook.")
             raise

From 35e3a918e1259bb7a0a0a59a0fcac66bfd9908e6 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 4 May 2021 12:47:55 +0100
Subject: [PATCH 2/5] Refactor

---
 InnerEye/ML/model_config_base.py |   1 +
 InnerEye/ML/model_testing.py     | 151 +++++++++++++++----------------
 2 files changed, 74 insertions(+), 78 deletions(-)

diff --git a/InnerEye/ML/model_config_base.py b/InnerEye/ML/model_config_base.py
index 47c660fc2..ed483444e 100644
--- a/InnerEye/ML/model_config_base.py
+++ b/InnerEye/ML/model_config_base.py
@@ -255,6 +255,7 @@ def generate_custom_report(self, report_dir: Path, model_proc: ModelProcessing)
         By default, this method is a no-op.
 
         :param report_dir: The output directory where the generated report should be saved.
+        :param model_proc: The type of model that is registered (single or ensemble)
         :return: The path to the generated report file.
         """
         pass
diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
index f5b5f1223..e043f6fe9 100644
--- a/InnerEye/ML/model_testing.py
+++ b/InnerEye/ML/model_testing.py
@@ -42,9 +42,9 @@
 
 BOXPLOT_FILE = "metrics_boxplot.png"
 THUMBNAILS_FOLDER = "thumbnails"
-
 MODEL_OUTPUT_CSV = "model_outputs.csv"
 
+
 def model_test(config: ModelConfigBase,
                data_split: ModelExecutionMode,
                checkpoint_handler: CheckpointHandler,
@@ -410,84 +410,79 @@ def classification_model_test(config: ScalarModelBase,
     """
     posthoc_label_transform = config.get_posthoc_label_transform()
 
-    def test_epoch(checkpoint_paths: List[Path]) -> Optional[MetricsDict]:
-        pipeline = create_inference_pipeline(config=config,
-                                             checkpoint_paths=checkpoint_paths)
-
-        if pipeline is None:
-            return None
-
-        # for mypy
-        assert isinstance(pipeline, ScalarInferencePipelineBase)
-
-        ml_util.set_random_seed(config.get_effective_random_seed(), "Model Testing")
-        ds = config.get_torch_dataset_for_inference(data_split).as_data_loader(
-            shuffle=False,
-            batch_size=1,
-            num_dataload_workers=0
-        )
-
-        logging.info(f"Starting to evaluate model on {data_split.value} set.")
-        metrics_dict = create_metrics_dict_for_scalar_models(config)
-
-        results_folder = config.outputs_folder / get_best_epoch_results_path(data_split, model_proc)
-        csv_file = results_folder / MODEL_OUTPUT_CSV
-        os.makedirs(str(results_folder), exist_ok=True)
-        with open(csv_file, "w") as f:
-            f.write(f"{LoggingColumns.Patient.value},{LoggingColumns.Hue.value},{LoggingColumns.Label.value},"
-                    f"{LoggingColumns.ModelOutput.value},{LoggingColumns.CrossValidationSplitIndex.value}\n")
-
-        for sample in ds:
-            result = pipeline.predict(sample)
-            model_output = result.posteriors
-            label = result.labels.to(device=model_output.device)
-            label = posthoc_label_transform(label)
-            sample_id = result.subject_ids[0]
-            with open(csv_file, "a") as f:
-                for i in range(len(config.target_names)):
-                    f.write(f"{sample_id},{config.target_names[i]},{label[0][i].item()},{model_output[0][i].item()},"
-                            f"{cross_val_split_index}\n")
-            compute_scalar_metrics(metrics_dict,
-                                   subject_ids=[sample_id],
-                                   model_output=model_output,
-                                   labels=label,
-                                   loss_type=config.loss_type)
-            logging.debug(f"Example {sample_id}: {metrics_dict.to_string()}")
-
-        average = metrics_dict.average(across_hues=False)
-        logging.info(average.to_string())
-
-        return metrics_dict
-
-    checkpoints_to_test = checkpoint_handler.get_checkpoints_to_test()
-
-    if not checkpoints_to_test:
+    checkpoint_paths = checkpoint_handler.get_checkpoints_to_test()
+    if not checkpoint_paths:
         raise ValueError("There were no checkpoints available for model testing.")
 
-    result = test_epoch(checkpoint_paths=checkpoints_to_test)
-    if result is None:
+    pipeline = create_inference_pipeline(config=config,
+                                         checkpoint_paths=checkpoint_paths)
+    if pipeline is None:
         raise ValueError("There was no single checkpoint file available for model testing.")
+
+    # for mypy
+    assert isinstance(pipeline, ScalarInferencePipelineBase)
+
+    ml_util.set_random_seed(config.get_effective_random_seed(), "Model Testing")
+    ds = config.get_torch_dataset_for_inference(data_split).as_data_loader(
+        shuffle=False,
+        batch_size=1,
+        num_dataload_workers=0
+    )
+
+    logging.info(f"Starting to evaluate model on {data_split.value} set.")
+    results_folder = config.outputs_folder / get_best_epoch_results_path(data_split, model_proc)
+    os.makedirs(str(results_folder), exist_ok=True)
+    metrics_dict = create_metrics_dict_for_scalar_models(config)
+    if not isinstance(config, SequenceModelBase):
+        output_logger: Optional[DataframeLogger] = DataframeLogger(csv_path=results_folder / MODEL_OUTPUT_CSV)
     else:
-        if isinstance(result, ScalarMetricsDict):
-            results_folder = config.outputs_folder / get_best_epoch_results_path(data_split, model_proc)
-            csv_file = results_folder / SUBJECT_METRICS_FILE_NAME
-
-            logging.info(f"Writing {data_split.value} metrics to file {str(csv_file)}")
-
-            # If we are running inference after a training run, the validation set metrics may have been written
-            # during train time. If this is not the case, or we are running on the test set, create the metrics
-            # file.
-            if not csv_file.exists():
-                os.makedirs(str(results_folder), exist_ok=True)
-                df_logger = DataframeLogger(csv_file)
-                # For test if ensemble split should be default, else record which fold produced this prediction
-                cv_index = DEFAULT_CROSS_VALIDATION_SPLIT_INDEX if model_proc == ModelProcessing.ENSEMBLE_CREATION \
-                    else cross_val_split_index
-                result.store_metrics_per_subject(df_logger=df_logger,
-                                                 mode=data_split,
-                                                 cross_validation_split_index=cv_index,
-                                                 epoch=BEST_EPOCH_FOLDER_NAME)
-                # write to disk
-                df_logger.flush()
-
-    return InferenceMetricsForClassification(metrics=result)
+        output_logger = None
+
+    for sample in ds:
+        result = pipeline.predict(sample)
+        model_output = result.posteriors
+        label = result.labels.to(device=model_output.device)
+        label = posthoc_label_transform(label)
+        sample_id = result.subject_ids[0]
+        if output_logger:
+            for i in range(len(config.target_names)):
+                output_logger.add_record({LoggingColumns.Patient.value: sample_id,
+                                          LoggingColumns.Hue.value: config.target_names[i],
+                                          LoggingColumns.Label.value: label[0][i].item(),
+                                          LoggingColumns.ModelOutput.value: model_output[0][i].item(),
+                                          LoggingColumns.CrossValidationSplitIndex.value: cross_val_split_index})
+
+        compute_scalar_metrics(metrics_dict,
+                               subject_ids=[sample_id],
+                               model_output=model_output,
+                               labels=label,
+                               loss_type=config.loss_type)
+        logging.debug(f"Example {sample_id}: {metrics_dict.to_string()}")
+
+    average = metrics_dict.average(across_hues=False)
+    logging.info(average.to_string())
+
+    if isinstance(metrics_dict, ScalarMetricsDict):
+        csv_file = results_folder / SUBJECT_METRICS_FILE_NAME
+
+        logging.info(f"Writing {data_split.value} metrics to file {str(csv_file)}")
+
+        # If we are running inference after a training run, the validation set metrics may have been written
+        # during train time. If this is not the case, or we are running on the test set, create the metrics
+        # file.
+        if not csv_file.exists():
+            df_logger = DataframeLogger(csv_file)
+            # For test if ensemble split should be default, else record which fold produced this prediction
+            cv_index = DEFAULT_CROSS_VALIDATION_SPLIT_INDEX if model_proc == ModelProcessing.ENSEMBLE_CREATION \
+                else cross_val_split_index
+            metrics_dict.store_metrics_per_subject(df_logger=df_logger,
+                                                   mode=data_split,
+                                                   cross_validation_split_index=cv_index,
+                                                   epoch=BEST_EPOCH_FOLDER_NAME)
+            # write to disk
+            df_logger.flush()
+
+    if output_logger:
+        output_logger.flush()
+
+    return InferenceMetricsForClassification(metrics=metrics_dict)

From 7f41dffebdf3657db0d0535b35a53ea9bce65db2 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 4 May 2021 13:23:26 +0100
Subject: [PATCH 3/5] Add test

---
 Tests/ML/models/test_scalar_model.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/Tests/ML/models/test_scalar_model.py b/Tests/ML/models/test_scalar_model.py
index 2f709cd84..6194aa7ad 100644
--- a/Tests/ML/models/test_scalar_model.py
+++ b/Tests/ML/models/test_scalar_model.py
@@ -139,7 +139,7 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
 3,S4,{class_name},0.521128,0,Train,-1
 """
     check_log_file(metrics_path, metrics_expected, ignore_columns=[])
-    # Check log METRICS_FILE_NAME inside of the folder epoch_004/Train, which is written when we run model_test.
+    # Check log METRICS_FILE_NAME inside of the folder best_validation_epoch/Train, which is written when we run model_test.
     # Normally, we would run it on the Test and Val splits, but for convenience we test on the train split here.
     inference_metrics_path = config.outputs_folder / get_best_epoch_results_path(ModelExecutionMode.TRAIN) / \
                              SUBJECT_METRICS_FILE_NAME
@@ -150,6 +150,14 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
 """
     check_log_file(inference_metrics_path, inference_metrics_expected, ignore_columns=[])
 
+    inference_model_output_path = config.outputs_folder / get_best_epoch_results_path(ModelExecutionMode.TRAIN) / \
+                                  model_testing.MODEL_OUTPUT_CSV
+    inference_model_output_expected = \
+        f"""subject,prediction_target,label,model_output,cross_validation_split_index
+S2,{class_name},1.000000,0.529399,-1
+S4,{class_name},0.000000,0.521128,-1"""
+    check_log_file(inference_model_output_path, inference_model_output_expected, ignore_columns=[])
+
 
 @pytest.mark.skipif(common_util.is_windows(), reason="Has OOM issues on windows build")
 @pytest.mark.cpu_and_gpu

From 2d09a4c78e0e57619bae4205f7a77c7b05039f6e Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 4 May 2021 13:26:53 +0100
Subject: [PATCH 4/5] Update CHANGELOG.md

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1e6807bd4..000215458 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -61,6 +61,8 @@ with only minimum code changes required. See [the MD documentation](docs/bring_y
 - ([#445](https://github.com/microsoft/InnerEye-DeepLearning/pull/445)) Adding test coverage for the `HelloContainer`
   model with multiple GPUs
 - ([#450](https://github.com/microsoft/InnerEye-DeepLearning/pull/450)) Adds the metric "Accuracy at threshold 0.5" to the classification report (`classification_crossval_report.ipynb`). 
+- ([#451](https://github.com/microsoft/InnerEye-DeepLearning/pull/451)) Write a file `model_outputs.csv` with columns 
+  `subject`, `prediction_target`, `label`, `model_output` and `cross_validation_split_index`. This file is not written out for sequence models.
 
 ### Changed
 

From 27a16ee046a15a365bf56f34cfa7a3f4ffe006a0 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 4 May 2021 13:49:26 +0100
Subject: [PATCH 5/5] Address PR comments

---
 CHANGELOG.md                 | 2 ++
 InnerEye/ML/model_testing.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 000215458..d97088508 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -79,6 +79,8 @@ with only minimum code changes required. See [the MD documentation](docs/bring_y
 - ([#437])(https://github.com/microsoft/InnerEye-DeepLearning/pull/437)) Upgrade to PyTorch-Lightning 1.2.8.
 - ([#439](https://github.com/microsoft/InnerEye-DeepLearning/pull/439)) Recovery checkpoints are now
   named `recovery_epoch=x.ckpt` instead of `recovery.ckpt` or `recovery-v0.ckpt`.
+- ([#451](https://github.com/microsoft/InnerEye-DeepLearning/pull/451)) Change the signature for function `generate_custom_report` 
+  in `ModelConfigBase` to take only the path to the reports folder and a `ModelProcessing` object.
 
 ### Fixed
 
diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
index e043f6fe9..f9f709321 100644
--- a/InnerEye/ML/model_testing.py
+++ b/InnerEye/ML/model_testing.py
@@ -417,7 +417,7 @@ def classification_model_test(config: ScalarModelBase,
     pipeline = create_inference_pipeline(config=config,
                                          checkpoint_paths=checkpoint_paths)
     if pipeline is None:
-        raise ValueError("There was no single checkpoint file available for model testing.")
+        raise ValueError("Inference pipeline could not be created.")
 
     # for mypy
     assert isinstance(pipeline, ScalarInferencePipelineBase)