microsoft · melanibe · May 4, 2021 · Apr 30, 2021 · May 4, 2021 · May 4, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -61,6 +61,8 @@ with only minimum code changes required. See [the MD documentation](docs/bring_y
 - ([#445](https://github.com/microsoft/InnerEye-DeepLearning/pull/445)) Adding test coverage for the `HelloContainer`
   model with multiple GPUs
 - ([#450](https://github.com/microsoft/InnerEye-DeepLearning/pull/450)) Adds the metric "Accuracy at threshold 0.5" to the classification report (`classification_crossval_report.ipynb`). 
+- ([#451](https://github.com/microsoft/InnerEye-DeepLearning/pull/451)) Write a file `model_outputs.csv` with columns 
+  `subject`, `prediction_target`, `label`, `model_output` and `cross_validation_split_index`. This file is not written out for sequence models.
 
 ### Changed
 
@@ -77,6 +79,8 @@ with only minimum code changes required. See [the MD documentation](docs/bring_y
 - ([#437])(https://github.com/microsoft/InnerEye-DeepLearning/pull/437)) Upgrade to PyTorch-Lightning 1.2.8.
 - ([#439](https://github.com/microsoft/InnerEye-DeepLearning/pull/439)) Recovery checkpoints are now
   named `recovery_epoch=x.ckpt` instead of `recovery.ckpt` or `recovery-v0.ckpt`.
+- ([#451](https://github.com/microsoft/InnerEye-DeepLearning/pull/451)) Change the signature for function `generate_custom_report` 
+  in `ModelConfigBase` to take only the path to the reports folder and a `ModelProcessing` object.
 
 ### Fixed
 

diff --git a/InnerEye/ML/model_config_base.py b/InnerEye/ML/model_config_base.py
@@ -13,6 +13,7 @@
 from pandas import DataFrame
 
 from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY
+from InnerEye.Common.common_util import ModelProcessing
 from InnerEye.Common.metrics_constants import TrackedMetrics
 from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode, STORED_CSV_FILE_NAMES
 from InnerEye.ML.deep_learning_config import DeepLearningConfig
@@ -248,16 +249,13 @@ def set_derived_model_properties(self, model: Any) -> None:
         """
         pass
 
-    def generate_custom_report(self, report_dir: Path, train_metrics: Path, val_metrics: Path,
-                               test_metrics: Path) -> Path:
+    def generate_custom_report(self, report_dir: Path, model_proc: ModelProcessing) -> Path:
         """
         Enables creating a custom results report, given the metrics files written during model training and inference.
         By default, this method is a no-op.
 
         :param report_dir: The output directory where the generated report should be saved.
-        :param train_metrics: The CSV file with training metrics.
-        :param val_metrics: The CSV file with validation metrics.
-        :param test_metrics: The CSV file with test metrics.
+        :param model_proc: The type of model that is registered (single or ensemble)
         :return: The path to the generated report file.
         """
         pass

diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
@@ -17,7 +17,7 @@
 from InnerEye.Common.common_util import BEST_EPOCH_FOLDER_NAME, METRICS_AGGREGATES_FILE, ModelProcessing, \
     SUBJECT_METRICS_FILE_NAME, get_best_epoch_results_path, is_linux, logging_section
 from InnerEye.Common.fixed_paths import DEFAULT_RESULT_IMAGE_NAME
-from InnerEye.Common.metrics_constants import MetricType, MetricsFileColumns
+from InnerEye.Common.metrics_constants import MetricType, MetricsFileColumns, LoggingColumns
 from InnerEye.ML import metrics, plotting
 from InnerEye.ML.common import ModelExecutionMode, STORED_CSV_FILE_NAMES
 from InnerEye.ML.config import DATASET_ID_FILE, GROUND_TRUTH_IDS_FILE, IMAGE_CHANNEL_IDS_FILE, SegmentationModelBase
@@ -42,6 +42,7 @@
 
 BOXPLOT_FILE = "metrics_boxplot.png"
 THUMBNAILS_FOLDER = "thumbnails"
+MODEL_OUTPUT_CSV = "model_outputs.csv"
 
 
 def model_test(config: ModelConfigBase,
@@ -409,72 +410,79 @@ def classification_model_test(config: ScalarModelBase,
     """
     posthoc_label_transform = config.get_posthoc_label_transform()
 
-    def test_epoch(checkpoint_paths: List[Path]) -> Optional[MetricsDict]:
-        pipeline = create_inference_pipeline(config=config,
-                                             checkpoint_paths=checkpoint_paths)
-
-        if pipeline is None:
-            return None
-
-        # for mypy
-        assert isinstance(pipeline, ScalarInferencePipelineBase)
-
-        ml_util.set_random_seed(config.get_effective_random_seed(), "Model Testing")
-        ds = config.get_torch_dataset_for_inference(data_split).as_data_loader(
-            shuffle=False,
-            batch_size=1,
-            num_dataload_workers=0
-        )
-
-        logging.info(f"Starting to evaluate model on {data_split.value} set.")
-        metrics_dict = create_metrics_dict_for_scalar_models(config)
-        for sample in ds:
-            result = pipeline.predict(sample)
-            model_output = result.posteriors
-            label = result.labels.to(device=model_output.device)
-            label = posthoc_label_transform(label)
-            sample_id = result.subject_ids[0]
-            compute_scalar_metrics(metrics_dict,
-                                   subject_ids=[sample_id],
-                                   model_output=model_output,
-                                   labels=label,
-                                   loss_type=config.loss_type)
-            logging.debug(f"Example {sample_id}: {metrics_dict.to_string()}")
-
-        average = metrics_dict.average(across_hues=False)
-        logging.info(average.to_string())
-
-        return metrics_dict
-
-    checkpoints_to_test = checkpoint_handler.get_checkpoints_to_test()
-
-    if not checkpoints_to_test:
+    checkpoint_paths = checkpoint_handler.get_checkpoints_to_test()
+    if not checkpoint_paths:
         raise ValueError("There were no checkpoints available for model testing.")
 
-    result = test_epoch(checkpoint_paths=checkpoints_to_test)
-    if result is None:
-        raise ValueError("There was no single checkpoint file available for model testing.")
+    pipeline = create_inference_pipeline(config=config,
+                                         checkpoint_paths=checkpoint_paths)
+    if pipeline is None:
+        raise ValueError("Inference pipeline could not be created.")
+
+    # for mypy
+    assert isinstance(pipeline, ScalarInferencePipelineBase)
+
+    ml_util.set_random_seed(config.get_effective_random_seed(), "Model Testing")
+    ds = config.get_torch_dataset_for_inference(data_split).as_data_loader(
+        shuffle=False,
+        batch_size=1,
+        num_dataload_workers=0
+    )
+
+    logging.info(f"Starting to evaluate model on {data_split.value} set.")
+    results_folder = config.outputs_folder / get_best_epoch_results_path(data_split, model_proc)
+    os.makedirs(str(results_folder), exist_ok=True)
+    metrics_dict = create_metrics_dict_for_scalar_models(config)
+    if not isinstance(config, SequenceModelBase):
+        output_logger: Optional[DataframeLogger] = DataframeLogger(csv_path=results_folder / MODEL_OUTPUT_CSV)
     else:
-        if isinstance(result, ScalarMetricsDict):
-            results_folder = config.outputs_folder / get_best_epoch_results_path(data_split, model_proc)
-            csv_file = results_folder / SUBJECT_METRICS_FILE_NAME
-
-            logging.info(f"Writing {data_split.value} metrics to file {str(csv_file)}")
-
-            # If we are running inference after a training run, the validation set metrics may have been written
-            # during train time. If this is not the case, or we are running on the test set, create the metrics
-            # file.
-            if not csv_file.exists():
-                os.makedirs(str(results_folder), exist_ok=False)
-                df_logger = DataframeLogger(csv_file)
-                # For test if ensemble split should be default, else record which fold produced this prediction
-                cv_index = DEFAULT_CROSS_VALIDATION_SPLIT_INDEX if model_proc == ModelProcessing.ENSEMBLE_CREATION \
-                    else cross_val_split_index
-                result.store_metrics_per_subject(df_logger=df_logger,
-                                                 mode=data_split,
-                                                 cross_validation_split_index=cv_index,
-                                                 epoch=BEST_EPOCH_FOLDER_NAME)
-                # write to disk
-                df_logger.flush()
-
-    return InferenceMetricsForClassification(metrics=result)
+        output_logger = None
+
+    for sample in ds:
+        result = pipeline.predict(sample)
+        model_output = result.posteriors
+        label = result.labels.to(device=model_output.device)
+        label = posthoc_label_transform(label)
+        sample_id = result.subject_ids[0]
+        if output_logger:
+            for i in range(len(config.target_names)):
+                output_logger.add_record({LoggingColumns.Patient.value: sample_id,
+                                          LoggingColumns.Hue.value: config.target_names[i],
+                                          LoggingColumns.Label.value: label[0][i].item(),
+                                          LoggingColumns.ModelOutput.value: model_output[0][i].item(),
+                                          LoggingColumns.CrossValidationSplitIndex.value: cross_val_split_index})
+
+        compute_scalar_metrics(metrics_dict,
+                               subject_ids=[sample_id],
+                               model_output=model_output,
+                               labels=label,
+                               loss_type=config.loss_type)
+        logging.debug(f"Example {sample_id}: {metrics_dict.to_string()}")
+
+    average = metrics_dict.average(across_hues=False)
+    logging.info(average.to_string())
+
+    if isinstance(metrics_dict, ScalarMetricsDict):
+        csv_file = results_folder / SUBJECT_METRICS_FILE_NAME
+
+        logging.info(f"Writing {data_split.value} metrics to file {str(csv_file)}")
+
+        # If we are running inference after a training run, the validation set metrics may have been written
+        # during train time. If this is not the case, or we are running on the test set, create the metrics
+        # file.
+        if not csv_file.exists():
+            df_logger = DataframeLogger(csv_file)
+            # For test if ensemble split should be default, else record which fold produced this prediction
+            cv_index = DEFAULT_CROSS_VALIDATION_SPLIT_INDEX if model_proc == ModelProcessing.ENSEMBLE_CREATION \
+                else cross_val_split_index
+            metrics_dict.store_metrics_per_subject(df_logger=df_logger,
+                                                   mode=data_split,
+                                                   cross_validation_split_index=cv_index,
+                                                   epoch=BEST_EPOCH_FOLDER_NAME)
+            # write to disk
+            df_logger.flush()
+
+    if output_logger:
+        output_logger.flush()
+
+    return InferenceMetricsForClassification(metrics=metrics_dict)
diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py
@@ -889,10 +889,7 @@ def get_epoch_path(mode: ModelExecutionMode) -> Path:
                 else:
                     logging.info(f"Cannot create report for config of type {type(config)}.")
 
-            config.generate_custom_report(report_dir=reports_dir,
-                                          train_metrics=path_to_best_epoch_train,
-                                          val_metrics=path_to_best_epoch_val,
-                                          test_metrics=path_to_best_epoch_test)
+            config.generate_custom_report(report_dir=reports_dir, model_proc=model_proc)
         except Exception as ex:
             print_exception(ex, "Failed to generate reporting notebook.")
             raise
diff --git a/Tests/ML/models/test_scalar_model.py b/Tests/ML/models/test_scalar_model.py
@@ -139,7 +139,7 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
 3,S4,{class_name},0.521128,0,Train,-1
 """
     check_log_file(metrics_path, metrics_expected, ignore_columns=[])
-    # Check log METRICS_FILE_NAME inside of the folder epoch_004/Train, which is written when we run model_test.
+    # Check log METRICS_FILE_NAME inside of the folder best_validation_epoch/Train, which is written when we run model_test.
     # Normally, we would run it on the Test and Val splits, but for convenience we test on the train split here.
     inference_metrics_path = config.outputs_folder / get_best_epoch_results_path(ModelExecutionMode.TRAIN) / \
                              SUBJECT_METRICS_FILE_NAME
@@ -150,6 +150,14 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
 """
     check_log_file(inference_metrics_path, inference_metrics_expected, ignore_columns=[])
 
+    inference_model_output_path = config.outputs_folder / get_best_epoch_results_path(ModelExecutionMode.TRAIN) / \
+                                  model_testing.MODEL_OUTPUT_CSV
+    inference_model_output_expected = \
+        f"""subject,prediction_target,label,model_output,cross_validation_split_index
+S2,{class_name},1.000000,0.529399,-1
+S4,{class_name},0.000000,0.521128,-1"""
+    check_log_file(inference_model_output_path, inference_model_output_expected, ignore_columns=[])
+
 
 @pytest.mark.skipif(common_util.is_windows(), reason="Has OOM issues on windows build")
 @pytest.mark.cpu_and_gpu