Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Bug fix for regression test (#496)
Browse files Browse the repository at this point in the history
- The regresssion test code in #492 missed out files in AzureML, because the wrong function was called in the main workflow.
- Cleaner folder structure for regression test results
  • Loading branch information
ant0nsc authored Jun 21, 2021
1 parent 01c31ed commit be36e39
Show file tree
Hide file tree
Showing 49 changed files with 86 additions and 108 deletions.
2 changes: 2 additions & 0 deletions .amlignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
.pytest_cache
.mypy_cache
__pycache__
azure-pipelines
.github
datasets
modelweights
outputs
Expand Down
2 changes: 1 addition & 1 deletion .idea/runConfigurations/Template__Run_ML_on_AzureML.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ created.
jobs that run in AzureML.

### Changed
- ([#496](https://github.com/microsoft/InnerEye-DeepLearning/pull/496)) All plots are now saved as PNG, rather than JPG.

### Fixed

Expand Down
2 changes: 1 addition & 1 deletion InnerEye/Common/Statistics/wilcoxon_signed_rank_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def main() -> None:
for line in lines:
print(line)
for basename, fig in plots.items():
fig.savefig(f"{basename}.jpg")
fig.savefig(f"{basename}.png")


if __name__ == "__main__":
Expand Down
68 changes: 33 additions & 35 deletions InnerEye/ML/baselines_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@
from InnerEye.ML.visualizers.metrics_scatterplot import write_to_scatterplot_directory
from InnerEye.ML.visualizers.plot_cross_validation import convert_rows_for_comparisons, may_write_lines_to_file

REGRESSION_TEST_OUTPUT_FOLDER = "OUTPUT"
REGRESSION_TEST_AZUREML_FOLDER = "AZUREML_OUTPUT"
REGRESSION_TEST_AZUREML_PARENT_FOLDER = "AZUREML_PARENT_OUTPUT"
CONTENTS_MISMATCH = "Contents mismatch"
MISSING_FILE = "Missing file"
MISSING_FILE = "Missing"
TEXT_FILE_SUFFIXES = [".txt", ".csv", ".json", ".html", ".md"]


Expand Down Expand Up @@ -222,56 +223,46 @@ def print_lines(prefix: str, lines: List[str]) -> None:

def compare_folder_contents(expected_folder: Path,
actual_folder: Optional[Path] = None,
run: Optional[Run] = None) -> None:
run: Optional[Run] = None) -> List[str]:
"""
Compares a set of files in a folder, against files in either the other folder or files stored in the given
AzureML run. Each file that is present in the "expected" folder must be also present in the "actual" folder
(or the AzureML run), with exactly the same contents, in the same folder structure.
For example, if there is a file "<expected>/foo/bar/contents.txt", then there must also be a file
"<actual>/foo/bar/contents.txt"
If a file is missing, or does not have the expected contents, an exception is raised.
:param expected_folder: A folder with files that are expected to be present.
:param actual_folder: The output folder with the actually produced files.
:param run: An AzureML run
:return: A list of human readable error messages, with message and file path. If no errors are found, the list is
empty.
"""
logging.debug(f"Checking job output against expected files in folder {expected_folder}")
logging.debug(f"Current working directory: {Path.cwd()}")
messages = []
if not expected_folder.is_dir():
raise ValueError(f"Folder with expected files does not exist: {expected_folder}")
if run and is_offline_run_context(run):
logging.warning("Skipping file comparison because the given run context is an AzureML offline run.")
return
return []
files_in_run: List[str] = run.get_file_names() if run else []
temp_folder = Path(tempfile.mkdtemp()) if run else None
for file in expected_folder.rglob("*"):
# rglob also returns folders, skip those
if file.is_dir():
continue
logging.debug(f"Checking file {file}")
# All files stored in AzureML runs use Linux-style path
file_relative = file.relative_to(expected_folder).as_posix()
if str(file_relative).startswith(REGRESSION_TEST_AZUREML_FOLDER) or \
str(file_relative).startswith(REGRESSION_TEST_AZUREML_PARENT_FOLDER):
continue
actual_file: Optional[Path] = None
if actual_folder:
actual_file = actual_folder / file_relative
if not actual_file.is_file():
actual_file = None
elif temp_folder is not None and run is not None:
actual_file = temp_folder / file_relative
if file_relative in files_in_run:
actual_file = temp_folder / file_relative
run.download_file(name=str(file_relative), output_file_path=str(actual_file))
message = compare_files(expected=file, actual=actual_file) if actual_file else "Missing file"
else:
raise ValueError("One of the two arguments run, actual_folder must be provided.")
message = compare_files(expected=file, actual=actual_file) if actual_file.exists() else MISSING_FILE
if message:
logging.debug(f"Error: {message}")
messages.append(f"{message}: {file_relative}")
logging.info(f"File {file_relative}: {message or 'OK'}")
if temp_folder:
shutil.rmtree(temp_folder)
if messages:
raise ValueError(f"Some expected files were missing or did not have the expected contents:{os.linesep}"
f"{os.linesep.join(messages)}")
return messages


def compare_folders_and_run_outputs(expected: Path, actual: Path) -> None:
Expand All @@ -286,17 +277,24 @@ def compare_folders_and_run_outputs(expected: Path, actual: Path) -> None:
"""
if not expected.is_dir():
raise ValueError(f"Folder with expected files does not exist: {expected}")
# First compare the normal output files that the run produces
compare_folder_contents(expected, actual)
# Compare the set of files in the magic folder with the outputs stored in the run context
azureml_folder = expected / REGRESSION_TEST_AZUREML_FOLDER
if azureml_folder.is_dir():
compare_folder_contents(azureml_folder, run=RUN_CONTEXT)
# Compare the set of files in the magic folder with the outputs stored in the run context of the parent run
azureml_parent_folder = expected / REGRESSION_TEST_AZUREML_PARENT_FOLDER
if azureml_parent_folder.is_dir():
if PARENT_RUN_CONTEXT is None:
raise ValueError(f"The set of expected test results in {expected} contains a folder "
f"{REGRESSION_TEST_AZUREML_PARENT_FOLDER}, but the present run is not a cross-validation "
"child run")
compare_folder_contents(azureml_parent_folder, run=PARENT_RUN_CONTEXT)
logging.debug(f"Current working directory: {Path.cwd()}")
messages = []
for (subfolder, message_prefix, actual_folder, run_to_compare) in \
[(REGRESSION_TEST_OUTPUT_FOLDER, "run output files", actual, None),
(REGRESSION_TEST_AZUREML_FOLDER, "AzureML outputs in present run", None, RUN_CONTEXT),
(REGRESSION_TEST_AZUREML_PARENT_FOLDER, "AzureML outputs in parent run", None, PARENT_RUN_CONTEXT)]:
folder = expected / subfolder
if folder.is_dir():
logging.info(f"Comparing results in {folder} against {message_prefix}:")
if actual_folder is None and run_to_compare is None:
raise ValueError(f"The set of expected test results in {expected} contains a folder "
f"{subfolder}, but there is no (parent) run to compare against.")
new_messages = compare_folder_contents(folder, actual_folder=actual_folder, run=run_to_compare)
if new_messages:
messages.append(f"Issues in {message_prefix}:")
messages.extend(new_messages)
else:
logging.info(f"Folder {subfolder} not found, skipping comparison against {message_prefix}.")
if messages:
raise ValueError(f"Some expected files were missing or did not have the expected contents:{os.linesep}"
f"{os.linesep.join(messages)}")
10 changes: 6 additions & 4 deletions InnerEye/ML/run_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
change_working_directory, get_best_epoch_results_path, is_windows, logging_section, logging_to_file, \
print_exception, remove_file_or_directory
from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, LOG_FILE_NAME, PYTHON_ENVIRONMENT_NAME
from InnerEye.ML.baselines_util import compare_folder_contents
from InnerEye.ML.baselines_util import compare_folders_and_run_outputs
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, DeepLearningConfig, FINAL_ENSEMBLE_MODEL_FOLDER, \
Expand Down Expand Up @@ -411,8 +411,8 @@ def run(self) -> None:
# run context.
logging.info("Comparing the current results against stored results")
if self.is_normal_run_or_crossval_child_0():
compare_folder_contents(expected_folder=self.container.regression_test_folder,
actual_folder=self.container.outputs_folder)
compare_folders_and_run_outputs(expected=self.container.regression_test_folder,
actual=self.container.outputs_folder)
else:
logging.info("Skipping because this is not cross-validation child run 0.")

Expand Down Expand Up @@ -712,9 +712,11 @@ def copy_file(source: Path, destination_file: str) -> None:
else:
raise ValueError(f"Expected an absolute path to a checkpoint file, but got: {checkpoint}")
model_folder.mkdir(parents=True, exist_ok=True)
# For reproducibility of the files used in regression tests, checkpoint paths need to be sorted.
checkpoints_sorted = sorted(relative_checkpoint_paths)
model_inference_config = ModelInferenceConfig(model_name=self.container.model_name,
model_configs_namespace=self.config_namespace,
checkpoint_paths=relative_checkpoint_paths)
checkpoint_paths=checkpoints_sorted)
# Inference configuration must live in the root folder of the registered model
full_path_to_config = model_folder / fixed_paths.MODEL_INFERENCE_JSON_FILE_NAME
full_path_to_config.write_text(model_inference_config.to_json(), encoding='utf-8') # type: ignore
Expand Down
4 changes: 2 additions & 2 deletions InnerEye/ML/visualizers/metrics_scatterplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def to_dict(data: pd.DataFrame) -> Dict[str, Dict[str, float]]:

def write_to_scatterplot_directory(root_folder: Path, plots: Dict[str, plt.Figure]) -> None:
"""
Writes a file root_folder/scatterplots/basename.jpg for every plot in plots with key "basename".
Writes a file root_folder/scatterplots/basename.png for every plot in plots with key "basename".
:param root_folder: path to a folder
:param plots: dictionary from plot basenames to plots (plt.Figure objects)
"""
Expand All @@ -217,7 +217,7 @@ def write_to_scatterplot_directory(root_folder: Path, plots: Dict[str, plt.Figur
scatterplot_dir.mkdir(parents=True, exist_ok=True)
logging.info(f"There are {len(plots)} plots to write to {scatterplot_dir}")
for basename, fig in plots.items():
fig.savefig(scatterplot_dir / f"{basename}.jpg")
fig.savefig(scatterplot_dir / f"{basename}.png")


def main() -> None:
Expand Down
2 changes: 1 addition & 1 deletion InnerEye/ML/visualizers/plot_cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ def plot_metrics(config: PlotCrossValidationConfig,

# save plot
suffix = f"_{sub_df_index}" if len(df_list) > 1 else ""
plot_dst = root / f"{metric_type}_{mode.value}_splits{suffix}.jpg"
plot_dst = root / f"{metric_type}_{mode.value}_splits{suffix}.png"
fig.savefig(plot_dst, bbox_inches='tight')
logging.info("Saved box-plots to: {}".format(plot_dst))

Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"model_name": "BasicModel2Epochs", "checkpoint_paths": ["checkpoints/best_checkpoint.ckpt", "checkpoints/OTHER_RUNS/1/best_checkpoint.ckpt"], "model_configs_namespace": "InnerEye.ML.configs.segmentation.BasicModel2Epochs"}
{"model_name": "BasicModel2Epochs", "checkpoint_paths": ["checkpoints/OTHER_RUNS/1/best_checkpoint.ckpt", "checkpoints/best_checkpoint.ckpt"], "model_configs_namespace": "InnerEye.ML.configs.segmentation.BasicModel2Epochs"}
Loading

0 comments on commit be36e39

Please # to comment.