From b24dcaf86c9a8bfdefbe31acddcce1f2143449d4 Mon Sep 17 00:00:00 2001 From: Thomas Druez Date: Tue, 31 Aug 2021 12:06:15 +0400 Subject: [PATCH] Remove the run_extractcode pipe in favor of extractcode API #312 Signed-off-by: Thomas Druez --- CHANGELOG.rst | 5 +++ docs/custom-pipelines.rst | 2 +- docs/tutorial-2.rst | 2 +- scanpipe/pipelines/scan_codebase.py | 26 +++++++-------- scanpipe/pipes/scancode.py | 51 +++++++++++++++-------------- scanpipe/tests/test_models.py | 8 ++--- scanpipe/tests/test_pipes.py | 6 ---- 7 files changed, 51 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f8531ca7d..8b048f646 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -3,12 +3,17 @@ ### Unreleased +- Remove the run_extractcode pipe in favor of extractcode API. + https://github.com/nexB/scancode.io/issues/312 + - The `scancode.run_scancode` pipe now uses an optimal number of available CPUs for multiprocessing by default. The exact number of parallel processes available to ScanCode.io can be defined using the SCANCODEIO_PROCESSES setting. + https://github.com/nexB/scancode.io/issues/302 - Renamed the SCANCODE_DEFAULT_OPTIONS setting to SCANCODE_TOOLKIT_CLI_OPTIONS. + https://github.com/nexB/scancode.io/issues/302 - Log the outputs of run_scancode as progress indication. https://github.com/nexB/scancode.io/issues/300 diff --git a/docs/custom-pipelines.rst b/docs/custom-pipelines.rst index 3c0662a73..bfbe798ff 100644 --- a/docs/custom-pipelines.rst +++ b/docs/custom-pipelines.rst @@ -75,7 +75,7 @@ ones, or remove any of them. return ( # Original steps from the ScanCodebase pipeline cls.copy_inputs_to_codebase_directory, - cls.run_extractcode, + cls.extract_archives, cls.run_scancode, cls.build_inventory_from_scan, diff --git a/docs/tutorial-2.rst b/docs/tutorial-2.rst index 57abf0287..5dae9e251 100644 --- a/docs/tutorial-2.rst +++ b/docs/tutorial-2.rst @@ -86,7 +86,7 @@ Instructions 2021-07-12 17:45:53.85 Pipeline [scan_codebase] starting 2021-07-12 17:45:53.85 Step [copy_inputs_to_codebase_directory] starting 2021-07-12 17:45:53.86 Step [copy_inputs_to_codebase_directory] completed in 0.00 seconds - 2021-07-12 17:45:53.86 Step [run_extractcode] starting + 2021-07-12 17:45:53.86 Step [extract_archives] starting [...] 2021-07-12 17:46:01.61 Pipeline completed diff --git a/scanpipe/pipelines/scan_codebase.py b/scanpipe/pipelines/scan_codebase.py index da5aecb78..a5aba25a7 100644 --- a/scanpipe/pipelines/scan_codebase.py +++ b/scanpipe/pipelines/scan_codebase.py @@ -40,16 +40,15 @@ class ScanCodebase(Pipeline): def steps(cls): return ( cls.copy_inputs_to_codebase_directory, - cls.run_extractcode, + cls.extract_archives, cls.run_scancode, cls.build_inventory_from_scan, cls.csv_output, ) - extractcode_options = [ - "--shallow", - "--all-formats", - ] + # Set to True to extract recursively nested archives in archives. + extract_recursively = False + scancode_options = [ "--copyright", "--email", @@ -67,16 +66,17 @@ def copy_inputs_to_codebase_directory(self): """ copy_inputs(self.project.inputs(), self.project.codebase_path) - def run_extractcode(self): + def extract_archives(self): """ - Extracts with extractcode. + Extracts archives with extractcode. """ - with self.save_errors(scancode.ScancodeError): - scancode.run_extractcode( - location=str(self.project.codebase_path), - options=self.extractcode_options, - raise_on_error=True, - ) + extract_errors = scancode.extract_archives( + location=self.project.codebase_path, + recurse=self.extract_recursively, + ) + + if extract_errors: + self.add_error("\n".join(extract_errors)) def run_scancode(self): """ diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index 2533f57ae..6dfc374e5 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -37,6 +37,7 @@ from commoncode import fileutils from commoncode.resource import VirtualCodebase from extractcode import all_kinds +from extractcode import api as extractcode_api from extractcode.extract import extract_file from scancode import ScancodeError from scancode import Scanner @@ -73,7 +74,9 @@ def get_max_workers(keep_available): def extract(location, target): """ - Wraps the `extractcode.extract_file` to execute the extraction and return errors. + Extracts the file at `location` to the `target` and return errors. + + Wraps the `extractcode.extract_file` function. """ errors = [] @@ -84,6 +87,29 @@ def extract(location, target): return errors +def extract_archives(location, recurse, all_formats=True): + """ + Extracts all archives at `location` and return errors. + + Wraps the `extractcode.api.extract_archives` function. + + If `recurse` is True, extract nested archives-in-archives recursively. + If `all_formats` is True, extract all supported archives formats. + """ + options = { + "recurse": recurse, + "replace_originals": False, + "all_formats": all_formats, + } + + errors = [] + for event in extractcode_api.extract_archives(location, **options): + if event.done: + errors.extend(event.errors) + + return errors + + def get_resource_info(location): """ Returns a mapping suitable for the creation of a new CodebaseResource. @@ -285,29 +311,6 @@ def scan_for_application_packages(project): _scan_and_save(project, scan_for_package_info, save_scan_package_results) -def run_extractcode(location, options=None, raise_on_error=False): - """ - Extracts content at `location` with extractcode. - Optional arguments for the `extractcode` executable can be provided with the - `options` list. - If `raise_on_error` is enabled, a ScancodeError will be raised if the - exitcode is greater than 0. - """ - extractcode_args = [ - pipes.get_bin_executable("extractcode"), - shlex.quote(location), - ] - - if options: - extractcode_args.extend(options) - - exitcode, output = pipes.run_command(extractcode_args) - if exitcode > 0 and raise_on_error: - raise ScancodeError(output) - - return exitcode, output - - def run_scancode(location, output_file, options, raise_on_error=False): """ Scans the `location` content and write the results into an `output_file`. diff --git a/scanpipe/tests/test_models.py b/scanpipe/tests/test_models.py index af695e74e..8eef398cb 100644 --- a/scanpipe/tests/test_models.py +++ b/scanpipe/tests/test_models.py @@ -537,8 +537,8 @@ def test_scanpipe_run_model_profile_method(self): "2021-02-05 12:46:47.63 Step [copy_inputs_to_codebase_directory] starting\n" "2021-02-05 12:46:47.63 Step [copy_inputs_to_codebase_directory]" " completed in 0.00 seconds\n" - "2021-02-05 12:46:47.63 Step [run_extractcode] starting\n" - "2021-02-05 12:46:48.13 Step [run_extractcode] completed in 0.50 seconds\n" + "2021-02-05 12:46:47.63 Step [extract_archives] starting\n" + "2021-02-05 12:46:48.13 Step [extract_archives] completed in 0.50 seconds\n" "2021-02-05 12:46:48.14 Step [run_scancode] starting\n" "2021-02-05 12:46:52.59 Step [run_scancode] completed in 4.45 seconds\n" "2021-02-05 12:46:52.59 Step [build_inventory_from_scan] starting\n" @@ -558,7 +558,7 @@ def test_scanpipe_run_model_profile_method(self): "build_inventory_from_scan": 0.16, "copy_inputs_to_codebase_directory": 0.0, "csv_output": 0.06, - "run_extractcode": 0.5, + "extract_archives": 0.5, "run_scancode": 4.45, } self.assertEqual(expected, run1.profile()) @@ -569,7 +569,7 @@ def test_scanpipe_run_model_profile_method(self): expected = ( "copy_inputs_to_codebase_directory 0.0 seconds 0.0%\n" - "run_extractcode 0.5 seconds 9.7%\n" + "extract_archives 0.5 seconds 9.7%\n" "\x1b[41;37mrun_scancode 4.45 seconds 86.1%\x1b[m\n" "build_inventory_from_scan 0.16 seconds 3.1%\n" "csv_output 0.06 seconds 1.2%\n" diff --git a/scanpipe/tests/test_pipes.py b/scanpipe/tests/test_pipes.py index b5c4fe9cd..5f92ec1b4 100644 --- a/scanpipe/tests/test_pipes.py +++ b/scanpipe/tests/test_pipes.py @@ -494,12 +494,6 @@ def test_scanpipe_pipes_scancode_create_codebase_resources_inject_policy(self): } self.assertEqual(expected, resource2.licenses[0]["policy"]) - def test_scanpipe_pipes_scancode_run_extractcode(self): - project = Project.objects.create(name="name with space") - exitcode, output = scancode.run_extractcode(str(project.codebase_path)) - self.assertEqual(0, exitcode) - self.assertIn("Extracting done.", output) - def test_scanpipe_pipes_scancode_run_scancode(self): project = Project.objects.create(name="name with space") exitcode, output = scancode.run_scancode(