diff --git a/scanpipe/pipelines/root_filesystems.py b/scanpipe/pipelines/root_filesystems.py index dce6ac610..e4e376cc4 100644 --- a/scanpipe/pipelines/root_filesystems.py +++ b/scanpipe/pipelines/root_filesystems.py @@ -60,7 +60,7 @@ def extract_input_files_to_codebase_directory(self): for input_file in input_files: extract_target = target_path / f"{input_file.name}-extract" - extract_errors = scancode.extract(input_file, extract_target) + extract_errors = scancode.extract_archive(input_file, extract_target) errors.extend(extract_errors) if errors: diff --git a/scanpipe/pipelines/scan_package.py b/scanpipe/pipelines/scan_package.py index 57d4d943b..242c34cec 100644 --- a/scanpipe/pipelines/scan_package.py +++ b/scanpipe/pipelines/scan_package.py @@ -27,7 +27,8 @@ from commoncode.hash import multi_checksums from scanpipe.pipelines.scan_codebase import ScanCodebase -from scanpipe.pipes import scancode +from scanpipe.pipes.scancode import extract_archive +from scanpipe.pipes.scancode import make_results_summary class ScanPackage(ScanCodebase): @@ -84,7 +85,7 @@ def extract_archive_to_codebase_directory(self): """ Extracts package archive with extractcode. """ - extract_errors = scancode.extract(self.archive_path, self.project.codebase_path) + extract_errors = extract_archive(self.archive_path, self.project.codebase_path) if extract_errors: self.add_error("\n".join(extract_errors)) @@ -93,7 +94,7 @@ def make_summary_from_scan_results(self): """ Builds a summary in JSON format from the generated scan results. """ - summary = scancode.make_results_summary(self.project, str(self.scan_output)) + summary = make_results_summary(self.project, str(self.scan_output)) output_file = self.project.get_output_file_path("summary", "json") with output_file.open("w") as summary_file: diff --git a/scanpipe/pipes/docker.py b/scanpipe/pipes/docker.py index d0599fc59..74af0389c 100644 --- a/scanpipe/pipes/docker.py +++ b/scanpipe/pipes/docker.py @@ -29,8 +29,7 @@ from scanpipe import pipes from scanpipe.pipes import rootfs -from scanpipe.pipes import scancode -from scanpipe.pipes.rootfs import has_hash_diff +from scanpipe.pipes.scancode import extract_archive logger = logging.getLogger(__name__) @@ -47,7 +46,7 @@ def extract_images_from_inputs(project): for input_tarball in project.inputs(pattern="*.tar*"): extract_target = target_path / f"{input_tarball.name}-extract" - extract_errors = scancode.extract(input_tarball, extract_target) + extract_errors = extract_archive(input_tarball, extract_target) images.extend(Image.get_images_from_dir(extract_target)) errors.extend(extract_errors) @@ -67,7 +66,7 @@ def extract_layers_from_images(project, images): for layer in image.layers: extract_target = target_path / layer.layer_id - extract_errors = scancode.extract(layer.archive_location, extract_target) + extract_errors = extract_archive(layer.archive_location, extract_target) errors.extend(extract_errors) layer.extracted_location = str(extract_target) @@ -154,7 +153,7 @@ def scan_image_for_system_packages(project, image, detect_licenses=True): logger.info(f" added as system-package to: {purl}") codebase_resource.save() - if has_hash_diff(install_file, codebase_resource): + if rootfs.has_hash_diff(install_file, codebase_resource): if install_file.path not in modified_resources: modified_resources.append(install_file.path) diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index f14f31e96..10fed59cc 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -36,9 +36,7 @@ import packagedcode from commoncode import fileutils from commoncode.resource import VirtualCodebase -from extractcode import all_kinds from extractcode import api as extractcode_api -from extractcode.extract import extract_file from scancode import ScancodeError from scancode import Scanner from scancode import api as scancode_api @@ -50,7 +48,7 @@ logger = logging.getLogger("scanpipe.pipes") """ -Utilities to deal with ScanCode objects, in particular Codebase and Package. +Utilities to deal with ScanCode toolkit features and objects. """ scanpipe_app = apps.get_app_config("scanpipe") @@ -59,8 +57,8 @@ def get_max_workers(keep_available): """ Returns the `SCANCODEIO_PROCESSES` if defined in the setting, - or returns a default value based on the number of available CPUs, minus the - provided `keep_available` value. + or returns a default value based on the number of available CPUs, + minus the provided `keep_available` value. """ processes = getattr(settings, "SCANCODEIO_PROCESSES", None) if processes is not None: @@ -72,34 +70,42 @@ def get_max_workers(keep_available): return max_workers -def extract(location, target): +def extract_archive(location, target): """ - Extracts the file at `location` to the `target` and return errors. + Extracts a single archive or compressed file at `location` to the `target` + directory. - Wraps the `extractcode.extract_file` function. + Returns a list of extraction errors. + + Wrapper of the `extractcode.api.extract_archive` function. """ errors = [] - for event in extract_file(location, target, kinds=all_kinds): + for event in extractcode_api.extract_archive(location, target): if event.done: errors.extend(event.errors) return errors -def extract_archives(location, recurse, all_formats=True): +def extract_archives(location, recurse=False): """ Extracts all archives at `location` and return errors. - Wraps the `extractcode.api.extract_archives` function. + Archives and compressed files are extracted in a new directory named + "-extract" created in the same directory as each extracted + archive. If `recurse` is True, extract nested archives-in-archives recursively. - If `all_formats` is True, extract all supported archives formats. + + Returns a list of extraction errors. + + Wrapper of the `extractcode.api.extract_archives` function. """ options = { "recurse": recurse, "replace_originals": False, - "all_formats": all_formats, + "all_formats": True, } errors = [] diff --git a/scanpipe/tests/data/archive.zip b/scanpipe/tests/data/archive.zip new file mode 100644 index 000000000..c35e7dc53 Binary files /dev/null and b/scanpipe/tests/data/archive.zip differ diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 92bc0e108..a55dc9a19 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -181,8 +181,8 @@ def test_scanpipe_rootfs_pipeline_extract_input_files_errors(self): project1.move_input_from(tempfile.mkstemp()[1]) self.assertEqual(2, len(project1.input_files)) - with mock.patch("scanpipe.pipes.scancode.extract") as extract: - extract.return_value = ["Error"] + with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: + extract_archive.return_value = ["Error"] pipeline_instance.extract_input_files_to_codebase_directory() error = project1.projecterrors.get() diff --git a/scanpipe/tests/test_pipes.py b/scanpipe/tests/test_pipes.py index c7648af81..c45acd539 100644 --- a/scanpipe/tests/test_pipes.py +++ b/scanpipe/tests/test_pipes.py @@ -22,8 +22,8 @@ import collections import json -import os import shutil +import tempfile from pathlib import Path from unittest import mock from unittest.case import expectedFailure @@ -51,7 +51,7 @@ from scanpipe.pipes import strip_root from scanpipe.pipes import tag_not_analyzed_codebase_resources from scanpipe.pipes import windows -from scanpipe.pipes.input import copy_inputs +from scanpipe.pipes.input import copy_input from scanpipe.tests import license_policies_index from scanpipe.tests import mocked_now from scanpipe.tests import package_data1 @@ -229,6 +229,47 @@ def test_scanpipe_pipes_outputs_to_xlsx(self): output_file = output.to_xlsx(project=project1) self.assertEqual([output_file.name], project1.output_root) + def test_scanpipe_pipes_scancode_extract_archive(self): + target = tempfile.mkdtemp() + input_location = str(self.data_location / "archive.zip") + + errors = scancode.extract_archive(input_location, target) + self.assertEqual([], errors) + + results = [path.name for path in list(Path(target).glob("**/*"))] + expected = [ + "c", + "a", + "c", + "b", + "a.txt", + "a.txt", + "a.txt", + ] + self.assertEqual(expected, results) + + def test_scanpipe_pipes_scancode_extract_archives(self): + tempdir = Path(tempfile.mkdtemp()) + input_location = str(self.data_location / "archive.zip") + copy_input(input_location, tempdir) + + errors = scancode.extract_archives(tempdir) + self.assertEqual([], errors) + + results = [path.name for path in list(tempdir.glob("**/*"))] + expected = [ + "archive.zip-extract", + "archive.zip", + "c", + "a", + "c", + "b", + "a.txt", + "a.txt", + "a.txt", + ] + self.assertEqual(expected, results) + def test_scanpipe_pipes_scancode_get_resource_info(self): input_location = str(self.data_location / "notice.NOTICE") sha256 = "b323607418a36b5bd700fcf52ae9ca49f82ec6359bc4b89b1b2d73cf75321757" @@ -301,7 +342,7 @@ def test_scanpipe_pipes_scancode_scan_file_and_save_results(self): self.assertEqual("scanned-with-error", codebase_resource1.status) self.assertEqual(4, project1.projecterrors.count()) - copy_inputs([self.data_location / "notice.NOTICE"], project1.codebase_path) + copy_input(self.data_location / "notice.NOTICE", project1.codebase_path) codebase_resource2 = CodebaseResource.objects.create( project=project1, path="notice.NOTICE" ) @@ -319,7 +360,7 @@ def test_scanpipe_pipes_scancode_scan_file_and_save_results(self): def test_scanpipe_pipes_scancode_scan_file_and_save_results_timeout_error(self): project1 = Project.objects.create(name="Analysis") - copy_inputs([self.data_location / "notice.NOTICE"], project1.codebase_path) + copy_input(self.data_location / "notice.NOTICE", project1.codebase_path) codebase_resource = CodebaseResource.objects.create( project=project1, path="notice.NOTICE" ) @@ -398,7 +439,7 @@ def test_scanpipe_pipes_scancode_scan_for_package_info_timeout(self): def test_scanpipe_pipes_scancode_scan_package_and_save_results_timeout_error(self): project1 = Project.objects.create(name="Analysis") - copy_inputs([self.data_location / "notice.NOTICE"], project1.codebase_path) + copy_input(self.data_location / "notice.NOTICE", project1.codebase_path) codebase_resource = CodebaseResource.objects.create( project=project1, path="notice.NOTICE" ) @@ -1101,7 +1142,7 @@ def test_scanpipe_pipes_make_codebase_resource(self): self.assertIn("is not under project/codebase/", str(cm.exception)) - copy_inputs([resource_location], p1.codebase_path) + copy_input(resource_location, p1.codebase_path) resource_location = str(p1.codebase_path / "notice.NOTICE") make_codebase_resource(p1, resource_location)