Skip to content

Commit

Permalink
Refactored the extract pipes into clean API #312
Browse files Browse the repository at this point in the history
- 2 pipes: extract_archive (extract to) and extract_archives (in place)
- Added tests

Signed-off-by: Thomas Druez <tdruez@nexb.com>
  • Loading branch information
tdruez committed Sep 3, 2021
1 parent c5a9092 commit e52c4da
Show file tree
Hide file tree
Showing 7 changed files with 77 additions and 30 deletions.
2 changes: 1 addition & 1 deletion scanpipe/pipelines/root_filesystems.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def extract_input_files_to_codebase_directory(self):

for input_file in input_files:
extract_target = target_path / f"{input_file.name}-extract"
extract_errors = scancode.extract(input_file, extract_target)
extract_errors = scancode.extract_archive(input_file, extract_target)
errors.extend(extract_errors)

if errors:
Expand Down
7 changes: 4 additions & 3 deletions scanpipe/pipelines/scan_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
from commoncode.hash import multi_checksums

from scanpipe.pipelines.scan_codebase import ScanCodebase
from scanpipe.pipes import scancode
from scanpipe.pipes.scancode import extract_archive
from scanpipe.pipes.scancode import make_results_summary


class ScanPackage(ScanCodebase):
Expand Down Expand Up @@ -84,7 +85,7 @@ def extract_archive_to_codebase_directory(self):
"""
Extracts package archive with extractcode.
"""
extract_errors = scancode.extract(self.archive_path, self.project.codebase_path)
extract_errors = extract_archive(self.archive_path, self.project.codebase_path)

if extract_errors:
self.add_error("\n".join(extract_errors))
Expand All @@ -93,7 +94,7 @@ def make_summary_from_scan_results(self):
"""
Builds a summary in JSON format from the generated scan results.
"""
summary = scancode.make_results_summary(self.project, str(self.scan_output))
summary = make_results_summary(self.project, str(self.scan_output))
output_file = self.project.get_output_file_path("summary", "json")

with output_file.open("w") as summary_file:
Expand Down
9 changes: 4 additions & 5 deletions scanpipe/pipes/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@

from scanpipe import pipes
from scanpipe.pipes import rootfs
from scanpipe.pipes import scancode
from scanpipe.pipes.rootfs import has_hash_diff
from scanpipe.pipes.scancode import extract_archive

logger = logging.getLogger(__name__)

Expand All @@ -47,7 +46,7 @@ def extract_images_from_inputs(project):

for input_tarball in project.inputs(pattern="*.tar*"):
extract_target = target_path / f"{input_tarball.name}-extract"
extract_errors = scancode.extract(input_tarball, extract_target)
extract_errors = extract_archive(input_tarball, extract_target)
images.extend(Image.get_images_from_dir(extract_target))
errors.extend(extract_errors)

Expand All @@ -67,7 +66,7 @@ def extract_layers_from_images(project, images):

for layer in image.layers:
extract_target = target_path / layer.layer_id
extract_errors = scancode.extract(layer.archive_location, extract_target)
extract_errors = extract_archive(layer.archive_location, extract_target)
errors.extend(extract_errors)
layer.extracted_location = str(extract_target)

Expand Down Expand Up @@ -154,7 +153,7 @@ def scan_image_for_system_packages(project, image, detect_licenses=True):
logger.info(f" added as system-package to: {purl}")
codebase_resource.save()

if has_hash_diff(install_file, codebase_resource):
if rootfs.has_hash_diff(install_file, codebase_resource):
if install_file.path not in modified_resources:
modified_resources.append(install_file.path)

Expand Down
32 changes: 19 additions & 13 deletions scanpipe/pipes/scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@
import packagedcode
from commoncode import fileutils
from commoncode.resource import VirtualCodebase
from extractcode import all_kinds
from extractcode import api as extractcode_api
from extractcode.extract import extract_file
from scancode import ScancodeError
from scancode import Scanner
from scancode import api as scancode_api
Expand All @@ -50,7 +48,7 @@
logger = logging.getLogger("scanpipe.pipes")

"""
Utilities to deal with ScanCode objects, in particular Codebase and Package.
Utilities to deal with ScanCode toolkit features and objects.
"""

scanpipe_app = apps.get_app_config("scanpipe")
Expand All @@ -59,8 +57,8 @@
def get_max_workers(keep_available):
"""
Returns the `SCANCODEIO_PROCESSES` if defined in the setting,
or returns a default value based on the number of available CPUs, minus the
provided `keep_available` value.
or returns a default value based on the number of available CPUs,
minus the provided `keep_available` value.
"""
processes = getattr(settings, "SCANCODEIO_PROCESSES", None)
if processes is not None:
Expand All @@ -72,34 +70,42 @@ def get_max_workers(keep_available):
return max_workers


def extract(location, target):
def extract_archive(location, target):
"""
Extracts the file at `location` to the `target` and return errors.
Extracts a single archive or compressed file at `location` to the `target`
directory.
Wraps the `extractcode.extract_file` function.
Returns a list of extraction errors.
Wrapper of the `extractcode.api.extract_archive` function.
"""
errors = []

for event in extract_file(location, target, kinds=all_kinds):
for event in extractcode_api.extract_archive(location, target):
if event.done:
errors.extend(event.errors)

return errors


def extract_archives(location, recurse, all_formats=True):
def extract_archives(location, recurse=False):
"""
Extracts all archives at `location` and return errors.
Wraps the `extractcode.api.extract_archives` function.
Archives and compressed files are extracted in a new directory named
"<file_name>-extract" created in the same directory as each extracted
archive.
If `recurse` is True, extract nested archives-in-archives recursively.
If `all_formats` is True, extract all supported archives formats.
Returns a list of extraction errors.
Wrapper of the `extractcode.api.extract_archives` function.
"""
options = {
"recurse": recurse,
"replace_originals": False,
"all_formats": all_formats,
"all_formats": True,
}

errors = []
Expand Down
Binary file added scanpipe/tests/data/archive.zip
Binary file not shown.
4 changes: 2 additions & 2 deletions scanpipe/tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ def test_scanpipe_rootfs_pipeline_extract_input_files_errors(self):
project1.move_input_from(tempfile.mkstemp()[1])
self.assertEqual(2, len(project1.input_files))

with mock.patch("scanpipe.pipes.scancode.extract") as extract:
extract.return_value = ["Error"]
with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive:
extract_archive.return_value = ["Error"]
pipeline_instance.extract_input_files_to_codebase_directory()

error = project1.projecterrors.get()
Expand Down
53 changes: 47 additions & 6 deletions scanpipe/tests/test_pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@

import collections
import json
import os
import shutil
import tempfile
from pathlib import Path
from unittest import mock
from unittest.case import expectedFailure
Expand Down Expand Up @@ -51,7 +51,7 @@
from scanpipe.pipes import strip_root
from scanpipe.pipes import tag_not_analyzed_codebase_resources
from scanpipe.pipes import windows
from scanpipe.pipes.input import copy_inputs
from scanpipe.pipes.input import copy_input
from scanpipe.tests import license_policies_index
from scanpipe.tests import mocked_now
from scanpipe.tests import package_data1
Expand Down Expand Up @@ -229,6 +229,47 @@ def test_scanpipe_pipes_outputs_to_xlsx(self):
output_file = output.to_xlsx(project=project1)
self.assertEqual([output_file.name], project1.output_root)

def test_scanpipe_pipes_scancode_extract_archive(self):
target = tempfile.mkdtemp()
input_location = str(self.data_location / "archive.zip")

errors = scancode.extract_archive(input_location, target)
self.assertEqual([], errors)

results = [path.name for path in list(Path(target).glob("**/*"))]
expected = [
"c",
"a",
"c",
"b",
"a.txt",
"a.txt",
"a.txt",
]
self.assertEqual(expected, results)

def test_scanpipe_pipes_scancode_extract_archives(self):
tempdir = Path(tempfile.mkdtemp())
input_location = str(self.data_location / "archive.zip")
copy_input(input_location, tempdir)

errors = scancode.extract_archives(tempdir)
self.assertEqual([], errors)

results = [path.name for path in list(tempdir.glob("**/*"))]
expected = [
"archive.zip-extract",
"archive.zip",
"c",
"a",
"c",
"b",
"a.txt",
"a.txt",
"a.txt",
]
self.assertEqual(expected, results)

def test_scanpipe_pipes_scancode_get_resource_info(self):
input_location = str(self.data_location / "notice.NOTICE")
sha256 = "b323607418a36b5bd700fcf52ae9ca49f82ec6359bc4b89b1b2d73cf75321757"
Expand Down Expand Up @@ -301,7 +342,7 @@ def test_scanpipe_pipes_scancode_scan_file_and_save_results(self):
self.assertEqual("scanned-with-error", codebase_resource1.status)
self.assertEqual(4, project1.projecterrors.count())

copy_inputs([self.data_location / "notice.NOTICE"], project1.codebase_path)
copy_input(self.data_location / "notice.NOTICE", project1.codebase_path)
codebase_resource2 = CodebaseResource.objects.create(
project=project1, path="notice.NOTICE"
)
Expand All @@ -319,7 +360,7 @@ def test_scanpipe_pipes_scancode_scan_file_and_save_results(self):

def test_scanpipe_pipes_scancode_scan_file_and_save_results_timeout_error(self):
project1 = Project.objects.create(name="Analysis")
copy_inputs([self.data_location / "notice.NOTICE"], project1.codebase_path)
copy_input(self.data_location / "notice.NOTICE", project1.codebase_path)
codebase_resource = CodebaseResource.objects.create(
project=project1, path="notice.NOTICE"
)
Expand Down Expand Up @@ -398,7 +439,7 @@ def test_scanpipe_pipes_scancode_scan_for_package_info_timeout(self):

def test_scanpipe_pipes_scancode_scan_package_and_save_results_timeout_error(self):
project1 = Project.objects.create(name="Analysis")
copy_inputs([self.data_location / "notice.NOTICE"], project1.codebase_path)
copy_input(self.data_location / "notice.NOTICE", project1.codebase_path)
codebase_resource = CodebaseResource.objects.create(
project=project1, path="notice.NOTICE"
)
Expand Down Expand Up @@ -1101,7 +1142,7 @@ def test_scanpipe_pipes_make_codebase_resource(self):

self.assertIn("is not under project/codebase/", str(cm.exception))

copy_inputs([resource_location], p1.codebase_path)
copy_input(resource_location, p1.codebase_path)
resource_location = str(p1.codebase_path / "notice.NOTICE")
make_codebase_resource(p1, resource_location)

Expand Down

0 comments on commit e52c4da

Please # to comment.