Skip to content

Commit

Permalink
Implement progress logging in path_match #688
Browse files Browse the repository at this point in the history
Signed-off-by: Thomas Druez <tdruez@nexb.com>
  • Loading branch information
tdruez committed Apr 21, 2023
1 parent a7a9018 commit f427d6f
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 11 deletions.
2 changes: 1 addition & 1 deletion scanpipe/pipelines/develop_to_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def steps(cls):
cls.path_match,
)

purldb_match_extensions = [".jar", ".war"]
purldb_match_extensions = [".jar", ".war", ".zip"]

def get_inputs(self):
"""Locate the `from` and `to` archives."""
Expand Down
27 changes: 27 additions & 0 deletions scanpipe/pipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,3 +288,30 @@ def remove_prefix(text, prefix):
prefix_len = len(prefix)
return text[prefix_len:]
return text


def get_progress_percentage(current_index, total_count):
"""
Return the percentage of progress given the current index and total count of
objects.
"""
if current_index < 0 or current_index >= total_count:
raise ValueError("current_index must be between 0 and total_count - 1")

progress = current_index / total_count * 100
return progress


def log_progress(log_func, current_index, total_count, last_percent, increment_percent):
"""
Log progress updates every `increment_percent` percentage points, given the
current index and total count of objects.
Return the latest percent logged.
"""
progress_percentage = int(get_progress_percentage(current_index, total_count))
if progress_percentage >= last_percent + increment_percent:
last_percent = progress_percentage
log_func(
f"Progress: {progress_percentage}% ({current_index:,d}/{total_count:,d})"
)
return last_percent
32 changes: 22 additions & 10 deletions scanpipe/pipes/d2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,14 @@ def checksum_match(project, checksum_field, logger=None):
"""Match using checksum."""
project_files = project.codebaseresources.files().not_empty()
from_resources = project_files.from_codebase().has_value(checksum_field)
to_resources = project_files.to_codebase().has_value(checksum_field)
to_resources = (
project_files.to_codebase().has_value(checksum_field).has_no_relation()
)

if logger:
resource_count = to_resources.count()
logger(
f"Matching {resource_count} to/ resources using {checksum_field} "
f"Matching {resource_count:,d} to/ resources using {checksum_field} "
f"against from/ codebase"
)

Expand All @@ -80,7 +82,8 @@ def java_to_class_match(project, logger=None):

to_resources_dot_class = to_resources.filter(name__endswith=to_extension)
if logger:
logger(f"Matching {to_resources_dot_class.count()} .class resources to .java")
count = to_resources_dot_class.count()
logger(f"Matching {count:,d} .class resources to .java")

for to_resource in to_resources_dot_class:
qualified_class = to_resource.path.split("-extract/")[-1]
Expand All @@ -105,22 +108,29 @@ def java_to_class_match(project, logger=None):

def path_match(project, logger=None):
"""Match using path similarities."""
project_files = project.codebaseresources.files().only("path")
project_files = project.codebaseresources.files().not_empty().only("path")
from_resources = project_files.from_codebase()
to_resources = project_files.to_codebase().has_no_relation()
resource_count = to_resources.count()

if logger:
resource_count = to_resources.count()
logger(
f"Matching {resource_count} to/ resources using path match "
f"Matching {resource_count:,d} to/ resources using path match "
f"against from/ codebase"
)

for to_resource in to_resources:
resource_iterator = to_resources.iterator(chunk_size=2000)
last_percent = 0
for resource_index, to_resource in enumerate(resource_iterator):
last_percent = pipes.log_progress(
logger, resource_index, resource_count, last_percent, increment_percent=5
)

path_parts = Path(to_resource.path.lstrip("/")).parts
path_parts_len = len(path_parts)
for index in range(1, path_parts_len):
current_parts = path_parts[index:]

for path_parts_index in range(1, path_parts_len):
current_parts = path_parts[path_parts_index:]
current_path = "/".join(current_parts)
# The slash "/" prefix matters during the match as we do not want to
# match on filenames sharing the same ending.
Expand Down Expand Up @@ -157,7 +167,9 @@ def purldb_match(project, extensions, logger=None):
if logger:
resource_count = to_resources.count()
extensions_str = ", ".join(extensions)
logger(f"Matching {resource_count} {extensions_str} resources against PurlDB")
logger(
f"Matching {resource_count:,d} {extensions_str} resources against PurlDB"
)

for resource in to_resources:
if results := purldb.match_by_sha1(sha1=resource.sha1):
Expand Down
33 changes: 33 additions & 0 deletions scanpipe/tests/pipes/test_pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# Visit https://github.com/nexB/scancode.io for support and download.

import datetime
import io
from pathlib import Path
from unittest import mock

Expand Down Expand Up @@ -203,3 +204,35 @@ def test_scanpipe_add_resource_to_package(self):
# resource.
scancode.add_resource_to_package(package1.package_uid, resource1, project1)
self.assertEqual(len(resource1.for_packages), 1)

def test_scanpipe_get_progress_percentage(self):
self.assertEqual(0.0, pipes.get_progress_percentage(0, 10))
self.assertEqual(50.0, pipes.get_progress_percentage(5, 10))
self.assertEqual(90.0, pipes.get_progress_percentage(9, 10))
self.assertEqual(60.0, pipes.get_progress_percentage(3, 5))

with self.assertRaises(ValueError):
pipes.get_progress_percentage(10, 1)

def test_scanpipe_log_progress(self):
buffer = io.StringIO()
last_percent = pipes.log_progress(
log_func=buffer.write,
current_index=1,
total_count=10,
last_percent=0,
increment_percent=5,
)
self.assertEqual(10, last_percent)
self.assertEqual("Progress: 10% (1/10)", buffer.getvalue())

buffer = io.StringIO()
last_percent = pipes.log_progress(
log_func=buffer.write,
current_index=20,
total_count=100,
last_percent=15,
increment_percent=5,
)
self.assertEqual(20, last_percent)
self.assertEqual("Progress: 20% (20/100)", buffer.getvalue())

0 comments on commit f427d6f

Please # to comment.