Skip to content

Commit

Permalink
Add progress logging and ETA for all matcher pipes #688
Browse files Browse the repository at this point in the history
Signed-off-by: Thomas Druez <tdruez@nexb.com>
  • Loading branch information
tdruez committed Apr 24, 2023
1 parent 33717e2 commit d64307b
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 59 deletions.
6 changes: 3 additions & 3 deletions scanpipe/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@

import inspect
import logging
import timeit
import traceback
import warnings
from contextlib import contextmanager
from functools import wraps
from pydoc import getdoc
from pydoc import splitdoc
from timeit import default_timer as timer

from django.utils import timezone

Expand Down Expand Up @@ -113,7 +113,7 @@ def execute(self):
self.run.current_step = f"{current_index}/{steps_count} {step_name}"[:256]

self.log(f"Step [{step_name}] starting")
start_time = timeit.default_timer()
start_time = timer()

try:
step(self)
Expand All @@ -122,7 +122,7 @@ def execute(self):
tb = "".join(traceback.format_tb(e.__traceback__))
return 1, f"{e}\n\nTraceback:\n{tb}"

run_time = timeit.default_timer() - start_time
run_time = timer() - start_time
self.log(f"Step [{step.__name__}] completed in {run_time:.2f} seconds")

self.run.current_step = ""
Expand Down
23 changes: 19 additions & 4 deletions scanpipe/pipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from datetime import datetime
from pathlib import Path
from time import sleep
from timeit import default_timer as timer

from django.db.models import Count

Expand Down Expand Up @@ -302,7 +303,14 @@ def get_progress_percentage(current_index, total_count):
return progress


def log_progress(log_func, current_index, total_count, last_percent, increment_percent):
def log_progress(
log_func,
current_index,
total_count,
last_percent,
increment_percent,
start_time=None,
):
"""
Log progress updates every `increment_percent` percentage points, given the
current index and total count of objects.
Expand All @@ -311,7 +319,14 @@ def log_progress(log_func, current_index, total_count, last_percent, increment_p
progress_percentage = int(get_progress_percentage(current_index, total_count))
if progress_percentage >= last_percent + increment_percent:
last_percent = progress_percentage
log_func(
f"Progress: {progress_percentage}% ({current_index:,d}/{total_count:,d})"
)
msg = f"Progress: {progress_percentage}% ({current_index:,d}/{total_count:,d})"

if start_time:
run_time = timer() - start_time
eta = round(run_time / progress_percentage * (100 - progress_percentage))
if eta:
msg += f" ETA: {round(eta)} seconds"

log_func(msg)

return last_percent
153 changes: 101 additions & 52 deletions scanpipe/pipes/d2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import difflib
from pathlib import Path
from timeit import default_timer as timer

from scanpipe import pipes
from scanpipe.models import CodebaseRelation
Expand Down Expand Up @@ -74,66 +75,93 @@ def get_best_checksum_matches(to_resource, matches):
return matches


def _resource_checksum_match(to_resource, from_resources, checksum_field):
checksum_value = getattr(to_resource, checksum_field)
matches = from_resources.filter(**{checksum_field: checksum_value})
for match in get_best_checksum_matches(to_resource, matches):
pipes.make_relationship(
from_resource=match,
to_resource=to_resource,
relationship=CodebaseRelation.Relationship.IDENTICAL,
match_type=checksum_field,
)


def checksum_match(project, checksum_field, logger=None):
"""Match using checksum."""
project_files = project.codebaseresources.files().no_status()
from_resources = project_files.from_codebase().has_value(checksum_field)
to_resources = (
project_files.to_codebase().has_value(checksum_field).has_no_relation()
)
resource_count = to_resources.count()

if logger:
resource_count = to_resources.count()
logger(
f"Matching {resource_count:,d} to/ resources using {checksum_field} "
f"against from/ codebase"
)

for to_resource in to_resources:
checksum_value = getattr(to_resource, checksum_field)
matches = from_resources.filter(**{checksum_field: checksum_value})
for match in get_best_checksum_matches(to_resource, matches):
pipes.make_relationship(
from_resource=match,
to_resource=to_resource,
relationship=CodebaseRelation.Relationship.IDENTICAL,
match_type=checksum_field,
)
resource_iterator = to_resources.iterator(chunk_size=2000)
last_percent = 0
start_time = timer()
for resource_index, to_resource in enumerate(resource_iterator):
last_percent = pipes.log_progress(
logger,
resource_index,
resource_count,
last_percent,
increment_percent=10,
start_time=start_time,
)
_resource_checksum_match(to_resource, from_resources, checksum_field)


def _resource_java_to_class_match(to_resource, from_resources):
qualified_class = get_extracted_subpath(to_resource.path)

if "$" in to_resource.name: # inner class
path_parts = Path(qualified_class.lstrip("/")).parts
parts_without_name = list(path_parts[:-1])
from_name = to_resource.name.split("$")[0] + ".java"
qualified_java = "/".join(parts_without_name + [from_name])
else:
qualified_java = qualified_class.replace(".class", ".java")

matches = from_resources.filter(path__endswith=qualified_java)
for match in matches:
pipes.make_relationship(
from_resource=match,
to_resource=to_resource,
relationship=CodebaseRelation.Relationship.COMPILED,
match_type="java_to_class",
)


def java_to_class_match(project, logger=None):
"""Match a .java source to its compiled .class using fully qualified name."""
from_extension = ".java"
to_extension = ".class"

project_files = project.codebaseresources.files().no_status()
from_resources = project_files.from_codebase()
to_resources = project_files.to_codebase().has_no_relation()

to_resources_dot_class = to_resources.filter(name__endswith=to_extension)
to_resources_dot_class = to_resources.filter(name__endswith=".class")
resource_count = to_resources_dot_class.count()
if logger:
count = to_resources_dot_class.count()
logger(f"Matching {count:,d} .class resources to .java")
logger(f"Matching {resource_count:,d} .class resources to .java")

for to_resource in to_resources_dot_class:
qualified_class = get_extracted_subpath(to_resource.path)

if "$" in to_resource.name: # inner class
path_parts = Path(qualified_class.lstrip("/")).parts
parts_without_name = list(path_parts[:-1])
from_name = to_resource.name.split("$")[0] + from_extension
qualified_java = "/".join(parts_without_name + [from_name])
else:
qualified_java = qualified_class.replace(to_extension, from_extension)

matches = from_resources.filter(path__endswith=qualified_java)
for match in matches:
pipes.make_relationship(
from_resource=match,
to_resource=to_resource,
relationship=CodebaseRelation.Relationship.COMPILED,
match_type="java_to_class",
)
resource_iterator = to_resources_dot_class.iterator(chunk_size=2000)
last_percent = 0
start_time = timer()
for resource_index, to_resource in enumerate(resource_iterator):
last_percent = pipes.log_progress(
logger,
resource_index,
resource_count,
last_percent,
increment_percent=10,
start_time=start_time,
)
_resource_java_to_class_match(to_resource, from_resources)


def get_diff_ratio(to_resource, from_resource):
Expand Down Expand Up @@ -208,13 +236,35 @@ def path_match(project, logger=None):

resource_iterator = to_resources.iterator(chunk_size=2000)
last_percent = 0
start_time = timer()
for resource_index, to_resource in enumerate(resource_iterator):
last_percent = pipes.log_progress(
logger, resource_index, resource_count, last_percent, increment_percent=5
logger,
resource_index,
resource_count,
last_percent,
increment_percent=10,
start_time=start_time,
)
_resource_path_match(to_resource, from_resources)


def _resource_purldb_match(project, resource):
if results := purldb.match_by_sha1(sha1=resource.sha1):
package_data = results[0]
package_data.pop("dependencies")
package = pipes.update_or_create_package(
project=project,
package_data=package_data,
codebase_resource=resource,
)
extracted_resources = project.codebaseresources.to_codebase().filter(
path__startswith=f"{resource.path}-extract"
)
package.add_resources(extracted_resources)
extracted_resources.update(status="application-package")


def purldb_match(project, extensions, logger=None):
to_resources = (
project.codebaseresources.files()
Expand All @@ -223,25 +273,24 @@ def purldb_match(project, extensions, logger=None):
.has_value("sha1")
.filter(extension__in=extensions)
)
resource_count = to_resources.count()

if logger:
resource_count = to_resources.count()
extensions_str = ", ".join(extensions)
logger(
f"Matching {resource_count:,d} {extensions_str} resources against PurlDB"
)

for resource in to_resources:
if results := purldb.match_by_sha1(sha1=resource.sha1):
package_data = results[0]
package_data.pop("dependencies")
package = pipes.update_or_create_package(
project=project,
package_data=package_data,
codebase_resource=resource,
)
extracted_resources = project.codebaseresources.to_codebase().filter(
path__startswith=f"{resource.path}-extract"
)
package.add_resources(extracted_resources)
extracted_resources.update(status="application-package")
resource_iterator = to_resources.iterator(chunk_size=2000)
last_percent = 0
start_time = timer()
for resource_index, to_resource in enumerate(resource_iterator):
last_percent = pipes.log_progress(
logger,
resource_index,
resource_count,
last_percent,
increment_percent=10,
start_time=start_time,
)
_resource_purldb_match(project, to_resource)

0 comments on commit d64307b

Please # to comment.