Skip to content

Commit

Permalink
Add logic to find the best_checksum_matches #688
Browse files Browse the repository at this point in the history
Signed-off-by: Thomas Druez <tdruez@nexb.com>
  • Loading branch information
tdruez committed Apr 21, 2023
1 parent 669bc18 commit 0787e1c
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 38 deletions.
13 changes: 7 additions & 6 deletions scanpipe/pipelines/develop_to_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ class DevelopToDeploy(Pipeline):
@classmethod
def steps(cls):
return (
cls.get_inputs,
cls.extract_inputs_to_codebase_directory,
cls.extract_archives_in_place,
cls.collect_and_create_codebase_resources,
cls.tag_empty_and_ignored_files,
# cls.get_inputs,
# cls.extract_inputs_to_codebase_directory,
# cls.extract_archives_in_place,
# cls.collect_and_create_codebase_resources,
# cls.tag_empty_and_ignored_files,
cls.checksum_match,
cls.purldb_match,
# cls.purldb_match,
cls.java_to_class_match,
cls.path_match,
)
Expand Down Expand Up @@ -84,6 +84,7 @@ def tag_empty_and_ignored_files(self):

def checksum_match(self):
"""Match using SHA1 checksum."""
self.project.codebaserelations.all().delete()
d2d.checksum_match(project=self.project, checksum_field="sha1", logger=self.log)

def purldb_match(self):
Expand Down
95 changes: 63 additions & 32 deletions scanpipe/pipes/d2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,39 @@ def get_inputs(project):
to_file = list(project.inputs("to*"))

if len(from_file) != 1:
raise Exception("from* archive not found.")
raise FileNotFoundError("from* archive not found.")

if len(to_file) != 1:
raise Exception("to* archive not found.")
raise FileNotFoundError("to* archive not found.")

return from_file[0], to_file[0]


def get_extracted_subpath(path):
"""Return the path segments located after the last `-extract/` segment"""
return path.split("-extract/")[-1]


def get_best_checksum_matches(to_resource, matches):
extracted_subpath_matches = [
from_resource
for from_resource in matches
if from_resource.path.endswith(get_extracted_subpath(to_resource.path))
]
if extracted_subpath_matches:
return extracted_subpath_matches

same_name_matches = [
from_resource
for from_resource in matches
if from_resource.name == to_resource.name
]
if same_name_matches:
return same_name_matches

return matches


def checksum_match(project, checksum_field, logger=None):
"""Match using checksum."""
project_files = project.codebaseresources.files().no_status()
Expand All @@ -66,7 +91,7 @@ def checksum_match(project, checksum_field, logger=None):
for to_resource in to_resources:
checksum_value = getattr(to_resource, checksum_field)
matches = from_resources.filter(**{checksum_field: checksum_value})
for match in matches:
for match in get_best_checksum_matches(to_resource, matches):
pipes.make_relationship(
from_resource=match,
to_resource=to_resource,
Expand All @@ -90,7 +115,7 @@ def java_to_class_match(project, logger=None):
logger(f"Matching {count:,d} .class resources to .java")

for to_resource in to_resources_dot_class:
qualified_class = to_resource.path.split("-extract/")[-1]
qualified_class = get_extracted_subpath(to_resource.path)

if "$" in to_resource.name: # inner class
path_parts = Path(qualified_class.lstrip("/")).parts
Expand All @@ -110,6 +135,39 @@ def java_to_class_match(project, logger=None):
)


def _resource_path_match(to_resource, from_resources):
path_parts = Path(to_resource.path.lstrip("/")).parts
path_parts_len = len(path_parts)

for path_parts_index in range(1, path_parts_len):
current_parts = path_parts[path_parts_index:]
current_path = "/".join(current_parts)
# The slash "/" prefix matters during the match as we do not want to
# match on filenames sharing the same ending.
# For example: Filter.java and FastFilter.java
matches = from_resources.filter(path__endswith=f"/{current_path}")

if len(matches) > len(current_parts):
break

for match in matches:
relation = CodebaseRelation.objects.filter(
from_resource=match,
to_resource=to_resource,
relationship=CodebaseRelation.Relationship.PATH_MATCH,
)
if not relation.exists():
pipes.make_relationship(
from_resource=match,
to_resource=to_resource,
relationship=CodebaseRelation.Relationship.PATH_MATCH,
match_type="path",
extra_data={
"path_score": f"{len(current_parts)}/{path_parts_len - 1}",
},
)


def path_match(project, logger=None):
"""Match using path similarities."""
project_files = project.codebaseresources.files().no_status().only("path")
Expand All @@ -129,34 +187,7 @@ def path_match(project, logger=None):
last_percent = pipes.log_progress(
logger, resource_index, resource_count, last_percent, increment_percent=5
)

path_parts = Path(to_resource.path.lstrip("/")).parts
path_parts_len = len(path_parts)

for path_parts_index in range(1, path_parts_len):
current_parts = path_parts[path_parts_index:]
current_path = "/".join(current_parts)
# The slash "/" prefix matters during the match as we do not want to
# match on filenames sharing the same ending.
# For example: Filter.java and FastFilter.java
matches = from_resources.filter(path__endswith=f"/{current_path}")

for match in matches:
relation = CodebaseRelation.objects.filter(
from_resource=match,
to_resource=to_resource,
relationship=CodebaseRelation.Relationship.PATH_MATCH,
)
if not relation.exists():
pipes.make_relationship(
from_resource=match,
to_resource=to_resource,
relationship=CodebaseRelation.Relationship.PATH_MATCH,
match_type="path",
extra_data={
"path_score": f"{len(current_parts)}/{path_parts_len-1}",
},
)
_resource_path_match(to_resource, from_resources)


def purldb_match(project, extensions, logger=None):
Expand Down
60 changes: 60 additions & 0 deletions scanpipe/tests/pipes/test_d2d.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/nexB/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

import tempfile

from django.test import TestCase

from scanpipe.models import Project
from scanpipe.pipes import d2d


class ScanPipeD2DPipesTest(TestCase):
def setUp(self):
self.project1 = Project.objects.create(name="Analysis")

def test_scanpipe_d2d_get_inputs(self):
with self.assertRaises(FileNotFoundError) as error:
d2d.get_inputs(self.project1)
self.assertEqual("from* archive not found.", str(error.exception))

_, input_location = tempfile.mkstemp(prefix="from-")
self.project1.copy_input_from(input_location)

with self.assertRaises(FileNotFoundError) as error:
d2d.get_inputs(self.project1)
self.assertEqual("to* archive not found.", str(error.exception))

_, input_location = tempfile.mkstemp(prefix="to-")
self.project1.copy_input_from(input_location)

self.assertEqual(2, len(d2d.get_inputs(self.project1)))

def test_scanpipe_d2d_get_extracted_subpath(self):
path = "not/an/extracted/path/"
self.assertEqual(path, d2d.get_extracted_subpath(path))

path = "a.jar-extract/subpath/file.ext"
self.assertEqual("subpath/file.ext", d2d.get_extracted_subpath(path))

path = "a.jar-extract/subpath/b.jar-extract/subpath/file.ext"
self.assertEqual("subpath/file.ext", d2d.get_extracted_subpath(path))
1 change: 1 addition & 0 deletions scanpipe/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -951,6 +951,7 @@ def get_queryset(self):
.files()
.to_codebase()
.prefetch_related("related_from__from_resource")
.distinct()
)
if self.request.GET.get("missing_only"):
queryset = queryset.has_no_relation()
Expand Down

0 comments on commit 0787e1c

Please # to comment.