diff --git a/scanpipe/pipelines/develop_to_deploy.py b/scanpipe/pipelines/develop_to_deploy.py index a0cd1405c..de0d31444 100644 --- a/scanpipe/pipelines/develop_to_deploy.py +++ b/scanpipe/pipelines/develop_to_deploy.py @@ -35,13 +35,13 @@ class DevelopToDeploy(Pipeline): @classmethod def steps(cls): return ( - cls.get_inputs, - cls.extract_inputs_to_codebase_directory, - cls.extract_archives_in_place, - cls.collect_and_create_codebase_resources, - cls.tag_empty_and_ignored_files, + # cls.get_inputs, + # cls.extract_inputs_to_codebase_directory, + # cls.extract_archives_in_place, + # cls.collect_and_create_codebase_resources, + # cls.tag_empty_and_ignored_files, cls.checksum_match, - cls.purldb_match, + # cls.purldb_match, cls.java_to_class_match, cls.path_match, ) @@ -84,6 +84,7 @@ def tag_empty_and_ignored_files(self): def checksum_match(self): """Match using SHA1 checksum.""" + self.project.codebaserelations.all().delete() d2d.checksum_match(project=self.project, checksum_field="sha1", logger=self.log) def purldb_match(self): diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index ffe166665..7475040c1 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -40,14 +40,39 @@ def get_inputs(project): to_file = list(project.inputs("to*")) if len(from_file) != 1: - raise Exception("from* archive not found.") + raise FileNotFoundError("from* archive not found.") if len(to_file) != 1: - raise Exception("to* archive not found.") + raise FileNotFoundError("to* archive not found.") return from_file[0], to_file[0] +def get_extracted_subpath(path): + """Return the path segments located after the last `-extract/` segment""" + return path.split("-extract/")[-1] + + +def get_best_checksum_matches(to_resource, matches): + extracted_subpath_matches = [ + from_resource + for from_resource in matches + if from_resource.path.endswith(get_extracted_subpath(to_resource.path)) + ] + if extracted_subpath_matches: + return extracted_subpath_matches + + same_name_matches = [ + from_resource + for from_resource in matches + if from_resource.name == to_resource.name + ] + if same_name_matches: + return same_name_matches + + return matches + + def checksum_match(project, checksum_field, logger=None): """Match using checksum.""" project_files = project.codebaseresources.files().no_status() @@ -66,7 +91,7 @@ def checksum_match(project, checksum_field, logger=None): for to_resource in to_resources: checksum_value = getattr(to_resource, checksum_field) matches = from_resources.filter(**{checksum_field: checksum_value}) - for match in matches: + for match in get_best_checksum_matches(to_resource, matches): pipes.make_relationship( from_resource=match, to_resource=to_resource, @@ -90,7 +115,7 @@ def java_to_class_match(project, logger=None): logger(f"Matching {count:,d} .class resources to .java") for to_resource in to_resources_dot_class: - qualified_class = to_resource.path.split("-extract/")[-1] + qualified_class = get_extracted_subpath(to_resource.path) if "$" in to_resource.name: # inner class path_parts = Path(qualified_class.lstrip("/")).parts @@ -110,6 +135,39 @@ def java_to_class_match(project, logger=None): ) +def _resource_path_match(to_resource, from_resources): + path_parts = Path(to_resource.path.lstrip("/")).parts + path_parts_len = len(path_parts) + + for path_parts_index in range(1, path_parts_len): + current_parts = path_parts[path_parts_index:] + current_path = "/".join(current_parts) + # The slash "/" prefix matters during the match as we do not want to + # match on filenames sharing the same ending. + # For example: Filter.java and FastFilter.java + matches = from_resources.filter(path__endswith=f"/{current_path}") + + if len(matches) > len(current_parts): + break + + for match in matches: + relation = CodebaseRelation.objects.filter( + from_resource=match, + to_resource=to_resource, + relationship=CodebaseRelation.Relationship.PATH_MATCH, + ) + if not relation.exists(): + pipes.make_relationship( + from_resource=match, + to_resource=to_resource, + relationship=CodebaseRelation.Relationship.PATH_MATCH, + match_type="path", + extra_data={ + "path_score": f"{len(current_parts)}/{path_parts_len - 1}", + }, + ) + + def path_match(project, logger=None): """Match using path similarities.""" project_files = project.codebaseresources.files().no_status().only("path") @@ -129,34 +187,7 @@ def path_match(project, logger=None): last_percent = pipes.log_progress( logger, resource_index, resource_count, last_percent, increment_percent=5 ) - - path_parts = Path(to_resource.path.lstrip("/")).parts - path_parts_len = len(path_parts) - - for path_parts_index in range(1, path_parts_len): - current_parts = path_parts[path_parts_index:] - current_path = "/".join(current_parts) - # The slash "/" prefix matters during the match as we do not want to - # match on filenames sharing the same ending. - # For example: Filter.java and FastFilter.java - matches = from_resources.filter(path__endswith=f"/{current_path}") - - for match in matches: - relation = CodebaseRelation.objects.filter( - from_resource=match, - to_resource=to_resource, - relationship=CodebaseRelation.Relationship.PATH_MATCH, - ) - if not relation.exists(): - pipes.make_relationship( - from_resource=match, - to_resource=to_resource, - relationship=CodebaseRelation.Relationship.PATH_MATCH, - match_type="path", - extra_data={ - "path_score": f"{len(current_parts)}/{path_parts_len-1}", - }, - ) + _resource_path_match(to_resource, from_resources) def purldb_match(project, extensions, logger=None): diff --git a/scanpipe/tests/pipes/test_d2d.py b/scanpipe/tests/pipes/test_d2d.py new file mode 100644 index 000000000..f9cceaf37 --- /dev/null +++ b/scanpipe/tests/pipes/test_d2d.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +import tempfile + +from django.test import TestCase + +from scanpipe.models import Project +from scanpipe.pipes import d2d + + +class ScanPipeD2DPipesTest(TestCase): + def setUp(self): + self.project1 = Project.objects.create(name="Analysis") + + def test_scanpipe_d2d_get_inputs(self): + with self.assertRaises(FileNotFoundError) as error: + d2d.get_inputs(self.project1) + self.assertEqual("from* archive not found.", str(error.exception)) + + _, input_location = tempfile.mkstemp(prefix="from-") + self.project1.copy_input_from(input_location) + + with self.assertRaises(FileNotFoundError) as error: + d2d.get_inputs(self.project1) + self.assertEqual("to* archive not found.", str(error.exception)) + + _, input_location = tempfile.mkstemp(prefix="to-") + self.project1.copy_input_from(input_location) + + self.assertEqual(2, len(d2d.get_inputs(self.project1))) + + def test_scanpipe_d2d_get_extracted_subpath(self): + path = "not/an/extracted/path/" + self.assertEqual(path, d2d.get_extracted_subpath(path)) + + path = "a.jar-extract/subpath/file.ext" + self.assertEqual("subpath/file.ext", d2d.get_extracted_subpath(path)) + + path = "a.jar-extract/subpath/b.jar-extract/subpath/file.ext" + self.assertEqual("subpath/file.ext", d2d.get_extracted_subpath(path)) diff --git a/scanpipe/views.py b/scanpipe/views.py index 2f477d35b..c18bf5f23 100644 --- a/scanpipe/views.py +++ b/scanpipe/views.py @@ -951,6 +951,7 @@ def get_queryset(self): .files() .to_codebase() .prefetch_related("related_from__from_resource") + .distinct() ) if self.request.GET.get("missing_only"): queryset = queryset.has_no_relation()