Add logic to find the best_checksum_matches #688

Signed-off-by: Thomas Druez <tdruez@nexb.com>
aboutcode-org · Apr 21, 2023 · 0787e1c · 0787e1c
1 parent 669bc18
commit 0787e1c
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 38 deletions.
diff --git a/scanpipe/pipelines/develop_to_deploy.py b/scanpipe/pipelines/develop_to_deploy.py
@@ -35,13 +35,13 @@ class DevelopToDeploy(Pipeline):
     @classmethod
     def steps(cls):
         return (
-            cls.get_inputs,
-            cls.extract_inputs_to_codebase_directory,
-            cls.extract_archives_in_place,
-            cls.collect_and_create_codebase_resources,
-            cls.tag_empty_and_ignored_files,
+            # cls.get_inputs,
+            # cls.extract_inputs_to_codebase_directory,
+            # cls.extract_archives_in_place,
+            # cls.collect_and_create_codebase_resources,
+            # cls.tag_empty_and_ignored_files,
             cls.checksum_match,
-            cls.purldb_match,
+            # cls.purldb_match,
             cls.java_to_class_match,
             cls.path_match,
         )
@@ -84,6 +84,7 @@ def tag_empty_and_ignored_files(self):
 
     def checksum_match(self):
         """Match using SHA1 checksum."""
+        self.project.codebaserelations.all().delete()
         d2d.checksum_match(project=self.project, checksum_field="sha1", logger=self.log)
 
     def purldb_match(self):

diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py
@@ -40,14 +40,39 @@ def get_inputs(project):
     to_file = list(project.inputs("to*"))
 
     if len(from_file) != 1:
-        raise Exception("from* archive not found.")
+        raise FileNotFoundError("from* archive not found.")
 
     if len(to_file) != 1:
-        raise Exception("to* archive not found.")
+        raise FileNotFoundError("to* archive not found.")
 
     return from_file[0], to_file[0]
 
 
+def get_extracted_subpath(path):
+    """Return the path segments located after the last `-extract/` segment"""
+    return path.split("-extract/")[-1]
+
+
+def get_best_checksum_matches(to_resource, matches):
+    extracted_subpath_matches = [
+        from_resource
+        for from_resource in matches
+        if from_resource.path.endswith(get_extracted_subpath(to_resource.path))
+    ]
+    if extracted_subpath_matches:
+        return extracted_subpath_matches
+
+    same_name_matches = [
+        from_resource
+        for from_resource in matches
+        if from_resource.name == to_resource.name
+    ]
+    if same_name_matches:
+        return same_name_matches
+
+    return matches
+
+
 def checksum_match(project, checksum_field, logger=None):
     """Match using checksum."""
     project_files = project.codebaseresources.files().no_status()
@@ -66,7 +91,7 @@ def checksum_match(project, checksum_field, logger=None):
     for to_resource in to_resources:
         checksum_value = getattr(to_resource, checksum_field)
         matches = from_resources.filter(**{checksum_field: checksum_value})
-        for match in matches:
+        for match in get_best_checksum_matches(to_resource, matches):
             pipes.make_relationship(
                 from_resource=match,
                 to_resource=to_resource,
@@ -90,7 +115,7 @@ def java_to_class_match(project, logger=None):
         logger(f"Matching {count:,d} .class resources to .java")
 
     for to_resource in to_resources_dot_class:
-        qualified_class = to_resource.path.split("-extract/")[-1]
+        qualified_class = get_extracted_subpath(to_resource.path)
 
         if "$" in to_resource.name:  # inner class
             path_parts = Path(qualified_class.lstrip("/")).parts
@@ -110,6 +135,39 @@ def java_to_class_match(project, logger=None):
             )
 
 
+def _resource_path_match(to_resource, from_resources):
+    path_parts = Path(to_resource.path.lstrip("/")).parts
+    path_parts_len = len(path_parts)
+
+    for path_parts_index in range(1, path_parts_len):
+        current_parts = path_parts[path_parts_index:]
+        current_path = "/".join(current_parts)
+        # The slash "/" prefix matters during the match as we do not want to
+        # match on filenames sharing the same ending.
+        # For example: Filter.java and FastFilter.java
+        matches = from_resources.filter(path__endswith=f"/{current_path}")
+
+        if len(matches) > len(current_parts):
+            break
+
+        for match in matches:
+            relation = CodebaseRelation.objects.filter(
+                from_resource=match,
+                to_resource=to_resource,
+                relationship=CodebaseRelation.Relationship.PATH_MATCH,
+            )
+            if not relation.exists():
+                pipes.make_relationship(
+                    from_resource=match,
+                    to_resource=to_resource,
+                    relationship=CodebaseRelation.Relationship.PATH_MATCH,
+                    match_type="path",
+                    extra_data={
+                        "path_score": f"{len(current_parts)}/{path_parts_len - 1}",
+                    },
+                )
+
+
 def path_match(project, logger=None):
     """Match using path similarities."""
     project_files = project.codebaseresources.files().no_status().only("path")
@@ -129,34 +187,7 @@ def path_match(project, logger=None):
         last_percent = pipes.log_progress(
             logger, resource_index, resource_count, last_percent, increment_percent=5
         )
-
-        path_parts = Path(to_resource.path.lstrip("/")).parts
-        path_parts_len = len(path_parts)
-
-        for path_parts_index in range(1, path_parts_len):
-            current_parts = path_parts[path_parts_index:]
-            current_path = "/".join(current_parts)
-            # The slash "/" prefix matters during the match as we do not want to
-            # match on filenames sharing the same ending.
-            # For example: Filter.java and FastFilter.java
-            matches = from_resources.filter(path__endswith=f"/{current_path}")
-
-            for match in matches:
-                relation = CodebaseRelation.objects.filter(
-                    from_resource=match,
-                    to_resource=to_resource,
-                    relationship=CodebaseRelation.Relationship.PATH_MATCH,
-                )
-                if not relation.exists():
-                    pipes.make_relationship(
-                        from_resource=match,
-                        to_resource=to_resource,
-                        relationship=CodebaseRelation.Relationship.PATH_MATCH,
-                        match_type="path",
-                        extra_data={
-                            "path_score": f"{len(current_parts)}/{path_parts_len-1}",
-                        },
-                    )
+        _resource_path_match(to_resource, from_resources)
 
 
 def purldb_match(project, extensions, logger=None):

diff --git a/scanpipe/tests/pipes/test_d2d.py b/scanpipe/tests/pipes/test_d2d.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/nexB/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode.io for support and download.
+
+import tempfile
+
+from django.test import TestCase
+
+from scanpipe.models import Project
+from scanpipe.pipes import d2d
+
+
+class ScanPipeD2DPipesTest(TestCase):
+    def setUp(self):
+        self.project1 = Project.objects.create(name="Analysis")
+
+    def test_scanpipe_d2d_get_inputs(self):
+        with self.assertRaises(FileNotFoundError) as error:
+            d2d.get_inputs(self.project1)
+        self.assertEqual("from* archive not found.", str(error.exception))
+
+        _, input_location = tempfile.mkstemp(prefix="from-")
+        self.project1.copy_input_from(input_location)
+
+        with self.assertRaises(FileNotFoundError) as error:
+            d2d.get_inputs(self.project1)
+        self.assertEqual("to* archive not found.", str(error.exception))
+
+        _, input_location = tempfile.mkstemp(prefix="to-")
+        self.project1.copy_input_from(input_location)
+
+        self.assertEqual(2, len(d2d.get_inputs(self.project1)))
+
+    def test_scanpipe_d2d_get_extracted_subpath(self):
+        path = "not/an/extracted/path/"
+        self.assertEqual(path, d2d.get_extracted_subpath(path))
+
+        path = "a.jar-extract/subpath/file.ext"
+        self.assertEqual("subpath/file.ext", d2d.get_extracted_subpath(path))
+
+        path = "a.jar-extract/subpath/b.jar-extract/subpath/file.ext"
+        self.assertEqual("subpath/file.ext", d2d.get_extracted_subpath(path))
diff --git a/scanpipe/views.py b/scanpipe/views.py
@@ -951,6 +951,7 @@ def get_queryset(self):
             .files()
             .to_codebase()
             .prefetch_related("related_from__from_resource")
+            .distinct()
         )
         if self.request.GET.get("missing_only"):
             queryset = queryset.has_no_relation()