Skip to content

Commit

Permalink
Add step to tag resources based on content and name #688
Browse files Browse the repository at this point in the history
Signed-off-by: Thomas Druez <tdruez@nexb.com>
  • Loading branch information
tdruez committed Apr 21, 2023
1 parent f427d6f commit 669bc18
Show file tree
Hide file tree
Showing 10 changed files with 143 additions and 26 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Generated by Django 4.2 on 2023-04-21 13:01

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("scanpipe", "0030_codebaserelation_scanpipe_co_relatio_40d26d_idx_and_more"),
]

operations = [
migrations.AddIndex(
model_name="codebaseresource",
index=models.Index(fields=["status"], name="scanpipe_co_status_20d02b_idx"),
),
]
1 change: 1 addition & 0 deletions scanpipe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1639,6 +1639,7 @@ class Meta:
models.Index(fields=["path"]),
models.Index(fields=["name"]),
models.Index(fields=["extension"]),
models.Index(fields=["status"]),
models.Index(fields=["programming_language"]),
models.Index(fields=["sha1"]),
]
Expand Down
8 changes: 8 additions & 0 deletions scanpipe/pipelines/develop_to_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from scanpipe.pipes import d2d
from scanpipe.pipes import purldb
from scanpipe.pipes import scancode
from scanpipe.pipes import tag
from scanpipe.pipes.scancode import extract_archives


Expand All @@ -38,6 +39,7 @@ def steps(cls):
cls.extract_inputs_to_codebase_directory,
cls.extract_archives_in_place,
cls.collect_and_create_codebase_resources,
cls.tag_empty_and_ignored_files,
cls.checksum_match,
cls.purldb_match,
cls.java_to_class_match,
Expand Down Expand Up @@ -74,6 +76,12 @@ def collect_and_create_codebase_resources(self):
for resource_path in self.project.walk_codebase_path():
pipes.make_codebase_resource(project=self.project, location=resource_path)

def tag_empty_and_ignored_files(self):
"""Tag empty and ignored files using names and extensions."""
tag.tag_empty_codebase_resources(self.project)
tag.tag_ignored_filenames(self.project, filenames=d2d.IGNORE_FILENAMES)
tag.tag_ignored_extensions(self.project, extensions=d2d.IGNORE_EXTENSIONS)

def checksum_match(self):
"""Match using SHA1 checksum."""
d2d.checksum_match(project=self.project, checksum_field="sha1", logger=self.log)
Expand Down
3 changes: 2 additions & 1 deletion scanpipe/pipelines/root_filesystems.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from scanpipe.pipelines import Pipeline
from scanpipe.pipes import rootfs
from scanpipe.pipes import scancode
from scanpipe.pipes import tag


class RootFS(Pipeline):
Expand Down Expand Up @@ -95,7 +96,7 @@ def tag_uninteresting_codebase_resources(self):

def tag_empty_files(self):
"""Flag empty files."""
rootfs.tag_empty_codebase_resources(self.project)
tag.tag_empty_codebase_resources(self.project)

def scan_for_application_packages(self):
"""Scan unknown resources for packages information."""
Expand Down
4 changes: 2 additions & 2 deletions scanpipe/pipelines/scan_codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@

from scanpipe import pipes
from scanpipe.pipelines import Pipeline
from scanpipe.pipes import rootfs
from scanpipe.pipes import scancode
from scanpipe.pipes import tag
from scanpipe.pipes.input import copy_inputs


Expand Down Expand Up @@ -81,7 +81,7 @@ def collect_and_create_codebase_resources(self):

def tag_empty_files(self):
"""Flag empty files."""
rootfs.tag_empty_codebase_resources(self.project)
tag.tag_empty_codebase_resources(self.project)

def scan_for_application_packages(self):
"""Scan unknown resources for packages information."""
Expand Down
12 changes: 8 additions & 4 deletions scanpipe/pipes/d2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
FROM = "from/"
TO = "to/"

IGNORE_FILENAMES = ("packageinfo",)

IGNORE_EXTENSIONS = ()


def get_inputs(project):
"""Locate the `from` and `to` archives in project inputs directory."""
Expand All @@ -46,7 +50,7 @@ def get_inputs(project):

def checksum_match(project, checksum_field, logger=None):
"""Match using checksum."""
project_files = project.codebaseresources.files().not_empty()
project_files = project.codebaseresources.files().no_status()
from_resources = project_files.from_codebase().has_value(checksum_field)
to_resources = (
project_files.to_codebase().has_value(checksum_field).has_no_relation()
Expand Down Expand Up @@ -76,7 +80,7 @@ def java_to_class_match(project, logger=None):
from_extension = ".java"
to_extension = ".class"

project_files = project.codebaseresources.files()
project_files = project.codebaseresources.files().no_status()
from_resources = project_files.from_codebase()
to_resources = project_files.to_codebase().has_no_relation()

Expand Down Expand Up @@ -108,7 +112,7 @@ def java_to_class_match(project, logger=None):

def path_match(project, logger=None):
"""Match using path similarities."""
project_files = project.codebaseresources.files().not_empty().only("path")
project_files = project.codebaseresources.files().no_status().only("path")
from_resources = project_files.from_codebase()
to_resources = project_files.to_codebase().has_no_relation()
resource_count = to_resources.count()
Expand Down Expand Up @@ -158,8 +162,8 @@ def path_match(project, logger=None):
def purldb_match(project, extensions, logger=None):
to_resources = (
project.codebaseresources.files()
.not_empty()
.to_codebase()
.no_status()
.has_value("sha1")
.filter(extension__in=extensions)
)
Expand Down
6 changes: 0 additions & 6 deletions scanpipe/pipes/rootfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,12 +313,6 @@ def match_not_analyzed(
matchable.save()


def tag_empty_codebase_resources(project):
"""Tags empty files as ignored."""
qs = project.codebaseresources.files().empty()
qs.filter(status__in=("", "not-analyzed")).update(status="ignored-empty-file")


def tag_uninteresting_codebase_resources(project):
"""
Check any file that doesn’t belong to any system package and determine if it's:
Expand Down
43 changes: 43 additions & 0 deletions scanpipe/pipes/tag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/nexB/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.


def tag_empty_codebase_resources(project):
"""Tags empty files as ignored."""
qs = (
project.codebaseresources.files()
.empty()
.filter(status__in=("", "not-analyzed"))
)
return qs.update(status="ignored-empty-file")


def tag_ignored_filenames(project, filenames):
"""Tag codebase resource as `ignored` status from list of `filenames`."""
qs = project.codebaseresources.no_status().filter(name__in=filenames)
return qs.update(status="ignored-filename")


def tag_ignored_extensions(project, extensions):
"""Tag codebase resource as `ignored` status from list of `extensions`."""
qs = project.codebaseresources.no_status().filter(extension__in=extensions)
return qs.update(status="ignored-extension")
13 changes: 0 additions & 13 deletions scanpipe/tests/pipes/test_rootfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,6 @@ def test_scanpipe_pipes_rootfs_from_project_codebase_class_method(self):
self.assertEqual("windows", distro.os)
self.assertEqual("windows", distro.identifier)

def test_scanpipe_pipes_rootfs_tag_empty_codebase_resources(self):
p1 = Project.objects.create(name="Analysis")
resource1 = CodebaseResource.objects.create(project=p1, path="dir/")
resource2 = CodebaseResource.objects.create(
project=p1, path="filename.ext", type=CodebaseResource.Type.FILE
)

rootfs.tag_empty_codebase_resources(p1)
resource1.refresh_from_db()
resource2.refresh_from_db()
self.assertEqual("", resource1.status)
self.assertEqual("ignored-empty-file", resource2.status)

def test_scanpipe_pipes_rootfs_tag_uninteresting_codebase_resources(self):
p1 = Project.objects.create(name="Analysis")
resource1 = CodebaseResource.objects.create(project=p1, path="filename.ext")
Expand Down
63 changes: 63 additions & 0 deletions scanpipe/tests/pipes/test_tag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/nexB/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

from django.test import TestCase

from scanpipe.models import CodebaseResource
from scanpipe.models import Project
from scanpipe.pipes import tag


class ScanPipeTagPipesTest(TestCase):
def setUp(self):
self.project1 = Project.objects.create(name="Analysis")
self.resource1 = CodebaseResource.objects.create(
project=self.project1, path="dir/"
)
self.resource2 = CodebaseResource.objects.create(
project=self.project1,
type=CodebaseResource.Type.FILE,
path="dir/filename.ext",
name="filename.ext",
extension=".ext",
)

def test_scanpipe_pipes_tag_tag_empty_codebase_resources(self):
tag.tag_empty_codebase_resources(self.project1)
self.resource1.refresh_from_db()
self.resource2.refresh_from_db()
self.assertEqual("", self.resource1.status)
self.assertEqual("ignored-empty-file", self.resource2.status)

def test_scanpipe_pipes_tag_tag_ignored_filenames(self):
tag.tag_ignored_filenames(self.project1, filenames=[self.resource2.name])
self.resource1.refresh_from_db()
self.resource2.refresh_from_db()
self.assertEqual("", self.resource1.status)
self.assertEqual("ignored-filename", self.resource2.status)

def test_scanpipe_pipes_tag_tag_ignored_extensions(self):
tag.tag_ignored_extensions(self.project1, extensions=[self.resource2.extension])
self.resource1.refresh_from_db()
self.resource2.refresh_from_db()
self.assertEqual("", self.resource1.status)
self.assertEqual("ignored-extension", self.resource2.status)

0 comments on commit 669bc18

Please # to comment.