Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Fix package scan only performance #3423

Merged
merged 3 commits into from
Jun 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,17 @@ v32.1.0 (next, roadmap)
See https://github.com/nexB/scancode-toolkit/issues/1745


v32.0.3 - 2023-05-26
v32.0.4 - 2023-06-07
---------------------

This is a minor bugfix release with the following updates:

- Fixes a performance issue issue arising out of license detection
on files happening in a single-threaded process_codebase step when the
license CLI option is disabled for a package scan.
Reference: https://github.com/nexB/scancode-toolkit/pull/3423

v32.0.3 - 2023-06-06
---------------------

This is a minor bugfix release with the following updates:
Expand Down
2 changes: 1 addition & 1 deletion setup-mini.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = scancode-toolkit-mini
version = 32.0.3
version = 32.0.4
license = Apache-2.0 AND CC-BY-4.0 AND LicenseRef-scancode-other-permissive AND LicenseRef-scancode-other-copyleft

# description must be on ONE line https://github.com/pypa/setuptools/issues/1390
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = scancode-toolkit
version = 32.0.3
version = 32.0.4
license = Apache-2.0 AND CC-BY-4.0 AND LicenseRef-scancode-other-permissive AND LicenseRef-scancode-other-copyleft

# description must be on ONE line https://github.com/pypa/setuptools/issues/1390
Expand Down
8 changes: 8 additions & 0 deletions src/packagedcode/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

from commoncode import fileutils

from licensedcode.cache import build_spdx_license_expression
from licensedcode.cache import get_cache
from licensedcode.tokenize import query_tokenizer
from licensedcode.detection import detect_licenses
from licensedcode.detection import get_unknown_license_detection
Expand Down Expand Up @@ -122,6 +124,11 @@ def assemble(cls, package_data, resource, codebase, package_adder):
resource=resource,
codebase=codebase,
)
if package.declared_license_expression:
package.declared_license_expression_spdx = str(build_spdx_license_expression(
license_expression=package.declared_license_expression,
licensing=get_cache().licensing,
))

cls.assign_package_to_resources(
package=package,
Expand All @@ -132,6 +139,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):

yield package


# we yield this as we do not want this further processed
yield resource

Expand Down
32 changes: 6 additions & 26 deletions src/packagedcode/licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))


def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
def add_referenced_license_matches_for_package(resource, codebase):
"""
Return an updated ``resource`` saving it in place, after adding new license
detections to the package manifests detected in this resource, following their
Expand Down Expand Up @@ -106,13 +106,7 @@ def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
if not referenced_resource:
continue

if no_licenses:
referenced_license_detections = get_license_detection_mappings(
location=referenced_resource.location
)

else:
referenced_license_detections = referenced_resource.license_detections
referenced_license_detections = referenced_resource.license_detections

if referenced_license_detections:
modified = True
Expand Down Expand Up @@ -160,7 +154,7 @@ def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
yield resource


def add_referenced_license_detection_from_package(resource, codebase, no_licenses):
def add_referenced_license_detection_from_package(resource, codebase):
"""
Return an updated ``resource`` saving it in place, after adding new license
matches (licenses and license_expressions) following their Rule
Expand Down Expand Up @@ -209,7 +203,6 @@ def add_referenced_license_detection_from_package(resource, codebase, no_license
sibling_license_detections, _le = get_license_detections_from_sibling_file(
resource=root_resource,
codebase=codebase,
no_licenses=no_licenses,
)
if TRACE:
logger_debug(
Expand Down Expand Up @@ -278,12 +271,10 @@ def add_referenced_license_detection_from_package(resource, codebase, no_license
yield resource


def add_license_from_sibling_file(resource, codebase, no_licenses):
def add_license_from_sibling_file(resource, codebase):
"""
Given a resource and it's codebase object, assign licenses to the package
detections in that resource, from the sibling files of it.

If `no_license` is True, then license scan (for resources) is disabled.
"""
if TRACE:
logger_debug(f'packagedcode.licensing: add_license_from_sibling_file: resource: {resource.path}')
Expand All @@ -303,7 +294,6 @@ def add_license_from_sibling_file(resource, codebase, no_licenses):
license_detections, license_expression = get_license_detections_from_sibling_file(
resource=resource,
codebase=codebase,
no_licenses=no_licenses,
)
if not license_detections:
return
Expand Down Expand Up @@ -333,13 +323,11 @@ def is_legal_or_readme(resource):
return False


def get_license_detections_from_sibling_file(resource, codebase, no_licenses):
def get_license_detections_from_sibling_file(resource, codebase):
"""
Return `license_detections`, a list of LicenseDetection objects and a
`license_expression`, given a resource and it's codebase object, from
the sibling files of the resource.

If `no_license` is True, then license scan (for resources) is disabled.
"""
siblings = []

Expand All @@ -357,15 +345,7 @@ def get_license_detections_from_sibling_file(resource, codebase, no_licenses):

license_detections = []
for sibling in siblings:
if no_licenses:
detections = get_license_detection_mappings(
location=sibling.location,
analysis=DetectionCategory.PACKAGE_ADD_FROM_SIBLING_FILE.value,
post_scan=True,
)
license_detections.extend(detections)
else:
license_detections.extend(sibling.license_detections)
license_detections.extend(sibling.license_detections)

if not license_detections:
return [], None
Expand Down
25 changes: 12 additions & 13 deletions src/packagedcode/plugin_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,17 +194,19 @@ def process_codebase(self, codebase, strip_root=False, **kwargs):
Also perform additional package license detection that depends on either
file license detection or the package detections.
"""
no_licenses = False
has_licenses = hasattr(codebase.root, 'license_detections')

# These steps add proper license detections to package_data and hence
# this is performed before top level packages creation
for resource in codebase.walk(topdown=False):
if not hasattr(resource, 'license_detections'):
no_licenses = True
if not has_licenses:
#TODO: Add the steps where we detect licenses from files for only a package scan
# in the multiprocessing get_package_data API function
continue

# If we don't detect license in package_data but there is license detected in file
# we add the license expression from the file to a package
modified = add_license_from_file(resource, codebase, no_licenses)
modified = add_license_from_file(resource, codebase)
if TRACE and modified:
logger_debug(f'packagedcode: process_codebase: add_license_from_file: modified: {modified}')

Expand All @@ -213,30 +215,30 @@ def process_codebase(self, codebase, strip_root=False, **kwargs):

# If there is referenced files in a extracted license statement, we follow
# the references, look for license detections and add them back
modified = list(add_referenced_license_matches_for_package(resource, codebase, no_licenses))
modified = list(add_referenced_license_matches_for_package(resource, codebase))
if TRACE and modified:
logger_debug(f'packagedcode: process_codebase: add_referenced_license_matches_for_package: modified: {modified}')

# If there is a LICENSE file on the same level as the manifest, and no license
# is detected in the package_data, we add the license from the file
modified = add_license_from_sibling_file(resource, codebase, no_licenses)
modified = add_license_from_sibling_file(resource, codebase)
if TRACE and modified:
logger_debug(f'packagedcode: process_codebase: add_license_from_sibling_file: modified: {modified}')

# Create codebase-level packages and dependencies
create_package_and_deps(codebase, strip_root=strip_root, **kwargs)

if not no_licenses:
if has_licenses:
# This step is dependent on top level packages
for resource in codebase.walk(topdown=False):
# If there is a unknown reference to a package we add the license
# from the package license detection
modified = list(add_referenced_license_detection_from_package(resource, codebase, no_licenses))
modified = list(add_referenced_license_detection_from_package(resource, codebase))
if TRACE and modified:
logger_debug(f'packagedcode: process_codebase: add_referenced_license_matches_from_package: modified: {modified}')


def add_license_from_file(resource, codebase, no_licenses):
def add_license_from_file(resource, codebase):
"""
Given a Resource, check if the detected package_data doesn't have license detections
and the file has license detections, and if so, populate the package_data license
Expand All @@ -248,10 +250,7 @@ def add_license_from_file(resource, codebase, no_licenses):
if not resource.is_file:
return

if no_licenses:
license_detections_file = get_license_detection_mappings(location=resource.location)
else:
license_detections_file = resource.license_detections
license_detections_file = resource.license_detections

if TRACE:
logger_debug(f'add_license_from_file: license_detections_file: {license_detections_file}')
Expand Down
5 changes: 3 additions & 2 deletions src/scancode_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,12 @@ def _create_dir(location):
# 4. hardcoded This is the default, fallback version in case package is not installed or we
# do not have a proper version otherwise.
if not __version__:
__version__ = '32.0.3'
__version__ = '32.0.4'

#######################
# used to warn user when the version is out of date
__release_date__ = datetime.datetime(2023, 6, 6)
# this is (year, month, day)
__release_date__ = datetime.datetime(2023, 6, 7)

# See https://github.com/nexB/scancode-toolkit/issues/2653 for more information
# on the data format version
Expand Down
6 changes: 3 additions & 3 deletions tests/formattedcode/test_output_cyclonedx.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,22 +228,22 @@ def test_cyclonedx_plugin_does_not_fail_without_packages():
def test_cyclonedx_plugin_json():
test_dir = test_env.get_test_loc('cyclonedx/simple')
result_file = test_env.get_temp_file('cyclonedx.json')
run_scan_click(['-p', test_dir, '--cyclonedx', result_file])
run_scan_click(['--package', test_dir, '--cyclonedx', result_file])
expected_file = test_env.get_test_loc('cyclonedx/simple-expected.json')
check_cyclone_output(expected_file, result_file, regen=REGEN_TEST_FIXTURES)


def test_cyclonedx_plugin_json_simple_package_icu():
test_dir = test_env.get_test_loc('cyclonedx/simple-icu')
result_file = test_env.get_temp_file('cyclonedx.json')
run_scan_click(['-p', test_dir, '--cyclonedx', result_file])
run_scan_click(['--package', '--license', test_dir, '--cyclonedx', result_file])
expected_file = test_env.get_test_loc('cyclonedx/simple-icu-expected.json')
check_cyclone_output(expected_file, result_file, regen=REGEN_TEST_FIXTURES)


def test_cyclonedx_plugin_xml_components_and_dependencies_are_serialized_correctly():
test_dir = test_env.get_test_loc('cyclonedx/simple')
result_file = test_env.get_temp_file('cyclonedx.xml')
run_scan_click(['-p', test_dir, '--cyclonedx-xml', result_file])
run_scan_click(['--package', test_dir, '--cyclonedx-xml', result_file])
expected_file = test_env.get_test_loc('cyclonedx/expected.xml')
check_cyclone_xml_output(expected_file, result_file, regen=REGEN_TEST_FIXTURES)
26 changes: 3 additions & 23 deletions tests/packagedcode/data/build/buck/end2end-expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -247,29 +247,9 @@
"vcs_url": null,
"copyright": null,
"holder": null,
"declared_license_expression": "apache-2.0",
"declared_license_expression_spdx": "Apache-2.0",
"license_detections": [
{
"license_expression": "apache-2.0",
"matches": [
{
"score": 100.0,
"start_line": 1,
"end_line": 1,
"matched_length": 3,
"match_coverage": 100.0,
"matcher": "1-hash",
"license_expression": "apache-2.0",
"rule_identifier": "spdx_license_id_apache-2.0_for_apache-2.0.RULE",
"rule_relevance": 100,
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_apache-2.0_for_apache-2.0.RULE",
"matched_text": "apache-2.0"
}
],
"identifier": "apache_2_0-d66ab77d-a5cc-7104-e702-dc7df61fe9e8"
}
],
"declared_license_expression": null,
"declared_license_expression_spdx": null,
"license_detections": [],
"other_license_expression": null,
"other_license_expression_spdx": null,
"other_license_detections": [],
Expand Down
Loading