Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Fix package scan only performance #3423

Merged
merged 3 commits into from
Jun 7, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/packagedcode/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

from commoncode import fileutils

from licensedcode.cache import build_spdx_license_expression
from licensedcode.cache import get_cache
from licensedcode.tokenize import query_tokenizer
from licensedcode.detection import detect_licenses
from licensedcode.detection import get_unknown_license_detection
Expand Down Expand Up @@ -122,6 +124,11 @@ def assemble(cls, package_data, resource, codebase, package_adder):
resource=resource,
codebase=codebase,
)
if package.declared_license_expression:
package.declared_license_expression_spdx = str(build_spdx_license_expression(
license_expression=package.declared_license_expression,
licensing=get_cache().licensing,
))

cls.assign_package_to_resources(
package=package,
Expand All @@ -132,6 +139,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):

yield package


# we yield this as we do not want this further processed
yield resource

Expand Down
32 changes: 6 additions & 26 deletions src/packagedcode/licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))


def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
def add_referenced_license_matches_for_package(resource, codebase):
"""
Return an updated ``resource`` saving it in place, after adding new license
detections to the package manifests detected in this resource, following their
Expand Down Expand Up @@ -106,13 +106,7 @@ def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
if not referenced_resource:
continue

if no_licenses:
referenced_license_detections = get_license_detection_mappings(
location=referenced_resource.location
)

else:
referenced_license_detections = referenced_resource.license_detections
referenced_license_detections = referenced_resource.license_detections

if referenced_license_detections:
modified = True
Expand Down Expand Up @@ -160,7 +154,7 @@ def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
yield resource


def add_referenced_license_detection_from_package(resource, codebase, no_licenses):
def add_referenced_license_detection_from_package(resource, codebase):
"""
Return an updated ``resource`` saving it in place, after adding new license
matches (licenses and license_expressions) following their Rule
Expand Down Expand Up @@ -209,7 +203,6 @@ def add_referenced_license_detection_from_package(resource, codebase, no_license
sibling_license_detections, _le = get_license_detections_from_sibling_file(
resource=root_resource,
codebase=codebase,
no_licenses=no_licenses,
)
if TRACE:
logger_debug(
Expand Down Expand Up @@ -278,12 +271,10 @@ def add_referenced_license_detection_from_package(resource, codebase, no_license
yield resource


def add_license_from_sibling_file(resource, codebase, no_licenses):
def add_license_from_sibling_file(resource, codebase):
"""
Given a resource and it's codebase object, assign licenses to the package
detections in that resource, from the sibling files of it.

If `no_license` is True, then license scan (for resources) is disabled.
"""
if TRACE:
logger_debug(f'packagedcode.licensing: add_license_from_sibling_file: resource: {resource.path}')
Expand All @@ -303,7 +294,6 @@ def add_license_from_sibling_file(resource, codebase, no_licenses):
license_detections, license_expression = get_license_detections_from_sibling_file(
resource=resource,
codebase=codebase,
no_licenses=no_licenses,
)
if not license_detections:
return
Expand Down Expand Up @@ -333,13 +323,11 @@ def is_legal_or_readme(resource):
return False


def get_license_detections_from_sibling_file(resource, codebase, no_licenses):
def get_license_detections_from_sibling_file(resource, codebase):
"""
Return `license_detections`, a list of LicenseDetection objects and a
`license_expression`, given a resource and it's codebase object, from
the sibling files of the resource.

If `no_license` is True, then license scan (for resources) is disabled.
"""
siblings = []

Expand All @@ -357,15 +345,7 @@ def get_license_detections_from_sibling_file(resource, codebase, no_licenses):

license_detections = []
for sibling in siblings:
if no_licenses:
detections = get_license_detection_mappings(
location=sibling.location,
analysis=DetectionCategory.PACKAGE_ADD_FROM_SIBLING_FILE.value,
post_scan=True,
)
license_detections.extend(detections)
else:
license_detections.extend(sibling.license_detections)
license_detections.extend(sibling.license_detections)

if not license_detections:
return [], None
Expand Down
25 changes: 12 additions & 13 deletions src/packagedcode/plugin_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,17 +194,19 @@ def process_codebase(self, codebase, strip_root=False, **kwargs):
Also perform additional package license detection that depends on either
file license detection or the package detections.
"""
no_licenses = False
has_licenses = hasattr(codebase.root, 'license_detections')

# These steps add proper license detections to package_data and hence
# this is performed before top level packages creation
for resource in codebase.walk(topdown=False):
if not hasattr(resource, 'license_detections'):
no_licenses = True
if not has_licenses:
#TODO: Add the steps where we detect licenses from files for only a package scan
# in the multiprocessing get_package_data API function
continue

# If we don't detect license in package_data but there is license detected in file
# we add the license expression from the file to a package
modified = add_license_from_file(resource, codebase, no_licenses)
modified = add_license_from_file(resource, codebase)
if TRACE and modified:
logger_debug(f'packagedcode: process_codebase: add_license_from_file: modified: {modified}')

Expand All @@ -213,30 +215,30 @@ def process_codebase(self, codebase, strip_root=False, **kwargs):

# If there is referenced files in a extracted license statement, we follow
# the references, look for license detections and add them back
modified = list(add_referenced_license_matches_for_package(resource, codebase, no_licenses))
modified = list(add_referenced_license_matches_for_package(resource, codebase))
if TRACE and modified:
logger_debug(f'packagedcode: process_codebase: add_referenced_license_matches_for_package: modified: {modified}')

# If there is a LICENSE file on the same level as the manifest, and no license
# is detected in the package_data, we add the license from the file
modified = add_license_from_sibling_file(resource, codebase, no_licenses)
modified = add_license_from_sibling_file(resource, codebase)
if TRACE and modified:
logger_debug(f'packagedcode: process_codebase: add_license_from_sibling_file: modified: {modified}')

# Create codebase-level packages and dependencies
create_package_and_deps(codebase, strip_root=strip_root, **kwargs)

if not no_licenses:
if has_licenses:
# This step is dependent on top level packages
for resource in codebase.walk(topdown=False):
# If there is a unknown reference to a package we add the license
# from the package license detection
modified = list(add_referenced_license_detection_from_package(resource, codebase, no_licenses))
modified = list(add_referenced_license_detection_from_package(resource, codebase))
if TRACE and modified:
logger_debug(f'packagedcode: process_codebase: add_referenced_license_matches_from_package: modified: {modified}')


def add_license_from_file(resource, codebase, no_licenses):
def add_license_from_file(resource, codebase):
"""
Given a Resource, check if the detected package_data doesn't have license detections
and the file has license detections, and if so, populate the package_data license
Expand All @@ -248,10 +250,7 @@ def add_license_from_file(resource, codebase, no_licenses):
if not resource.is_file:
return

if no_licenses:
license_detections_file = get_license_detection_mappings(location=resource.location)
else:
license_detections_file = resource.license_detections
license_detections_file = resource.license_detections

if TRACE:
logger_debug(f'add_license_from_file: license_detections_file: {license_detections_file}')
Expand Down
6 changes: 3 additions & 3 deletions tests/formattedcode/test_output_cyclonedx.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,22 +228,22 @@ def test_cyclonedx_plugin_does_not_fail_without_packages():
def test_cyclonedx_plugin_json():
test_dir = test_env.get_test_loc('cyclonedx/simple')
result_file = test_env.get_temp_file('cyclonedx.json')
run_scan_click(['-p', test_dir, '--cyclonedx', result_file])
run_scan_click(['--package', test_dir, '--cyclonedx', result_file])
expected_file = test_env.get_test_loc('cyclonedx/simple-expected.json')
check_cyclone_output(expected_file, result_file, regen=REGEN_TEST_FIXTURES)


def test_cyclonedx_plugin_json_simple_package_icu():
test_dir = test_env.get_test_loc('cyclonedx/simple-icu')
result_file = test_env.get_temp_file('cyclonedx.json')
run_scan_click(['-p', test_dir, '--cyclonedx', result_file])
run_scan_click(['--package', '--license', test_dir, '--cyclonedx', result_file])
expected_file = test_env.get_test_loc('cyclonedx/simple-icu-expected.json')
check_cyclone_output(expected_file, result_file, regen=REGEN_TEST_FIXTURES)


def test_cyclonedx_plugin_xml_components_and_dependencies_are_serialized_correctly():
test_dir = test_env.get_test_loc('cyclonedx/simple')
result_file = test_env.get_temp_file('cyclonedx.xml')
run_scan_click(['-p', test_dir, '--cyclonedx-xml', result_file])
run_scan_click(['--package', test_dir, '--cyclonedx-xml', result_file])
expected_file = test_env.get_test_loc('cyclonedx/expected.xml')
check_cyclone_xml_output(expected_file, result_file, regen=REGEN_TEST_FIXTURES)
26 changes: 3 additions & 23 deletions tests/packagedcode/data/build/buck/end2end-expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -247,29 +247,9 @@
"vcs_url": null,
"copyright": null,
"holder": null,
"declared_license_expression": "apache-2.0",
"declared_license_expression_spdx": "Apache-2.0",
"license_detections": [
{
"license_expression": "apache-2.0",
"matches": [
{
"score": 100.0,
"start_line": 1,
"end_line": 1,
"matched_length": 3,
"match_coverage": 100.0,
"matcher": "1-hash",
"license_expression": "apache-2.0",
"rule_identifier": "spdx_license_id_apache-2.0_for_apache-2.0.RULE",
"rule_relevance": 100,
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_apache-2.0_for_apache-2.0.RULE",
"matched_text": "apache-2.0"
}
],
"identifier": "apache_2_0-d66ab77d-a5cc-7104-e702-dc7df61fe9e8"
}
],
"declared_license_expression": null,
"declared_license_expression_spdx": null,
"license_detections": [],
"other_license_expression": null,
"other_license_expression_spdx": null,
"other_license_detections": [],
Expand Down
Loading