diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0a3f4fa3d..861f51c25 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -15,6 +15,9 @@ v33.0.0 (unreleased) once. https://github.com/nexB/scancode.io/issues/646 +- Improve the resolution of CycloneDX BOM and SPDX document when the file extension is + simply ``.json``. + - Add support for manifest types using ScanCode-toolkit handlers. https://github.com/nexB/scancode.io/issues/658 diff --git a/scanpipe/pipes/cyclonedx.py b/scanpipe/pipes/cyclonedx.py index 4a35ee301..b01c6ec8a 100644 --- a/scanpipe/pipes/cyclonedx.py +++ b/scanpipe/pipes/cyclonedx.py @@ -21,8 +21,9 @@ # Visit https://github.com/nexB/scancode.io for support and download. import json -import pathlib from collections import defaultdict +from contextlib import suppress +from pathlib import Path from django.core.validators import EMPTY_VALUES @@ -31,7 +32,7 @@ CyclonedxSoftwareBillOfMaterialsStandard as Bom_1_4, ) -SCHEMAS_PATH = pathlib.Path(__file__).parent / "schemas" +SCHEMAS_PATH = Path(__file__).parent / "schemas" CYCLONEDX_SPEC_VERSION = "1.4" CYCLONEDX_SCHEMA_NAME = "bom-1.4.schema.json" @@ -157,7 +158,7 @@ def validate_document(document, schema=CYCLONEDX_SCHEMA_PATH): if isinstance(document, str): document = json.loads(document) - if isinstance(schema, pathlib.Path): + if isinstance(schema, Path): schema = schema.read_text() if isinstance(schema, str): @@ -174,3 +175,12 @@ def validate_document(document, schema=CYCLONEDX_SCHEMA_PATH): resolver = jsonschema.RefResolver.from_schema(schema, store=store) validator = jsonschema.Draft7Validator(schema=schema, resolver=resolver) validator.validate(instance=document) + + +def is_cyclonedx_bom(input_location): + """Return True if the file at `input_location` is a CycloneDX BOM.""" + with suppress(Exception): + data = json.loads(Path(input_location).read_text()) + if data.get("$schema", "").endswith(CYCLONEDX_SCHEMA_NAME): + return True + return False diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py index eac16a6d1..67b08e190 100644 --- a/scanpipe/pipes/resolve.py +++ b/scanpipe/pipes/resolve.py @@ -215,6 +215,12 @@ def get_default_package_type(input_location): if input_location.endswith((".bom.json", ".cdx.json")): return "cyclonedx" + if input_location.endswith(".json"): + if cyclonedx.is_cyclonedx_bom(input_location): + return "cyclonedx" + if spdx.is_spdx_document(input_location): + return "spdx" + # Mapping between the `default_package_type` its related resolver function resolver_registry = { diff --git a/scanpipe/pipes/spdx.py b/scanpipe/pipes/spdx.py index d4a62665b..b35a3d710 100644 --- a/scanpipe/pipes/spdx.py +++ b/scanpipe/pipes/spdx.py @@ -21,17 +21,18 @@ # Visit https://github.com/nexB/scancode.io for support and download. import json -import pathlib import re +from contextlib import suppress from dataclasses import dataclass from dataclasses import field from datetime import datetime +from pathlib import Path from typing import List # Python 3.8 compatibility SPDX_SPEC_VERSION = "2.3" SPDX_LICENSE_LIST_VERSION = "3.18" SPDX_SCHEMA_NAME = "spdx-schema-2.3.json" -SPDX_SCHEMA_PATH = pathlib.Path(__file__).parent / "schemas" / SPDX_SCHEMA_NAME +SPDX_SCHEMA_PATH = Path(__file__).parent / "schemas" / SPDX_SCHEMA_NAME SPDX_SCHEMA_URL = ( "https://raw.githubusercontent.com/spdx/spdx-spec/v2.3/schemas/spdx-schema.json" ) @@ -631,9 +632,18 @@ def validate_document(document, schema=SPDX_SCHEMA_PATH): if isinstance(document, Document): document = document.as_dict() - if isinstance(schema, pathlib.Path): + if isinstance(schema, Path): schema = schema.read_text() if isinstance(schema, str): schema = json.loads(schema) jsonschema.validate(instance=document, schema=schema) + + +def is_spdx_document(input_location): + """Return True if the file at `input_location` is a SPDX Document.""" + with suppress(Exception): + data = json.loads(Path(input_location).read_text()) + if data.get("SPDXID"): + return True + return False diff --git a/scanpipe/tests/data/cyclonedx/asgiref-3.3.0.json b/scanpipe/tests/data/cyclonedx/asgiref-3.3.0.json new file mode 100644 index 000000000..083eebcf0 --- /dev/null +++ b/scanpipe/tests/data/cyclonedx/asgiref-3.3.0.json @@ -0,0 +1,77 @@ +{ + "$schema": "http://cyclonedx.org/schema/bom-1.4.schema.json", + "bomFormat": "CycloneDX", + "specVersion": "1.4", + "serialNumber": "urn:uuid:b74fe5df-e965-415e-ba65-f38421a0695d", + "version": 1, + "metadata": { + "tools": [ + { + "name": "ScanCode.io", + "version": "31.0.0" + } + ], + "component": { + "type": "library", + "bom-ref": "4c7f2e2e-0c41-45d2-a6cc-7bbde9a9f440", + "name": "asgiref" + }, + "properties": [ + { + "name": "notice", + "value": "Generated with ScanCode.io and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied.\nNo content created from ScanCode.io should be considered or used as legal advice.\nConsult an Attorney for any legal advice.\nScanCode.io is a free software code scanning tool from nexB Inc. and others\nlicensed under the Apache License version 2.0.\nScanCode is a trademark of nexB Inc.\nVisit https://github.com/nexB/scancode.io for support and download.\n" + } + ] + }, + "components": [ + { + "type": "library", + "bom-ref": "pkg:pypi/asgiref@3.3.0", + "name": "asgiref", + "version": "3.3.0", + "description": "ASGI specs, helper code, and adapters\nasgiref\n=======\n\n.. image:: https://api.travis-ci.org/django/asgiref.svg\n :target: https://travis-ci.org/django/asgiref\n\n.. image:: https://img.shields.io/pypi/v/asgiref.svg\n :target: https://pypi.python.org/pypi/asgiref\n\nASGI is a standard for Python asynchronous web apps and servers to communicate\nwith each other, and positioned as an asynchronous successor to WSGI. You can\nread more at https://asgi.readthedocs.io/en/latest/\n\nThis package includes ASGI base libraries, such as:\n\n* Sync-to-async and async-to-sync function wrappers, ``asgiref.sync``\n* Server base classes, ``asgiref.server``\n* A WSGI-to-ASGI adapter, in ``asgiref.wsgi``\n\n\nFunction wrappers\n-----------------\n\nThese allow you to wrap or decorate async or sync functions to call them from\nthe other style (so you can call async functions from a synchronous thread,\nor vice-versa).\n\nIn particular:\n\n* AsyncToSync lets a synchronous subthread stop and wait while the async\n function is called on the main thread's event loop, and then control is\n returned to the thread when the async function is finished.\n\n* SyncToAsync lets async code call a synchronous function, which is run in\n a threadpool and control returned to the async coroutine when the synchronous\n function completes.\n\nThe idea is to make it easier to call synchronous APIs from async code and\nasynchronous APIs from synchronous code so it's easier to transition code from\none style to the other. In the case of Channels, we wrap the (synchronous)\nDjango view system with SyncToAsync to allow it to run inside the (asynchronous)\nASGI server.\n\nNote that exactly what threads things run in is very specific, and aimed to\nkeep maximum compatibility with old synchronous code. See\n\"Synchronous code & Threads\" below for a full explanation. By default,\n``sync_to_async`` will run all synchronous code in the program in the same\nthread for safety reasons; you can disable this for more performance with\n``@sync_to_async(thread_sensitive=False)``, but make sure that your code does\nnot rely on anything bound to threads (like database connections) when you do.\n\n\nThreadlocal replacement\n-----------------------\n\nThis is a drop-in replacement for ``threading.local`` that works with both\nthreads and asyncio Tasks. Even better, it will proxy values through from a\ntask-local context to a thread-local context when you use ``sync_to_async``\nto run things in a threadpool, and vice-versa for ``async_to_sync``.\n\nIf you instead want true thread- and task-safety, you can set\n``thread_critical`` on the Local object to ensure this instead.\n\n\nServer base classes\n-------------------\n\nIncludes a ``StatelessServer`` class which provides all the hard work of\nwriting a stateless server (as in, does not handle direct incoming sockets\nbut instead consumes external streams or sockets to work out what is happening).\n\nAn example of such a server would be a chatbot server that connects out to\na central chat server and provides a \"connection scope\" per user chatting to\nit. There's only one actual connection, but the server has to separate things\ninto several scopes for easier writing of the code.\n\nYou can see an example of this being used in `frequensgi `_.\n\n\nWSGI-to-ASGI adapter\n--------------------\n\nAllows you to wrap a WSGI application so it appears as a valid ASGI application.\n\nSimply wrap it around your WSGI application like so::\n\n asgi_application = WsgiToAsgi(wsgi_application)\n\nThe WSGI application will be run in a synchronous threadpool, and the wrapped\nASGI application will be one that accepts ``http`` class messages.\n\nPlease note that not all extended features of WSGI may be supported (such as\nfile handles for incoming POST bodies).\n\n\nDependencies\n------------\n\n``asgiref`` requires Python 3.5 or higher.\n\n\nContributing\n------------\n\nPlease refer to the\n`main Channels contributing docs `_.\n\n\nTesting\n'''''''\n\nTo run tests, make sure you have installed the ``tests`` extra with the package::\n\n cd asgiref/\n pip install -e .[tests]\n pytest\n\n\nBuilding the documentation\n''''''''''''''''''''''''''\n\nThe documentation uses `Sphinx `_::\n\n cd asgiref/docs/\n pip install sphinx\n\nTo build the docs, you can use the default tools::\n\n sphinx-build -b html . _build/html # or `make html`, if you've got make set up\n cd _build/html\n python -m http.server\n\n...or you can use ``sphinx-autobuild`` to run a server and rebuild/reload\nyour documentation changes automatically::\n\n pip install sphinx-autobuild\n sphinx-autobuild . _build/html\n\n\nImplementation Details\n----------------------\n\nSynchronous code & threads\n''''''''''''''''''''''''''\n\nThe ``asgiref.sync`` module provides two wrappers that let you go between\nasynchronous and synchronous code at will, while taking care of the rough edges\nfor you.\n\nUnfortunately, the rough edges are numerous, and the code has to work especially\nhard to keep things in the same thread as much as possible. Notably, the\nrestrictions we are working with are:\n\n* All synchronous code called through ``SyncToAsync`` and marked with\n ``thread_sensitive`` should run in the same thread as each other (and if the\n outer layer of the program is synchronous, the main thread)\n\n* If a thread already has a running async loop, ``AsyncToSync`` can't run things\n on that loop if it's blocked on synchronous code that is above you in the\n call stack.\n\nThe first compromise you get to might be that ``thread_sensitive`` code should\njust run in the same thread and not spawn in a sub-thread, fulfilling the first\nrestriction, but that immediately runs you into the second restriction.\n\nThe only real solution is to essentially have a variant of ThreadPoolExecutor\nthat executes any ``thread_sensitive`` code on the outermost synchronous\nthread - either the main thread, or a single spawned subthread.\n\nThis means you now have two basic states:\n\n* If the outermost layer of your program is synchronous, then all async code\n run through ``AsyncToSync`` will run in a per-call event loop in arbitary\n sub-threads, while all ``thread_sensitive`` code will run in the main thread.\n\n* If the outermost layer of your program is asynchronous, then all async code\n runs on the main thread's event loop, and all ``thread_sensitive`` synchronous\n code will run in a single shared sub-thread.\n\nCruicially, this means that in both cases there is a thread which is a shared\nresource that all ``thread_sensitive`` code must run on, and there is a chance\nthat this thread is currently blocked on its own ``AsyncToSync`` call. Thus,\n``AsyncToSync`` needs to act as an executor for thread code while it's blocking.\n\nThe ``CurrentThreadExecutor`` class provides this functionality; rather than\nsimply waiting on a Future, you can call its ``run_until_future`` method and\nit will run submitted code until that Future is done. This means that code\ninside the call can then run code on your thread.\n\n\nMaintenance and Security\n------------------------\n\nTo report security issues, please contact security@djangoproject.com. For GPG\nsignatures and more security process information, see\nhttps://docs.djangoproject.com/en/dev/internals/security/.\n\nTo report bugs or request new features, please open a new GitHub issue.\n\nThis repository is part of the Channels project. For the shepherd and maintenance team, please see the\n`main Channels readme `_.", + "licenses": [ + { + "expression": "BSD-3-Clause AND BSD-3-Clause" + } + ], + "purl": "pkg:pypi/asgiref@3.3.0", + "externalReferences": [ + { + "url": "https://pypi.org/pypi/asgiref/3.3.0/json", + "type": "bom" + }, + { + "url": "https://pypi.org/packages/source/a/asgiref/asgiref-3.3.0.tar.gz", + "type": "distribution" + }, + { + "url": "https://pypi.org/project/asgiref", + "type": "website" + } + ], + "properties": [ + { + "name": "aboutcode:homepage_url", + "value": "https://github.com/django/asgiref/" + }, + { + "name": "aboutcode:primary_language", + "value": "Python" + } + ] + } + ], + "dependencies": [ + { + "ref": "4c7f2e2e-0c41-45d2-a6cc-7bbde9a9f440", + "dependsOn": [ + "pkg:pypi/asgiref@3.3.0" + ] + }, + { + "ref": "pkg:pypi/asgiref@3.3.0", + "dependsOn": [] + } + ] +} \ No newline at end of file diff --git a/scanpipe/tests/data/manifests/toml.json b/scanpipe/tests/data/manifests/toml.json new file mode 100644 index 000000000..62cf07cc0 --- /dev/null +++ b/scanpipe/tests/data/manifests/toml.json @@ -0,0 +1,46 @@ +{ + "spdxVersion": "SPDX-2.3", + "dataLicense": "CC0-1.0", + "SPDXID": "SPDXRef-DOCUMENT", + "name": "scancodeio_import_spdx", + "documentNamespace": "https://scancode.io/spdxdocs/dd4d94a7-debe-40b6-b831-6c562ca87ede", + "creationInfo": { + "created": "2022-11-22T18:12:45Z", + "creators": [ + "Tool: ScanCode.io-31.0.0" + ], + "licenseListVersion": "3.18" + }, + "packages": [ + { + "name": "toml", + "SPDXID": "SPDXRef-scancodeio-discoveredpackage-a0d1d7e4-72c0-4c11-9509-80168acd2e62", + "downloadLocation": "https://files.pythonhosted.org/packages/be/ba/1f744cdc819428fc6b5084ec34d9b30660f6f9daaf70eead706e3203ec3c/toml-0.10.2.tar.gz", + "licenseConcluded": "MIT", + "copyrightText": "NOASSERTION", + "filesAnalyzed": false, + "versionInfo": "0.10.2", + "licenseDeclared": "MIT", + "homepage": "https://github.com/uiri/toml", + "description": "Python Library for Tom's Obvious, Minimal Language\n****\nTOML\n****\n\n.. image:: https://img.shields.io/pypi/v/toml\n :target: https://pypi.org/project/toml/\n\n.. image:: https://travis-ci.org/uiri/toml.svg?branch=master\n :target: https://travis-ci.org/uiri/toml\n\n.. image:: https://img.shields.io/pypi/pyversions/toml.svg\n :target: https://pypi.org/project/toml/\n\n\nA Python library for parsing and creating `TOML `_.\n\nThe module passes `the TOML test suite `_.\n\nSee also:\n\n* `The TOML Standard `_\n* `The currently supported TOML specification `_\n\nInstallation\n============\n\nTo install the latest release on `PyPI `_,\nsimply run:\n\n::\n\n pip install toml\n\nOr to install the latest development version, run:\n\n::\n\n git clone https://github.com/uiri/toml.git\n cd toml\n python setup.py install\n\nQuick Tutorial\n==============\n\n*toml.loads* takes in a string containing standard TOML-formatted data and\nreturns a dictionary containing the parsed data.\n\n.. code:: pycon\n\n >>> import toml\n >>> toml_string = \"\"\"\n ... # This is a TOML document.\n ...\n ... title = \"TOML Example\"\n ...\n ... [owner]\n ... name = \"Tom Preston-Werner\"\n ... dob = 1979-05-27T07:32:00-08:00 # First class dates\n ...\n ... [database]\n ... server = \"192.168.1.1\"\n ... ports = [ 8001, 8001, 8002 ]\n ... connection_max = 5000\n ... enabled = true\n ...\n ... [servers]\n ...\n ... # Indentation (tabs and/or spaces) is allowed but not required\n ... [servers.alpha]\n ... ip = \"10.0.0.1\"\n ... dc = \"eqdc10\"\n ...\n ... [servers.beta]\n ... ip = \"10.0.0.2\"\n ... dc = \"eqdc10\"\n ...\n ... [clients]\n ... data = [ [\"gamma\", \"delta\"], [1, 2] ]\n ...\n ... # Line breaks are OK when inside arrays\n ... hosts = [\n ... \"alpha\",\n ... \"omega\"\n ... ]\n ... \"\"\"\n >>> parsed_toml = toml.loads(toml_string)\n\n\n*toml.dumps* takes a dictionary and returns a string containing the\ncorresponding TOML-formatted data.\n\n.. code:: pycon\n\n >>> new_toml_string = toml.dumps(parsed_toml)\n >>> print(new_toml_string)\n title = \"TOML Example\"\n [owner]\n name = \"Tom Preston-Werner\"\n dob = 1979-05-27T07:32:00Z\n [database]\n server = \"192.168.1.1\"\n ports = [ 8001, 8001, 8002,]\n connection_max = 5000\n enabled = true\n [clients]\n data = [ [ \"gamma\", \"delta\",], [ 1, 2,],]\n hosts = [ \"alpha\", \"omega\",]\n [servers.alpha]\n ip = \"10.0.0.1\"\n dc = \"eqdc10\"\n [servers.beta]\n ip = \"10.0.0.2\"\n dc = \"eqdc10\"\n\n*toml.dump* takes a dictionary and a file descriptor and returns a string containing the\ncorresponding TOML-formatted data.\n\n.. code:: pycon\n\n >>> with open('new_toml_file.toml', 'w') as f:\n ... new_toml_string = toml.dump(parsed_toml, f)\n >>> print(new_toml_string)\n title = \"TOML Example\"\n [owner]\n name = \"Tom Preston-Werner\"\n dob = 1979-05-27T07:32:00Z\n [database]\n server = \"192.168.1.1\"\n ports = [ 8001, 8001, 8002,]\n connection_max = 5000\n enabled = true\n [clients]\n data = [ [ \"gamma\", \"delta\",], [ 1, 2,],]\n hosts = [ \"alpha\", \"omega\",]\n [servers.alpha]\n ip = \"10.0.0.1\"\n dc = \"eqdc10\"\n [servers.beta]\n ip = \"10.0.0.2\"\n dc = \"eqdc10\"\n\nFor more functions, view the API Reference below.\n\nNote\n----\n\nFor Numpy users, by default the data types ``np.floatX`` will not be translated to floats by toml, but will instead be encoded as strings. To get around this, specify the ``TomlNumpyEncoder`` when saving your data.\n\n.. code:: pycon\n\n >>> import toml\n >>> import numpy as np\n >>> a = np.arange(0, 10, dtype=np.double)\n >>> output = {'a': a}\n >>> toml.dumps(output)\n 'a = [ \"0.0\", \"1.0\", \"2.0\", \"3.0\", \"4.0\", \"5.0\", \"6.0\", \"7.0\", \"8.0\", \"9.0\",]\\n'\n >>> toml.dumps(output, encoder=toml.TomlNumpyEncoder())\n 'a = [ 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0,]\\n'\n\nAPI Reference\n=============\n\n``toml.load(f, _dict=dict)``\n Parse a file or a list of files as TOML and return a dictionary.\n\n :Args:\n * ``f``: A path to a file, list of filepaths (to be read into single\n object) or a file descriptor\n * ``_dict``: The class of the dictionary object to be returned\n\n :Returns:\n A dictionary (or object ``_dict``) containing parsed TOML data\n\n :Raise:\n * ``TypeError``: When ``f`` is an invalid type or is a list containing\n invalid types\n * ``TomlDecodeError``: When an error occurs while decoding the file(s)\n\n``toml.loads(s, _dict=dict)``\n Parse a TOML-formatted string to a dictionary.\n\n :Args:\n * ``s``: The TOML-formatted string to be parsed\n * ``_dict``: Specifies the class of the returned toml dictionary\n\n :Returns:\n A dictionary (or object ``_dict``) containing parsed TOML data\n\n :Raise:\n * ``TypeError``: When a non-string object is passed\n * ``TomlDecodeError``: When an error occurs while decoding the\n TOML-formatted string\n\n``toml.dump(o, f, encoder=None)``\n Write a dictionary to a file containing TOML-formatted data\n\n :Args:\n * ``o``: An object to be converted into TOML\n * ``f``: A File descriptor where the TOML-formatted output should be stored\n * ``encoder``: An instance of ``TomlEncoder`` (or subclass) for encoding the object. If ``None``, will default to ``TomlEncoder``\n\n :Returns:\n A string containing the TOML-formatted data corresponding to object ``o``\n\n :Raise:\n * ``TypeError``: When anything other than file descriptor is passed\n\n``toml.dumps(o, encoder=None)``\n Create a TOML-formatted string from an input object\n\n :Args:\n * ``o``: An object to be converted into TOML\n * ``encoder``: An instance of ``TomlEncoder`` (or subclass) for encoding the object. If ``None``, will default to ``TomlEncoder``\n\n :Returns:\n A string containing the TOML-formatted data corresponding to object ``o``\n\n\n\nLicensing\n=========\n\nThis project is released under the terms of the MIT Open Source License. View\n*LICENSE.txt* for more information.", + "releaseDate": "2020-11-01T00:00:00Z", + "checksums": [ + { + "algorithm": "MD5", + "checksumValue": "59bce5d8d67e858735ec3f399ec90253" + } + ], + "externalRefs": [ + { + "referenceCategory": "PACKAGE-MANAGER", + "referenceType": "purl", + "referenceLocator": "pkg:pypi/toml@0.10.2" + } + ] + } + ], + "documentDescribes": [ + "SPDXRef-DOCUMENT" + ], + "comment": "Generated with ScanCode.io and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied.\nNo content created from ScanCode.io should be considered or used as legal advice.\nConsult an Attorney for any legal advice.\nScanCode.io is a free software code scanning tool from nexB Inc. and others\nlicensed under the Apache License version 2.0.\nScanCode is a trademark of nexB Inc.\nVisit https://github.com/nexB/scancode.io for support and download.\n" +} \ No newline at end of file diff --git a/scanpipe/tests/pipes/test_resolve.py b/scanpipe/tests/pipes/test_resolve.py index 128af98b6..af081a05c 100644 --- a/scanpipe/tests/pipes/test_resolve.py +++ b/scanpipe/tests/pipes/test_resolve.py @@ -41,9 +41,15 @@ def test_scanpipe_pipes_resolve_get_default_package_type(self): input_location = self.manifest_location / "toml.spdx.json" self.assertEqual("spdx", resolve.get_default_package_type(input_location)) + input_location = self.manifest_location / "toml.json" + self.assertEqual("spdx", resolve.get_default_package_type(input_location)) + input_location = self.data_location / "cyclonedx/nested.bom.json" self.assertEqual("cyclonedx", resolve.get_default_package_type(input_location)) + input_location = self.data_location / "cyclonedx/asgiref-3.3.0.json" + self.assertEqual("cyclonedx", resolve.get_default_package_type(input_location)) + def test_scanpipe_pipes_resolve_set_license_expression(self): declared_license = {"license": "MIT"} data = resolve.set_license_expression({"declared_license": declared_license})