-
-
Notifications
You must be signed in to change notification settings - Fork 588
/
Copy pathoutput_spdx.py
401 lines (330 loc) · 13.5 KB
/
output_spdx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
import os
import sys
import uuid
from datetime import datetime
from io import BytesIO
from io import StringIO
from license_expression import Licensing
from commoncode.cliutils import OUTPUT_GROUP
from commoncode.cliutils import PluggableCommandLineOption
from commoncode.fileutils import file_name
from commoncode.fileutils import parent_directory
from commoncode.text import python_safe_name
from spdx_tools.spdx.model import SpdxNoAssertion
from spdx_tools.spdx.model import Version
from spdx_tools.spdx.model import CreationInfo
from spdx_tools.spdx.model import Actor
from spdx_tools.spdx.model import ActorType
from spdx_tools.spdx.model import Document
from spdx_tools.spdx.model import Package
from spdx_tools.spdx.model import File
from spdx_tools.spdx.model import Checksum
from spdx_tools.spdx.model import ChecksumAlgorithm
from spdx_tools.spdx.model import ExtractedLicensingInfo
from spdx_tools.spdx.model import SpdxNone
from spdx_tools.spdx.model import Relationship
from spdx_tools.spdx.model import RelationshipType
from spdx_tools.spdx.spdx_element_utils import calculate_package_verification_code
from formattedcode import FileOptionType
from licensedcode.detection import get_matches_from_detection_mappings
from plugincode.output import output_impl
from plugincode.output import OutputPlugin
import scancode_config
# Tracing flags
TRACE = False
TRACE_DEEP = False
def logger_debug(*args):
pass
if TRACE or TRACE_DEEP:
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
def logger_debug(*args):
return logger.debug(
' '.join(isinstance(a, str) and a or repr(a) for a in args)
)
"""
Output plugins to write scan results in SPDX format.
"""
@output_impl
class SpdxTvOutput(OutputPlugin):
options = [
PluggableCommandLineOption(('--spdx-tv',),
type=FileOptionType(mode='w', encoding='utf-8', lazy=True),
metavar='FILE',
help='Write scan output as SPDX Tag/Value to FILE.',
help_group=OUTPUT_GROUP)
]
def is_enabled(self, spdx_tv, **kwargs):
return spdx_tv
def process_codebase(self, codebase, spdx_tv, **kwargs):
_process_codebase(
spdx_plugin=self,
codebase=codebase,
input_path=kwargs.get('input', ''),
output_file=spdx_tv,
as_tagvalue=True,
**kwargs
)
@output_impl
class SpdxRdfOutput(OutputPlugin):
options = [
PluggableCommandLineOption(('--spdx-rdf',),
type=FileOptionType(mode='w', encoding='utf-8', lazy=True),
metavar='FILE',
help='Write scan output as SPDX RDF to FILE.',
help_group=OUTPUT_GROUP)
]
def is_enabled(self, spdx_rdf, **kwargs):
return spdx_rdf
def process_codebase(self, codebase, spdx_rdf, **kwargs):
_process_codebase(
spdx_plugin=self,
codebase=codebase,
input_path=kwargs.get('input', ''),
output_file=spdx_rdf,
as_tagvalue=False,
**kwargs
)
def _process_codebase(
spdx_plugin,
codebase,
input_path,
output_file,
as_tagvalue=True,
**kwargs,
):
check_sha1(codebase)
files = spdx_plugin.get_files(codebase, **kwargs)
header = codebase.get_or_create_current_header()
tool_name = header.tool_name
tool_version = header.tool_version
notice = header.notice
package_name = build_package_name(input_path)
write_spdx(
codebase=codebase,
output_file=output_file,
files=files,
tool_name=tool_name,
tool_version=tool_version,
notice=notice,
package_name=package_name,
as_tagvalue=as_tagvalue,
)
def build_package_name(input_path):
"""
Return a package name built from an ``input_path`` path.
"""
if input_path:
absinput = absinput = os.path.abspath(input_path)
if os.path.isfile(absinput):
input_path = parent_directory(absinput)
return python_safe_name(file_name(input_path))
return 'scancode-toolkit-analyzed-package'
def check_sha1(codebase):
has_sha1 = hasattr(codebase.root, 'sha1')
if not has_sha1:
import click
click.secho(
'WARNING: Files are missing a SHA1 attribute. '
'Incomplete SPDX document created.',
err=True,
fg='red',
)
def write_spdx(
codebase,
output_file,
files,
tool_name,
tool_version,
notice,
package_name='',
download_location=SpdxNoAssertion(),
as_tagvalue=True,
spdx_version = (2, 2),
with_notice_text=False,
):
"""
Write scan output as SPDX Tag/value to ``output_file`` file-like
object using the ``files`` list of scanned file data.
Write as RDF XML if ``as_tagvalue`` is False.
Use the ``notice`` string as a notice included in a document comment.
Include the ``tool_name`` and ``tool_version`` to indicate which tool is
producing this SPDX document.
Use ``package_name`` as a Package name and as a namespace prefix base.
"""
from licensedcode import cache
licenses = cache.get_licenses_db()
licensing = Licensing()
as_rdf = not as_tagvalue
ns_prefix = '_'.join(package_name.lower().split())
comment = notice + f'\nSPDX License List: {scancode_config.spdx_license_list_version}'
version_major, version_minor = scancode_config.spdx_license_list_version.split(".")
spdx_license_list_version = Version(major=version_major, minor=version_minor)
tool_name = tool_name or 'ScanCode'
creator = Actor(ActorType.TOOL, f'{tool_name} {tool_version}')
creation_info = CreationInfo(
spdx_id="SPDXRef-DOCUMENT",
spdx_version=f"SPDX-{spdx_version[0]}.{spdx_version[1]}",
data_license='CC0-1.0',
document_comment=comment,
document_namespace=f'http://spdx.org/spdxdocs/{ns_prefix}-{uuid.uuid4()}',
license_list_version=spdx_license_list_version,
name='SPDX Document created by ScanCode Toolkit',
creators=[creator],
created=datetime.now(),
)
package_id = '001'
package = Package(
name=package_name,
download_location=download_location,
spdx_id=f'SPDXRef-{package_id}',
)
doc = Document(
creation_info=creation_info,
packages=[package],
)
# Use a set of unique copyrights for the package.
package_copyright_texts = set()
all_files_have_no_license = True
all_files_have_no_copyright = True
# FIXME: this should walk the codebase instead!!!
for sid, file_data in enumerate(files, 1):
# Skip directories.
if file_data.get('type') != 'file':
continue
# Set a relative file name as that is what we want in
# SPDX output (with explicit leading './').
name = './' + file_data.get('path')
if file_data.get('file_type') == 'empty':
checksum = Checksum(ChecksumAlgorithm.SHA1, "da39a3ee5e6b4b0d3255bfef95601890afd80709")
else:
# FIXME: this sets the checksum of a file to the empty string hash if unknown; tracked in https://github.com/nexB/scancode-toolkit/issues/3453
checksum = Checksum(ChecksumAlgorithm.SHA1, file_data.get('sha1') or 'da39a3ee5e6b4b0d3255bfef95601890afd80709')
file_entry = File(
spdx_id=f'SPDXRef-{sid}',
name=name,
checksums=[checksum]
)
file_license_detections = file_data.get('license_detections')
license_matches = get_matches_from_detection_mappings(file_license_detections)
if license_matches:
all_files_have_no_license = False
for match in license_matches:
file_license_expression = match["license_expression"]
file_license_keys = licensing.license_keys(
expression=file_license_expression,
unique=True
)
for license_key in file_license_keys:
file_license = licenses.get(license_key)
license_key = file_license.key
spdx_id = file_license.spdx_license_key
if not spdx_id:
spdx_id = f'LicenseRef-scancode-{license_key}'
is_license_ref = spdx_id.lower().startswith('licenseref-')
spdx_license = licensing.parse(spdx_id)
if is_license_ref:
text = match.get('matched_text')
# FIXME: replace this with the licensedb URL
comment = (
f'See details at https://github.com/nexB/scancode-toolkit'
f'/blob/develop/src/licensedcode/data/licenses/{license_key}.LICENSE\n'
)
extracted_license = ExtractedLicensingInfo(
license_id=spdx_id,
# always set some text, even if we did not extract the
# matched text
extracted_text=text if text else comment,
license_name=file_license.short_name,
comment=comment,
)
doc.extracted_licensing_info.append(extracted_license)
# Add licenses in the order they appear in the file. Maintaining
# the order might be useful for provenance purposes.
file_entry.license_info_in_file.append(spdx_license)
package.license_info_from_files.append(spdx_license)
elif license_matches is None:
all_files_have_no_license = False
file_entry.license_info_in_file.append(SpdxNoAssertion())
else:
file_entry.license_info_in_file.append(SpdxNone())
file_entry.license_concluded = SpdxNoAssertion()
file_copyrights = file_data.get('copyrights')
if file_copyrights:
all_files_have_no_copyright = False
copyrights = []
for file_copyright in file_copyrights:
copyrights.append(file_copyright.get('copyright'))
package_copyright_texts.update(copyrights)
# Create a text of copyright statements in the order they appear in
# the file. Maintaining the order might be useful for provenance
# purposes.
file_entry.copyright_text = '\n'.join(copyrights) + '\n'
elif file_copyrights is None:
all_files_have_no_copyright = False
file_entry.copyright_text = SpdxNoAssertion()
else:
file_entry.copyright_text = SpdxNone()
doc.files.append(file_entry)
relationship = Relationship(package.spdx_id, RelationshipType.CONTAINS, file_entry.spdx_id)
doc.relationships.append(relationship)
if not doc.files:
if as_tagvalue:
msg = "# No results for package '{}'.\n".format(package.name)
else:
# rdf
msg = "<!-- No results for package '{}'. -->\n".format(package.name)
output_file.write(msg)
# Remove duplicate licenses from the list for the package.
package.license_info_from_files = list(set(package.license_info_from_files))
if not package.license_info_from_files:
if all_files_have_no_license:
package.license_info_from_files = [SpdxNone()]
else:
package.license_info_from_files = [SpdxNoAssertion()]
else:
# List license identifiers alphabetically for the package.
package.license_info_from_files = sorted(package.license_info_from_files)
if not package_copyright_texts:
if all_files_have_no_copyright:
package.copyright_text = SpdxNone()
else:
package.copyright_text = SpdxNoAssertion()
else:
# Create a text of alphabetically sorted copyright
# statements for the package.
package.copyright_text = '\n'.join(sorted(package_copyright_texts)) + '\n'
package.verification_code = calculate_package_verification_code(doc.files)
package.license_declared = SpdxNoAssertion()
package.license_concluded = SpdxNoAssertion()
# The spdx-tools write_document returns either:
# - unicode for tag values
# - UTF8-encoded bytes for rdf because somehow the rdf and xml
# libraries do the encoding and do not return text but bytes
# The file passed by ScanCode for output is opened in text mode Therefore in
# one case we do need to deal with bytes and decode before writing (rdf) and
# in the other case we deal with text all the way.
if doc.files:
if as_tagvalue:
from spdx_tools.spdx.writer.tagvalue.tagvalue_writer import write_document_to_stream # NOQA
spdx_output = StringIO()
elif as_rdf:
from spdx_tools.spdx.writer.rdf.rdf_writer import write_document_to_stream # NOQA
# rdf is utf-encoded bytes
spdx_output = BytesIO()
write_document_to_stream(doc, spdx_output, validate=False)
result = spdx_output.getvalue()
if as_rdf:
# rdf is utf-encoded bytes
result = result.decode('utf-8')
output_file.write(result)