Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Update debian package manifest parsing #3647

Merged
merged 8 commits into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ commoncode==31.0.3
construct==2.10.68
container-inspector==31.1.0
cryptography==37.0.4
debian-inspector==31.0.0
debian-inspector==31.1.0
dockerfile-parse==1.2.0
dparse2==0.7.0
extractcode==31.0.0
Expand Down
2 changes: 1 addition & 1 deletion setup-mini.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ install_requires =
colorama >= 0.3.9
commoncode >= 31.0.2
container-inspector >= 31.0.0
debian-inspector >= 31.0.0
debian-inspector >= 31.1.0
dparse2 >= 0.7.0
fasteners
fingerprints >= 0.6.0
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ install_requires =
colorama >= 0.3.9
commoncode >= 31.0.3
container-inspector >= 31.0.0
debian-inspector >= 31.0.0
debian-inspector >= 31.1.0
dparse2 >= 0.7.0
fasteners
fingerprints >= 0.6.0
Expand Down
8 changes: 6 additions & 2 deletions src/packagedcode/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,13 +208,17 @@
debian_copyright.DebianCopyrightFileInPackageHandler,
debian_copyright.DebianCopyrightFileInSourceHandler,

# TODO: consider activating? debian_copyright.StandaloneDebianCopyrightFileHandler,

debian.DebianDistrolessInstalledDatabaseHandler,

debian.DebianInstalledFilelistHandler,
debian.DebianInstalledMd5sumFilelistHandler,
debian.DebianInstalledStatusDatabaseHandler,
debian.DebianControlFileInSourceHandler,
debian.DebianDscFileHandler,
debian.DebianSourcePackageTarballHandler,
debian.DebianSourcePackageMetadataTarballHandler,
debian.DebianDebPackageHandler,
debian_copyright.StandaloneDebianCopyrightFileHandler
]

if on_linux:
Expand Down
122 changes: 102 additions & 20 deletions src/packagedcode/debian.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import os
import logging
from collections import Counter
from pathlib import Path

from commoncode import fileutils
Expand Down Expand Up @@ -137,6 +138,7 @@ def parse(cls, location):
debian_data=get_paragraph_data_from_file(location=location),
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
distro='debian',
)

@classmethod
Expand All @@ -157,15 +159,19 @@ class DebianControlFileInSourceHandler(models.DatafileHandler):

@classmethod
def parse(cls, location):
# TODO: we cannot know the distro from the name only
# NOTE: a control file in a source repo or debina.tar tarball can contain more than one package
debian_packages = []
for debian_data in get_paragraphs_data_from_file(location=location):
yield build_package_data(
debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
debian_packages.append(
build_package_data(
debian_data=debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
)
)

yield from populate_debian_namespace(debian_packages)

@classmethod
def assign_package_to_resources(cls, package, resource, codebase, package_adder):
# two levels up
Expand All @@ -191,11 +197,19 @@ def parse(cls, location):
location=location,
remove_pgp_signature=True,
)
yield build_package_data(

package_data_from_file = build_package_data_from_package_filename(
filename=os.path.basename(location),
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
)
package_data = build_package_data(
debian_data=debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
)
package_data.update_purl_fields(package_data=package_data_from_file)
yield package_data

@classmethod
def assign_package_to_resources(cls, package, resource, codebase, package_adder):
Expand All @@ -214,13 +228,18 @@ class DebianInstalledStatusDatabaseHandler(models.DatafileHandler):
def parse(cls, location):
# note that we do not know yet the distro at this stage
# we could get it... but we get that later during assemble()
for debian_data in get_paragraphs_data_from_file(location):
yield build_package_data(
debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
debian_packages = []
for debian_data in get_paragraphs_data_from_file(location=location):
debian_packages.append(
build_package_data(
debian_data=debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
)
)

yield from populate_debian_namespace(debian_packages)

@classmethod
def assemble(cls, package_data, resource, codebase, package_adder):
# get the root resource of the rootfs
Expand Down Expand Up @@ -260,7 +279,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):

# We only need to adjust the md5sum/list path in the case of `same`
qualifiers = package_data.qualifiers or {}
architecture = qualifiers.get('architecture')
architecture = qualifiers.get('arch')

multi_arch = package_data.extra_data.get('multi_arch')

Expand Down Expand Up @@ -305,6 +324,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
package.update(
package_data=package_data,
datafile_path=res.path,
check_compatible=False,
replace=False,
include_version=False,
include_qualifiers=False,
Expand Down Expand Up @@ -379,14 +399,18 @@ def parse(cls, location):
rootfs installation. distroless is derived from Debian but each package
has its own status file.
"""
for debian_data in get_paragraphs_data_from_file(location):
yield build_package_data(
debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
distro='distroless',
debian_packages = []
for debian_data in get_paragraphs_data_from_file(location=location):
debian_packages.append(
build_package_data(
debian_data=debian_data,
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
)
)

yield from populate_debian_namespace(debian_packages)

@classmethod
def assemble(cls, package_data, resource, codebase, package_adder):
# get the root resource of the rootfs
Expand Down Expand Up @@ -523,6 +547,9 @@ def build_package_data_from_package_filename(filename, datasource_id, package_ty
"""

# TODO: we cannot know the distro from the name only
# PURLs without namespace is invalid, so we need to
# have a default value for this
distro = 'debian'
deb = DebArchive.from_filename(filename=filename)

if deb.architecture:
Expand All @@ -538,6 +565,7 @@ def build_package_data_from_package_filename(filename, datasource_id, package_ty
datasource_id=datasource_id,
type=package_type,
name=deb.name,
namespace=distro,
version=version,
qualifiers=qualifiers,
)
Expand Down Expand Up @@ -598,7 +626,7 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No
qualifiers = {}
architecture = debian_data.get('architecture')
if architecture:
qualifiers['architecture'] = architecture
qualifiers['arch'] = architecture

extra_data = {}
# Multi-Arch can be: "foreign", "same", "allowed", "all", "optional" or
Expand Down Expand Up @@ -628,13 +656,27 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No
if keyword:
keywords.append(keyword)

# Get distro/namespace information from clues in package data
if not distro:
if version:
for clue, namespace in version_clues_for_namespace.items():
if clue in version:
distro = namespace
break

if maintainer:
for clue, namespace in maintainer_clues_for_namespace.items():
if clue in maintainer:
distro = namespace
break

source_packages = []
source = debian_data.get('source')
if source:
source_pkg_purl = PackageURL(
type=package_type,
name=source,
namespace=distro
namespace=distro,
).to_string()

source_packages.append(source_pkg_purl)
Expand All @@ -656,6 +698,46 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No
)


def populate_debian_namespace(packages):
"""
For an iterable of debian `packages`, populate the
most frequently occuring namespace, or the default
namespace 'debian' in packages without namespace.
"""
if not packages:
return

namespaces_with_count = Counter([
package.namespace
for package in packages
])
distro = max(namespaces_with_count, key=namespaces_with_count.get)
if not distro:
distro = 'debian'

for package in packages:
if not package.namespace:
package.namespace = distro
yield package


version_clues_for_namespace = {
'deb': 'debian',
'ubuntu': 'ubuntu',
}


maintainer_clues_for_namespace = {
'packages.debian.org': 'debian',
'lists.debian.org': 'debian',
'lists.alioth.debian.org': 'debian',
'@debian.org': 'debian',
'debian-init-diversity@': 'debian',
'lists.ubuntu.com': 'ubuntu',
'@canonical.com': 'ubuntu',
}


ignored_root_dirs = {
'/.',
'/bin',
Expand Down
56 changes: 56 additions & 0 deletions src/packagedcode/debian_copyright.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from debian_inspector.copyright import CopyrightLicenseParagraph
from debian_inspector.copyright import CopyrightHeaderParagraph
from debian_inspector.copyright import DebianCopyright
from debian_inspector.package import CodeMetadata
from debian_inspector.version import Version as DebVersion
from license_expression import ExpressionError
from license_expression import LicenseSymbolLike
from license_expression import Licensing
Expand Down Expand Up @@ -263,11 +265,65 @@ class StandaloneDebianCopyrightFileHandler(BaseDebianCopyrightFileHandler):
'*_copyright',
)

@classmethod
def is_datafile(cls, location, filetypes=tuple()):
return (
super().is_datafile(location, filetypes=filetypes)
and not DebianCopyrightFileInPackageHandler.is_datafile(location)
and not DebianCopyrightFileInSourceHandler.is_datafile(location)
)

@classmethod
def assemble(cls, package_data, resource, codebase, package_adder):
# assemble is the default
yield from super().assemble(package_data, resource, codebase, package_adder)

@classmethod
def parse(cls, location):
"""
Gets license/copyright information from file like
other copyright files, but also gets purl fields if
present in copyright filename, if obtained from
upstream metadata archive.
"""
package_data = list(super().parse(location)).pop()
package_data_from_file = build_package_data_from_metadata_filename(
filename=os.path.basename(location),
datasource_id=cls.datasource_id,
package_type=cls.default_package_type,
)
if package_data_from_file:
package_data.update_purl_fields(package_data=package_data_from_file)

yield package_data


def build_package_data_from_metadata_filename(filename, datasource_id, package_type):
"""
Return a PackageData built from the filename of a Debian package metadata.
"""

# TODO: we cannot know the distro from the name only
# PURLs without namespace is invalid, so we need to
# have a default value for this
distro = 'debian'
try:
deb = CodeMetadata.from_filename(filename=filename)
except ValueError:
return

version = deb.version
if isinstance(version, DebVersion):
version = str(version)

return models.PackageData(
datasource_id=datasource_id,
type=package_type,
name=deb.name,
namespace=distro,
version=version,
)


class NotReallyStructuredCopyrightFile(Exception):
"""
Expand Down
Loading
Loading