From e4e985b9b05b9570648399adf292ab7f659b4cef Mon Sep 17 00:00:00 2001 From: Alex Goodman Date: Mon, 21 Oct 2024 12:17:12 -0400 Subject: [PATCH] Create single license scanner for all catalogers (#3348) * add single license scanner instance Signed-off-by: Alex Goodman * rename testing license scanner Signed-off-by: Alex Goodman --------- Signed-off-by: Alex Goodman --- internal/licenses/context.go | 18 +++++ internal/licenses/parser.go | 45 ------------ internal/licenses/scanner.go | 68 +++++++++++++++++++ internal/licenses/search.go | 28 ++++++++ syft/create_sbom.go | 4 ++ syft/pkg/cataloger/generic/cataloger.go | 4 +- syft/pkg/cataloger/golang/licenses.go | 30 ++++---- syft/pkg/cataloger/golang/licenses_test.go | 24 ++++--- syft/pkg/cataloger/golang/package.go | 8 +-- syft/pkg/cataloger/golang/parse_go_binary.go | 28 ++++++-- .../cataloger/golang/parse_go_binary_test.go | 6 +- syft/pkg/cataloger/golang/parse_go_mod.go | 13 ++-- syft/pkg/cataloger/java/archive_parser.go | 64 +++++++++-------- .../pkg/cataloger/java/archive_parser_test.go | 28 +++++--- syft/pkg/cataloger/python/package.go | 42 ++++++------ syft/pkg/cataloger/python/parse_wheel_egg.go | 15 +++- 16 files changed, 277 insertions(+), 148 deletions(-) create mode 100644 internal/licenses/context.go delete mode 100644 internal/licenses/parser.go create mode 100644 internal/licenses/scanner.go create mode 100644 internal/licenses/search.go diff --git a/internal/licenses/context.go b/internal/licenses/context.go new file mode 100644 index 00000000000..9a1224dae94 --- /dev/null +++ b/internal/licenses/context.go @@ -0,0 +1,18 @@ +package licenses + +import ( + "context" +) + +type licenseScannerKey struct{} + +func SetContextLicenseScanner(ctx context.Context, s Scanner) context.Context { + return context.WithValue(ctx, licenseScannerKey{}, s) +} + +func ContextLicenseScanner(ctx context.Context) Scanner { + if s, ok := ctx.Value(licenseScannerKey{}).(Scanner); ok { + return s + } + return NewDefaultScanner() +} diff --git a/internal/licenses/parser.go b/internal/licenses/parser.go deleted file mode 100644 index c9f77d56784..00000000000 --- a/internal/licenses/parser.go +++ /dev/null @@ -1,45 +0,0 @@ -package licenses - -import ( - "io" - - "github.com/google/licensecheck" - - "github.com/anchore/syft/syft/file" - "github.com/anchore/syft/syft/license" - "github.com/anchore/syft/syft/pkg" -) - -const ( - coverageThreshold = 75 - unknownLicenseType = "UNKNOWN" -) - -// Parse scans the contents of a license file to attempt to determine the type of license it is -func Parse(reader io.Reader, l file.Location) (licenses []pkg.License, err error) { - licenses = make([]pkg.License, 0) - contents, err := io.ReadAll(reader) - if err != nil { - return nil, err - } - - scanner, err := licensecheck.NewScanner(licensecheck.BuiltinLicenses()) - if err != nil { - return nil, err - } - - cov := scanner.Scan(contents) - if cov.Percent < coverageThreshold { - // unknown or no licenses here? - return licenses, nil - } - - for _, m := range cov.Match { - lic := pkg.NewLicenseFromLocations(m.ID, l) - lic.Type = license.Concluded - - licenses = append(licenses, lic) - } - - return licenses, nil -} diff --git a/internal/licenses/scanner.go b/internal/licenses/scanner.go new file mode 100644 index 00000000000..5efb1d3a3e9 --- /dev/null +++ b/internal/licenses/scanner.go @@ -0,0 +1,68 @@ +package licenses + +import ( + "context" + "io" + + "github.com/google/licensecheck" + + "github.com/anchore/syft/internal/log" +) + +const coverageThreshold = 75 // determined by experimentation + +type Scanner interface { + IdentifyLicenseIDs(context.Context, io.Reader) ([]string, error) +} + +var _ Scanner = (*scanner)(nil) + +type scanner struct { + coverageThreshold float64 // between 0 and 100 + scanner func([]byte) licensecheck.Coverage +} + +// NewDefaultScanner returns a scanner that uses a new instance of the default licensecheck package scanner. +func NewDefaultScanner() Scanner { + s, err := licensecheck.NewScanner(licensecheck.BuiltinLicenses()) + if err != nil { + log.WithFields("error", err).Trace("unable to create default license scanner") + s = nil + } + return &scanner{ + coverageThreshold: coverageThreshold, + scanner: s.Scan, + } +} + +// TestingOnlyScanner returns a scanner that uses the built-in license scanner from the licensecheck package. +// THIS IS ONLY MEANT FOR TEST CODE, NOT PRODUCTION CODE. +func TestingOnlyScanner() Scanner { + return &scanner{ + coverageThreshold: coverageThreshold, + scanner: licensecheck.Scan, + } +} + +func (s scanner) IdentifyLicenseIDs(_ context.Context, reader io.Reader) ([]string, error) { + if s.scanner == nil { + return nil, nil + } + + content, err := io.ReadAll(reader) + if err != nil { + return nil, err + } + + cov := s.scanner(content) + if cov.Percent < s.coverageThreshold { + // unknown or no licenses here? + return nil, nil + } + + var ids []string + for _, m := range cov.Match { + ids = append(ids, m.ID) + } + return ids, nil +} diff --git a/internal/licenses/search.go b/internal/licenses/search.go new file mode 100644 index 00000000000..47dc14e453c --- /dev/null +++ b/internal/licenses/search.go @@ -0,0 +1,28 @@ +package licenses + +import ( + "context" + + "github.com/anchore/syft/syft/file" + "github.com/anchore/syft/syft/license" + "github.com/anchore/syft/syft/pkg" +) + +// Search scans the contents of a license file to attempt to determine the type of license it is +func Search(ctx context.Context, scanner Scanner, reader file.LocationReadCloser) (licenses []pkg.License, err error) { + licenses = make([]pkg.License, 0) + + ids, err := scanner.IdentifyLicenseIDs(ctx, reader) + if err != nil { + return nil, err + } + + for _, id := range ids { + lic := pkg.NewLicenseFromLocations(id, reader.Location) + lic.Type = license.Concluded + + licenses = append(licenses, lic) + } + + return licenses, nil +} diff --git a/syft/create_sbom.go b/syft/create_sbom.go index 3e4010eeadf..1e526fd90c6 100644 --- a/syft/create_sbom.go +++ b/syft/create_sbom.go @@ -9,6 +9,7 @@ import ( "github.com/scylladb/go-set/strset" "github.com/anchore/syft/internal/bus" + "github.com/anchore/syft/internal/licenses" "github.com/anchore/syft/internal/sbomsync" "github.com/anchore/syft/internal/task" "github.com/anchore/syft/syft/artifact" @@ -60,6 +61,9 @@ func CreateSBOM(ctx context.Context, src source.Source, cfg *CreateSBOMConfig) ( }, } + // inject a single license scanner for all package cataloging tasks into context + ctx = licenses.SetContextLicenseScanner(ctx, licenses.NewDefaultScanner()) + catalogingProgress := monitorCatalogingTask(src.ID(), taskGroups) packageCatalogingProgress := monitorPackageCatalogingTask() diff --git a/syft/pkg/cataloger/generic/cataloger.go b/syft/pkg/cataloger/generic/cataloger.go index f955ed35076..db23fad7306 100644 --- a/syft/pkg/cataloger/generic/cataloger.go +++ b/syft/pkg/cataloger/generic/cataloger.go @@ -154,7 +154,7 @@ func (c *Cataloger) Catalog(ctx context.Context, resolver file.Resolver) ([]pkg. var relationships []artifact.Relationship var errs error - logger := log.Nested("cataloger", c.upstreamCataloger) + lgr := log.Nested("cataloger", c.upstreamCataloger) env := Environment{ // TODO: consider passing into the cataloger, this would affect the cataloger interface (and all implementations). This can be deferred until later. @@ -166,7 +166,7 @@ func (c *Cataloger) Catalog(ctx context.Context, resolver file.Resolver) ([]pkg. log.WithFields("path", location.RealPath).Trace("parsing file contents") - discoveredPackages, discoveredRelationships, err := invokeParser(ctx, resolver, location, logger, parser, &env) + discoveredPackages, discoveredRelationships, err := invokeParser(ctx, resolver, location, lgr, parser, &env) if err != nil { // parsers may return errors and valid packages / relationships errs = unknown.Append(errs, location, err) diff --git a/syft/pkg/cataloger/golang/licenses.go b/syft/pkg/cataloger/golang/licenses.go index 0b02c6581c1..402a014485a 100644 --- a/syft/pkg/cataloger/golang/licenses.go +++ b/syft/pkg/cataloger/golang/licenses.go @@ -3,6 +3,7 @@ package golang import ( "archive/zip" "bytes" + "context" "fmt" "io" "io/fs" @@ -79,9 +80,9 @@ func remotesForModule(proxies []string, noProxy []string, module string) []strin return proxies } -func (c *goLicenseResolver) getLicenses(resolver file.Resolver, moduleName, moduleVersion string) ([]pkg.License, error) { +func (c *goLicenseResolver) getLicenses(ctx context.Context, scanner licenses.Scanner, resolver file.Resolver, moduleName, moduleVersion string) ([]pkg.License, error) { // search the scan target first, ignoring local and remote sources - goLicenses, err := c.findLicensesInSource(resolver, + goLicenses, err := c.findLicensesInSource(ctx, scanner, resolver, fmt.Sprintf(`**/go/pkg/mod/%s@%s/*`, processCaps(moduleName), moduleVersion), ) if err != nil || len(goLicenses) > 0 { @@ -90,7 +91,7 @@ func (c *goLicenseResolver) getLicenses(resolver file.Resolver, moduleName, modu // look in the local host mod directory... if c.opts.SearchLocalModCacheLicenses { - goLicenses, err = c.getLicensesFromLocal(moduleName, moduleVersion) + goLicenses, err = c.getLicensesFromLocal(ctx, scanner, moduleName, moduleVersion) if err != nil || len(goLicenses) > 0 { return toPkgLicenses(goLicenses), err } @@ -98,13 +99,13 @@ func (c *goLicenseResolver) getLicenses(resolver file.Resolver, moduleName, modu // download from remote sources if c.opts.SearchRemoteLicenses { - goLicenses, err = c.getLicensesFromRemote(moduleName, moduleVersion) + goLicenses, err = c.getLicensesFromRemote(ctx, scanner, moduleName, moduleVersion) } return toPkgLicenses(goLicenses), err } -func (c *goLicenseResolver) getLicensesFromLocal(moduleName, moduleVersion string) ([]goLicense, error) { +func (c *goLicenseResolver) getLicensesFromLocal(ctx context.Context, scanner licenses.Scanner, moduleName, moduleVersion string) ([]goLicense, error) { if c.localModCacheDir == nil { return nil, nil } @@ -120,10 +121,10 @@ func (c *goLicenseResolver) getLicensesFromLocal(moduleName, moduleVersion strin // if we're running against a directory on the filesystem, it may not include the // user's homedir / GOPATH, so we defer to using the localModCacheResolver // we use $GOPATH/pkg/mod to avoid leaking information about the user's system - return c.findLicensesInFS("file://$GOPATH/pkg/mod/"+subdir+"/", dir) + return c.findLicensesInFS(ctx, scanner, "file://$GOPATH/pkg/mod/"+subdir+"/", dir) } -func (c *goLicenseResolver) getLicensesFromRemote(moduleName, moduleVersion string) ([]goLicense, error) { +func (c *goLicenseResolver) getLicensesFromRemote(ctx context.Context, scanner licenses.Scanner, moduleName, moduleVersion string) ([]goLicense, error) { return c.licenseCache.Resolve(fmt.Sprintf("%s/%s", moduleName, moduleVersion), func() ([]goLicense, error) { proxies := remotesForModule(c.opts.Proxies, c.opts.NoProxy, moduleName) @@ -132,11 +133,11 @@ func (c *goLicenseResolver) getLicensesFromRemote(moduleName, moduleVersion stri return nil, err } - return c.findLicensesInFS(urlPrefix, fsys) + return c.findLicensesInFS(ctx, scanner, urlPrefix, fsys) }) } -func (c *goLicenseResolver) findLicensesInFS(urlPrefix string, fsys fs.FS) ([]goLicense, error) { +func (c *goLicenseResolver) findLicensesInFS(ctx context.Context, scanner licenses.Scanner, urlPrefix string, fsys fs.FS) ([]goLicense, error) { var out []goLicense err := fs.WalkDir(fsys, ".", func(filePath string, d fs.DirEntry, err error) error { if err != nil { @@ -156,7 +157,8 @@ func (c *goLicenseResolver) findLicensesInFS(urlPrefix string, fsys fs.FS) ([]go return nil } defer internal.CloseAndLogError(rdr, filePath) - parsed, err := licenses.Parse(rdr, file.NewLocation(filePath)) + + parsed, err := licenses.Search(ctx, scanner, file.NewLocationReadCloser(file.NewLocation(filePath), rdr)) if err != nil { log.Debugf("error parsing license file %s: %v", filePath, err) return nil @@ -174,7 +176,7 @@ func (c *goLicenseResolver) findLicensesInFS(urlPrefix string, fsys fs.FS) ([]go return out, err } -func (c *goLicenseResolver) findLicensesInSource(resolver file.Resolver, globMatch string) ([]goLicense, error) { +func (c *goLicenseResolver) findLicensesInSource(ctx context.Context, scanner licenses.Scanner, resolver file.Resolver, globMatch string) ([]goLicense, error) { var out []goLicense locations, err := resolver.FilesByGlob(globMatch) if err != nil { @@ -182,7 +184,7 @@ func (c *goLicenseResolver) findLicensesInSource(resolver file.Resolver, globMat } for _, l := range locations { - parsed, err := c.parseLicenseFromLocation(l, resolver) + parsed, err := c.parseLicenseFromLocation(ctx, scanner, l, resolver) if err != nil { return nil, err } @@ -200,7 +202,7 @@ func (c *goLicenseResolver) findLicensesInSource(resolver file.Resolver, globMat return out, nil } -func (c *goLicenseResolver) parseLicenseFromLocation(l file.Location, resolver file.Resolver) ([]goLicense, error) { +func (c *goLicenseResolver) parseLicenseFromLocation(ctx context.Context, scanner licenses.Scanner, l file.Location, resolver file.Resolver) ([]goLicense, error) { var out []goLicense fileName := path.Base(l.RealPath) if c.lowerLicenseFileNames.Has(strings.ToLower(fileName)) { @@ -209,7 +211,7 @@ func (c *goLicenseResolver) parseLicenseFromLocation(l file.Location, resolver f return nil, err } defer internal.CloseAndLogError(contents, l.RealPath) - parsed, err := licenses.Parse(contents, l) + parsed, err := licenses.Search(ctx, scanner, file.NewLocationReadCloser(l, contents)) if err != nil { return nil, err } diff --git a/syft/pkg/cataloger/golang/licenses_test.go b/syft/pkg/cataloger/golang/licenses_test.go index 0ac644faf89..0acde6de95c 100644 --- a/syft/pkg/cataloger/golang/licenses_test.go +++ b/syft/pkg/cataloger/golang/licenses_test.go @@ -3,6 +3,7 @@ package golang import ( "archive/zip" "bytes" + "context" "fmt" "io/fs" "net/http" @@ -15,6 +16,7 @@ import ( "github.com/stretchr/testify/require" + "github.com/anchore/syft/internal/licenses" "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/internal/fileresolver" "github.com/anchore/syft/syft/license" @@ -26,6 +28,8 @@ func Test_LocalLicenseSearch(t *testing.T) { loc2 := file.NewLocation("github.com/!cap!o!r!g/!cap!project@v4.111.5/LICENSE.txt") loc3 := file.NewLocation("github.com/someorg/strangelicense@v1.2.3/LiCeNsE.tXt") + licenseScanner := licenses.TestingOnlyScanner() + tests := []struct { name string version string @@ -78,12 +82,12 @@ func Test_LocalLicenseSearch(t *testing.T) { LocalModCacheDir: filepath.Join(wd, "test-fixtures", "licenses", "pkg", "mod"), }, ) - licenses, err := l.getLicenses(fileresolver.Empty{}, test.name, test.version) + lics, err := l.getLicenses(context.Background(), licenseScanner, fileresolver.Empty{}, test.name, test.version) require.NoError(t, err) - require.Len(t, licenses, 1) + require.Len(t, lics, 1) - require.Equal(t, test.expected, licenses[0]) + require.Equal(t, test.expected, lics[0]) }) } } @@ -92,6 +96,8 @@ func Test_RemoteProxyLicenseSearch(t *testing.T) { loc1 := file.NewLocation("github.com/someorg/somename@v0.3.2/LICENSE") loc2 := file.NewLocation("github.com/!cap!o!r!g/!cap!project@v4.111.5/LICENSE.txt") + licenseScanner := licenses.TestingOnlyScanner() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { buf := &bytes.Buffer{} uri := strings.TrimPrefix(strings.TrimSuffix(r.RequestURI, ".zip"), "/") @@ -169,12 +175,12 @@ func Test_RemoteProxyLicenseSearch(t *testing.T) { }, ) - licenses, err := l.getLicenses(fileresolver.Empty{}, test.name, test.version) + lics, err := l.getLicenses(context.Background(), licenseScanner, fileresolver.Empty{}, test.name, test.version) require.NoError(t, err) - require.Len(t, licenses, 1) + require.Len(t, lics, 1) - require.Equal(t, test.expected, licenses[0]) + require.Equal(t, test.expected, lics[0]) }) } } @@ -248,7 +254,7 @@ func Test_findVersionPath(t *testing.T) { func Test_walkDirErrors(t *testing.T) { resolver := newGoLicenseResolver("", CatalogerConfig{}) - _, err := resolver.findLicensesInFS("somewhere", badFS{}) + _, err := resolver.findLicensesInFS(context.Background(), licenses.TestingOnlyScanner(), "somewhere", badFS{}) require.Error(t, err) } @@ -266,6 +272,8 @@ func Test_noLocalGoModDir(t *testing.T) { validTmp := t.TempDir() require.NoError(t, os.MkdirAll(filepath.Join(validTmp, "mod@ver"), 0700|os.ModeDir)) + licenseScanner := licenses.TestingOnlyScanner() + tests := []struct { name string dir string @@ -299,7 +307,7 @@ func Test_noLocalGoModDir(t *testing.T) { SearchLocalModCacheLicenses: true, LocalModCacheDir: test.dir, }) - _, err := resolver.getLicensesFromLocal("mod", "ver") + _, err := resolver.getLicensesFromLocal(context.Background(), licenseScanner, "mod", "ver") test.wantErr(t, err) }) } diff --git a/syft/pkg/cataloger/golang/package.go b/syft/pkg/cataloger/golang/package.go index 8946b5f06f5..c3071420c40 100644 --- a/syft/pkg/cataloger/golang/package.go +++ b/syft/pkg/cataloger/golang/package.go @@ -5,21 +5,15 @@ import ( "strings" "github.com/anchore/packageurl-go" - "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/pkg" ) -func (c *goBinaryCataloger) newGoBinaryPackage(resolver file.Resolver, dep *debug.Module, mainModule, goVersion, architecture string, buildSettings pkg.KeyValues, cryptoSettings, experiments []string, locations ...file.Location) pkg.Package { +func (c *goBinaryCataloger) newGoBinaryPackage(dep *debug.Module, mainModule, goVersion, architecture string, buildSettings pkg.KeyValues, cryptoSettings, experiments []string, licenses []pkg.License, locations ...file.Location) pkg.Package { if dep.Replace != nil { dep = dep.Replace } - licenses, err := c.licenseResolver.getLicenses(resolver, dep.Path, dep.Version) - if err != nil { - log.Tracef("error getting licenses for golang package: %s %v", dep.Path, err) - } - p := pkg.Package{ Name: dep.Path, Version: dep.Version, diff --git a/syft/pkg/cataloger/golang/parse_go_binary.go b/syft/pkg/cataloger/golang/parse_go_binary.go index 91957f19d81..b0fe56a4e4c 100644 --- a/syft/pkg/cataloger/golang/parse_go_binary.go +++ b/syft/pkg/cataloger/golang/parse_go_binary.go @@ -18,6 +18,7 @@ import ( "golang.org/x/mod/module" "github.com/anchore/syft/internal" + "github.com/anchore/syft/internal/licenses" "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/file" @@ -59,9 +60,11 @@ func newGoBinaryCataloger(opts CatalogerConfig) *goBinaryCataloger { } // parseGoBinary catalogs packages found in the "buildinfo" section of a binary built by the go compiler. -func (c *goBinaryCataloger) parseGoBinary(_ context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { +func (c *goBinaryCataloger) parseGoBinary(ctx context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { var pkgs []pkg.Package + licenseScanner := licenses.ContextLicenseScanner(ctx) + unionReader, err := unionreader.GetUnionReader(reader.ReadCloser) if err != nil { return nil, nil, err @@ -73,7 +76,7 @@ func (c *goBinaryCataloger) parseGoBinary(_ context.Context, resolver file.Resol var rels []artifact.Relationship for _, mod := range mods { var depPkgs []pkg.Package - mainPkg, depPkgs := c.buildGoPkgInfo(resolver, reader.Location, mod, mod.arch, unionReader) + mainPkg, depPkgs := c.buildGoPkgInfo(ctx, licenseScanner, resolver, reader.Location, mod, mod.arch, unionReader) if mainPkg != nil { rels = createModuleRelationships(*mainPkg, depPkgs) pkgs = append(pkgs, *mainPkg) @@ -101,7 +104,7 @@ func createModuleRelationships(main pkg.Package, deps []pkg.Package) []artifact. var emptyModule debug.Module var moduleFromPartialPackageBuild = debug.Module{Path: "command-line-arguments"} -func (c *goBinaryCataloger) buildGoPkgInfo(resolver file.Resolver, location file.Location, mod *extendedBuildInfo, arch string, reader io.ReadSeekCloser) (*pkg.Package, []pkg.Package) { +func (c *goBinaryCataloger) buildGoPkgInfo(ctx context.Context, licenseScanner licenses.Scanner, resolver file.Resolver, location file.Location, mod *extendedBuildInfo, arch string, reader io.ReadSeekCloser) (*pkg.Package, []pkg.Package) { if mod == nil { return nil, nil } @@ -116,9 +119,13 @@ func (c *goBinaryCataloger) buildGoPkgInfo(resolver file.Resolver, location file continue } + lics, err := c.licenseResolver.getLicenses(ctx, licenseScanner, resolver, dep.Path, dep.Version) + if err != nil { + log.Tracef("error getting licenses for golang package: %s %v", dep.Path, err) + } + gover, experiments := getExperimentsFromVersion(mod.GoVersion) p := c.newGoBinaryPackage( - resolver, dep, mod.Main.Path, gover, @@ -126,6 +133,7 @@ func (c *goBinaryCataloger) buildGoPkgInfo(resolver file.Resolver, location file nil, mod.cryptoSettings, experiments, + lics, location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), ) if pkg.IsValid(&p) { @@ -137,7 +145,7 @@ func (c *goBinaryCataloger) buildGoPkgInfo(resolver file.Resolver, location file return nil, pkgs } - main := c.makeGoMainPackage(resolver, mod, arch, location, reader) + main := c.makeGoMainPackage(ctx, licenseScanner, resolver, mod, arch, location, reader) return &main, pkgs } @@ -152,11 +160,16 @@ func missingMainModule(mod *extendedBuildInfo) bool { return mod.Main == moduleFromPartialPackageBuild } -func (c *goBinaryCataloger) makeGoMainPackage(resolver file.Resolver, mod *extendedBuildInfo, arch string, location file.Location, reader io.ReadSeekCloser) pkg.Package { +func (c *goBinaryCataloger) makeGoMainPackage(ctx context.Context, licenseScanner licenses.Scanner, resolver file.Resolver, mod *extendedBuildInfo, arch string, location file.Location, reader io.ReadSeekCloser) pkg.Package { gbs := getBuildSettings(mod.Settings) + + lics, err := c.licenseResolver.getLicenses(ctx, licenseScanner, resolver, mod.Main.Path, mod.Main.Version) + if err != nil { + log.Tracef("error getting licenses for golang package: %s %v", mod.Main.Path, err) + } + gover, experiments := getExperimentsFromVersion(mod.GoVersion) main := c.newGoBinaryPackage( - resolver, &mod.Main, mod.Main.Path, gover, @@ -164,6 +177,7 @@ func (c *goBinaryCataloger) makeGoMainPackage(resolver file.Resolver, mod *exten gbs, mod.cryptoSettings, experiments, + lics, location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), ) diff --git a/syft/pkg/cataloger/golang/parse_go_binary_test.go b/syft/pkg/cataloger/golang/parse_go_binary_test.go index b2af3ffca85..565fda09046 100644 --- a/syft/pkg/cataloger/golang/parse_go_binary_test.go +++ b/syft/pkg/cataloger/golang/parse_go_binary_test.go @@ -3,6 +3,7 @@ package golang import ( "bufio" "bytes" + "context" "errors" "io" "os" @@ -17,6 +18,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/anchore/syft/internal/licenses" "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/internal/fileresolver" "github.com/anchore/syft/syft/internal/unionreader" @@ -167,6 +169,8 @@ func TestBuildGoPkgInfo(t *testing.T) { }, } + licenseScanner := licenses.TestingOnlyScanner() + tests := []struct { name string mod *extendedBuildInfo @@ -1053,7 +1057,7 @@ func TestBuildGoPkgInfo(t *testing.T) { c := newGoBinaryCataloger(DefaultCatalogerConfig()) reader, err := unionreader.GetUnionReader(io.NopCloser(strings.NewReader(test.binaryContent))) require.NoError(t, err) - mainPkg, pkgs := c.buildGoPkgInfo(fileresolver.Empty{}, location, test.mod, test.mod.arch, reader) + mainPkg, pkgs := c.buildGoPkgInfo(context.Background(), licenseScanner, fileresolver.Empty{}, location, test.mod, test.mod.arch, reader) if mainPkg != nil { pkgs = append(pkgs, *mainPkg) } diff --git a/syft/pkg/cataloger/golang/parse_go_mod.go b/syft/pkg/cataloger/golang/parse_go_mod.go index 8faf4acb509..cfb373e9048 100644 --- a/syft/pkg/cataloger/golang/parse_go_mod.go +++ b/syft/pkg/cataloger/golang/parse_go_mod.go @@ -11,6 +11,7 @@ import ( "golang.org/x/mod/modfile" "github.com/anchore/syft/internal" + "github.com/anchore/syft/internal/licenses" "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/file" @@ -31,9 +32,11 @@ func newGoModCataloger(opts CatalogerConfig) *goModCataloger { // parseGoModFile takes a go.mod and lists all packages discovered. // //nolint:funlen -func (c *goModCataloger) parseGoModFile(_ context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { +func (c *goModCataloger) parseGoModFile(ctx context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { packages := make(map[string]pkg.Package) + licenseScanner := licenses.ContextLicenseScanner(ctx) + contents, err := io.ReadAll(reader) if err != nil { return nil, nil, fmt.Errorf("failed to read go module: %w", err) @@ -50,7 +53,7 @@ func (c *goModCataloger) parseGoModFile(_ context.Context, resolver file.Resolve } for _, m := range f.Require { - licenses, err := c.licenseResolver.getLicenses(resolver, m.Mod.Path, m.Mod.Version) + lics, err := c.licenseResolver.getLicenses(ctx, licenseScanner, resolver, m.Mod.Path, m.Mod.Version) if err != nil { log.Tracef("error getting licenses for package: %s %v", m.Mod.Path, err) } @@ -58,7 +61,7 @@ func (c *goModCataloger) parseGoModFile(_ context.Context, resolver file.Resolve packages[m.Mod.Path] = pkg.Package{ Name: m.Mod.Path, Version: m.Mod.Version, - Licenses: pkg.NewLicenseSet(licenses...), + Licenses: pkg.NewLicenseSet(lics...), Locations: file.NewLocationSet(reader.Location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation)), PURL: packageURL(m.Mod.Path, m.Mod.Version), Language: pkg.Go, @@ -71,7 +74,7 @@ func (c *goModCataloger) parseGoModFile(_ context.Context, resolver file.Resolve // remove any old packages and replace with new ones... for _, m := range f.Replace { - licenses, err := c.licenseResolver.getLicenses(resolver, m.New.Path, m.New.Version) + lics, err := c.licenseResolver.getLicenses(ctx, licenseScanner, resolver, m.New.Path, m.New.Version) if err != nil { log.Tracef("error getting licenses for package: %s %v", m.New.Path, err) } @@ -83,7 +86,7 @@ func (c *goModCataloger) parseGoModFile(_ context.Context, resolver file.Resolve packages[m.New.Path] = pkg.Package{ Name: m.New.Path, Version: m.New.Version, - Licenses: pkg.NewLicenseSet(licenses...), + Licenses: pkg.NewLicenseSet(lics...), Locations: file.NewLocationSet(reader.Location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation)), PURL: packageURL(m.New.Path, m.New.Version), Language: pkg.Go, diff --git a/syft/pkg/cataloger/java/archive_parser.go b/syft/pkg/cataloger/java/archive_parser.go index dc5ff5f1336..d084ac3753b 100644 --- a/syft/pkg/cataloger/java/archive_parser.go +++ b/syft/pkg/cataloger/java/archive_parser.go @@ -4,6 +4,7 @@ import ( "context" "crypto" "fmt" + "io" "os" "path" "slices" @@ -49,14 +50,15 @@ var javaArchiveHashes = []crypto.Hash{ } type archiveParser struct { - fileManifest intFile.ZipFileManifest - location file.Location - archivePath string - contentPath string - fileInfo archiveFilename - detectNested bool - cfg ArchiveCatalogerConfig - maven *mavenResolver + fileManifest intFile.ZipFileManifest + location file.Location + archivePath string + contentPath string + fileInfo archiveFilename + detectNested bool + cfg ArchiveCatalogerConfig + maven *mavenResolver + licenseScanner licenses.Scanner } type genericArchiveParserAdapter struct { @@ -69,7 +71,7 @@ func newGenericArchiveParserAdapter(cfg ArchiveCatalogerConfig) genericArchivePa // parseJavaArchive is a parser function for java archive contents, returning all Java libraries and nested archives. func (gap genericArchiveParserAdapter) parseJavaArchive(ctx context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { - parser, cleanupFn, err := newJavaArchiveParser(reader, true, gap.cfg) + parser, cleanupFn, err := newJavaArchiveParser(ctx, reader, true, gap.cfg) // note: even on error, we should always run cleanup functions defer cleanupFn() if err != nil { @@ -88,7 +90,9 @@ func uniquePkgKey(groupID string, p *pkg.Package) string { // newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover // and parse nested archives or ignore them. -func newJavaArchiveParser(reader file.LocationReadCloser, detectNested bool, cfg ArchiveCatalogerConfig) (*archiveParser, func(), error) { +func newJavaArchiveParser(ctx context.Context, reader file.LocationReadCloser, detectNested bool, cfg ArchiveCatalogerConfig) (*archiveParser, func(), error) { + licenseScanner := licenses.ContextLicenseScanner(ctx) + // fetch the last element of the virtual path virtualElements := strings.Split(reader.Path(), ":") currentFilepath := virtualElements[len(virtualElements)-1] @@ -104,14 +108,15 @@ func newJavaArchiveParser(reader file.LocationReadCloser, detectNested bool, cfg } return &archiveParser{ - fileManifest: fileManifest, - location: reader.Location, - archivePath: archivePath, - contentPath: contentPath, - fileInfo: newJavaArchiveFilename(currentFilepath), - detectNested: detectNested, - cfg: cfg, - maven: newMavenResolver(nil, cfg), + fileManifest: fileManifest, + location: reader.Location, + archivePath: archivePath, + contentPath: contentPath, + fileInfo: newJavaArchiveFilename(currentFilepath), + detectNested: detectNested, + cfg: cfg, + maven: newMavenResolver(nil, cfg), + licenseScanner: licenseScanner, }, cleanupFn, nil } @@ -220,7 +225,7 @@ func (j *archiveParser) discoverMainPackage(ctx context.Context) (*pkg.Package, return nil, err } - name, version, licenses, err := j.discoverNameVersionLicense(ctx, manifest) + name, version, lics, err := j.discoverNameVersionLicense(ctx, manifest) if err != nil { return nil, err } @@ -230,7 +235,7 @@ func (j *archiveParser) discoverMainPackage(ctx context.Context) (*pkg.Package, Name: name, Version: version, Language: pkg.Java, - Licenses: pkg.NewLicenseSet(licenses...), + Licenses: pkg.NewLicenseSet(lics...), Locations: file.NewLocationSet( j.location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), ), @@ -246,7 +251,7 @@ func (j *archiveParser) discoverMainPackage(ctx context.Context) (*pkg.Package, func (j *archiveParser) discoverNameVersionLicense(ctx context.Context, manifest *pkg.JavaManifest) (string, string, []pkg.License, error) { // we use j.location because we want to associate the license declaration with where we discovered the contents in the manifest // TODO: when we support locations of paths within archives we should start passing the specific manifest location object instead of the top jar - licenses := pkg.NewLicensesFromLocation(j.location, selectLicenses(manifest)...) + lics := pkg.NewLicensesFromLocation(j.location, selectLicenses(manifest)...) /* We should name and version from, in this order: 1. pom.properties if we find exactly 1 @@ -262,25 +267,25 @@ func (j *archiveParser) discoverNameVersionLicense(ctx context.Context, manifest version = selectVersion(manifest, j.fileInfo) } - if len(licenses) == 0 { - fileLicenses, err := j.getLicenseFromFileInArchive() + if len(lics) == 0 { + fileLicenses, err := j.getLicenseFromFileInArchive(ctx) if err != nil { return "", "", nil, err } if fileLicenses != nil { - licenses = append(licenses, fileLicenses...) + lics = append(lics, fileLicenses...) } } // If we didn't find any licenses in the archive so far, we'll try again in Maven Central using groupIDFromJavaMetadata - if len(licenses) == 0 { + if len(lics) == 0 { // Today we don't have a way to distinguish between licenses from the manifest and licenses from the pom.xml // until the file.Location object can support sub-paths (i.e. paths within archives, recursively; issue https://github.com/anchore/syft/issues/2211). // Until then it's less confusing to use the licenses from the pom.xml only if the manifest did not list any. - licenses = j.findLicenseFromJavaMetadata(ctx, groupID, artifactID, version, parsedPom, manifest) + lics = j.findLicenseFromJavaMetadata(ctx, groupID, artifactID, version, parsedPom, manifest) } - return artifactID, version, licenses, nil + return artifactID, version, lics, nil } // findLicenseFromJavaMetadata attempts to find license information from all available maven metadata properties and pom info @@ -446,7 +451,7 @@ func getDigestsFromArchive(archivePath string) ([]file.Digest, error) { return digests, nil } -func (j *archiveParser) getLicenseFromFileInArchive() ([]pkg.License, error) { +func (j *archiveParser) getLicenseFromFileInArchive(ctx context.Context) ([]pkg.License, error) { var fileLicenses []pkg.License for _, filename := range licenses.FileNames() { licenseMatches := j.fileManifest.GlobMatch(true, "/META-INF/"+filename) @@ -463,7 +468,8 @@ func (j *archiveParser) getLicenseFromFileInArchive() ([]pkg.License, error) { for _, licenseMatch := range licenseMatches { licenseContents := contents[licenseMatch] - parsed, err := licenses.Parse(strings.NewReader(licenseContents), j.location) + r := strings.NewReader(licenseContents) + parsed, err := licenses.Search(ctx, j.licenseScanner, file.NewLocationReadCloser(j.location, io.NopCloser(r))) if err != nil { return nil, err } diff --git a/syft/pkg/cataloger/java/archive_parser_test.go b/syft/pkg/cataloger/java/archive_parser_test.go index 6fa0acc4755..b0aad4faf5b 100644 --- a/syft/pkg/cataloger/java/archive_parser_test.go +++ b/syft/pkg/cataloger/java/archive_parser_test.go @@ -20,6 +20,7 @@ import ( "github.com/stretchr/testify/require" "github.com/vifraa/gopom" + "github.com/anchore/syft/internal/licenses" "github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/license" @@ -30,6 +31,8 @@ import ( func TestSearchMavenForLicenses(t *testing.T) { url := mockMavenRepo(t) + ctx := licenses.SetContextLicenseScanner(context.Background(), licenses.TestingOnlyScanner()) + tests := []struct { name string fixture string @@ -71,6 +74,7 @@ func TestSearchMavenForLicenses(t *testing.T) { // setup parser ap, cleanupFn, err := newJavaArchiveParser( + ctx, file.LocationReadCloser{ Location: file.NewLocation(fixture.Name()), ReadCloser: fixture, @@ -86,6 +90,8 @@ func TestSearchMavenForLicenses(t *testing.T) { } func TestParseJar(t *testing.T) { + ctx := licenses.SetContextLicenseScanner(context.Background(), licenses.TestingOnlyScanner()) + tests := []struct { name string fixture string @@ -347,10 +353,12 @@ func TestParseJar(t *testing.T) { UseNetwork: false, UseMavenLocalRepository: false, } - parser, cleanupFn, err := newJavaArchiveParser(file.LocationReadCloser{ - Location: file.NewLocation(fixture.Name()), - ReadCloser: fixture, - }, false, cfg) + parser, cleanupFn, err := newJavaArchiveParser( + ctx, + file.LocationReadCloser{ + Location: file.NewLocation(fixture.Name()), + ReadCloser: fixture, + }, false, cfg) defer cleanupFn() require.NoError(t, err) @@ -1352,6 +1360,8 @@ func Test_parseJavaArchive_regressions(t *testing.T) { } func Test_deterministicMatchingPomProperties(t *testing.T) { + ctx := licenses.SetContextLicenseScanner(context.Background(), licenses.TestingOnlyScanner()) + tests := []struct { fixture string expected mavenID @@ -1371,10 +1381,12 @@ func Test_deterministicMatchingPomProperties(t *testing.T) { fixture, err := os.Open(fixturePath) require.NoError(t, err) - parser, cleanupFn, err := newJavaArchiveParser(file.LocationReadCloser{ - Location: file.NewLocation(fixture.Name()), - ReadCloser: fixture, - }, false, ArchiveCatalogerConfig{UseNetwork: false}) + parser, cleanupFn, err := newJavaArchiveParser( + ctx, + file.LocationReadCloser{ + Location: file.NewLocation(fixture.Name()), + ReadCloser: fixture, + }, false, ArchiveCatalogerConfig{UseNetwork: false}) defer cleanupFn() require.NoError(t, err) diff --git a/syft/pkg/cataloger/python/package.go b/syft/pkg/cataloger/python/package.go index e574d583a14..e7c86125651 100644 --- a/syft/pkg/cataloger/python/package.go +++ b/syft/pkg/cataloger/python/package.go @@ -1,6 +1,7 @@ package python import ( + "context" "fmt" "regexp" "strings" @@ -72,7 +73,26 @@ func newPackageForRequirementsWithMetadata(name, version string, metadata pkg.Py return p } -func newPackageForPackage(resolver file.Resolver, m parsedData, sources ...file.Location) pkg.Package { +func newPackageForPackage(m parsedData, licenses pkg.LicenseSet, sources ...file.Location) pkg.Package { + name := normalize(m.Name) + + p := pkg.Package{ + Name: name, + Version: m.Version, + PURL: packageURL(name, m.Version, &m.PythonPackage), + Locations: file.NewLocationSet(sources...), + Licenses: licenses, + Language: pkg.Python, + Type: pkg.PythonPkg, + Metadata: m.PythonPackage, + } + + p.SetID() + + return p +} + +func findLicenses(ctx context.Context, scanner licenses.Scanner, resolver file.Resolver, m parsedData) pkg.LicenseSet { var licenseSet pkg.LicenseSet switch { @@ -89,7 +109,7 @@ func newPackageForPackage(resolver file.Resolver, m parsedData, sources ...file. if len(found) > 0 { metadataContents, err := resolver.FileContentsByLocation(found[0]) if err == nil { - parsed, err := licenses.Parse(metadataContents, m.LicenseLocation) + parsed, err := licenses.Search(ctx, scanner, file.NewLocationReadCloser(m.LicenseLocation, metadataContents)) if err != nil { log.WithFields("error", err).Tracef("unable to parse a license from the file in %s", m.LicenseLocation.Path()) } @@ -101,23 +121,7 @@ func newPackageForPackage(resolver file.Resolver, m parsedData, sources ...file. } } } - - name := normalize(m.Name) - - p := pkg.Package{ - Name: name, - Version: m.Version, - PURL: packageURL(name, m.Version, &m.PythonPackage), - Locations: file.NewLocationSet(sources...), - Licenses: licenseSet, - Language: pkg.Python, - Type: pkg.PythonPkg, - Metadata: m.PythonPackage, - } - - p.SetID() - - return p + return licenseSet } func packageURL(name, version string, m *pkg.PythonPackage) string { diff --git a/syft/pkg/cataloger/python/parse_wheel_egg.go b/syft/pkg/cataloger/python/parse_wheel_egg.go index 5fe8ad98151..2d2b9487c1a 100644 --- a/syft/pkg/cataloger/python/parse_wheel_egg.go +++ b/syft/pkg/cataloger/python/parse_wheel_egg.go @@ -9,6 +9,7 @@ import ( "path/filepath" "github.com/anchore/syft/internal" + "github.com/anchore/syft/internal/licenses" "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/file" @@ -18,7 +19,9 @@ import ( // parseWheelOrEgg takes the primary metadata file reference and returns the python package it represents. Contained // fields are governed by the PyPA core metadata specification (https://packaging.python.org/en/latest/specifications/core-metadata/). -func parseWheelOrEgg(_ context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { +func parseWheelOrEgg(ctx context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { + licenseScanner := licenses.ContextLicenseScanner(ctx) + pd, sources, err := assembleEggOrWheelMetadata(resolver, reader.Location) if err != nil { return nil, nil, err @@ -33,7 +36,13 @@ func parseWheelOrEgg(_ context.Context, resolver file.Resolver, _ *generic.Envir return nil, nil, nil } - pkgs := []pkg.Package{newPackageForPackage(resolver, *pd, sources...)} + pkgs := []pkg.Package{ + newPackageForPackage( + *pd, + findLicenses(ctx, licenseScanner, resolver, *pd), + sources..., + ), + } return pkgs, nil, nil } @@ -60,7 +69,7 @@ func fetchInstalledFiles(resolver file.Resolver, metadataLocation file.Location, // parse the installed-files contents installedFiles, err := parseInstalledFiles(installedFilesContents, metadataLocation.RealPath, sitePackagesRootPath) if err != nil { - log.Warnf("unable to parse installed-files.txt for python package=%+v: %w", metadataLocation.RealPath, err) + log.WithFields("error", err, "path", metadataLocation.RealPath).Trace("unable to parse installed-files.txt for python package") return files, sources, nil }