From 6ea218b1f654277eb1c331d0cd275c31536be6c9 Mon Sep 17 00:00:00 2001 From: Parth Patel <88045217+pxp928@users.noreply.github.com> Date: Thu, 26 Sep 2024 17:08:57 -0400 Subject: [PATCH] improve on ingestion license check (#2152) * remove unneeded batching and remove source license query on ingestion Signed-off-by: pxp928 * fix unit test for on ingestion scanner Signed-off-by: pxp928 --------- Signed-off-by: pxp928 --- .../clearlydefined/clearlydefined.go | 21 +++--- pkg/ingestor/parser/common/scanner/scanner.go | 40 ++---------- .../parser/common/scanner/scanner_test.go | 21 ------ pkg/ingestor/parser/parser.go | 64 +++++++++++-------- 4 files changed, 57 insertions(+), 89 deletions(-) diff --git a/pkg/certifier/clearlydefined/clearlydefined.go b/pkg/certifier/clearlydefined/clearlydefined.go index 7bfa3e4b1a..ac9d85ab51 100644 --- a/pkg/certifier/clearlydefined/clearlydefined.go +++ b/pkg/certifier/clearlydefined/clearlydefined.go @@ -120,7 +120,8 @@ func getDefinitions(ctx context.Context, client *http.Client, purls []string, co } // EvaluateClearlyDefinedDefinition converts the purls into coordinates to query clearly defined -func EvaluateClearlyDefinedDefinition(ctx context.Context, client *http.Client, purls []string, docChannel chan<- *processor.Document) ([]*processor.Document, error) { +func EvaluateClearlyDefinedDefinition(ctx context.Context, client *http.Client, + purls []string, docChannel chan<- *processor.Document, collectSourceLicenses bool) ([]*processor.Document, error) { logger := logging.FromContext(ctx) var batchCoordinates []string var queryPurls []string @@ -143,7 +144,8 @@ func EvaluateClearlyDefinedDefinition(ctx context.Context, client *http.Client, batchCoordinates = append(batchCoordinates, coordinate.ToString()) } } - if genCDDocs, err := generateDefinitions(ctx, client, batchCoordinates, queryPurls, docChannel); err != nil { + if genCDDocs, err := generateDefinitions(ctx, client, batchCoordinates, queryPurls, docChannel, + collectSourceLicenses); err != nil { return nil, fmt.Errorf("generateDefinitions failed with error: %w", err) } else { generatedCDDocs = append(generatedCDDocs, genCDDocs...) @@ -154,7 +156,8 @@ func EvaluateClearlyDefinedDefinition(ctx context.Context, client *http.Client, // generateDefinitions takes in the batched coordinated to retrieve the definition. It uses the definition to check if source // information can be queried in clearly defined. -func generateDefinitions(ctx context.Context, client *http.Client, batchCoordinates, queryPurls []string, docChannel chan<- *processor.Document) ([]*processor.Document, error) { +func generateDefinitions(ctx context.Context, client *http.Client, batchCoordinates, + queryPurls []string, docChannel chan<- *processor.Document, collectSourceLicenses bool) ([]*processor.Document, error) { var generatedCDDocs []*processor.Document if len(batchCoordinates) > 0 { definitionMap, err := getDefinitions(ctx, client, queryPurls, batchCoordinates) @@ -168,10 +171,12 @@ func generateDefinitions(ctx context.Context, client *http.Client, batchCoordina generatedCDDocs = append(generatedCDDocs, genCDPkgDocs...) } - if genCDSrcDocs, err := evaluateDefinitionForSource(ctx, client, definitionMap, docChannel); err != nil { - return nil, fmt.Errorf("evaluateDefinitionForSource failed with error: %w", err) - } else { - generatedCDDocs = append(generatedCDDocs, genCDSrcDocs...) + if collectSourceLicenses { + if genCDSrcDocs, err := evaluateDefinitionForSource(ctx, client, definitionMap, docChannel); err != nil { + return nil, fmt.Errorf("evaluateDefinitionForSource failed with error: %w", err) + } else { + generatedCDDocs = append(generatedCDDocs, genCDSrcDocs...) + } } } return generatedCDDocs, nil @@ -190,7 +195,7 @@ func (c *cdCertifier) CertifyComponent(ctx context.Context, rootComponent interf purls = append(purls, node.Purl) } - if _, err := EvaluateClearlyDefinedDefinition(ctx, c.cdHTTPClient, purls, docChannel); err != nil { + if _, err := EvaluateClearlyDefinedDefinition(ctx, c.cdHTTPClient, purls, docChannel, true); err != nil { return fmt.Errorf("could not generate document from Clearly Defined results: %w", err) } diff --git a/pkg/ingestor/parser/common/scanner/scanner.go b/pkg/ingestor/parser/common/scanner/scanner.go index f120f5ff82..a3f9a98e53 100644 --- a/pkg/ingestor/parser/common/scanner/scanner.go +++ b/pkg/ingestor/parser/common/scanner/scanner.go @@ -64,40 +64,12 @@ func PurlsLicenseScan(ctx context.Context, purls []string) ([]assembler.CertifyL var certLegalIngest []assembler.CertifyLegalIngest var hasSourceAtIngest []assembler.HasSourceAtIngest - if len(purls) > 249 { - i := 0 - var batchPurls []string - for _, purl := range purls { - if i < 248 { - batchPurls = append(batchPurls, purl) - i++ - } else { - batchPurls = append(batchPurls, purl) - batchedCL, batchedHSA, err := runQueryOnBatchedPurls(ctx, cdParser, batchPurls) - if err != nil { - return nil, nil, fmt.Errorf("runQueryOnBatchedPurls failed with error: %w", err) - } - certLegalIngest = append(certLegalIngest, batchedCL...) - hasSourceAtIngest = append(hasSourceAtIngest, batchedHSA...) - batchPurls = make([]string, 0) - } - } - if len(batchPurls) > 0 { - batchedCL, batchedHSA, err := runQueryOnBatchedPurls(ctx, cdParser, batchPurls) - if err != nil { - return nil, nil, fmt.Errorf("runQueryOnBatchedPurls failed with error: %w", err) - } - certLegalIngest = append(certLegalIngest, batchedCL...) - hasSourceAtIngest = append(hasSourceAtIngest, batchedHSA...) - } - } else { - batchedCL, batchedHSA, err := runQueryOnBatchedPurls(ctx, cdParser, purls) - if err != nil { - return nil, nil, fmt.Errorf("runQueryOnBatchedPurls failed with error: %w", err) - } - certLegalIngest = append(certLegalIngest, batchedCL...) - hasSourceAtIngest = append(hasSourceAtIngest, batchedHSA...) + batchedCL, batchedHSA, err := runQueryOnBatchedPurls(ctx, cdParser, purls) + if err != nil { + return nil, nil, fmt.Errorf("runQueryOnBatchedPurls failed with error: %w", err) } + certLegalIngest = append(certLegalIngest, batchedCL...) + hasSourceAtIngest = append(hasSourceAtIngest, batchedHSA...) return certLegalIngest, hasSourceAtIngest, nil } @@ -109,7 +81,7 @@ func runQueryOnBatchedPurls(ctx context.Context, cdParser common.DocumentParser, var hasSourceAtIngest []assembler.HasSourceAtIngest if cdProcessorDocs, err := cd_certifier.EvaluateClearlyDefinedDefinition(ctx, &http.Client{ Transport: version.UATransport, - }, batchPurls, nil); err != nil { + }, batchPurls, nil, false); err != nil { return nil, nil, fmt.Errorf("failed get definition from clearly defined with error: %w", err) } else { for _, doc := range cdProcessorDocs { diff --git a/pkg/ingestor/parser/common/scanner/scanner_test.go b/pkg/ingestor/parser/common/scanner/scanner_test.go index 57851627ec..37779dacac 100644 --- a/pkg/ingestor/parser/common/scanner/scanner_test.go +++ b/pkg/ingestor/parser/common/scanner/scanner_test.go @@ -373,27 +373,6 @@ func TestPurlsLicenseScan(t *testing.T) { Collector: "clearlydefined", }, }, - { - Src: &generated.SourceInputSpec{ - Type: "sourcearchive", - Namespace: "org.apache.logging.log4j", - Name: "log4j-core", - Tag: ptrfrom.String("2.8.1"), - }, - Declared: []generated.LicenseInputSpec{}, - Discovered: []generated.LicenseInputSpec{ - {Name: "Apache-2.0", ListVersion: &lvUnknown}, - {Name: "NOASSERTION", ListVersion: &lvUnknown}, - }, - CertifyLegal: &generated.CertifyLegalInputSpec{ - DiscoveredLicense: "Apache-2.0 AND NOASSERTION", - Attribution: "Copyright 2005-2006 Tim Fennell,Copyright 1999-2012 Apache Software Foundation,Copyright 1999-2005 The Apache Software Foundation", - Justification: "Retrieved from ClearlyDefined", - TimeScanned: tm, - Origin: "clearlydefined", - Collector: "clearlydefined", - }, - }, }, wantHSAs: []assembler.HasSourceAtIngest{ { diff --git a/pkg/ingestor/parser/parser.go b/pkg/ingestor/parser/parser.go index bd83ec8a46..cff72e8db3 100644 --- a/pkg/ingestor/parser/parser.go +++ b/pkg/ingestor/parser/parser.go @@ -18,6 +18,7 @@ package parser import ( "context" "fmt" + "sync" "github.com/guacsec/guac/pkg/assembler" "github.com/guacsec/guac/pkg/handler/processor" @@ -75,6 +76,8 @@ func RegisterDocumentParser(p func() common.DocumentParser, d processor.Document // ParseDocumentTree takes the DocumentTree and create graph inputs (nodes and edges) per document node. func ParseDocumentTree(ctx context.Context, docTree processor.DocumentTree, scanForVulns bool, scanForLicense bool) ([]assembler.IngestPredicates, []*common.IdentifierStrings, error) { + var wg sync.WaitGroup + assemblerInputs := []assembler.IngestPredicates{} identifierStrings := []*common.IdentifierStrings{} logger := docTree.Document.ChildLogger @@ -98,40 +101,49 @@ func ParseDocumentTree(ctx context.Context, docTree processor.DocumentTree, scan } if scanForVulns { - // scan purls via OSV on initial ingestion to capture vulnerability information - var purls []string - for _, idString := range identifierStrings { - purls = append(purls, idString.PurlStrings...) - } + wg.Add(1) + go func() { + defer wg.Done() + // scan purls via OSV on initial ingestion to capture vulnerability information + var purls []string + for _, idString := range identifierStrings { + purls = append(purls, idString.PurlStrings...) + } - vulnEquals, certVulns, err := scanner.PurlsVulnScan(ctx, purls) - if err != nil { - logger.Errorf("error scanning purls for vulnerabilities %v", err) - } else { - if len(assemblerInputs) > 0 { - assemblerInputs[0].VulnEqual = append(assemblerInputs[0].VulnEqual, vulnEquals...) - assemblerInputs[0].CertifyVuln = append(assemblerInputs[0].CertifyVuln, certVulns...) + vulnEquals, certVulns, err := scanner.PurlsVulnScan(ctx, purls) + if err != nil { + logger.Errorf("error scanning purls for vulnerabilities %v", err) + } else { + if len(assemblerInputs) > 0 { + assemblerInputs[0].VulnEqual = append(assemblerInputs[0].VulnEqual, vulnEquals...) + assemblerInputs[0].CertifyVuln = append(assemblerInputs[0].CertifyVuln, certVulns...) + } } - } + }() } if scanForLicense { - // scan purls via clearly defined on initial ingestion to capture license information - var purls []string - for _, idString := range identifierStrings { - purls = append(purls, idString.PurlStrings...) - } + wg.Add(1) + go func() { + defer wg.Done() + // scan purls via clearly defined on initial ingestion to capture license information + var purls []string + for _, idString := range identifierStrings { + purls = append(purls, idString.PurlStrings...) + } - certLegal, hasSourceAt, err := scanner.PurlsLicenseScan(ctx, purls) - if err != nil { - logger.Errorf("error scanning purls for licenses %v", err) - } else { - if len(assemblerInputs) > 0 { - assemblerInputs[0].CertifyLegal = append(assemblerInputs[0].CertifyLegal, certLegal...) - assemblerInputs[0].HasSourceAt = append(assemblerInputs[0].HasSourceAt, hasSourceAt...) + certLegal, hasSourceAt, err := scanner.PurlsLicenseScan(ctx, purls) + if err != nil { + logger.Errorf("error scanning purls for licenses %v", err) + } else { + if len(assemblerInputs) > 0 { + assemblerInputs[0].CertifyLegal = append(assemblerInputs[0].CertifyLegal, certLegal...) + assemblerInputs[0].HasSourceAt = append(assemblerInputs[0].HasSourceAt, hasSourceAt...) + } } - } + }() } + wg.Wait() return assemblerInputs, identifierStrings, nil }