Skip to content

Commit ff9ef9a

Browse files
committed
using osv modified csv file to get the correct modified vulnerabilities, using osv for malicious packages, solves: #1834
1 parent 046ce9b commit ff9ef9a

7 files changed

Lines changed: 180 additions & 282 deletions

File tree

transformer/osv_transformer.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ func affectedComponentsFromGitRange(affected dtos.Affected) []models.AffectedCom
286286
}
287287

288288
// MaliciousAffectedComponentFromOSV converts OSV data to MaliciousAffectedComponent entries
289-
func MaliciousAffectedComponentFromOSV(osv dtos.OSV, maliciousPackageID string) []models.MaliciousAffectedComponent {
289+
func MaliciousAffectedComponentFromOSV(osv *dtos.OSV, maliciousPackageID string) []models.MaliciousAffectedComponent {
290290
affectedComponents := make([]models.MaliciousAffectedComponent, 0)
291291
for _, affected := range osv.Affected {
292292
for _, c := range affectedComponentsFromAffected(affected) {

vulndb/gob.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ func readGobFile(path string, out any) error {
222222

223223
var batchSize = 5_000
224224

225-
func readGobFileStream[T any, Transformed any](ctx context.Context, path string, out chan<- Transformed, transformer func([]T) Transformed) error {
225+
func readGobFileStream[T any](ctx context.Context, path string, handleItems func([]T) error) error {
226226
fd, err := os.Open(path)
227227
if err != nil {
228228
return fmt.Errorf("could not open gob file %s: %w", path, err)
@@ -241,16 +241,19 @@ func readGobFileStream[T any, Transformed any](ctx context.Context, path string,
241241
batch = append(batch, item)
242242
if len(batch) == batchSize {
243243
select {
244-
case out <- transformer(batch):
245244
case <-ctx.Done():
246245
return ctx.Err()
246+
default:
247+
handleItems(batch)
247248
}
248249
batch = batch[:0]
249250
}
250251
}
251252
if len(batch) > 0 {
252253
select {
253-
case out <- transformer(batch):
254+
default:
255+
handleItems(batch)
256+
return nil
254257
case <-ctx.Done():
255258
return ctx.Err()
256259
}

vulndb/integrity.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,13 +184,13 @@ func calculateTotalIntegrityInformation(ctx context.Context, tx pgx.Tx) ([]table
184184
md5(string_agg(row_hash, '' ORDER BY id)) AS checksum
185185
FROM (
186186
SELECT id, md5(coalesce(id, '\0') || '|' || coalesce(modified::text, '\0')) AS row_hash
187-
FROM malicious_packages WHERE id NOT LIKE 'FAKE-TEST-%'
187+
FROM malicious_packages WHERE id NOT LIKE 'MAL-FAKE-TEST-%'
188188
) sub
189189
),
190190
malicious_affected_components_integrity AS (
191191
SELECT 'malicious_affected_components' AS table_name, count(*) AS row_count,
192192
md5(string_agg(id::text, '' ORDER BY id)) AS checksum
193-
FROM malicious_affected_components WHERE malicious_package_id NOT LIKE 'FAKE-TEST-%'
193+
FROM malicious_affected_components WHERE malicious_package_id NOT LIKE 'MAL-FAKE-TEST-%'
194194
)
195195
SELECT table_name, row_count, checksum FROM cves_integrity
196196
UNION ALL SELECT table_name, row_count, checksum FROM cve_relationships_integrity

vulndb/malicious_packages_checker.go

Lines changed: 3 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,11 @@
1616
package vulndb
1717

1818
import (
19-
"archive/tar"
20-
"compress/gzip"
2119
"context"
22-
"encoding/json"
2320
"fmt"
24-
"io"
2521
"log/slog"
2622
"net/http"
2723
"strings"
28-
"sync"
2924
"time"
3025

3126
"github.com/jackc/pgx/v5"
@@ -50,12 +45,11 @@ type MaliciousPackageChecker struct {
5045
httpClient *http.Client
5146
}
5247

53-
type malRow struct {
48+
type malRows struct {
5449
pkgs []models.MaliciousPackage
5550
comps []models.MaliciousAffectedComponent
5651
}
5752

58-
5953
func NewMaliciousPackageChecker(
6054
repository *repositories.MaliciousPackageRepository,
6155
) (*MaliciousPackageChecker, error) {
@@ -68,145 +62,6 @@ func NewMaliciousPackageChecker(
6862

6963
// FetchAll downloads the malicious packages archive and returns all parsed packages
7064
// and affected components without touching the database.
71-
func (c *MaliciousPackageChecker) FetchAll(ctx context.Context) ([]models.MaliciousPackage, []models.MaliciousAffectedComponent, error) {
72-
slog.Info("Downloading malicious packages archive", "url", c.repoURL)
73-
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.repoURL, nil)
74-
if err != nil {
75-
return nil, nil, fmt.Errorf("failed to create download request: %w", err)
76-
}
77-
resp, err := c.httpClient.Do(req)
78-
if err != nil {
79-
return nil, nil, fmt.Errorf("failed to download archive: %w", err)
80-
}
81-
defer resp.Body.Close()
82-
83-
if resp.StatusCode != http.StatusOK {
84-
return nil, nil, fmt.Errorf("failed to download archive: HTTP %d", resp.StatusCode)
85-
}
86-
87-
gzr, err := gzip.NewReader(resp.Body)
88-
if err != nil {
89-
return nil, nil, fmt.Errorf("failed to create gzip reader: %w", err)
90-
}
91-
defer gzr.Close()
92-
93-
tr := tar.NewReader(gzr)
94-
ecosystems := []string{"npm", "go", "maven", "pypi", "crates.io"}
95-
96-
processWG := &sync.WaitGroup{}
97-
collectWG := &sync.WaitGroup{}
98-
99-
fileJobs := make(chan []byte, malPkgNumOfGoRoutines*20)
100-
resultJobs := make(chan processingResults, BatchSize*2)
101-
102-
for range malPkgNumOfGoRoutines {
103-
processWG.Add(1)
104-
go processMaliciousPackageFile(processWG, fileJobs, resultJobs)
105-
}
106-
107-
var (
108-
packages []models.MaliciousPackage
109-
components []models.MaliciousAffectedComponent
110-
mu sync.Mutex
111-
)
112-
collectWG.Add(1)
113-
go func() {
114-
defer collectWG.Done()
115-
for r := range resultJobs {
116-
// pre-compute component IDs so they are stable in the gob file
117-
for i := range r.AffectedComponents {
118-
if r.AffectedComponents[i].ID == "" {
119-
r.AffectedComponents[i].ID = r.AffectedComponents[i].CalculateHash()
120-
}
121-
}
122-
mu.Lock()
123-
packages = append(packages, r.Package)
124-
components = append(components, r.AffectedComponents...)
125-
mu.Unlock()
126-
}
127-
}()
128-
129-
for {
130-
header, err := tr.Next()
131-
if err != nil {
132-
if err == io.EOF {
133-
break
134-
}
135-
return nil, nil, fmt.Errorf("failed to read tar: %w", err)
136-
}
137-
if !strings.HasSuffix(header.Name, ".json") || header.Typeflag != tar.TypeReg {
138-
continue
139-
}
140-
isTarget := false
141-
for _, eco := range ecosystems {
142-
if strings.Contains(header.Name, "/osv/malicious/"+eco+"/") {
143-
isTarget = true
144-
break
145-
}
146-
}
147-
if !isTarget {
148-
continue
149-
}
150-
data, err := io.ReadAll(tr)
151-
if err != nil {
152-
slog.Debug("Failed to read file from tar", "name", header.Name, "error", err)
153-
continue
154-
}
155-
fileJobs <- data
156-
}
157-
158-
close(fileJobs)
159-
processWG.Wait()
160-
close(resultJobs)
161-
collectWG.Wait()
162-
163-
slog.Info("Fetched malicious packages", "packages", len(packages), "components", len(components))
164-
return packages, components, nil
165-
}
166-
167-
type processingResults struct {
168-
Package models.MaliciousPackage
169-
AffectedComponents []models.MaliciousAffectedComponent
170-
}
171-
172-
// this function grabs json file contents from the jobs channel and builds the package as well as the affected components from it. These are then sent to the db worker function
173-
func processMaliciousPackageFile(waitGroup *sync.WaitGroup, jobs chan []byte, results chan processingResults) {
174-
defer waitGroup.Done()
175-
for data := range jobs {
176-
var entry dtos.OSV
177-
if err := json.Unmarshal(data, &entry); err != nil {
178-
slog.Debug("Failed to unmarshal JSON", "error", err)
179-
continue
180-
}
181-
182-
if entry.ID == "" {
183-
slog.Warn("Skipping malicious package with empty ID", "summary", entry.Summary)
184-
continue
185-
}
186-
187-
if len(entry.Affected) == 0 {
188-
continue
189-
}
190-
191-
// Create malicious package record
192-
pkg := models.MaliciousPackage{
193-
ID: entry.ID,
194-
Summary: entry.Summary,
195-
Details: entry.Details,
196-
Published: entry.Published,
197-
Modified: entry.Modified,
198-
}
199-
200-
// Create affected components
201-
components := transformer.MaliciousAffectedComponentFromOSV(entry, entry.ID)
202-
// send both as a job to the db writer function
203-
results <- processingResults{
204-
Package: pkg,
205-
AffectedComponents: components,
206-
}
207-
}
208-
}
209-
21065
func buildFakePackages() ([]models.MaliciousPackage, []models.MaliciousAffectedComponent) {
21166
testPackages := map[string][]string{
21267
"npm": {"fake-malicious-npm-package", "@fake-org/malicious-package"},
@@ -223,8 +78,8 @@ func buildFakePackages() ([]models.MaliciousPackage, []models.MaliciousAffectedC
22378
for ecosystem, pkgNames := range testPackages {
22479
for _, pkgName := range pkgNames {
22580
normalizedPkgName := strings.NewReplacer("/", "-", "@", "-", ":", "-", ".", "-").Replace(pkgName)
226-
fakeID := fmt.Sprintf("FAKE-TEST-%s-%s", strings.ToUpper(ecosystem), strings.ToUpper(normalizedPkgName))
227-
fakeEntry := dtos.OSV{
81+
fakeID := fmt.Sprintf("MAL-FAKE-TEST-%s-%s", strings.ToUpper(ecosystem), strings.ToUpper(normalizedPkgName))
82+
fakeEntry := &dtos.OSV{
22883
ID: fakeID,
22984
Summary: fmt.Sprintf("Fake malicious %s package for testing", ecosystem),
23085
Details: "This is a fake malicious package entry used for testing the dependency proxy",

0 commit comments

Comments
 (0)