Skip to content

Commit af6a460

Browse files
joanreyeroCopilot
andauthored
feat: osv advisories ingestion (#4149)
Signed-off-by: Joan Reyero <joan@reyero.io> Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
1 parent 2f3f03d commit af6a460

30 files changed

Lines changed: 2652 additions & 104 deletions

backend/.env.dist.local

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,3 +183,14 @@ ENRICHER_GITHUB_TOKENS=
183183
ENRICHER_BATCH_SIZE=100
184184
ENRICHER_REPO_UPDATE_INTERVAL_HOURS=24
185185
ENRICHER_IDLE_SLEEP_SEC=60
186+
187+
# osv-sync (Temporal-scheduled; see services/apps/packages_worker/src/osv/schedule.ts)
188+
# OSV_ECOSYSTEMS uses OSV's canonical bucket case (npm lowercase, Maven titlecase) because
189+
# the bucket URL <BASE>/<ecosystem>/all.zip is case-sensitive (Maven/all.zip exists,
190+
# maven/all.zip 404s). The allowlist check and DB storage normalize to lowercase
191+
# internally per ADR-0001 §OSV "Ecosystem normalization", so downstream stays lowercase.
192+
OSV_BULK_BASE_URL=https://osv-vulnerabilities.storage.googleapis.com
193+
OSV_ECOSYSTEMS=npm,Maven
194+
OSV_TMP_DIR=/tmp/osv
195+
OSV_BATCH_SIZE=500
196+
OSV_DERIVE_BATCH_SIZE=1000

backend/src/osspckgs/migrations/V1779710880__initial_schema.sql

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,12 @@ CREATE TABLE packages (
5858
latest_release_at timestamptz,
5959
dependent_packages_count int,
6060
dependent_repos_count int,
61-
-- has_critical_vulnerability bool NOT NULL DEFAULT FALSE,
62-
-- Deferred: semantics undecided between (a) any advisory with no fixed_version vs
63-
-- (b) latest_version falls inside an affected semver range. Lateral join against
64-
-- advisory_packages used in queries until this is resolved.
61+
-- has_critical_vulnerability: TRUE iff latest_version is inside an active
62+
-- affected range of a critical advisory (CVSS >= 7.0) OR a MAL-* malicious-
63+
-- package advisory matches the package. Maintained by the deriveCriticalFlag
64+
-- activity in packages_worker/src/osv/. See ADR-0001 §`has_critical_vulnerability`
65+
-- semantics for the option-b + MAL- override rationale.
66+
has_critical_vulnerability bool NOT NULL DEFAULT FALSE,
6567
criticality_score numeric(10, 4),
6668
-- is_critical and last_rank_pass_at are not in the original pckgs.md spec; added so
6769
-- the packages table can answer "is this package critical?" without joining packages_universe,
@@ -82,8 +84,12 @@ CREATE INDEX ON packages (ecosystem, name);
8284

8385
CREATE INDEX ON packages USING gin (keywords);
8486

85-
-- INDEX on has_critical_vulnerability removed — column is commented out above.
86-
-- Uncomment both when semantics are decided.
87+
-- Partial index on has_critical_vulnerability TRUE rows only — that's the bucket
88+
-- the security overlay query needs ("list all packages with a known critical
89+
-- vuln"). The FALSE rows dominate the table and don't need an index.
90+
CREATE INDEX ON packages (has_critical_vulnerability)
91+
WHERE
92+
has_critical_vulnerability;
8793

8894
CREATE INDEX ON packages (criticality_score DESC)
8995
WHERE
@@ -569,6 +575,15 @@ CREATE TABLE advisories (
569575
aliases text[], -- CVE-XXXX, GHSA-...
570576
severity text, -- 'LOW' | 'MEDIUM' | 'HIGH' | 'CRITICAL'
571577
cvss numeric(3, 1),
578+
-- Provenance of the cvss value above. Lets downstream consumers distinguish
579+
-- a real vendor-supplied vector from a synthesized qualitative fallback.
580+
-- See ADR-0001 §CVSS scoring strategy. Allowed values:
581+
-- 'osv_cvss_v3' numeric score from a CVSS_V3 vector
582+
-- 'osv_cvss_v4' reserved; v4 numeric scoring deferred
583+
-- 'osv_qualitative_fallback' synthesized from database_specific.severity
584+
-- 'osv_malicious_package' MAL-* id with no CVSS vector
585+
-- Extensible to 'ghsa' | 'nvd' as additional sources come online.
586+
cvss_source text,
572587
-- >= 7.0 intentional: treat HIGH + CRITICAL both as actionable
573588
is_critical bool GENERATED ALWAYS AS (cvss >= 7.0) STORED,
574589
summary text,
@@ -599,10 +614,27 @@ CREATE INDEX ON advisory_packages (package_id)
599614
WHERE
600615
package_id IS NOT NULL;
601616

602-
-- Version ranges affected by an advisory per package.
617+
-- Drives the resolveMissingPackageIds catch-up UPDATE in deriveCriticalFlag:
618+
-- the query filters WHERE package_id IS NULL and joins on (ecosystem,
619+
-- package_name). The non-partial (ecosystem, package_name) index above is
620+
-- usable here too (the planner just adds a Filter on package_id IS NULL), but
621+
-- as the table grows the vast majority of rows have package_id IS NOT NULL,
622+
-- so the non-partial scan ends up filtering out most of what it reads. This
623+
-- partial index only contains the still-unresolved rows, keeping it tiny
624+
-- regardless of total table size and making the daily catch-up O(unresolved)
625+
-- instead of O(total).
626+
CREATE INDEX ON advisory_packages (ecosystem, package_name)
627+
WHERE
628+
package_id IS NULL;
629+
630+
-- Version ranges affected by an advisory per package. Populated by the OSV
631+
-- ingest worker (packages_worker/src/osv) using introduced_version /
632+
-- fixed_version / last_affected. range_raw / unaffected_raw are reserved
633+
-- for the deps.dev BQ ingest worker (future): that worker writes the raw
634+
-- range strings without parsing into structured boundaries. The OSV upsert
635+
-- path only deletes rows where range_raw / unaffected_raw are both NULL,
636+
-- so deps.dev rows are not clobbered when OSV re-syncs.
603637
-- COALESCE prevents silent duplicates when introduced_version is NULL.
604-
-- BQ-sourced rows populate range_raw / unaffected_raw only; introduced/fixed/last_affected
605-
-- are populated by a future range-parsing workstream.
606638
CREATE TABLE advisory_affected_ranges (
607639
id bigserial PRIMARY KEY,
608640
advisory_package_id bigint NOT NULL REFERENCES advisory_packages (id),
@@ -613,7 +645,18 @@ CREATE TABLE advisory_affected_ranges (
613645
unaffected_raw text -- raw UnaffectedVersions string from deps.dev BQ
614646
);
615647

616-
CREATE UNIQUE INDEX ON advisory_affected_ranges (advisory_package_id, COALESCE(introduced_version, ''));
648+
-- Full-tuple uniqueness so two ranges sharing introduced_version but differing
649+
-- in fixed_version or last_affected (cross-distro patches, partial fixes in a
650+
-- single advisory) both survive insertion. The narrower (advisory_package_id,
651+
-- introduced_version) form silently collapsed those cases to one row, dropping
652+
-- the wider range and under-reporting vulnerable windows in the derive step.
653+
-- See ADR-0001 §`advisory_affected_ranges` uniqueness scope.
654+
CREATE UNIQUE INDEX ON advisory_affected_ranges (
655+
advisory_package_id,
656+
COALESCE(introduced_version, ''),
657+
COALESCE(fixed_version, ''),
658+
COALESCE(last_affected, '')
659+
);
617660

618661
CREATE INDEX ON advisory_affected_ranges (advisory_package_id);
619662

0 commit comments

Comments
 (0)