Skip to content

Commit fe68bea

Browse files
authored
chore: schema changes (#4148)
Signed-off-by: Uroš Marolt <uros@marolt.me>
1 parent afd950e commit fe68bea

3 files changed

Lines changed: 143 additions & 105 deletions

File tree

backend/src/osspckgs/migrations/V1779710880__initial_schema.sql

Lines changed: 143 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@ CREATE TABLE packages_universe (
1313
criticality_score numeric(10, 4),
1414
rank_in_ecosystem int,
1515
is_critical bool NOT NULL DEFAULT FALSE,
16-
last_ranked_at timestamptz
16+
-- Renamed from last_ranked_at (original pckgs.md spec) to last_rank_pass_at to make the
17+
-- every-pass update semantic explicit: updated unconditionally on each ranking run,
18+
-- not only when the rank changes. Mirrors the same column added to packages.
19+
last_rank_pass_at timestamptz
1720
);
1821

1922
CREATE INDEX ON packages_universe (ecosystem, rank_in_ecosystem);
@@ -52,17 +55,26 @@ CREATE TABLE packages (
5255
dependent_packages_count int,
5356
dependent_repos_count int,
5457
downloads_last_month bigint,
55-
-- TODO: define semantics before enabling. Options:
56-
-- a) fixed_version IS NULL on any advisory range → "no fix released yet" (simple, opt 2)
57-
-- b) latest_version falls inside an affected range → "currently vulnerable" (correct, needs semver comparison per ecosystem)
5858
-- has_critical_vulnerability bool NOT NULL DEFAULT FALSE,
59+
-- Deferred: semantics undecided between (a) any advisory with no fixed_version vs
60+
-- (b) latest_version falls inside an affected semver range. Lateral join against
61+
-- advisory_packages used in queries until this is resolved.
5962
criticality_score numeric(10, 4),
63+
-- is_critical and last_rank_pass_at are not in the original pckgs.md spec; added so
64+
-- the packages table can answer "is this package critical?" without joining packages_universe,
65+
-- which is an ephemeral ranking workspace and gets truncated on every weekly pass.
66+
is_critical bool NOT NULL DEFAULT FALSE,
67+
-- Set on every ranking pass (not just when rank changes) so queries can detect stale rows
68+
-- via last_rank_pass_at < NOW() - INTERVAL '8 days'.
69+
last_rank_pass_at timestamptz,
6070
ingestion_source text,
6171
last_synced_at timestamptz NOT NULL DEFAULT NOW()
6272
);
6373

6474
CREATE UNIQUE INDEX ON packages (ecosystem, COALESCE(namespace, ''), name);
6575

76+
CREATE INDEX ON packages (is_critical) WHERE is_critical;
77+
6678
CREATE INDEX ON packages (ecosystem, name);
6779

6880
CREATE INDEX ON packages USING gin (keywords);
@@ -107,8 +119,11 @@ CREATE TABLE versions (
107119
ecosystem text NOT NULL,
108120
number text NOT NULL,
109121
published_at timestamptz,
110-
is_latest bool NOT NULL DEFAULT FALSE,
111-
is_yanked bool NOT NULL DEFAULT FALSE,
122+
-- Nullable: deps.dev PackageVersions does not expose is_latest; set by the npm/maven enricher
123+
-- workers that have authoritative latest-version data. NULL = unknown (not yet enriched).
124+
is_latest bool,
125+
-- Nullable for same reason: yanked status comes from registry-specific workers, not deps.dev.
126+
is_yanked bool,
112127
is_prerelease bool NOT NULL DEFAULT FALSE,
113128
license text, -- SPDX where available; can differ per version
114129
download_count bigint, -- per-version where available (npm, crates)
@@ -472,14 +487,26 @@ CREATE TABLE repos (
472487
watchers int,
473488
open_issues int,
474489
last_commit_at timestamptz,
475-
archived bool NOT NULL DEFAULT FALSE,
476-
disabled bool NOT NULL DEFAULT FALSE,
477-
is_fork bool NOT NULL DEFAULT FALSE,
478-
created_at timestamptz,
490+
-- Nullable: deps.dev ProjectsLatest does not expose archived/disabled/is_fork.
491+
-- These are populated by the GitHub API enricher worker. NULL = not yet enriched.
492+
archived bool,
493+
disabled bool,
494+
is_fork bool,
495+
-- DEFAULT NOW() added: fallback when upstream source does not provide a creation timestamp.
496+
created_at timestamptz DEFAULT NOW(),
497+
homepage text,
498+
-- raw_project_type/raw_project_name preserve deps.dev's original project identity (e.g. "GITLAB",
499+
-- "github.com/owner/repo") so self-hosted GitLab instances can be detected later without backfill.
500+
-- canonicalRepoUrl() uses these to build the canonical url; they remain queryable for debugging.
501+
raw_project_type text,
502+
raw_project_name text,
479503
-- Scorecard aggregate; per-check detail in repo_scorecard_checks
480504
scorecard_score numeric(3, 1),
481505
scorecard_last_run_at timestamptz,
482-
last_synced_at timestamptz NOT NULL DEFAULT NOW()
506+
-- Nullable with no default: multiple enrichers (deps.dev, GitHub worker, Scorecard) each write
507+
-- different columns at different times. NOT NULL DEFAULT would stamp a "synced" timestamp on
508+
-- first insert even when most columns are still NULL, making freshness checks misleading.
509+
last_synced_at timestamptz
483510
);
484511

485512
CREATE INDEX ON repos (host, OWNER, name);
@@ -538,7 +565,9 @@ CREATE INDEX ON package_repos (repo_id);
538565
-- ============================================================
539566
CREATE TABLE advisories (
540567
id bigserial PRIMARY KEY,
541-
osv_id text UNIQUE NOT NULL,
568+
osv_id text UNIQUE NOT NULL, -- SourceID from deps.dev BQ (GHSA-xxx, CVE-xxx, OSV-xxx, etc.)
569+
source text, -- 'GHSA' | 'OSV' | 'NVD' | 'NSWG' etc. (BQ: Source)
570+
source_url text, -- upstream advisory URL (BQ: SourceURL)
542571
aliases text[], -- CVE-XXXX, GHSA-...
543572
severity text, -- 'LOW' | 'MEDIUM' | 'HIGH' | 'CRITICAL'
544573
cvss numeric(3, 1),
@@ -547,7 +576,7 @@ CREATE TABLE advisories (
547576
summary text,
548577
details text,
549578
published_at timestamptz,
550-
modified_at timestamptz
579+
modified_at timestamptz -- NULL for BQ-sourced rows; tracked in-house on re-sync
551580
);
552581

553582
-- osv_id index omitted: UNIQUE constraint above already creates one.
@@ -574,12 +603,16 @@ WHERE
574603

575604
-- Version ranges affected by an advisory per package.
576605
-- COALESCE prevents silent duplicates when introduced_version is NULL.
606+
-- BQ-sourced rows populate range_raw / unaffected_raw only; introduced/fixed/last_affected
607+
-- are populated by a future range-parsing workstream.
577608
CREATE TABLE advisory_affected_ranges (
578609
id bigserial PRIMARY KEY,
579610
advisory_package_id bigint NOT NULL REFERENCES advisory_packages (id),
580611
introduced_version text, -- NULL = unknown start
581-
fixed_version text, -- NULL = no fix yet
582-
last_affected text -- NULL = no known upper bound
612+
fixed_version text, -- NULL = no fix yet
613+
last_affected text, -- NULL = no known upper bound
614+
range_raw text, -- raw AffectedVersions string from deps.dev BQ
615+
unaffected_raw text -- raw UnaffectedVersions string from deps.dev BQ
583616
);
584617

585618
CREATE UNIQUE INDEX ON advisory_affected_ranges (advisory_package_id, COALESCE(introduced_version, ''));
@@ -650,3 +683,98 @@ CREATE TABLE downloads_daily (
650683
UNIQUE (package_id, date)
651684
)
652685
PARTITION BY RANGE (date);
686+
687+
-- ============================================================
688+
-- CRITICALITY RANKING FUNCTION
689+
-- ============================================================
690+
CREATE OR REPLACE FUNCTION rank_packages_universe(
691+
weight_downloads numeric,
692+
weight_dependent_repos numeric,
693+
weight_dependent_packages numeric,
694+
log_smoothing numeric,
695+
critical_top_n_by_ecosystem jsonb
696+
)
697+
RETURNS TABLE (scored_rows int, ranked_rows int, propagated_rows int)
698+
LANGUAGE plpgsql
699+
AS $$
700+
DECLARE
701+
n_scored int;
702+
n_ranked int;
703+
n_propagated int;
704+
BEGIN
705+
-- Step 1: recompute scores; only touch rows whose score changed.
706+
-- log_smoothing is added before LN() to avoid LN(0) on zero-count rows
707+
-- and to compress the gap between small and large values (e.g. LN(1)=0
708+
-- vs LN(2)≈0.69 gives a gentler floor than LN(0)=-∞). Typically 1.0.
709+
--
710+
-- Until the npm-registry / Maven downloads enricher runs, downloads_30d
711+
-- is NULL on every row. weight_downloads contributes 0 to the score;
712+
-- ranking effectively reduces to:
713+
-- LN(1 + dependent_repos_count) * weight_dependent_repos
714+
-- + LN(1 + dependent_packages_count) * weight_dependent_packages
715+
UPDATE packages_universe SET last_rank_pass_at = NOW();
716+
717+
WITH new_scores AS (
718+
SELECT
719+
id,
720+
( LN(log_smoothing + COALESCE(downloads_30d, 0)) * weight_downloads
721+
+ LN(log_smoothing + COALESCE(dependent_repos_count, 0)) * weight_dependent_repos
722+
+ LN(log_smoothing + COALESCE(dependent_packages_count, 0)) * weight_dependent_packages
723+
)::numeric(10, 4) AS new_score
724+
FROM packages_universe
725+
)
726+
UPDATE packages_universe pu
727+
SET criticality_score = ns.new_score
728+
FROM new_scores ns
729+
WHERE pu.id = ns.id
730+
AND pu.criticality_score IS DISTINCT FROM ns.new_score;
731+
732+
GET DIAGNOSTICS n_scored = ROW_COUNT;
733+
734+
-- Step 2: rank within ecosystem; flag is_critical via JSONB lookup.
735+
-- Only purl-having rows are ranked (null purls can't propagate to packages).
736+
-- Tie-break by id keeps ranks deterministic across runs so IS DISTINCT FROM
737+
-- doesn't no-op-write equal-score rows on every call.
738+
WITH ranked AS (
739+
SELECT
740+
id,
741+
ecosystem,
742+
ROW_NUMBER() OVER (
743+
PARTITION BY ecosystem
744+
ORDER BY criticality_score DESC NULLS LAST, id
745+
) AS r
746+
FROM packages_universe
747+
WHERE purl IS NOT NULL
748+
),
749+
with_flag AS (
750+
SELECT
751+
id,
752+
r,
753+
COALESCE(
754+
r <= (critical_top_n_by_ecosystem ->> ecosystem)::int,
755+
FALSE
756+
) AS new_is_critical
757+
FROM ranked
758+
)
759+
UPDATE packages_universe pu
760+
SET rank_in_ecosystem = wf.r,
761+
is_critical = wf.new_is_critical
762+
FROM with_flag wf
763+
WHERE pu.id = wf.id
764+
AND ( pu.rank_in_ecosystem IS DISTINCT FROM wf.r
765+
OR pu.is_critical IS DISTINCT FROM wf.new_is_critical );
766+
767+
GET DIAGNOSTICS n_ranked = ROW_COUNT;
768+
769+
-- Step 3: propagate criticality_score onto Tier-2 packages rows.
770+
UPDATE packages p
771+
SET criticality_score = pu.criticality_score
772+
FROM packages_universe pu
773+
WHERE p.purl = pu.purl
774+
AND p.criticality_score IS DISTINCT FROM pu.criticality_score;
775+
776+
GET DIAGNOSTICS n_propagated = ROW_COUNT;
777+
778+
RETURN QUERY SELECT n_scored, n_ranked, n_propagated;
779+
END;
780+
$$;

backend/src/osspckgs/migrations/V1779799200__repos_default_created_and_last_synced.sql

Lines changed: 0 additions & 6 deletions
This file was deleted.

backend/src/osspckgs/migrations/V1779885600__criticality_ranking_function.sql

Lines changed: 0 additions & 84 deletions
This file was deleted.

0 commit comments

Comments
 (0)