@@ -13,7 +13,10 @@ CREATE TABLE packages_universe (
1313 criticality_score numeric (10 , 4 ),
1414 rank_in_ecosystem int ,
1515 is_critical bool NOT NULL DEFAULT FALSE,
16- last_ranked_at timestamptz
16+ -- Renamed from last_ranked_at (original pckgs.md spec) to last_rank_pass_at to make the
17+ -- every-pass update semantic explicit: updated unconditionally on each ranking run,
18+ -- not only when the rank changes. Mirrors the same column added to packages.
19+ last_rank_pass_at timestamptz
1720);
1821
1922CREATE INDEX ON packages_universe (ecosystem, rank_in_ecosystem);
@@ -52,17 +55,26 @@ CREATE TABLE packages (
5255 dependent_packages_count int ,
5356 dependent_repos_count int ,
5457 downloads_last_month bigint ,
55- -- TODO: define semantics before enabling. Options:
56- -- a) fixed_version IS NULL on any advisory range → "no fix released yet" (simple, opt 2)
57- -- b) latest_version falls inside an affected range → "currently vulnerable" (correct, needs semver comparison per ecosystem)
5858 -- has_critical_vulnerability bool NOT NULL DEFAULT FALSE,
59+ -- Deferred: semantics undecided between (a) any advisory with no fixed_version vs
60+ -- (b) latest_version falls inside an affected semver range. Lateral join against
61+ -- advisory_packages used in queries until this is resolved.
5962 criticality_score numeric (10 , 4 ),
63+ -- is_critical and last_rank_pass_at are not in the original pckgs.md spec; added so
64+ -- the packages table can answer "is this package critical?" without joining packages_universe,
65+ -- which is an ephemeral ranking workspace and gets truncated on every weekly pass.
66+ is_critical bool NOT NULL DEFAULT FALSE,
67+ -- Set on every ranking pass (not just when rank changes) so queries can detect stale rows
68+ -- via last_rank_pass_at < NOW() - INTERVAL '8 days'.
69+ last_rank_pass_at timestamptz ,
6070 ingestion_source text ,
6171 last_synced_at timestamptz NOT NULL DEFAULT NOW()
6272);
6373
6474CREATE UNIQUE INDEX ON packages (ecosystem, COALESCE(namespace, ' ' ), name);
6575
76+ CREATE INDEX ON packages (is_critical) WHERE is_critical;
77+
6678CREATE INDEX ON packages (ecosystem, name);
6779
6880CREATE INDEX ON packages USING gin (keywords);
@@ -107,8 +119,11 @@ CREATE TABLE versions (
107119 ecosystem text NOT NULL ,
108120 number text NOT NULL ,
109121 published_at timestamptz ,
110- is_latest bool NOT NULL DEFAULT FALSE,
111- is_yanked bool NOT NULL DEFAULT FALSE,
122+ -- Nullable: deps.dev PackageVersions does not expose is_latest; set by the npm/maven enricher
123+ -- workers that have authoritative latest-version data. NULL = unknown (not yet enriched).
124+ is_latest bool,
125+ -- Nullable for same reason: yanked status comes from registry-specific workers, not deps.dev.
126+ is_yanked bool,
112127 is_prerelease bool NOT NULL DEFAULT FALSE,
113128 license text , -- SPDX where available; can differ per version
114129 download_count bigint , -- per-version where available (npm, crates)
@@ -472,14 +487,26 @@ CREATE TABLE repos (
472487 watchers int ,
473488 open_issues int ,
474489 last_commit_at timestamptz ,
475- archived bool NOT NULL DEFAULT FALSE,
476- disabled bool NOT NULL DEFAULT FALSE,
477- is_fork bool NOT NULL DEFAULT FALSE,
478- created_at timestamptz ,
490+ -- Nullable: deps.dev ProjectsLatest does not expose archived/disabled/is_fork.
491+ -- These are populated by the GitHub API enricher worker. NULL = not yet enriched.
492+ archived bool,
493+ disabled bool,
494+ is_fork bool,
495+ -- DEFAULT NOW() added: fallback when upstream source does not provide a creation timestamp.
496+ created_at timestamptz DEFAULT NOW(),
497+ homepage text ,
498+ -- raw_project_type/raw_project_name preserve deps.dev's original project identity (e.g. "GITLAB",
499+ -- "github.com/owner/repo") so self-hosted GitLab instances can be detected later without backfill.
500+ -- canonicalRepoUrl() uses these to build the canonical url; they remain queryable for debugging.
501+ raw_project_type text ,
502+ raw_project_name text ,
479503 -- Scorecard aggregate; per-check detail in repo_scorecard_checks
480504 scorecard_score numeric (3 , 1 ),
481505 scorecard_last_run_at timestamptz ,
482- last_synced_at timestamptz NOT NULL DEFAULT NOW()
506+ -- Nullable with no default: multiple enrichers (deps.dev, GitHub worker, Scorecard) each write
507+ -- different columns at different times. NOT NULL DEFAULT would stamp a "synced" timestamp on
508+ -- first insert even when most columns are still NULL, making freshness checks misleading.
509+ last_synced_at timestamptz
483510);
484511
485512CREATE INDEX ON repos (host, OWNER, name);
@@ -538,7 +565,9 @@ CREATE INDEX ON package_repos (repo_id);
538565-- ============================================================
539566CREATE TABLE advisories (
540567 id bigserial PRIMARY KEY ,
541- osv_id text UNIQUE NOT NULL ,
568+ osv_id text UNIQUE NOT NULL , -- SourceID from deps.dev BQ (GHSA-xxx, CVE-xxx, OSV-xxx, etc.)
569+ source text , -- 'GHSA' | 'OSV' | 'NVD' | 'NSWG' etc. (BQ: Source)
570+ source_url text , -- upstream advisory URL (BQ: SourceURL)
542571 aliases text [], -- CVE-XXXX, GHSA-...
543572 severity text , -- 'LOW' | 'MEDIUM' | 'HIGH' | 'CRITICAL'
544573 cvss numeric (3 , 1 ),
@@ -547,7 +576,7 @@ CREATE TABLE advisories (
547576 summary text ,
548577 details text ,
549578 published_at timestamptz ,
550- modified_at timestamptz
579+ modified_at timestamptz -- NULL for BQ-sourced rows; tracked in-house on re-sync
551580);
552581
553582-- osv_id index omitted: UNIQUE constraint above already creates one.
@@ -574,12 +603,16 @@ WHERE
574603
575604-- Version ranges affected by an advisory per package.
576605-- COALESCE prevents silent duplicates when introduced_version is NULL.
606+ -- BQ-sourced rows populate range_raw / unaffected_raw only; introduced/fixed/last_affected
607+ -- are populated by a future range-parsing workstream.
577608CREATE TABLE advisory_affected_ranges (
578609 id bigserial PRIMARY KEY ,
579610 advisory_package_id bigint NOT NULL REFERENCES advisory_packages (id),
580611 introduced_version text , -- NULL = unknown start
581- fixed_version text , -- NULL = no fix yet
582- last_affected text -- NULL = no known upper bound
612+ fixed_version text , -- NULL = no fix yet
613+ last_affected text , -- NULL = no known upper bound
614+ range_raw text , -- raw AffectedVersions string from deps.dev BQ
615+ unaffected_raw text -- raw UnaffectedVersions string from deps.dev BQ
583616);
584617
585618CREATE UNIQUE INDEX ON advisory_affected_ranges (advisory_package_id, COALESCE(introduced_version, ' ' ));
@@ -650,3 +683,98 @@ CREATE TABLE downloads_daily (
650683 UNIQUE (package_id, date )
651684)
652685PARTITION BY RANGE (date );
686+
687+ -- ============================================================
688+ -- CRITICALITY RANKING FUNCTION
689+ -- ============================================================
690+ CREATE OR REPLACE FUNCTION rank_packages_universe (
691+ weight_downloads numeric ,
692+ weight_dependent_repos numeric ,
693+ weight_dependent_packages numeric ,
694+ log_smoothing numeric ,
695+ critical_top_n_by_ecosystem jsonb
696+ )
697+ RETURNS TABLE (scored_rows int , ranked_rows int , propagated_rows int )
698+ LANGUAGE plpgsql
699+ AS $$
700+ DECLARE
701+ n_scored int ;
702+ n_ranked int ;
703+ n_propagated int ;
704+ BEGIN
705+ -- Step 1: recompute scores; only touch rows whose score changed.
706+ -- log_smoothing is added before LN() to avoid LN(0) on zero-count rows
707+ -- and to compress the gap between small and large values (e.g. LN(1)=0
708+ -- vs LN(2)≈0.69 gives a gentler floor than LN(0)=-∞). Typically 1.0.
709+ --
710+ -- Until the npm-registry / Maven downloads enricher runs, downloads_30d
711+ -- is NULL on every row. weight_downloads contributes 0 to the score;
712+ -- ranking effectively reduces to:
713+ -- LN(1 + dependent_repos_count) * weight_dependent_repos
714+ -- + LN(1 + dependent_packages_count) * weight_dependent_packages
715+ UPDATE packages_universe SET last_rank_pass_at = NOW();
716+
717+ WITH new_scores AS (
718+ SELECT
719+ id,
720+ ( LN(log_smoothing + COALESCE(downloads_30d, 0 )) * weight_downloads
721+ + LN(log_smoothing + COALESCE(dependent_repos_count, 0 )) * weight_dependent_repos
722+ + LN(log_smoothing + COALESCE(dependent_packages_count, 0 )) * weight_dependent_packages
723+ )::numeric (10 , 4 ) AS new_score
724+ FROM packages_universe
725+ )
726+ UPDATE packages_universe pu
727+ SET criticality_score = ns .new_score
728+ FROM new_scores ns
729+ WHERE pu .id = ns .id
730+ AND pu .criticality_score IS DISTINCT FROM ns .new_score ;
731+
732+ GET DIAGNOSTICS n_scored = ROW_COUNT;
733+
734+ -- Step 2: rank within ecosystem; flag is_critical via JSONB lookup.
735+ -- Only purl-having rows are ranked (null purls can't propagate to packages).
736+ -- Tie-break by id keeps ranks deterministic across runs so IS DISTINCT FROM
737+ -- doesn't no-op-write equal-score rows on every call.
738+ WITH ranked AS (
739+ SELECT
740+ id,
741+ ecosystem,
742+ ROW_NUMBER() OVER (
743+ PARTITION BY ecosystem
744+ ORDER BY criticality_score DESC NULLS LAST, id
745+ ) AS r
746+ FROM packages_universe
747+ WHERE purl IS NOT NULL
748+ ),
749+ with_flag AS (
750+ SELECT
751+ id,
752+ r,
753+ COALESCE(
754+ r <= (critical_top_n_by_ecosystem - >> ecosystem)::int ,
755+ FALSE
756+ ) AS new_is_critical
757+ FROM ranked
758+ )
759+ UPDATE packages_universe pu
760+ SET rank_in_ecosystem = wf .r ,
761+ is_critical = wf .new_is_critical
762+ FROM with_flag wf
763+ WHERE pu .id = wf .id
764+ AND ( pu .rank_in_ecosystem IS DISTINCT FROM wf .r
765+ OR pu .is_critical IS DISTINCT FROM wf .new_is_critical );
766+
767+ GET DIAGNOSTICS n_ranked = ROW_COUNT;
768+
769+ -- Step 3: propagate criticality_score onto Tier-2 packages rows.
770+ UPDATE packages p
771+ SET criticality_score = pu .criticality_score
772+ FROM packages_universe pu
773+ WHERE p .purl = pu .purl
774+ AND p .criticality_score IS DISTINCT FROM pu .criticality_score ;
775+
776+ GET DIAGNOSTICS n_propagated = ROW_COUNT;
777+
778+ RETURN QUERY SELECT n_scored, n_ranked, n_propagated;
779+ END;
780+ $$;
0 commit comments