Skip to content

Commit a3162dc

Browse files
authored
chore: deprecated packages_universe and moved the columns to packages (CM-1225) (#4182)
Signed-off-by: Uroš Marolt <uros@marolt.me>
1 parent 26e735a commit a3162dc

18 files changed

Lines changed: 297 additions & 173 deletions

File tree

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
-- Retire packages_universe (Tier 3 workspace). All signals are migrated onto
2+
-- packages, rank_packages_universe() is replaced by rank_packages() which operates on packages directly,
3+
-- and the table is dropped.
4+
--
5+
-- Columns migrated from packages_universe → packages:
6+
-- downloads_last_30d bigint (npm 30-day window cache for ranking)
7+
-- centrality_score numeric(10,8) (PageRank, stored for future formula use)
8+
-- rank_in_ecosystem int (computed by rank_packages)
9+
--
10+
-- rank_packages_universe() → rank_packages() changes:
11+
-- - Operates on packages directly; no more TRUNCATE/INSERT workspace.
12+
-- - Scope limited to ecosystems present in critical_top_n_by_ecosystem JSONB
13+
-- (dynamic — add an ecosystem to the JSONB to include it in ranking).
14+
-- - "Propagate to packages" step removed (packages IS the target now).
15+
-- - Return column propagated_rows removed (no propagation step).
16+
17+
-- ── 1. Add missing columns to packages ────────────────────────────────────────
18+
19+
ALTER TABLE packages
20+
ADD COLUMN IF NOT EXISTS downloads_last_30d bigint,
21+
ADD COLUMN IF NOT EXISTS centrality_score numeric(10, 8),
22+
ADD COLUMN IF NOT EXISTS rank_in_ecosystem int;
23+
24+
-- ── 2. Back-fill from packages_universe ───────────────────────────────────────
25+
26+
UPDATE packages p
27+
SET downloads_last_30d = pu.downloads_last_30d,
28+
centrality_score = pu.centrality_score,
29+
rank_in_ecosystem = pu.rank_in_ecosystem
30+
FROM packages_universe pu
31+
WHERE p.purl = pu.purl
32+
AND (
33+
p.downloads_last_30d IS DISTINCT FROM pu.downloads_last_30d
34+
OR p.centrality_score IS DISTINCT FROM pu.centrality_score
35+
OR p.rank_in_ecosystem IS DISTINCT FROM pu.rank_in_ecosystem
36+
);
37+
38+
-- ── 3. Replace rank_packages_universe() ───────────────────────────────────────
39+
-- Two overloads exist in the schema:
40+
-- V1779710880 created (numeric, numeric, numeric, numeric, jsonb) — 5 params
41+
-- V1780589607 CREATE OR REPLACE'd with (numeric, numeric, numeric, jsonb) — different
42+
-- signature, so it added a second overload rather than replacing the first.
43+
-- Both must be dropped; only one new 4-param version is created.
44+
45+
DROP FUNCTION IF EXISTS rank_packages_universe(numeric, numeric, numeric, numeric, jsonb);
46+
DROP FUNCTION IF EXISTS rank_packages_universe(numeric, numeric, numeric, jsonb);
47+
48+
-- Usage:
49+
-- -- with defaults (weights 0.25/0.25/0.50, built-in top-N budget)
50+
-- SELECT * FROM rank_packages();
51+
--
52+
-- -- with custom weights and/or a different top-N budget
53+
-- SELECT * FROM rank_packages(
54+
-- 0.20, 0.30, 0.50,
55+
-- '{"npm": 400000, "maven": 200000, "cargo": 75000}'::jsonb
56+
-- );
57+
58+
-- rank_packages() — score, rank, and flag packages in one pass.
59+
--
60+
-- Formula:
61+
-- impact = w_downloads * pct_rank( LOG(1 + downloads_last_30d) ) within ecosystem
62+
-- + w_dep_pkgs * pct_rank( LOG(1 + dependent_count) ) within ecosystem
63+
-- + w_transitive * pct_rank( LOG(1 + transitive_dependent_count) ) within ecosystem
64+
--
65+
-- Steps:
66+
-- 1. Score — compute impact via weighted PERCENT_RANK() (scoped to JSONB ecosystems)
67+
-- 2. Rank — ROW_NUMBER() per ecosystem, flag top-N as is_critical (scoped to JSONB ecosystems)
68+
-- 2.5 Spotlight — force is_critical = TRUE for rows in package_criticality_spotlight
69+
-- 3. Stamp — unconditionally set last_rank_pass_at on all scored rows (schema contract)
70+
--
71+
-- All weights and the top-N budget are call-time parameters.
72+
-- ROW_NUMBER() (not RANK()) keeps each ecosystem's critical set exactly at top-N.
73+
-- Only ecosystems present as keys in critical_top_n_by_ecosystem are scored/ranked;
74+
-- packages from other ecosystems are not touched.
75+
76+
CREATE OR REPLACE FUNCTION rank_packages(
77+
weight_downloads numeric DEFAULT 0.25,
78+
weight_dependent_packages numeric DEFAULT 0.25,
79+
weight_transitive numeric DEFAULT 0.50,
80+
critical_top_n_by_ecosystem jsonb DEFAULT '{"npm":400000,"go":100000,"maven":200000,"pypi":100000,"nuget":50000,"cargo":75000}'::jsonb
81+
)
82+
RETURNS TABLE(scored_rows int, ranked_rows int)
83+
LANGUAGE plpgsql AS $$
84+
DECLARE
85+
n_scored int;
86+
n_ranked int;
87+
BEGIN
88+
-- ── Step 1: score ──────────────────────────────────────────────────────────
89+
WITH percentile_scores AS (
90+
SELECT
91+
id,
92+
(
93+
weight_downloads * PERCENT_RANK() OVER (
94+
PARTITION BY ecosystem ORDER BY LOG(1 + COALESCE(downloads_last_30d, 0)))
95+
96+
+ weight_dependent_packages * PERCENT_RANK() OVER (
97+
PARTITION BY ecosystem ORDER BY LOG(1 + COALESCE(dependent_count, 0)))
98+
99+
+ weight_transitive * PERCENT_RANK() OVER (
100+
PARTITION BY ecosystem ORDER BY LOG(1 + COALESCE(transitive_dependent_count, 0)))
101+
)::numeric(10, 4) AS new_impact
102+
FROM packages
103+
WHERE ecosystem IN (SELECT jsonb_object_keys(critical_top_n_by_ecosystem))
104+
)
105+
UPDATE packages p
106+
SET impact = ps.new_impact
107+
FROM percentile_scores ps
108+
WHERE p.id = ps.id
109+
AND p.impact IS DISTINCT FROM ps.new_impact;
110+
111+
GET DIAGNOSTICS n_scored = ROW_COUNT;
112+
113+
-- ── Step 2: rank + flag ────────────────────────────────────────────────────
114+
WITH ranked AS (
115+
SELECT
116+
id, ecosystem,
117+
ROW_NUMBER() OVER (
118+
PARTITION BY ecosystem
119+
ORDER BY impact DESC NULLS LAST, id
120+
) AS r
121+
FROM packages
122+
WHERE purl IS NOT NULL
123+
AND ecosystem IN (SELECT jsonb_object_keys(critical_top_n_by_ecosystem))
124+
),
125+
flagged AS (
126+
SELECT
127+
id, r,
128+
COALESCE(
129+
r <= (critical_top_n_by_ecosystem ->> ecosystem)::int,
130+
FALSE
131+
) AS new_is_critical
132+
FROM ranked
133+
)
134+
UPDATE packages p
135+
SET rank_in_ecosystem = f.r,
136+
is_critical = f.new_is_critical
137+
FROM flagged f
138+
WHERE p.id = f.id
139+
AND (
140+
p.rank_in_ecosystem IS DISTINCT FROM f.r
141+
OR p.is_critical IS DISTINCT FROM f.new_is_critical
142+
);
143+
144+
GET DIAGNOSTICS n_ranked = ROW_COUNT;
145+
146+
-- ── Step 2.5: spotlight overrides ─────────────────────────────────────────
147+
UPDATE packages p
148+
SET is_critical = TRUE
149+
FROM package_criticality_spotlight s
150+
WHERE p.ecosystem = s.ecosystem
151+
AND (p.namespace IS NOT DISTINCT FROM s.namespace)
152+
AND p.name = s.name
153+
AND p.is_critical = FALSE;
154+
155+
-- ── Step 3: stamp last_rank_pass_at unconditionally ───────────────────────
156+
-- Schema contract: must be updated on every pass (not only when scores change)
157+
-- so stale-detection queries (last_rank_pass_at < NOW() - INTERVAL '8 days') work.
158+
UPDATE packages
159+
SET last_rank_pass_at = NOW()
160+
WHERE ecosystem IN (SELECT jsonb_object_keys(critical_top_n_by_ecosystem));
161+
162+
RETURN QUERY SELECT n_scored, n_ranked;
163+
END;
164+
$$;
165+
166+
-- ── 4. Drop packages_universe ─────────────────────────────────────────────────
167+
-- No FK constraints reference this table (npm_package_universe_state and
168+
-- downloads_last_30d use purl text, not FK).
169+
170+
DROP TABLE packages_universe;

0 commit comments

Comments
 (0)