Skip to content

Commit 3e8feca

Browse files
authored
chore: update downloads schema for tier 2/3 granularity (#4150)
Signed-off-by: Joana Maia <jmaia@contractor.linuxfoundation.org>
1 parent fe68bea commit 3e8feca

1 file changed

Lines changed: 79 additions & 12 deletions

File tree

backend/src/osspckgs/migrations/V1779710880__initial_schema.sql

Lines changed: 79 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@ CREATE TABLE packages_universe (
77
ecosystem text NOT NULL,
88
namespace text,
99
name text NOT NULL,
10-
downloads_30d bigint,
10+
-- Cached latest 30-day window count. Written by the same weekly ranking worker that upserts rows into
11+
-- the downloads_last_30d table (keyed by purl/end_date). This column is the denormalized latest value
12+
-- used directly by rank_packages_universe() to avoid a join; the downloads_last_30d table holds the
13+
-- full rolling-window timeline.
14+
downloads_last_30d bigint,
1115
dependent_packages_count int,
1216
dependent_repos_count int,
1317
criticality_score numeric(10, 4),
@@ -54,7 +58,6 @@ CREATE TABLE packages (
5458
latest_release_at timestamptz,
5559
dependent_packages_count int,
5660
dependent_repos_count int,
57-
downloads_last_month bigint,
5861
-- has_critical_vulnerability bool NOT NULL DEFAULT FALSE,
5962
-- Deferred: semantics undecided between (a) any advisory with no fixed_version vs
6063
-- (b) latest_version falls inside an affected semver range. Lateral join against
@@ -79,10 +82,6 @@ CREATE INDEX ON packages (ecosystem, name);
7982

8083
CREATE INDEX ON packages USING gin (keywords);
8184

82-
CREATE INDEX ON packages (downloads_last_month DESC)
83-
WHERE
84-
status = 'active';
85-
8685
-- INDEX on has_critical_vulnerability removed — column is commented out above.
8786
-- Uncomment both when semantics are decided.
8887

@@ -126,7 +125,6 @@ CREATE TABLE versions (
126125
is_yanked bool,
127126
is_prerelease bool NOT NULL DEFAULT FALSE,
128127
license text, -- SPDX where available; can differ per version
129-
download_count bigint, -- per-version where available (npm, crates)
130128
last_synced_at timestamptz NOT NULL DEFAULT NOW(),
131129
PRIMARY KEY (id, package_id),
132130
UNIQUE (package_id, number)
@@ -646,9 +644,27 @@ CREATE TABLE package_maintainers (
646644
);
647645

648646
-- ============================================================
649-
-- DOWNLOADS (time-series, partitioned by month via pg_partman)
647+
-- DOWNLOADS
648+
--
649+
-- Two tables track download volume at different tiers and granularities:
650+
--
651+
-- downloads_daily (tier 2 — packages)
652+
-- Source of truth for daily download counts. One row per package per day.
653+
-- No denormalized rollup on the packages table — consumers SUM over this
654+
-- table when they need a window (e.g. last 30 days).
655+
--
656+
-- downloads_last_30d (tier 3 — packages_universe)
657+
-- Rolling 30-day download timeline keyed by purl. Each row represents one
658+
-- 30-day window (start_date..end_date). Keyed by purl so rows survive the
659+
-- weekly truncation of packages_universe. The latest window's count is also
660+
-- cached in packages_universe.downloads_last_30d for fast access by the
661+
-- criticality-ranking function (no join needed).
662+
--
663+
-- ============================================================
664+
-- DOWNLOADS DAILY (tier 2 — packages, daily granularity)
650665
--
651-
-- pg_partman MUST be enabled in OCI config before this migration runs:
666+
-- Partitioned by month via pg_partman. pg_partman MUST be enabled in OCI
667+
-- config before this migration runs:
652668
-- OCI Console → Database → Configuration → Extensions → enable pg_partman
653669
--
654670
-- After enabling, run the setup below (once, outside Flyway or in a
@@ -676,14 +692,65 @@ CREATE TABLE package_maintainers (
676692
-- ============================================================
677693
CREATE TABLE downloads_daily (
678694
id bigserial,
679-
package_id bigint NOT NULL,
695+
package_id bigint NOT NULL REFERENCES packages (id),
680696
date date NOT NULL,
681697
count bigint NOT NULL,
682698
PRIMARY KEY (id, date),
683699
UNIQUE (package_id, date)
684700
)
685701
PARTITION BY RANGE (date);
686702

703+
-- ============================================================
704+
-- DOWNLOADS LAST 30D (tier 3 — packages_universe, rolling 30-day granularity)
705+
--
706+
-- Historical timeline of rolling 30-day download counts, keyed by purl.
707+
-- Each row captures one window: downloads from start_date to end_date (inclusive).
708+
-- Keyed by purl (not packages_universe.id) so rows survive the weekly
709+
-- truncation of packages_universe. The latest window is also written
710+
-- to packages_universe.downloads_last_30d column for fast access by the ranking function.
711+
--
712+
-- Writers should upsert: INSERT ... ON CONFLICT (purl, end_date) DO UPDATE SET count = EXCLUDED.count, start_date = EXCLUDED.start_date
713+
-- PK includes end_date because Postgres requires the partition key to be
714+
-- part of the primary key on range-partitioned tables.
715+
--
716+
-- Partitioned by month via pg_partman. pg_partman MUST be enabled in OCI
717+
-- config before this migration runs:
718+
-- OCI Console → Database → Configuration → Extensions → enable pg_partman
719+
--
720+
-- After enabling, run the setup below (once, outside Flyway or in a
721+
-- separate migration) to register pg_partman and create initial partitions:
722+
--
723+
-- CREATE EXTENSION IF NOT EXISTS pg_partman SCHEMA partman;
724+
--
725+
-- SELECT partman.create_parent(
726+
-- p_parent_table => 'public.downloads_last_30d',
727+
-- p_control => 'end_date',
728+
-- p_interval => '1 month',
729+
-- p_premake => 3 -- pre-creates 3 future monthly partitions
730+
-- );
731+
--
732+
-- -- pg_cron job to maintain partitions (also needs pg_cron enabled in OCI):
733+
-- SELECT cron.schedule('partman-maintain-30d', '0 2 * * *',
734+
-- $$CALL partman.run_maintenance_proc()$$);
735+
--
736+
-- Without this setup, inserts into downloads_last_30d will fail with
737+
-- "no partition found for row". The table structure below is correct;
738+
-- only the partition management setup is deferred.
739+
--
740+
-- ============================================================
741+
CREATE TABLE downloads_last_30d (
742+
id bigserial,
743+
purl text NOT NULL,
744+
start_date date NOT NULL,
745+
end_date date NOT NULL,
746+
count bigint NOT NULL,
747+
PRIMARY KEY (id, end_date),
748+
UNIQUE (purl, end_date)
749+
)
750+
PARTITION BY RANGE (end_date);
751+
752+
CREATE INDEX ON downloads_last_30d (purl, end_date DESC);
753+
687754
-- ============================================================
688755
-- CRITICALITY RANKING FUNCTION
689756
-- ============================================================
@@ -707,7 +774,7 @@ BEGIN
707774
-- and to compress the gap between small and large values (e.g. LN(1)=0
708775
-- vs LN(2)≈0.69 gives a gentler floor than LN(0)=-∞). Typically 1.0.
709776
--
710-
-- Until the npm-registry / Maven downloads enricher runs, downloads_30d
777+
-- Until the npm-registry / Maven downloads enricher runs, downloads_last_30d
711778
-- is NULL on every row. weight_downloads contributes 0 to the score;
712779
-- ranking effectively reduces to:
713780
-- LN(1 + dependent_repos_count) * weight_dependent_repos
@@ -717,7 +784,7 @@ BEGIN
717784
WITH new_scores AS (
718785
SELECT
719786
id,
720-
( LN(log_smoothing + COALESCE(downloads_30d, 0)) * weight_downloads
787+
( LN(log_smoothing + COALESCE(downloads_last_30d, 0)) * weight_downloads
721788
+ LN(log_smoothing + COALESCE(dependent_repos_count, 0)) * weight_dependent_repos
722789
+ LN(log_smoothing + COALESCE(dependent_packages_count, 0)) * weight_dependent_packages
723790
)::numeric(10, 4) AS new_score

0 commit comments

Comments
 (0)