Skip to content

Commit 92c0b79

Browse files
epipavjoanagmaia
andauthored
feat: tracking NPM packages (#4159)
Signed-off-by: anilb <epipav@gmail.com> Signed-off-by: anil <epipav@gmail.com> Signed-off-by: Joana Maia <jmaia@contractor.linuxfoundation.org> Co-authored-by: Joana Maia <jmaia@contractor.linuxfoundation.org>
1 parent 17d1bfd commit 92c0b79

44 files changed

Lines changed: 3240 additions & 45 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

backend/.env.dist.local

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ CROWD_REDIS_PORT=6379
2222

2323
# S3 settings
2424
CROWD_S3_HOST=localhost
25-
CROWD_S3_PORT=9000
25+
CROWD_S3_PORT=9100
2626
CROWD_S3_INTEGRATION_ASSETS_BUCKET=crowd-integrations-assets
2727
CROWD_S3_MICROSERVICES_ASSETS_BUCKET=crowd-microservices-assets
2828
CROWD_S3_AWS_ACCOUNT_ID=000000000000
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
-- npm worker supporting tables and partition management for download tracking.
2+
3+
ALTER TABLE maintainers DROP COLUMN IF EXISTS email_hash;
4+
ALTER TABLE maintainers ADD COLUMN IF NOT EXISTS email text;
5+
6+
CREATE TABLE npm_worker_state (
7+
name text PRIMARY KEY,
8+
value text NOT NULL,
9+
updated_at timestamptz NOT NULL DEFAULT now()
10+
);
11+
12+
CREATE TABLE npm_package_state (
13+
purl text PRIMARY KEY,
14+
metadata_first_scanned_at timestamptz NOT NULL DEFAULT now(),
15+
metadata_last_run_at timestamptz,
16+
metadata_run_result jsonb, -- { status, attempts, httpStatus?, errorKind?, message? }
17+
daily_downloads_last_processed_at timestamptz,
18+
daily_downloads_run_result jsonb -- { status, httpStatus?, errorKind?, message? }
19+
);
20+
21+
CREATE TABLE npm_package_universe_state (
22+
purl text PRIMARY KEY,
23+
downloads_30d_last_run_at timestamptz, -- breadth watermark: latest 30d window refreshed
24+
downloads_30d_history_backfilled_at timestamptz, -- depth watermark: NULL until full older history filled
25+
downloads_30d_run_result jsonb -- { status, httpStatus?, errorKind?, message? }
26+
);
27+
CREATE INDEX ON npm_package_universe_state (downloads_30d_last_run_at);
28+
CREATE INDEX ON npm_package_universe_state (downloads_30d_history_backfilled_at);
29+
30+
-- ============================================================
31+
-- pg_partman setup for downloads_daily (monthly partitions)
32+
-- ============================================================
33+
CREATE SCHEMA IF NOT EXISTS partman;
34+
CREATE EXTENSION IF NOT EXISTS pg_partman SCHEMA partman;
35+
36+
SELECT partman.create_parent(
37+
p_parent_table => 'public.downloads_daily',
38+
p_control => 'date',
39+
p_interval => '1 month',
40+
p_premake => 12
41+
);
42+
43+
-- Create all historical monthly partitions (2015-01 through last month).
44+
DO $$
45+
DECLARE
46+
m date;
47+
BEGIN
48+
FOR m IN
49+
SELECT d::date
50+
FROM generate_series(
51+
'2015-01-01'::date,
52+
(date_trunc('month', now()) - interval '1 month')::date,
53+
'1 month'::interval
54+
) AS d
55+
LOOP
56+
EXECUTE format(
57+
'CREATE TABLE IF NOT EXISTS %I PARTITION OF downloads_daily FOR VALUES FROM (%L) TO (%L)',
58+
'downloads_daily_p' || to_char(m, 'YYYYMMDD'),
59+
m,
60+
(m + interval '1 month')::date
61+
);
62+
END LOOP;
63+
END
64+
$$;
65+
66+
-- ============================================================
67+
-- pg_partman setup for downloads_last_30d (yearly partitions)
68+
-- ============================================================
69+
SELECT partman.create_parent(
70+
p_parent_table => 'public.downloads_last_30d',
71+
p_control => 'end_date',
72+
p_interval => '1 year',
73+
p_premake => 3
74+
);
75+
76+
-- Create all historical yearly partitions (2015 through last year).
77+
DO $$
78+
DECLARE
79+
y date;
80+
BEGIN
81+
FOR y IN
82+
SELECT d::date
83+
FROM generate_series(
84+
'2015-01-01'::date,
85+
(date_trunc('year', now()) - interval '1 year')::date,
86+
'1 year'::interval
87+
) AS d
88+
LOOP
89+
EXECUTE format(
90+
'CREATE TABLE IF NOT EXISTS %I PARTITION OF downloads_last_30d FOR VALUES FROM (%L) TO (%L)',
91+
'downloads_last_30d_p' || to_char(y, 'YYYYMMDD'),
92+
y,
93+
(y + interval '1 year')::date
94+
);
95+
END LOOP;
96+
END
97+
$$;

docs/adr/0001-oss-packages-design-decisions.md

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -226,10 +226,10 @@ Five sub-workers run concurrently (npm, Maven, OSV, GitHub, Docker Hub), all wri
226226
| `packages` | Upsert on `purl`. Each worker only writes columns it owns; ecosystem isolation means column-level conflicts cannot occur in practice. |
227227
| `packages_universe` | Incremental upsert keyed on `purl`. The deps.dev import only touches rows whose underlying deps.dev snapshot date has advanced since the previous import (initial run is a one-time full backfill). |
228228
| `versions` | Append-only via `INSERT … ON CONFLICT DO NOTHING`. Yanked/deprecated status is a separate targeted `UPDATE (is_yanked = true) WHERE …`. |
229-
| `repos` | Registry workers (npm, Maven) do **not** write directly to `repos`. They write `package_repos` rows. The GitHub enricher — triggered when `repos.last_synced_at IS NULL` — upserts `repos` with metadata. Docker Hub worker adds `docker_*` columns on top. |
229+
| `repos` | Registry workers (npm, Maven) do **not** write `repos` enrichment metadata. They INSERT a minimal `repos(url, host)` row — `url` (canonical) and `host` (coarse classification) are both derived from the declared repository URL — solely to create the FK target their `package_repos` link needs. `owner`/`name`/`stars`/`description` and all other metadata stay NULL and remain enricher-owned; existing rows are never updated by registry workers. The GitHub enricher — triggered when `repos.last_synced_at IS NULL` — upserts `repos` with metadata. Docker Hub worker adds `docker_*` columns on top. |
230230
| `package_repos` | Composite PK `(package_id, repo_url)`. Each `source` value ('declared', 'deps_dev', 'heuristic', 'manual') is a separate row — sources do not overwrite each other. |
231231
| `advisories` | Upsert on `osv_id`. OSV is the single source of truth; no other worker writes to this table. |
232-
| `maintainers` / `package_maintainers` | Upsert on `(ecosystem, username)`. Never delete — history is preserved. |
232+
| `maintainers` / `package_maintainers` | `maintainers`: upsert on `(ecosystem, username)`, never deletedthe identity history is preserved. `package_maintainers`: reflects the **current** link set — the npm worker replaces a package's links each ingest (delete + reinsert), so prior link rows are not retained. |
233233
| `downloads_daily` | Append-only time-series. Each `(package_id, date)` row is written once. npm and Maven workers own disjoint rows by ecosystem. Historical timelines are preserved — workers do not overwrite past dates. |
234234
| `downloads_last_30d` | Upsert on `(purl, end_date)`. Written by the weekly ranking worker only. The cached `packages_universe.downloads_last_30d` column must be updated in the same pass. |
235235

@@ -375,10 +375,13 @@ Local verification against the live OSV dataset (2026-05-28) showed the multi-ra
375375

376376
**Strategy**: daily delta poll via the CouchDB changes feed, not a full sync.
377377

378-
1. Call `replicate.npmjs.com/_changes?since=<last_seq>&limit=<batch>` to get package names changed in the last 24h.
379-
2. For each changed name, fetch the full document from `registry.npmjs.com/<package>`.
380-
3. Normalize into `packages`, `versions`, `maintainers`, and `package_maintainers` using the write rules above.
381-
4. Downloads (timeline): call `api.npmjs.org/downloads/range/<start>:<end>/<package>` in batches of **128 packages per call** against the critical list. Write per-day rows to `downloads_daily`. Update `packages_universe.downloads_last_30d` at the end of each pass.
378+
1. Call `replicate.npmjs.com/_changes?since=<last_seq>&limit=<batch>` to get package names changed since the last run.
379+
2. Filter changed names against the `packages` table (~700k packages). Only packages already tracked in Tier 2 are re-ingested — unknown packages in the changes feed are ignored.
380+
3. For each matching changed name, fetch the full document from `registry.npmjs.com/<package>`.
381+
4. Normalize into `packages`, `versions`, `maintainers`, and `package_maintainers` using the write rules above.
382+
4. Downloads: two Temporal workflows — `backfillDailyDownloads` (per-day rows into `downloads_daily`) and `refreshLast30dDownloads` (rolling 30-day windows into `downloads_last_30d`). Both are self-healing: they detect and fill missing windows on each run rather than assuming continuity. Both currently source packages from a static watch list. Once the deps.dev BQ import is operational, `backfillDailyDownloads` will source from `packages` (Tier 2 critical slice) and `refreshLast30dDownloads` will source from `packages_universe` (full Tier 3 population).
383+
384+
**Rolling-30-day window shape**: each `downloads_last_30d` window uses `end_date = 1st of calendar month, start_date = end_date − 30 days` — not a true calendar month. This ensures every window covers exactly 30 days, making download counts directly comparable across months for criticality scoring. Calendar months (28–31 days) would skew comparisons.
382385

383386
The `_changes` `seq` cursor is persisted between runs (in a `worker_state` row or env var) so each poll starts where the last one ended. If the cursor is lost the worker falls back to a full re-sync of the critical list.
384387

@@ -557,7 +560,7 @@ SET count = EXCLUDED.count,
557560
start_date = EXCLUDED.start_date;
558561
```
559562

560-
The weekly ranking worker must write both the `downloads_last_30d` row and the cached `packages_universe.downloads_last_30d` column in the same pass — if only one write happens, the ranking function silently uses a stale value. Code review must enforce this.
563+
The writer of a `downloads_last_30d` row must also update the cached `packages_universe.downloads_last_30d` column in the same pass when writing the latest window — if only one write happens, the ranking function silently uses a stale value. Code review must enforce this. For npm, this responsibility belongs to the npm worker's `refreshLast30dDownloads` workflow.
561564

562565
A package promoted from Tier 3 to Tier 2 (becomes critical) will have rolling-window history but no daily history; the daily timeline starts from the promotion date forward.
563566

pnpm-lock.yaml

Lines changed: 7 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scripts/builders/packages.env

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
DOCKERFILE="./services/docker/Dockerfile.packages"
22
CONTEXT="../"
33
REPO="sjc.ocir.io/axbydjxa5zuh/packages"
4-
SERVICES="packages github-repos-enricher deps-dev-ingest"
4+
SERVICES="packages github-repos-enricher deps-dev-ingest npm-worker"

scripts/packages-db/Dockerfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
FROM postgres:14
2+
RUN apt-get update \
3+
&& apt-get install -y --no-install-recommends postgresql-14-partman \
4+
&& rm -rf /var/lib/apt/lists/*

scripts/scaffold.yaml

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ services:
77
- POSTGRES_PASSWORD=example
88
- POSTGRES_DB=crowd-web
99
ports:
10-
- 5432:5432
10+
- '127.0.0.1:5432:5432'
1111
volumes:
1212
- pgdata-dev:/var/lib/postgresql/data
1313
- ./scaffold/sequin/postgres-docker-entrypoint-initdb.d/create-sequin-database.sql:/docker-entrypoint-initdb.d/create-sequin-database.sql
@@ -23,22 +23,24 @@ services:
2323
- POSTGRES_PASSWORD=example
2424
- POSTGRES_DB=product-db
2525
ports:
26-
- 5433:5432
26+
- '127.0.0.1:5433:5432'
2727
volumes:
2828
- pgdata-product-dev:/var/lib/postgresql/data
2929
shm_size: 1gb
3030
networks:
3131
- crowd-bridge
3232

3333
packages:
34-
image: postgres:14-alpine
34+
build:
35+
context: ./packages-db
36+
dockerfile: Dockerfile
3537
restart: unless-stopped
3638
command: -c 'max_connections=300'
3739
environment:
3840
- POSTGRES_PASSWORD=example
3941
- POSTGRES_DB=packages-db
4042
ports:
41-
- 5434:5432
43+
- '127.0.0.1:5434:5432'
4244
volumes:
4345
- pgdata-packages-dev:/var/lib/postgresql/data
4446
shm_size: 1gb
@@ -62,8 +64,8 @@ services:
6264
soft: 65536
6365
hard: 65536
6466
ports:
65-
- 9200:9200
66-
- 9600:9600
67+
- '127.0.0.1:9200:9200'
68+
- '127.0.0.1:9600:9600'
6769
volumes:
6870
- opensearch-dev:/usr/share/opensearch/data
6971
networks:
@@ -85,7 +87,7 @@ services:
8587
image: scireum/s3-ninja:8.0.0
8688
restart: unless-stopped
8789
ports:
88-
- 9000:9000
90+
- '127.0.0.1:9100:9000'
8991
volumes:
9092
- s3-dev:/home/sirius/data
9193
networks:
@@ -98,7 +100,7 @@ services:
98100
- ./scaffold/nginx/templates:/etc/nginx/templates
99101
- ./scaffold/nginx/ssl:/etc/nginx/ssl
100102
ports:
101-
- '443:443'
103+
- '127.0.0.1:443:443'
102104
environment:
103105
- NGINX_HOST=localhost
104106
- NGINX_PORT=443
@@ -115,7 +117,7 @@ services:
115117
volumes:
116118
- redis-dev:/data
117119
ports:
118-
- 6379:6379
120+
- '127.0.0.1:6379:6379'
119121
networks:
120122
- crowd-bridge
121123

@@ -133,7 +135,7 @@ services:
133135
- NANGO_SERVER_URL=http://localhost:3003
134136
- SERVER_PORT=3003
135137
ports:
136-
- '3003:3003'
138+
- '127.0.0.1:3003:3003'
137139
networks:
138140
- crowd-bridge
139141

@@ -154,9 +156,9 @@ services:
154156
- KAFKA_KRAFT_CLUSTER_ID=OTMwNzFhYTY1ODNiNGE5OT
155157
- KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true
156158
ports:
157-
- '9092:9092'
158-
- '9093:9093'
159-
- '9094:9094'
159+
- '127.0.0.1:9092:9092'
160+
- '127.0.0.1:9093:9093'
161+
- '127.0.0.1:9094:9094'
160162
networks:
161163
- crowd-bridge
162164

@@ -182,8 +184,8 @@ services:
182184
context: scaffold/temporal
183185
restart: unless-stopped
184186
ports:
185-
- '7233:7233'
186-
- '8233:8233'
187+
- '127.0.0.1:7233:7233'
188+
- '127.0.0.1:8233:8233'
187189
networks:
188190
- crowd-bridge
189191

@@ -194,8 +196,8 @@ services:
194196
environment:
195197
- COMPATIBILITY_MODE=1 # <— force Classic version
196198
ports:
197-
- '80:80'
198-
- '7181:7181'
199+
- '127.0.0.1:80:80'
200+
- '127.0.0.1:7181:7181'
199201
networks:
200202
- crowd-bridge
201203

scripts/services/npm-worker.yaml

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
version: '3.1'
2+
3+
x-env-args: &env-args
4+
DOCKER_BUILDKIT: 1
5+
NODE_ENV: docker
6+
SERVICE: npm-worker
7+
CROWD_TEMPORAL_TASKQUEUE: npm-worker
8+
CROWD_TEMPORAL_NAMESPACE: ${CROWD_PACKAGES_TEMPORAL_NAMESPACE}
9+
SHELL: /bin/sh
10+
SUPPRESS_NO_CONFIG_WARNING: 'true'
11+
12+
services:
13+
npm-worker:
14+
build:
15+
context: ../../
16+
dockerfile: ./scripts/services/docker/Dockerfile.packages
17+
command: 'pnpm run start:npm-worker'
18+
working_dir: /usr/crowd/app/services/apps/packages_worker
19+
env_file:
20+
- ../../backend/.env.dist.local
21+
- ../../backend/.env.dist.composed
22+
- ../../backend/.env.override.local
23+
- ../../backend/.env.override.composed
24+
environment:
25+
<<: *env-args
26+
restart: always
27+
networks:
28+
- crowd-bridge
29+
30+
npm-worker-dev:
31+
build:
32+
context: ../../
33+
dockerfile: ./scripts/services/docker/Dockerfile.packages
34+
command: 'pnpm run dev:npm-worker'
35+
working_dir: /usr/crowd/app/services/apps/packages_worker
36+
# user: '${USER_ID}:${GROUP_ID}'
37+
env_file:
38+
- ../../backend/.env.dist.local
39+
- ../../backend/.env.dist.composed
40+
- ../../backend/.env.override.local
41+
- ../../backend/.env.override.composed
42+
environment:
43+
<<: *env-args
44+
hostname: npm-worker
45+
networks:
46+
- crowd-bridge
47+
volumes:
48+
- ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src
49+
- ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src
50+
- ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src
51+
- ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src
52+
- ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src
53+
- ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src
54+
- ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src
55+
- ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src
56+
- ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src
57+
- ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src
58+
- ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src
59+
- ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src
60+
- ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src
61+
- ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src
62+
- ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src
63+
- ../../services/apps/packages_worker/src:/usr/crowd/app/services/apps/packages_worker/src
64+
65+
networks:
66+
crowd-bridge:
67+
external: true

0 commit comments

Comments
 (0)