Skip to content

Commit b23c298

Browse files
joanreyeroclaude
andcommitted
feat: dockerhub-sync worker for repo_docker pull counts (CM-1213)
Standalone loop worker (modeled on github-repos-enricher) that: - discovers Docker images for GitHub repos via Dockerfile-gated <owner>/<name> probing on hub.docker.com/v2 - refreshes pull/star counts daily into repo_docker - snapshots lifetime pull_count into repo_docker_pulls_daily for delta-at-query-time daily granularity Schema (V1779710880 edited in place, pre-prod): - repos.docker_checked_at + partial index for discovery backlog - repo_docker_pulls_daily partitioned by date (pg_partman, mirrors downloads_daily) - repo_docker_stale_idx on last_synced_at Tested against a 1000-repo random sample from prod public.repositories: 2.6% hit rate on Hub; 87% of repos have no Dockerfile; ghcr.io is the dominant registry for the remainder. CI-workflow parsing and ghcr/quay probes scoped as follow-ups. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Joan Reyero <joan@reyero.io>
1 parent af6a460 commit b23c298

14 files changed

Lines changed: 843 additions & 0 deletions

File tree

backend/.env.dist.local

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,3 +194,10 @@ OSV_ECOSYSTEMS=npm,Maven
194194
OSV_TMP_DIR=/tmp/osv
195195
OSV_BATCH_SIZE=500
196196
OSV_DERIVE_BATCH_SIZE=1000
197+
198+
# dockerhub-sync (see services/apps/packages_worker/src/dockerhub/)
199+
DOCKERHUB_API_BASE_URL=https://hub.docker.com/v2
200+
DOCKERHUB_BATCH_SIZE=100
201+
DOCKERHUB_REFRESH_INTERVAL_HOURS=24
202+
DOCKERHUB_DISCOVERY_INTERVAL_DAYS=14
203+
DOCKERHUB_IDLE_SLEEP_SEC=60

backend/src/osspckgs/migrations/V1779710880__initial_schema.sql

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,10 @@ CREATE TABLE repos (
507507
-- Scorecard aggregate; per-check detail in repo_scorecard_checks
508508
scorecard_score numeric(3, 1),
509509
scorecard_last_run_at timestamptz,
510+
-- Last time dockerhub-sync probed this repo for a published Docker image (Dockerfile
511+
-- detection + Hub candidate lookup). NULL = never checked. Separate from last_synced_at
512+
-- because discovery cadence (weeks) differs from light-metadata refresh cadence (daily).
513+
docker_checked_at timestamptz,
510514
-- Nullable with no default: multiple enrichers (deps.dev, GitHub worker, Scorecard) each write
511515
-- different columns at different times. NOT NULL DEFAULT would stamp a "synced" timestamp on
512516
-- first insert even when most columns are still NULL, making freshness checks misleading.
@@ -521,6 +525,13 @@ CREATE INDEX ON repos (scorecard_score)
521525
WHERE
522526
scorecard_score IS NOT NULL;
523527

528+
-- Partial index for the dockerhub-sync discovery backlog query: pages repos that have
529+
-- never been probed for a Docker image. Once docker_checked_at is set the row drops out
530+
-- of the index, so this stays small even as the repos table grows.
531+
CREATE INDEX repos_docker_pending_idx ON repos (id)
532+
WHERE
533+
host = 'github' AND docker_checked_at IS NULL;
534+
524535
-- OpenSSF Scorecard per-check detail (~18 named checks)
525536
CREATE TABLE repo_scorecard_checks (
526537
id bigserial PRIMARY KEY,
@@ -547,6 +558,30 @@ CREATE INDEX ON repo_docker (repo_id)
547558
WHERE
548559
repo_id IS NOT NULL;
549560

561+
-- Supports the dockerhub-sync refresh query (WHERE last_synced_at < NOW() - interval).
562+
CREATE INDEX repo_docker_stale_idx ON repo_docker (last_synced_at);
563+
564+
-- ============================================================
565+
-- REPO DOCKER PULLS DAILY
566+
-- One row per image per day storing the *lifetime* pull_count as returned
567+
-- by hub.docker.com/v2/repositories/<image>. Docker Hub does not expose
568+
-- per-day download counts, so daily deltas are derived at query time:
569+
-- pulls_total - LAG(pulls_total) OVER (PARTITION BY image_name ORDER BY date)
570+
-- Keyed by image_name (matches repo_docker UNIQUE) so rows survive a
571+
-- repo_docker re-discovery without an FK cascade.
572+
--
573+
-- Partitioned monthly via pg_partman — same setup as downloads_daily; add a
574+
-- partman.create_parent('public.repo_docker_pulls_daily', 'date', '1 month', 3)
575+
-- call alongside the downloads_daily registration.
576+
-- ============================================================
577+
CREATE TABLE repo_docker_pulls_daily (
578+
image_name text NOT NULL,
579+
date date NOT NULL,
580+
pulls_total bigint NOT NULL,
581+
PRIMARY KEY (image_name, date)
582+
)
583+
PARTITION BY RANGE (date);
584+
550585
-- Package → repo provenance (monorepos publish N packages from one repo)
551586
CREATE TABLE package_repos (
552587
id bigserial PRIMARY KEY,
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
version: '3.1'
2+
3+
x-env-args: &env-args
4+
DOCKER_BUILDKIT: 1
5+
NODE_ENV: docker
6+
SERVICE: dockerhub-sync
7+
SHELL: /bin/sh
8+
SUPPRESS_NO_CONFIG_WARNING: 'true'
9+
DOCKERHUB_API_BASE_URL: 'https://hub.docker.com/v2'
10+
DOCKERHUB_BATCH_SIZE: '100'
11+
DOCKERHUB_REFRESH_INTERVAL_HOURS: '24'
12+
DOCKERHUB_DISCOVERY_INTERVAL_DAYS: '14'
13+
DOCKERHUB_IDLE_SLEEP_SEC: '60'
14+
15+
services:
16+
dockerhub-sync:
17+
build:
18+
context: ../../
19+
dockerfile: ./scripts/services/docker/Dockerfile.packages-worker
20+
command: 'pnpm run start:dockerhub-sync'
21+
working_dir: /usr/crowd/app/services/apps/packages_worker
22+
env_file:
23+
- ../../backend/.env.dist.local
24+
- ../../backend/.env.dist.composed
25+
- ../../backend/.env.override.local
26+
- ../../backend/.env.override.composed
27+
environment:
28+
<<: *env-args
29+
restart: always
30+
networks:
31+
- crowd-bridge
32+
33+
dockerhub-sync-dev:
34+
build:
35+
context: ../../
36+
dockerfile: ./scripts/services/docker/Dockerfile.packages-worker
37+
command: 'pnpm run dev:dockerhub-sync'
38+
working_dir: /usr/crowd/app/services/apps/packages_worker
39+
# user: '${USER_ID}:${GROUP_ID}'
40+
env_file:
41+
- ../../backend/.env.dist.local
42+
- ../../backend/.env.dist.composed
43+
- ../../backend/.env.override.local
44+
- ../../backend/.env.override.composed
45+
environment:
46+
<<: *env-args
47+
hostname: dockerhub-sync
48+
networks:
49+
- crowd-bridge
50+
volumes:
51+
- ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src
52+
- ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src
53+
- ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src
54+
- ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src
55+
- ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src
56+
- ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src
57+
- ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src
58+
- ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src
59+
- ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src
60+
- ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src
61+
- ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src
62+
- ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src
63+
- ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src
64+
- ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src
65+
- ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src
66+
- ../../services/apps/packages_worker/src:/usr/crowd/app/services/apps/packages_worker/src
67+
68+
networks:
69+
crowd-bridge:
70+
external: true

services/apps/packages_worker/package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
"dev:github-repos-enricher": "SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts",
99
"dev:packages-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts",
1010
"dev:github-repos-enricher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts",
11+
"start:dockerhub-sync": "SERVICE=dockerhub-sync tsx src/bin/dockerhub-sync.ts",
12+
"dev:dockerhub-sync": "SERVICE=dockerhub-sync LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/dockerhub-sync.ts",
13+
"dev:dockerhub-sync:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=dockerhub-sync LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/dockerhub-sync.ts",
1114
"lint": "npx eslint --ext .ts src --max-warnings=0",
1215
"format": "npx prettier --write \"src/**/*.ts\"",
1316
"format-check": "npx prettier --check .",
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import { getServiceLogger } from '@crowd/logging'
2+
3+
import { getDockerhubConfig } from '../config'
4+
import { getPackagesDb } from '../db'
5+
import { runDockerhubLoop } from '../dockerhub'
6+
7+
const log = getServiceLogger()
8+
9+
let shuttingDown = false
10+
11+
const shutdown = async () => {
12+
if (shuttingDown) return
13+
shuttingDown = true
14+
log.info('Shutting down dockerhub-sync...')
15+
}
16+
17+
process.on('SIGINT', shutdown)
18+
process.on('SIGTERM', shutdown)
19+
20+
const main = async () => {
21+
log.info('dockerhub-sync starting...')
22+
23+
const config = getDockerhubConfig()
24+
25+
if (config.tokens.length === 0) {
26+
log.error('ENRICHER_GITHUB_TOKENS is required (comma-separated PATs)')
27+
process.exit(1)
28+
}
29+
30+
const qx = await getPackagesDb()
31+
await qx.selectOne('SELECT 1')
32+
log.info('Connected to packages-db.')
33+
34+
log.info(
35+
{ tokens: config.tokens.length, batchSize: config.batchSize, hubBaseUrl: config.hubBaseUrl },
36+
'Starting dockerhub loop',
37+
)
38+
39+
await runDockerhubLoop(qx, config, () => shuttingDown)
40+
41+
log.info('dockerhub-sync stopped.')
42+
process.exit(0)
43+
}
44+
45+
main().catch((err) => {
46+
log.error({ err }, 'dockerhub-sync fatal error')
47+
process.exit(1)
48+
})

services/apps/packages_worker/src/config.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,21 @@ export function getEnricherConfig() {
3232
idleSleepSec: requireEnvInt('ENRICHER_IDLE_SLEEP_SEC'),
3333
}
3434
}
35+
36+
export function getDockerhubConfig() {
37+
const rawTokens = process.env.ENRICHER_GITHUB_TOKENS ?? ''
38+
const tokens = rawTokens
39+
.split(',')
40+
.map((t) => t.trim())
41+
.filter(Boolean)
42+
43+
return {
44+
// GitHub PATs reused from the enricher for Dockerfile detection (GraphQL).
45+
tokens,
46+
hubBaseUrl: requireEnv('DOCKERHUB_API_BASE_URL'),
47+
batchSize: requireEnvInt('DOCKERHUB_BATCH_SIZE'),
48+
refreshIntervalHours: requireEnvInt('DOCKERHUB_REFRESH_INTERVAL_HOURS'),
49+
discoveryIntervalDays: requireEnvInt('DOCKERHUB_DISCOVERY_INTERVAL_DAYS'),
50+
idleSleepSec: requireEnvInt('DOCKERHUB_IDLE_SLEEP_SEC'),
51+
}
52+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import { describe, expect, it } from 'vitest'
2+
3+
import { buildCandidates } from '../candidates'
4+
5+
describe('buildCandidates', () => {
6+
it('lowercases owner and repo into a single <owner>/<repo> candidate', () => {
7+
expect(buildCandidates('Grafana', 'Grafana')).toEqual(['grafana/grafana'])
8+
})
9+
10+
it('passes through already-valid lowercase slugs', () => {
11+
expect(buildCandidates('prometheus', 'node_exporter')).toEqual(['prometheus/node_exporter'])
12+
})
13+
14+
it('rejects components with characters Docker Hub does not accept', () => {
15+
// GitHub allows '+' in org names via renames; Hub would 400.
16+
expect(buildCandidates('foo+bar', 'baz')).toEqual([])
17+
})
18+
19+
it('rejects components that start or end with a separator', () => {
20+
expect(buildCandidates('-leading', 'repo')).toEqual([])
21+
expect(buildCandidates('owner', 'trailing.')).toEqual([])
22+
})
23+
24+
it('does not emit a library/<repo> candidate', () => {
25+
// Guard against accidental reintroduction — see comment in candidates.ts.
26+
expect(buildCandidates('nodejs', 'node')).toEqual(['nodejs/node'])
27+
})
28+
})
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import { afterEach, describe, expect, it, vi } from 'vitest'
2+
3+
import { fetchDockerhub } from '../fetchDockerhub'
4+
import { FetchError } from '../types'
5+
6+
const BASE = 'https://hub.docker.com/v2'
7+
8+
function mockFetch(status: number, body: unknown, headers: Record<string, string> = {}) {
9+
return vi.spyOn(globalThis, 'fetch').mockResolvedValue(
10+
new Response(typeof body === 'string' ? body : JSON.stringify(body), {
11+
status,
12+
headers: { 'Content-Type': 'application/json', ...headers },
13+
}),
14+
)
15+
}
16+
17+
afterEach(() => {
18+
vi.restoreAllMocks()
19+
})
20+
21+
describe('fetchDockerhub', () => {
22+
it('returns pull/star counts on 200', async () => {
23+
mockFetch(200, {
24+
name: 'grafana',
25+
namespace: 'grafana',
26+
pull_count: 12345,
27+
star_count: 678,
28+
last_updated: '2026-05-01T00:00:00Z',
29+
})
30+
31+
const r = await fetchDockerhub(BASE, 'grafana/grafana')
32+
expect(r).toEqual({
33+
imageName: 'grafana/grafana',
34+
pulls: 12345,
35+
stars: 678,
36+
lastUpdated: '2026-05-01T00:00:00Z',
37+
})
38+
})
39+
40+
it('appends trailing slash to the request URL', async () => {
41+
const spy = mockFetch(200, { pull_count: 1, star_count: 0 })
42+
await fetchDockerhub(BASE, 'a/b')
43+
expect(spy).toHaveBeenCalledWith(`${BASE}/repositories/a/b/`, expect.anything())
44+
})
45+
46+
it('classifies 404 as NOT_FOUND', async () => {
47+
mockFetch(404, { message: 'object not found' })
48+
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'NOT_FOUND' })
49+
})
50+
51+
it('classifies 400 as NOT_FOUND (Hub 400s on malformed slugs)', async () => {
52+
mockFetch(400, { message: 'bad request' })
53+
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'NOT_FOUND' })
54+
})
55+
56+
it('classifies 429 as RATE_LIMIT with resetAt from header', async () => {
57+
const resetSec = Math.floor(Date.now() / 1000) + 120
58+
mockFetch(429, { message: 'too many' }, { 'x-ratelimit-reset': String(resetSec) })
59+
60+
expect.assertions(3)
61+
try {
62+
await fetchDockerhub(BASE, 'a/b')
63+
} catch (err) {
64+
expect(err).toBeInstanceOf(FetchError)
65+
const fe = err as FetchError
66+
expect(fe.kind).toBe('RATE_LIMIT')
67+
expect(fe.resetAt).toBeGreaterThan(Date.now())
68+
}
69+
})
70+
71+
it('classifies x-ratelimit-remaining: 0 as RATE_LIMIT even on 200', async () => {
72+
mockFetch(200, { pull_count: 1 }, { 'x-ratelimit-remaining': '0' })
73+
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'RATE_LIMIT' })
74+
})
75+
76+
it('classifies 5xx as TRANSIENT', async () => {
77+
mockFetch(503, 'Service Unavailable')
78+
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'TRANSIENT' })
79+
})
80+
81+
it('classifies network failure as TRANSIENT', async () => {
82+
vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('ECONNRESET'))
83+
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'TRANSIENT' })
84+
})
85+
86+
it('classifies non-JSON 200 body as MALFORMED', async () => {
87+
mockFetch(200, '<html>not json</html>')
88+
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'MALFORMED' })
89+
})
90+
91+
it('classifies missing pull_count as MALFORMED', async () => {
92+
mockFetch(200, { name: 'x', star_count: 0 })
93+
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'MALFORMED' })
94+
})
95+
})
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// Docker Hub repository slugs are lowercase and limited to [a-z0-9._-].
2+
// GitHub allows uppercase and a few characters Hub rejects, so we lowercase
3+
// and validate before probing — anything that fails the regex would 400 on
4+
// Hub anyway and isn't worth an HTTP round-trip.
5+
const HUB_COMPONENT = /^[a-z0-9](?:[a-z0-9._-]*[a-z0-9])?$/
6+
7+
export function buildCandidates(owner: string, name: string): string[] {
8+
const ns = owner.toLowerCase()
9+
const repo = name.toLowerCase()
10+
11+
if (!HUB_COMPONENT.test(ns) || !HUB_COMPONENT.test(repo)) {
12+
return []
13+
}
14+
15+
// v1 deliberately omits `library/<repo>`: a random github.com/foo/node with a
16+
// dev Dockerfile would false-positive onto the official library/node image.
17+
// Official images (~150) to be seeded via allowlist in a follow-up.
18+
return [`${ns}/${repo}`]
19+
}

0 commit comments

Comments
 (0)