From 9a724ed1c3861530f98bfacb618cc0b358148e29 Mon Sep 17 00:00:00 2001 From: Kiichi Iwashita Date: Sun, 10 May 2026 00:55:10 +0900 Subject: [PATCH 1/3] fix(infra): inject Garage secrets via env, roll back postgres to 17 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two compose-time bugs surfaced when first running `make compose-up`: 1. Garage refused to start with `Invalid RPC secret key`. The `rpc_secret = "REPLACE_ME_AT_BOOTSTRAP_via_env"` placeholder in garage.toml was being read literally — no env override was wired. 2. Postgres 18 changed the on-disk layout (PR docker-library/postgres#1259): it now refuses to mount at /var/lib/postgresql/data and demands a parent-dir mount with versioned subdirs. Three of our four services crashed in lockstep ("PostgreSQL data in /var/lib/postgresql/data (unused mount/volume)"). Fixes: - garage.toml: drop `rpc_secret` / `admin_token` / `metrics_token` lines. Garage reads `GARAGE_RPC_SECRET` etc. from the environment at start. - docker-compose.yml: add `environment:` block on the garage service that pulls GARAGE_{RPC_SECRET,ADMIN_TOKEN} from the host env, with `${VAR:?...}` validation so a missing op-run wrap fails loudly instead of silently using empty values. - docker-compose.yml: revert postgres 18-alpine → 17-alpine (digest pinned). Postgres 17 is supported through 2029; the 18 path-shape change is better solved in a future PR with proper PGDATA + parent-mount layout. - Makefile: `compose-up` and `compose-up-streaming` now go through `$(OP_RUN)` (and depend on `env-check`), so secrets are always injected. Verified locally: postgres / temporal-db / lakekeeper-db / temporal / temporal-ui all healthy. Garage starts cleanly (cluster-layout init is a follow-up). Lakekeeper DB-migration step is also a follow-up (Phase 2). Co-Authored-By: Claude Opus 4.7 (1M context) --- Makefile | 13 ++++++++----- docker/docker-compose.yml | 13 ++++++++++--- docker/garage/garage.toml | 9 ++++----- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 7a402d7..79c4434 100644 --- a/Makefile +++ b/Makefile @@ -69,15 +69,18 @@ env-check: ## Verify env wiring (1Password item or .env.local) fi # ---------------------------------------------------------------------- -# Docker compose (no secrets needed for compose itself; secrets via .env if any) +# Docker compose # ---------------------------------------------------------------------- +# `compose up` needs GARAGE_RPC_SECRET, GARAGE_ADMIN_TOKEN, POSTGRES_PASSWORD +# from the env (Garage refuses to start without a 32-byte rpc_secret). +# We wrap with $(OP_RUN) so 1Password (or .env.local) supplies them. .PHONY: compose-up -compose-up: ## Start the platform stack - cd $(REPO_ROOT)/docker && docker compose up -d +compose-up: env-check ## Start the platform stack + cd $(REPO_ROOT)/docker && $(OP_RUN) docker compose up -d .PHONY: compose-up-streaming -compose-up-streaming: ## Start the platform stack + Kafka - cd $(REPO_ROOT)/docker && docker compose -f docker-compose.yml -f docker-compose.streaming.yml up -d +compose-up-streaming: env-check ## Start the platform stack + Kafka + cd $(REPO_ROOT)/docker && $(OP_RUN) docker compose -f docker-compose.yml -f docker-compose.streaming.yml up -d .PHONY: compose-down compose-down: ## Stop the platform stack diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 294de61..24e029b 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -44,6 +44,13 @@ services: ports: - "3900:3900" # S3 API - "3903:3903" # admin API + environment: + # Secrets injected from 1Password via `op run --env-file=.env`. + # Garage reads any config field from env when prefixed with GARAGE_. + # See: https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/ + GARAGE_RPC_SECRET: "${GARAGE_RPC_SECRET:?GARAGE_RPC_SECRET must be set (use op run --env-file=.env -- ...)}" + GARAGE_ADMIN_TOKEN: "${GARAGE_ADMIN_TOKEN:?GARAGE_ADMIN_TOKEN must be set}" + GARAGE_METRICS_TOKEN: "${GARAGE_ADMIN_TOKEN:?reuse admin token for metrics}" volumes: - garage-meta:/var/lib/garage/meta - garage-data:/var/lib/garage/data @@ -59,7 +66,7 @@ services: # Lakekeeper — Iceberg REST catalog (Apache 2.0, Rust) # ---------------------------------------------------------------- lakekeeper-db: - image: postgres:18-alpine@sha256:54451ecb8ab38c24c3ec123f2fd501303a3a1856a5c66e98cecf2460d5e1e9d7 + image: postgres:17-alpine@sha256:c7526c0f6c3f30260a563d7bcf8ad778effac59a44f8ffa86678c35418338609 environment: POSTGRES_USER: lakekeeper POSTGRES_PASSWORD: lakekeeper @@ -97,7 +104,7 @@ services: # Postgres — operational mart (reverse-ETL target, app DB) # ---------------------------------------------------------------- postgres: - image: postgres:18-alpine@sha256:54451ecb8ab38c24c3ec123f2fd501303a3a1856a5c66e98cecf2460d5e1e9d7 + image: postgres:17-alpine@sha256:c7526c0f6c3f30260a563d7bcf8ad778effac59a44f8ffa86678c35418338609 ports: - "5432:5432" environment: @@ -117,7 +124,7 @@ services: # Temporal — orchestration # ---------------------------------------------------------------- temporal-db: - image: postgres:18-alpine@sha256:54451ecb8ab38c24c3ec123f2fd501303a3a1856a5c66e98cecf2460d5e1e9d7 + image: postgres:17-alpine@sha256:c7526c0f6c3f30260a563d7bcf8ad778effac59a44f8ffa86678c35418338609 environment: POSTGRES_USER: temporal POSTGRES_PASSWORD: temporal diff --git a/docker/garage/garage.toml b/docker/garage/garage.toml index f9e8013..c88b4fb 100644 --- a/docker/garage/garage.toml +++ b/docker/garage/garage.toml @@ -6,11 +6,12 @@ db_engine = "lmdb" replication_factor = 1 consistency_mode = "consistent" -# RPC secret — overridden via env at first boot. -# Generate with: openssl rand -hex 32 +# Secrets (rpc_secret, admin_token, metrics_token) are injected via +# environment variables — see docker-compose.yml `garage.environment` +# block. The values come from 1Password (`op run --env-file=.env`). +# Reference: https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/ rpc_bind_addr = "[::]:3901" rpc_public_addr = "127.0.0.1:3901" -rpc_secret = "REPLACE_ME_AT_BOOTSTRAP_via_env" [s3_api] s3_region = "garage" @@ -24,5 +25,3 @@ index = "index.html" [admin] api_bind_addr = "[::]:3903" -admin_token = "REPLACE_ME_AT_BOOTSTRAP_via_env" -metrics_token = "REPLACE_ME_AT_BOOTSTRAP_via_env" From 39d56770b2776cce5c9758e08c4b0845fa96dc54 Mon Sep 17 00:00:00 2001 From: Kiichi Iwashita Date: Sun, 10 May 2026 01:26:18 +0900 Subject: [PATCH 2/3] feat(phase-1): garage-init, dbt install scaffold, end-to-end fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 of the project (minimal end-to-end pipeline) now runs cleanly from `make compose-up` through `make phase1` to a live Streamlit dashboard at http://localhost:8501. Real GitHub data lands in Garage S3, flows through DuckDB bronze→silver→gold, and is visualised. Major additions --------------- - scripts/garage-init.sh — idempotent layout + bucket + key import. Fixes the "no role assigned / Quorum not available" state Garage ships in. Pulls GARAGE_S3_ACCESS_KEY/SECRET_KEY from env (op run). - Makefile: `make garage-init`, `make dbt-install`, and `phase1` target chains compose-up → garage-init → ingest → dbt → dashboard. - transform/requirements.txt: pin dbt-core 1.10 + dbt-duckdb 1.10. - transform/.venv setup via `make dbt-install` (separate from ingestion's venv to keep dependency surfaces clean). Pipeline fixes discovered while running for real ------------------------------------------------ 1. profiles.yml.example: add `s3_region: garage` so DuckDB's S3 client stops sending `ap-northeast-1` to Garage and getting AuthorizationHeaderMalformed back. 2. read_parquet(union_by_name=true) on all three bronze stg models — needed because anthropics/claude-code has license=null and pyarrow inferred its `license_spdx` column as INTEGER while every other repo wrote VARCHAR. Combined with `try_cast(license_spdx as varchar)` on the projection. 3. Bronze stg_* models materialized as `table` (not `view`). Working around DuckDB v1.5 binder INTERNAL Error: when bronze is a view and silver does `select from {{ ref(stg_*) }} qualify row_number() over (... order by fetched_at desc) = 1`, the binder errors with "Failed to bind column reference '': inequal types (TIMESTAMP != VARCHAR)". Reproduced the bug in isolation; the triggers are (a) source = subquery/view, (b) qualify with TIMESTAMP ordering, (c) ≥2 TIMESTAMP columns in projection. Persisting bronze sidesteps the inline-binder path. 4. Silver fct_*/dim_repos rewritten with `qualify` (replaces the `with ranked as (select *, row_number() ...)` pattern, which also triggered the same binder bug even with table-bronze). 5. fct_commits: replace `cardinality(parents)` with `len(parents)`. `parents` is `VARCHAR[]` (a list); DuckDB's `cardinality()` is for MAPs. `len()` is the canonical list/array length. Verified end-to-end run ----------------------- - Bronze: 5 + 7,833 + 390 rows ingested from 5 OSS repos - Silver: 5 + 7,366 + 467 + 390 rows - Gold: repo_health_snapshot 5 rows, repo_daily_metrics populated - dbt: PASS=28 ERROR=0 SKIP=0 (3 bronze + 4 silver + 2 gold + 19 tests) - Streamlit: HTTP 200 on :8501 reading gold tables The .gitignore now also covers `.local/`, used for personal phase trackers. See `.local/phase-1/{plan,status,log}.md` for the running notes (gitignored). Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 5 + Makefile | 30 ++++- scripts/garage-init.sh | 109 ++++++++++++++++++ transform/dbt_project.yml | 6 +- transform/models/bronze/stg_commit.sql | 8 +- transform/models/bronze/stg_issue_or_pr.sql | 8 +- transform/models/bronze/stg_repo_metadata.sql | 14 ++- transform/models/silver/dim_repos.sql | 17 +-- transform/models/silver/fct_commits.sql | 16 +-- transform/models/silver/fct_issues.sql | 16 +-- transform/models/silver/fct_pull_requests.sql | 15 +-- transform/models/sources.yml | 46 ++------ transform/package-lock.yml | 11 ++ transform/profiles.yml.example | 1 + transform/requirements.txt | 9 ++ 15 files changed, 211 insertions(+), 100 deletions(-) create mode 100755 scripts/garage-init.sh create mode 100644 transform/package-lock.yml create mode 100644 transform/requirements.txt diff --git a/.gitignore b/.gitignore index 66d2edb..2f655a7 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,8 @@ logs/ sbom.json sbom.spdx.json trivy-results.sarif + +# ======================================== +# Personal/runtime workspace (never committed) +# ======================================== +.local/ diff --git a/Makefile b/Makefile index 79c4434..899adba 100644 --- a/Makefile +++ b/Makefile @@ -94,6 +94,13 @@ compose-ps: ## Show running containers compose-logs: ## Tail logs cd $(REPO_ROOT)/docker && docker compose logs -f +# ---------------------------------------------------------------------- +# Garage cluster init (one-time per fresh stack) +# ---------------------------------------------------------------------- +.PHONY: garage-init +garage-init: env-check ## Assign layout, create bucket, import keys, grant access + $(OP_RUN) bash $(REPO_ROOT)/scripts/garage-init.sh + # ---------------------------------------------------------------------- # Python ingestion # ---------------------------------------------------------------------- @@ -118,19 +125,30 @@ ingest-test: ## Run the ingestor unit tests (no secrets needed) cd $(REPO_ROOT)/ingestion/python && .venv/bin/pytest # ---------------------------------------------------------------------- -# dbt +# dbt — uses its own venv at transform/.venv # ---------------------------------------------------------------------- +DBT_VENV := $(REPO_ROOT)/transform/.venv +DBT := $(DBT_VENV)/bin/dbt +DBT_PROFILES_DIR := $(REPO_ROOT)/transform + +.PHONY: dbt-install +dbt-install: ## Create transform/.venv and install dbt-core + dbt-duckdb + $(PY) -m venv $(DBT_VENV) + $(DBT_VENV)/bin/pip install -r $(REPO_ROOT)/transform/requirements.txt + @test -f $(DBT_PROFILES_DIR)/profiles.yml || cp $(REPO_ROOT)/transform/profiles.yml.example $(DBT_PROFILES_DIR)/profiles.yml + @echo "dbt ready: $(DBT)" + .PHONY: dbt-deps -dbt-deps: ## Install dbt packages - cd $(REPO_ROOT)/transform && dbt deps +dbt-deps: ## Install dbt packages from packages.yml + cd $(REPO_ROOT)/transform && DBT_PROFILES_DIR=$(DBT_PROFILES_DIR) $(DBT) deps .PHONY: dbt-build dbt-build: env-check ## Run dbt build end-to-end - cd $(REPO_ROOT)/transform && $(OP_RUN) dbt build + cd $(REPO_ROOT)/transform && $(OP_RUN) bash -c 'DBT_PROFILES_DIR=$(DBT_PROFILES_DIR) $(DBT) build' .PHONY: dbt-debug dbt-debug: env-check ## Validate dbt connection - cd $(REPO_ROOT)/transform && $(OP_RUN) dbt debug + cd $(REPO_ROOT)/transform && $(OP_RUN) bash -c 'DBT_PROFILES_DIR=$(DBT_PROFILES_DIR) $(DBT) debug' # ---------------------------------------------------------------------- # Streamlit @@ -160,7 +178,7 @@ precommit: ## Run all pre-commit hooks against everything # End-to-end # ---------------------------------------------------------------------- .PHONY: phase1 -phase1: compose-up ingest-bootstrap ingest dbt-deps dbt-build dashboard ## Run the full Phase 1 pipeline +phase1: compose-up garage-init ingest-install ingest-bootstrap ingest dbt-install dbt-deps dbt-build dashboard ## Run the full Phase 1 pipeline # ---------------------------------------------------------------------- # 1Password helpers diff --git a/scripts/garage-init.sh b/scripts/garage-init.sh new file mode 100755 index 0000000..f2de2f3 --- /dev/null +++ b/scripts/garage-init.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash +# Initialize the single-node Garage cluster: +# 1. Assign role to the local node (zone, capacity) +# 2. Apply staged layout +# 3. Create the `bronze` bucket +# 4. Import the access keys from env (resolved by `op run`) +# 5. Grant the imported key read+write on `bronze` +# +# Idempotent — safe to re-run after a stack restart. Each step short-circuits +# if the desired state already exists. +# +# Required env (resolved via `op run --env-file=.env`): +# GARAGE_S3_ACCESS_KEY +# GARAGE_S3_SECRET_KEY +# +# Usage: +# make garage-init # via Makefile (wraps with op run automatically) +# bash scripts/garage-init.sh # if env vars are already exported + +set -euo pipefail + +CONTAINER="${GARAGE_CONTAINER:-de-lab-garage-1}" +ZONE="${GARAGE_ZONE:-dc1}" +CAPACITY="${GARAGE_CAPACITY:-10G}" +TAG="${GARAGE_TAG:-local}" +BUCKET="${GARAGE_BUCKET:-bronze}" +KEY_NAME="${GARAGE_KEY_NAME:-de-lab}" + +color() { printf "\033[%sm%s\033[0m" "$1" "$2"; } +info() { echo "$(color 36 "[garage-init]") $*"; } +warn() { echo "$(color 33 "[garage-init]") $*" >&2; } +err() { echo "$(color 31 "[garage-init]") $*" >&2; } + +g() { docker exec -i "$CONTAINER" /garage "$@"; } + +# ---------------------------------------------------------------------- +# 0. Pre-flight +# ---------------------------------------------------------------------- +docker inspect "$CONTAINER" > /dev/null 2>&1 || { + err "Container '$CONTAINER' is not running. Run 'make compose-up' first." + exit 1 +} + +[ -n "${GARAGE_S3_ACCESS_KEY:-}" ] || { err "GARAGE_S3_ACCESS_KEY missing — run via 'make garage-init' (op run)"; exit 1; } +[ -n "${GARAGE_S3_SECRET_KEY:-}" ] || { err "GARAGE_S3_SECRET_KEY missing — run via 'make garage-init' (op run)"; exit 1; } + +# Wait for daemon to accept commands +info "Waiting for Garage daemon..." +for _ in $(seq 1 30); do + if g status > /dev/null 2>&1; then + break + fi + sleep 1 +done +g status > /dev/null 2>&1 || { err "Garage daemon did not respond within 30s"; exit 1; } + +# ---------------------------------------------------------------------- +# 1. Layout: assign role + apply (idempotent) +# ---------------------------------------------------------------------- +status_out=$(g status 2>&1) +node_id=$(echo "$status_out" | awk '/HEALTHY NODES/{flag=1; next} flag && NF && $1 != "ID" {print $1; exit}') +[ -n "$node_id" ] || { err "Could not parse node ID from 'garage status'"; echo "$status_out" >&2; exit 1; } + +if echo "$status_out" | grep -q "NO ROLE ASSIGNED"; then + info "Assigning role to node $node_id (zone=$ZONE, cap=$CAPACITY)..." + g layout assign "$node_id" -z "$ZONE" -c "$CAPACITY" -t "$TAG" + info "Applying staged layout (version 1)..." + g layout apply --version 1 +else + info "Node $node_id already has role assigned — skipping layout." +fi + +# ---------------------------------------------------------------------- +# 2. Bucket (idempotent) +# ---------------------------------------------------------------------- +if g bucket list 2>/dev/null | awk 'NR>2 {print $1}' | grep -qx "$BUCKET"; then + info "Bucket '$BUCKET' already exists — skipping." +else + info "Creating bucket '$BUCKET'..." + g bucket create "$BUCKET" +fi + +# ---------------------------------------------------------------------- +# 3. Key import (idempotent) +# ---------------------------------------------------------------------- +if g key list 2>/dev/null | awk 'NR>2 {print $2}' | grep -qx "$KEY_NAME"; then + info "Key '$KEY_NAME' already exists — skipping import." +else + info "Importing access keys (id from env, name=$KEY_NAME)..." + g key import --yes -n "$KEY_NAME" "$GARAGE_S3_ACCESS_KEY" "$GARAGE_S3_SECRET_KEY" +fi + +# ---------------------------------------------------------------------- +# 4. Grant the key access to the bucket (idempotent: garage allow is OK to re-run) +# ---------------------------------------------------------------------- +info "Granting read+write on '$BUCKET' to key '$KEY_NAME'..." +g bucket allow --read --write --owner "$BUCKET" --key "$KEY_NAME" + +# ---------------------------------------------------------------------- +# 5. Verify +# ---------------------------------------------------------------------- +info "Verification:" +g bucket info "$BUCKET" 2>&1 | sed 's/^/ /' + +echo "" +echo "$(color 32 "[garage-init] Done.")" +echo " Bucket : $BUCKET" +echo " Key name: $KEY_NAME" +echo " S3 endpoint (host): http://localhost:3900" diff --git a/transform/dbt_project.yml b/transform/dbt_project.yml index f61e0be..de674da 100644 --- a/transform/dbt_project.yml +++ b/transform/dbt_project.yml @@ -18,7 +18,11 @@ clean-targets: models: de_lab: bronze: - +materialized: view + # Materialize bronze as table (not view) to dodge DuckDB v1.5 binder + # bug: view → silver `qualify` with multiple TIMESTAMP columns triggers + # "INTERNAL Error: TIMESTAMP != VARCHAR". Persisted bronze has stable + # column types and avoids the inline-view binder path. + +materialized: table +schema: bronze silver: +materialized: table diff --git a/transform/models/bronze/stg_commit.sql b/transform/models/bronze/stg_commit.sql index 288684f..b028bf9 100644 --- a/transform/models/bronze/stg_commit.sql +++ b/transform/models/bronze/stg_commit.sql @@ -1,4 +1,4 @@ -{{ config(materialized='view') }} +{{ config(materialized='table') }} select repo_full_name, @@ -13,4 +13,8 @@ select parents, cast(fetched_at as timestamp) as fetched_at, raw_payload -from {{ source('bronze', 'commit') }} +from read_parquet( + 's3://bronze/commit/**/*.parquet', + hive_partitioning = true, + union_by_name = true +) diff --git a/transform/models/bronze/stg_issue_or_pr.sql b/transform/models/bronze/stg_issue_or_pr.sql index 01fb50a..ed8e34a 100644 --- a/transform/models/bronze/stg_issue_or_pr.sql +++ b/transform/models/bronze/stg_issue_or_pr.sql @@ -1,4 +1,4 @@ -{{ config(materialized='view') }} +{{ config(materialized='table') }} -- Bronze passthrough for issues + PRs (GitHub treats them via the same endpoint). -- Splitting happens in silver. @@ -18,4 +18,8 @@ select cast(closed_at as timestamp) as closed_at, cast(fetched_at as timestamp) as fetched_at, raw_payload -from {{ source('bronze', 'issue_or_pr') }} +from read_parquet( + 's3://bronze/issue_or_pr/**/*.parquet', + hive_partitioning = true, + union_by_name = true +) diff --git a/transform/models/bronze/stg_repo_metadata.sql b/transform/models/bronze/stg_repo_metadata.sql index 10120c6..65fa817 100644 --- a/transform/models/bronze/stg_repo_metadata.sql +++ b/transform/models/bronze/stg_repo_metadata.sql @@ -1,7 +1,9 @@ -{{ config(materialized='view') }} +{{ config(materialized='table') }} -- Bronze passthrough for repository headline numbers. --- Just type cast and rename for downstream consumption. +-- Read directly from Garage S3 via DuckDB httpfs + parquet. +-- (sources.yml documents the contract; dbt-duckdb's source resolution +-- doesn't auto-handle Hive-partitioned globs, so we read explicitly.) select full_name as repo_full_name, name as repo_name, @@ -15,7 +17,7 @@ select network_count, default_branch, language, - license_spdx, + try_cast(license_spdx as varchar) as license_spdx, archived, disabled, fork, @@ -24,4 +26,8 @@ select cast(pushed_at as timestamp) as pushed_at, cast(fetched_at as timestamp) as fetched_at, raw_payload -from {{ source('bronze', 'repo_metadata') }} +from read_parquet( + 's3://bronze/repo_metadata/**/*.parquet', + hive_partitioning = true, + union_by_name = true +) diff --git a/transform/models/silver/dim_repos.sql b/transform/models/silver/dim_repos.sql index b0f468a..09c3e29 100644 --- a/transform/models/silver/dim_repos.sql +++ b/transform/models/silver/dim_repos.sql @@ -2,15 +2,10 @@ -- Latest snapshot per repo. Bronze can have multiple snapshots over time; -- silver collapses to "current truth" by fetched_at. -with ranked as ( - select - *, - row_number() over ( - partition by repo_full_name - order by fetched_at desc - ) as rn - from {{ ref('stg_repo_metadata') }} -) +-- +-- We use `qualify` rather than `with ranked as (select *, row_number() ...)` +-- because `select *` against an external-parquet view confuses DuckDB's +-- window-function binder (INTERNAL Error: TIMESTAMP != VARCHAR). select repo_full_name, repo_name, @@ -30,5 +25,5 @@ select updated_at as repo_updated_at, pushed_at as last_pushed_at, fetched_at as snapshot_at -from ranked -where rn = 1 +from {{ ref('stg_repo_metadata') }} +qualify row_number() over (partition by repo_full_name order by fetched_at desc) = 1 diff --git a/transform/models/silver/fct_commits.sql b/transform/models/silver/fct_commits.sql index fae4a78..15e7735 100644 --- a/transform/models/silver/fct_commits.sql +++ b/transform/models/silver/fct_commits.sql @@ -1,14 +1,5 @@ {{ config(materialized='table') }} -with deduped as ( - select - *, - row_number() over ( - partition by repo_full_name, sha - order by fetched_at desc - ) as rn - from {{ ref('stg_commit') }} -) select repo_full_name, sha, @@ -19,7 +10,8 @@ select committer_login, committer_email, committed_date, - cardinality(parents) as parent_count, + -- DuckDB v1.5: cardinality() is for MAPs; use len() for LISTs + len(parents) as parent_count, fetched_at -from deduped -where rn = 1 +from {{ ref('stg_commit') }} +qualify row_number() over (partition by repo_full_name, sha order by fetched_at desc) = 1 diff --git a/transform/models/silver/fct_issues.sql b/transform/models/silver/fct_issues.sql index 7a28ebe..4b51884 100644 --- a/transform/models/silver/fct_issues.sql +++ b/transform/models/silver/fct_issues.sql @@ -1,16 +1,7 @@ {{ config(materialized='table') }} -- Issues only. PRs are split off into fct_pull_requests. -with deduped as ( - select - *, - row_number() over ( - partition by repo_full_name, number - order by fetched_at desc - ) as rn - from {{ ref('stg_issue_or_pr') }} - where is_pull_request = false -) +-- Use `qualify` to avoid DuckDB's `select * + row_number()` binder bug. select repo_full_name, number as issue_number, @@ -25,5 +16,6 @@ select updated_at, closed_at, fetched_at -from deduped -where rn = 1 +from {{ ref('stg_issue_or_pr') }} +where is_pull_request = false +qualify row_number() over (partition by repo_full_name, number order by fetched_at desc) = 1 diff --git a/transform/models/silver/fct_pull_requests.sql b/transform/models/silver/fct_pull_requests.sql index 390f02f..42ce0dc 100644 --- a/transform/models/silver/fct_pull_requests.sql +++ b/transform/models/silver/fct_pull_requests.sql @@ -1,15 +1,5 @@ {{ config(materialized='table') }} -with deduped as ( - select - *, - row_number() over ( - partition by repo_full_name, number - order by fetched_at desc - ) as rn - from {{ ref('stg_issue_or_pr') }} - where is_pull_request = true -) select repo_full_name, number as pr_number, @@ -23,5 +13,6 @@ select updated_at, closed_at, fetched_at -from deduped -where rn = 1 +from {{ ref('stg_issue_or_pr') }} +where is_pull_request = true +qualify row_number() over (partition by repo_full_name, number order by fetched_at desc) = 1 diff --git a/transform/models/sources.yml b/transform/models/sources.yml index f51bdfe..e8e1ac7 100644 --- a/transform/models/sources.yml +++ b/transform/models/sources.yml @@ -1,9 +1,13 @@ version: 2 -# External Bronze sources written by ingestion/python/. -# These are materialized as DuckDB views over Garage S3 parquet partitions. +# Documentation-only: these sources describe the contract of what the +# Python ingestor writes to Garage S3. The bronze stg_* models read +# directly via DuckDB's `read_parquet(...)` since dbt-duckdb's source +# resolution doesn't auto-handle Hive-partitioned globs. # -# Phase 2 will replace these with Iceberg tables managed by Lakekeeper. +# Phase 2 will replace these external parquet sources with Iceberg tables +# managed by Lakekeeper, at which point real `source()` references become +# usable end-to-end. sources: - name: bronze @@ -11,45 +15,11 @@ sources: meta: origin: ingestion/python/ partitioned_by: [event_type, owner, repo, year, month, day] + external_location_template: "s3://bronze/{event_type}/{owner}/{repo}/year={Y}/month={M}/day={D}/*.parquet" tables: - name: repo_metadata description: "Snapshot of repository headline numbers." - external: - # Replaced at compile-time by env_var or vars. - # See profiles.yml.example: extensions=[httpfs, parquet] settings=[s3_*] - location: "s3://bronze/repo_metadata/**/*.parquet" - columns: - - name: full_name - description: "owner/repo" - tests: [not_null] - - name: stargazers_count - tests: [not_null] - - name: fetched_at - tests: [not_null] - - name: issue_or_pr description: "Issues and pull requests (PR is a kind of issue on GitHub)." - external: - location: "s3://bronze/issue_or_pr/**/*.parquet" - columns: - - name: repo_full_name - tests: [not_null] - - name: number - tests: [not_null] - - name: state - tests: - - not_null - - accepted_values: - values: [open, closed] - - name: is_pull_request - tests: [not_null] - - name: commit description: "Commits on the default branch." - external: - location: "s3://bronze/commit/**/*.parquet" - columns: - - name: sha - tests: [not_null] - - name: repo_full_name - tests: [not_null] diff --git a/transform/package-lock.yml b/transform/package-lock.yml new file mode 100644 index 0000000..2299784 --- /dev/null +++ b/transform/package-lock.yml @@ -0,0 +1,11 @@ +packages: + - name: dbt_utils + package: dbt-labs/dbt_utils + version: 1.3.3 + - name: dbt_expectations + package: calogica/dbt_expectations + version: 0.10.4 + - name: dbt_date + package: calogica/dbt_date + version: 0.10.1 +sha1_hash: 7e6826471cf21d684924eabf28ee66f0d233f735 diff --git a/transform/profiles.yml.example b/transform/profiles.yml.example index 6e8a893..2292630 100644 --- a/transform/profiles.yml.example +++ b/transform/profiles.yml.example @@ -10,6 +10,7 @@ de_lab: - iceberg settings: s3_endpoint: localhost:3900 + s3_region: "{{ env_var('GARAGE_S3_REGION', 'garage') }}" s3_access_key_id: "{{ env_var('GARAGE_S3_ACCESS_KEY') }}" s3_secret_access_key: "{{ env_var('GARAGE_S3_SECRET_KEY') }}" s3_use_ssl: false diff --git a/transform/requirements.txt b/transform/requirements.txt new file mode 100644 index 0000000..23499b4 --- /dev/null +++ b/transform/requirements.txt @@ -0,0 +1,9 @@ +# dbt runtime for the transform/ project. +# Pinned to dbt-core 1.10 (Phase 4 will switch to dbt Fusion). +# See ADR-0005 for the migration strategy. + +dbt-core>=1.10,<1.11 +dbt-duckdb>=1.10,<1.11 + +# Data quality (replaces Great Expectations — see ADR / SECURITY.md) +# soda-core-duckdb>=3.5 # Phase 4 — keep commented for now to keep deps lean From 803546ef59810a823d1813cf1a3bb0647511628b Mon Sep 17 00:00:00 2001 From: Kiichi Iwashita Date: Sun, 10 May 2026 01:30:01 +0900 Subject: [PATCH 3/3] fix(dbt): override generate_schema_name to drop main_ prefix dbt-duckdb's default generate_schema_name macro produces _, so a model declared as `+schema: gold` lands at `main_gold.repo_health_snapshot` rather than `gold.repo_health_snapshot`. The Streamlit dashboard (and any downstream SQL example) references the cleaner `gold.*` form, so the dashboard fell into its CatalogException-caught fallback ("Gold models not yet materialized"). Override the macro to use the custom schema verbatim, matching what's declared in dbt_project.yml. Now bronze/silver/gold tables live in the schemas of those names, and the dashboard renders correctly. The case for keeping the prefix is multi-target schema isolation (dev_gold vs prod_gold sharing one DuckDB file), which we don't currently need; we have a single dev profile. Co-Authored-By: Claude Opus 4.7 (1M context) --- transform/macros/generate_schema_name.sql | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 transform/macros/generate_schema_name.sql diff --git a/transform/macros/generate_schema_name.sql b/transform/macros/generate_schema_name.sql new file mode 100644 index 0000000..2dd6fa5 --- /dev/null +++ b/transform/macros/generate_schema_name.sql @@ -0,0 +1,19 @@ +{# + Override dbt's default schema-naming behaviour. + + Default: when a model has `+schema: bronze`, dbt creates the table at + _ → main_bronze + + Override: use the custom schema name verbatim, so models declared as + `+schema: bronze` land in `bronze.*`. This matches what Streamlit and + case-study SQL examples reference. + + See https://docs.getdbt.com/docs/build/custom-schemas +#} +{% macro generate_schema_name(custom_schema_name, node) -%} + {%- if custom_schema_name is none -%} + {{ target.schema }} + {%- else -%} + {{ custom_schema_name | trim }} + {%- endif -%} +{%- endmacro %}