From 9a724ed1c3861530f98bfacb618cc0b358148e29 Mon Sep 17 00:00:00 2001
From: Kiichi Iwashita <kiichi.iwashita@info-box.jp>
Date: Sun, 10 May 2026 00:55:10 +0900
Subject: [PATCH 1/3] fix(infra): inject Garage secrets via env, roll back
 postgres to 17
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two compose-time bugs surfaced when first running `make compose-up`:

1. Garage refused to start with `Invalid RPC secret key`. The
   `rpc_secret = "REPLACE_ME_AT_BOOTSTRAP_via_env"` placeholder in
   garage.toml was being read literally — no env override was wired.

2. Postgres 18 changed the on-disk layout (PR docker-library/postgres#1259):
   it now refuses to mount at /var/lib/postgresql/data and demands a
   parent-dir mount with versioned subdirs. Three of our four services
   crashed in lockstep ("PostgreSQL data in /var/lib/postgresql/data
   (unused mount/volume)").

Fixes:
- garage.toml: drop `rpc_secret` / `admin_token` / `metrics_token` lines.
  Garage reads `GARAGE_RPC_SECRET` etc. from the environment at start.
- docker-compose.yml: add `environment:` block on the garage service that
  pulls GARAGE_{RPC_SECRET,ADMIN_TOKEN} from the host env, with
  `${VAR:?...}` validation so a missing op-run wrap fails loudly instead
  of silently using empty values.
- docker-compose.yml: revert postgres 18-alpine → 17-alpine (digest pinned).
  Postgres 17 is supported through 2029; the 18 path-shape change is
  better solved in a future PR with proper PGDATA + parent-mount layout.
- Makefile: `compose-up` and `compose-up-streaming` now go through
  `$(OP_RUN)` (and depend on `env-check`), so secrets are always injected.

Verified locally: postgres / temporal-db / lakekeeper-db / temporal /
temporal-ui all healthy. Garage starts cleanly (cluster-layout init is a
follow-up). Lakekeeper DB-migration step is also a follow-up (Phase 2).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Makefile                  | 13 ++++++++-----
 docker/docker-compose.yml | 13 ++++++++++---
 docker/garage/garage.toml |  9 ++++-----
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/Makefile b/Makefile
index 7a402d7..79c4434 100644
--- a/Makefile
+++ b/Makefile
@@ -69,15 +69,18 @@ env-check: ## Verify env wiring (1Password item or .env.local)
 	fi
 
 # ----------------------------------------------------------------------
-# Docker compose (no secrets needed for compose itself; secrets via .env if any)
+# Docker compose
 # ----------------------------------------------------------------------
+# `compose up` needs GARAGE_RPC_SECRET, GARAGE_ADMIN_TOKEN, POSTGRES_PASSWORD
+# from the env (Garage refuses to start without a 32-byte rpc_secret).
+# We wrap with $(OP_RUN) so 1Password (or .env.local) supplies them.
 .PHONY: compose-up
-compose-up: ## Start the platform stack
-	cd $(REPO_ROOT)/docker && docker compose up -d
+compose-up: env-check ## Start the platform stack
+	cd $(REPO_ROOT)/docker && $(OP_RUN) docker compose up -d
 
 .PHONY: compose-up-streaming
-compose-up-streaming: ## Start the platform stack + Kafka
-	cd $(REPO_ROOT)/docker && docker compose -f docker-compose.yml -f docker-compose.streaming.yml up -d
+compose-up-streaming: env-check ## Start the platform stack + Kafka
+	cd $(REPO_ROOT)/docker && $(OP_RUN) docker compose -f docker-compose.yml -f docker-compose.streaming.yml up -d
 
 .PHONY: compose-down
 compose-down: ## Stop the platform stack
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index 294de61..24e029b 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -44,6 +44,13 @@ services:
     ports:
       - "3900:3900"  # S3 API
       - "3903:3903"  # admin API
+    environment:
+      # Secrets injected from 1Password via `op run --env-file=.env`.
+      # Garage reads any config field from env when prefixed with GARAGE_.
+      # See: https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/
+      GARAGE_RPC_SECRET: "${GARAGE_RPC_SECRET:?GARAGE_RPC_SECRET must be set (use op run --env-file=.env -- ...)}"
+      GARAGE_ADMIN_TOKEN: "${GARAGE_ADMIN_TOKEN:?GARAGE_ADMIN_TOKEN must be set}"
+      GARAGE_METRICS_TOKEN: "${GARAGE_ADMIN_TOKEN:?reuse admin token for metrics}"
     volumes:
       - garage-meta:/var/lib/garage/meta
       - garage-data:/var/lib/garage/data
@@ -59,7 +66,7 @@ services:
   # Lakekeeper — Iceberg REST catalog (Apache 2.0, Rust)
   # ----------------------------------------------------------------
   lakekeeper-db:
-    image: postgres:18-alpine@sha256:54451ecb8ab38c24c3ec123f2fd501303a3a1856a5c66e98cecf2460d5e1e9d7
+    image: postgres:17-alpine@sha256:c7526c0f6c3f30260a563d7bcf8ad778effac59a44f8ffa86678c35418338609
     environment:
       POSTGRES_USER: lakekeeper
       POSTGRES_PASSWORD: lakekeeper
@@ -97,7 +104,7 @@ services:
   # Postgres — operational mart (reverse-ETL target, app DB)
   # ----------------------------------------------------------------
   postgres:
-    image: postgres:18-alpine@sha256:54451ecb8ab38c24c3ec123f2fd501303a3a1856a5c66e98cecf2460d5e1e9d7
+    image: postgres:17-alpine@sha256:c7526c0f6c3f30260a563d7bcf8ad778effac59a44f8ffa86678c35418338609
     ports:
       - "5432:5432"
     environment:
@@ -117,7 +124,7 @@ services:
   # Temporal — orchestration
   # ----------------------------------------------------------------
   temporal-db:
-    image: postgres:18-alpine@sha256:54451ecb8ab38c24c3ec123f2fd501303a3a1856a5c66e98cecf2460d5e1e9d7
+    image: postgres:17-alpine@sha256:c7526c0f6c3f30260a563d7bcf8ad778effac59a44f8ffa86678c35418338609
     environment:
       POSTGRES_USER: temporal
       POSTGRES_PASSWORD: temporal
diff --git a/docker/garage/garage.toml b/docker/garage/garage.toml
index f9e8013..c88b4fb 100644
--- a/docker/garage/garage.toml
+++ b/docker/garage/garage.toml
@@ -6,11 +6,12 @@ db_engine = "lmdb"
 replication_factor = 1
 consistency_mode = "consistent"
 
-# RPC secret — overridden via env at first boot.
-# Generate with: openssl rand -hex 32
+# Secrets (rpc_secret, admin_token, metrics_token) are injected via
+# environment variables — see docker-compose.yml `garage.environment`
+# block. The values come from 1Password (`op run --env-file=.env`).
+# Reference: https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/
 rpc_bind_addr = "[::]:3901"
 rpc_public_addr = "127.0.0.1:3901"
-rpc_secret = "REPLACE_ME_AT_BOOTSTRAP_via_env"
 
 [s3_api]
 s3_region = "garage"
@@ -24,5 +25,3 @@ index = "index.html"
 
 [admin]
 api_bind_addr = "[::]:3903"
-admin_token = "REPLACE_ME_AT_BOOTSTRAP_via_env"
-metrics_token = "REPLACE_ME_AT_BOOTSTRAP_via_env"

From 39d56770b2776cce5c9758e08c4b0845fa96dc54 Mon Sep 17 00:00:00 2001
From: Kiichi Iwashita <kiichi.iwashita@info-box.jp>
Date: Sun, 10 May 2026 01:26:18 +0900
Subject: [PATCH 2/3] feat(phase-1): garage-init, dbt install scaffold,
 end-to-end fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1 of the project (minimal end-to-end pipeline) now runs cleanly
from `make compose-up` through `make phase1` to a live Streamlit
dashboard at http://localhost:8501. Real GitHub data lands in Garage S3,
flows through DuckDB bronze→silver→gold, and is visualised.

Major additions
---------------
- scripts/garage-init.sh — idempotent layout + bucket + key import.
  Fixes the "no role assigned / Quorum not available" state Garage
  ships in. Pulls GARAGE_S3_ACCESS_KEY/SECRET_KEY from env (op run).
- Makefile: `make garage-init`, `make dbt-install`, and `phase1` target
  chains compose-up → garage-init → ingest → dbt → dashboard.
- transform/requirements.txt: pin dbt-core 1.10 + dbt-duckdb 1.10.
- transform/.venv setup via `make dbt-install` (separate from
  ingestion's venv to keep dependency surfaces clean).

Pipeline fixes discovered while running for real
------------------------------------------------
1. profiles.yml.example: add `s3_region: garage` so DuckDB's S3 client
   stops sending `ap-northeast-1` to Garage and getting
   AuthorizationHeaderMalformed back.
2. read_parquet(union_by_name=true) on all three bronze stg models —
   needed because anthropics/claude-code has license=null and pyarrow
   inferred its `license_spdx` column as INTEGER while every other repo
   wrote VARCHAR. Combined with `try_cast(license_spdx as varchar)` on
   the projection.
3. Bronze stg_* models materialized as `table` (not `view`). Working
   around DuckDB v1.5 binder INTERNAL Error: when bronze is a view and
   silver does `select <many cols incl. timestamps> from {{ ref(stg_*) }}
   qualify row_number() over (... order by fetched_at desc) = 1`, the
   binder errors with "Failed to bind column reference '': inequal
   types (TIMESTAMP != VARCHAR)". Reproduced the bug in isolation; the
   triggers are (a) source = subquery/view, (b) qualify with TIMESTAMP
   ordering, (c) ≥2 TIMESTAMP columns in projection. Persisting bronze
   sidesteps the inline-binder path.
4. Silver fct_*/dim_repos rewritten with `qualify` (replaces the
   `with ranked as (select *, row_number() ...)` pattern, which also
   triggered the same binder bug even with table-bronze).
5. fct_commits: replace `cardinality(parents)` with `len(parents)`.
   `parents` is `VARCHAR[]` (a list); DuckDB's `cardinality()` is for
   MAPs. `len()` is the canonical list/array length.

Verified end-to-end run
-----------------------
- Bronze: 5 + 7,833 + 390 rows ingested from 5 OSS repos
- Silver: 5 + 7,366 + 467 + 390 rows
- Gold:   repo_health_snapshot 5 rows, repo_daily_metrics populated
- dbt:    PASS=28 ERROR=0 SKIP=0 (3 bronze + 4 silver + 2 gold + 19 tests)
- Streamlit: HTTP 200 on :8501 reading gold tables

The .gitignore now also covers `.local/`, used for personal phase
trackers. See `.local/phase-1/{plan,status,log}.md` for the running
notes (gitignored).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore                                    |   5 +
 Makefile                                      |  30 ++++-
 scripts/garage-init.sh                        | 109 ++++++++++++++++++
 transform/dbt_project.yml                     |   6 +-
 transform/models/bronze/stg_commit.sql        |   8 +-
 transform/models/bronze/stg_issue_or_pr.sql   |   8 +-
 transform/models/bronze/stg_repo_metadata.sql |  14 ++-
 transform/models/silver/dim_repos.sql         |  17 +--
 transform/models/silver/fct_commits.sql       |  16 +--
 transform/models/silver/fct_issues.sql        |  16 +--
 transform/models/silver/fct_pull_requests.sql |  15 +--
 transform/models/sources.yml                  |  46 ++------
 transform/package-lock.yml                    |  11 ++
 transform/profiles.yml.example                |   1 +
 transform/requirements.txt                    |   9 ++
 15 files changed, 211 insertions(+), 100 deletions(-)
 create mode 100755 scripts/garage-init.sh
 create mode 100644 transform/package-lock.yml
 create mode 100644 transform/requirements.txt

diff --git a/.gitignore b/.gitignore
index 66d2edb..2f655a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,3 +137,8 @@ logs/
 sbom.json
 sbom.spdx.json
 trivy-results.sarif
+
+# ========================================
+# Personal/runtime workspace (never committed)
+# ========================================
+.local/
diff --git a/Makefile b/Makefile
index 79c4434..899adba 100644
--- a/Makefile
+++ b/Makefile
@@ -94,6 +94,13 @@ compose-ps: ## Show running containers
 compose-logs: ## Tail logs
 	cd $(REPO_ROOT)/docker && docker compose logs -f
 
+# ----------------------------------------------------------------------
+# Garage cluster init (one-time per fresh stack)
+# ----------------------------------------------------------------------
+.PHONY: garage-init
+garage-init: env-check ## Assign layout, create bucket, import keys, grant access
+	$(OP_RUN) bash $(REPO_ROOT)/scripts/garage-init.sh
+
 # ----------------------------------------------------------------------
 # Python ingestion
 # ----------------------------------------------------------------------
@@ -118,19 +125,30 @@ ingest-test: ## Run the ingestor unit tests (no secrets needed)
 	cd $(REPO_ROOT)/ingestion/python && .venv/bin/pytest
 
 # ----------------------------------------------------------------------
-# dbt
+# dbt — uses its own venv at transform/.venv
 # ----------------------------------------------------------------------
+DBT_VENV := $(REPO_ROOT)/transform/.venv
+DBT := $(DBT_VENV)/bin/dbt
+DBT_PROFILES_DIR := $(REPO_ROOT)/transform
+
+.PHONY: dbt-install
+dbt-install: ## Create transform/.venv and install dbt-core + dbt-duckdb
+	$(PY) -m venv $(DBT_VENV)
+	$(DBT_VENV)/bin/pip install -r $(REPO_ROOT)/transform/requirements.txt
+	@test -f $(DBT_PROFILES_DIR)/profiles.yml || cp $(REPO_ROOT)/transform/profiles.yml.example $(DBT_PROFILES_DIR)/profiles.yml
+	@echo "dbt ready: $(DBT)"
+
 .PHONY: dbt-deps
-dbt-deps: ## Install dbt packages
-	cd $(REPO_ROOT)/transform && dbt deps
+dbt-deps: ## Install dbt packages from packages.yml
+	cd $(REPO_ROOT)/transform && DBT_PROFILES_DIR=$(DBT_PROFILES_DIR) $(DBT) deps
 
 .PHONY: dbt-build
 dbt-build: env-check ## Run dbt build end-to-end
-	cd $(REPO_ROOT)/transform && $(OP_RUN) dbt build
+	cd $(REPO_ROOT)/transform && $(OP_RUN) bash -c 'DBT_PROFILES_DIR=$(DBT_PROFILES_DIR) $(DBT) build'
 
 .PHONY: dbt-debug
 dbt-debug: env-check ## Validate dbt connection
-	cd $(REPO_ROOT)/transform && $(OP_RUN) dbt debug
+	cd $(REPO_ROOT)/transform && $(OP_RUN) bash -c 'DBT_PROFILES_DIR=$(DBT_PROFILES_DIR) $(DBT) debug'
 
 # ----------------------------------------------------------------------
 # Streamlit
@@ -160,7 +178,7 @@ precommit: ## Run all pre-commit hooks against everything
 # End-to-end
 # ----------------------------------------------------------------------
 .PHONY: phase1
-phase1: compose-up ingest-bootstrap ingest dbt-deps dbt-build dashboard ## Run the full Phase 1 pipeline
+phase1: compose-up garage-init ingest-install ingest-bootstrap ingest dbt-install dbt-deps dbt-build dashboard ## Run the full Phase 1 pipeline
 
 # ----------------------------------------------------------------------
 # 1Password helpers
diff --git a/scripts/garage-init.sh b/scripts/garage-init.sh
new file mode 100755
index 0000000..f2de2f3
--- /dev/null
+++ b/scripts/garage-init.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+# Initialize the single-node Garage cluster:
+#   1. Assign role to the local node (zone, capacity)
+#   2. Apply staged layout
+#   3. Create the `bronze` bucket
+#   4. Import the access keys from env (resolved by `op run`)
+#   5. Grant the imported key read+write on `bronze`
+#
+# Idempotent — safe to re-run after a stack restart. Each step short-circuits
+# if the desired state already exists.
+#
+# Required env (resolved via `op run --env-file=.env`):
+#   GARAGE_S3_ACCESS_KEY
+#   GARAGE_S3_SECRET_KEY
+#
+# Usage:
+#   make garage-init        # via Makefile (wraps with op run automatically)
+#   bash scripts/garage-init.sh  # if env vars are already exported
+
+set -euo pipefail
+
+CONTAINER="${GARAGE_CONTAINER:-de-lab-garage-1}"
+ZONE="${GARAGE_ZONE:-dc1}"
+CAPACITY="${GARAGE_CAPACITY:-10G}"
+TAG="${GARAGE_TAG:-local}"
+BUCKET="${GARAGE_BUCKET:-bronze}"
+KEY_NAME="${GARAGE_KEY_NAME:-de-lab}"
+
+color() { printf "\033[%sm%s\033[0m" "$1" "$2"; }
+info()  { echo "$(color 36 "[garage-init]") $*"; }
+warn()  { echo "$(color 33 "[garage-init]") $*" >&2; }
+err()   { echo "$(color 31 "[garage-init]") $*" >&2; }
+
+g() { docker exec -i "$CONTAINER" /garage "$@"; }
+
+# ----------------------------------------------------------------------
+# 0. Pre-flight
+# ----------------------------------------------------------------------
+docker inspect "$CONTAINER" > /dev/null 2>&1 || {
+  err "Container '$CONTAINER' is not running. Run 'make compose-up' first."
+  exit 1
+}
+
+[ -n "${GARAGE_S3_ACCESS_KEY:-}" ] || { err "GARAGE_S3_ACCESS_KEY missing — run via 'make garage-init' (op run)"; exit 1; }
+[ -n "${GARAGE_S3_SECRET_KEY:-}" ] || { err "GARAGE_S3_SECRET_KEY missing — run via 'make garage-init' (op run)"; exit 1; }
+
+# Wait for daemon to accept commands
+info "Waiting for Garage daemon..."
+for _ in $(seq 1 30); do
+  if g status > /dev/null 2>&1; then
+    break
+  fi
+  sleep 1
+done
+g status > /dev/null 2>&1 || { err "Garage daemon did not respond within 30s"; exit 1; }
+
+# ----------------------------------------------------------------------
+# 1. Layout: assign role + apply (idempotent)
+# ----------------------------------------------------------------------
+status_out=$(g status 2>&1)
+node_id=$(echo "$status_out" | awk '/HEALTHY NODES/{flag=1; next} flag && NF && $1 != "ID" {print $1; exit}')
+[ -n "$node_id" ] || { err "Could not parse node ID from 'garage status'"; echo "$status_out" >&2; exit 1; }
+
+if echo "$status_out" | grep -q "NO ROLE ASSIGNED"; then
+  info "Assigning role to node $node_id (zone=$ZONE, cap=$CAPACITY)..."
+  g layout assign "$node_id" -z "$ZONE" -c "$CAPACITY" -t "$TAG"
+  info "Applying staged layout (version 1)..."
+  g layout apply --version 1
+else
+  info "Node $node_id already has role assigned — skipping layout."
+fi
+
+# ----------------------------------------------------------------------
+# 2. Bucket (idempotent)
+# ----------------------------------------------------------------------
+if g bucket list 2>/dev/null | awk 'NR>2 {print $1}' | grep -qx "$BUCKET"; then
+  info "Bucket '$BUCKET' already exists — skipping."
+else
+  info "Creating bucket '$BUCKET'..."
+  g bucket create "$BUCKET"
+fi
+
+# ----------------------------------------------------------------------
+# 3. Key import (idempotent)
+# ----------------------------------------------------------------------
+if g key list 2>/dev/null | awk 'NR>2 {print $2}' | grep -qx "$KEY_NAME"; then
+  info "Key '$KEY_NAME' already exists — skipping import."
+else
+  info "Importing access keys (id from env, name=$KEY_NAME)..."
+  g key import --yes -n "$KEY_NAME" "$GARAGE_S3_ACCESS_KEY" "$GARAGE_S3_SECRET_KEY"
+fi
+
+# ----------------------------------------------------------------------
+# 4. Grant the key access to the bucket (idempotent: garage allow is OK to re-run)
+# ----------------------------------------------------------------------
+info "Granting read+write on '$BUCKET' to key '$KEY_NAME'..."
+g bucket allow --read --write --owner "$BUCKET" --key "$KEY_NAME"
+
+# ----------------------------------------------------------------------
+# 5. Verify
+# ----------------------------------------------------------------------
+info "Verification:"
+g bucket info "$BUCKET" 2>&1 | sed 's/^/  /'
+
+echo ""
+echo "$(color 32 "[garage-init] Done.")"
+echo "  Bucket  : $BUCKET"
+echo "  Key name: $KEY_NAME"
+echo "  S3 endpoint (host): http://localhost:3900"
diff --git a/transform/dbt_project.yml b/transform/dbt_project.yml
index f61e0be..de674da 100644
--- a/transform/dbt_project.yml
+++ b/transform/dbt_project.yml
@@ -18,7 +18,11 @@ clean-targets:
 models:
   de_lab:
     bronze:
-      +materialized: view
+      # Materialize bronze as table (not view) to dodge DuckDB v1.5 binder
+      # bug: view → silver `qualify` with multiple TIMESTAMP columns triggers
+      # "INTERNAL Error: TIMESTAMP != VARCHAR". Persisted bronze has stable
+      # column types and avoids the inline-view binder path.
+      +materialized: table
       +schema: bronze
     silver:
       +materialized: table
diff --git a/transform/models/bronze/stg_commit.sql b/transform/models/bronze/stg_commit.sql
index 288684f..b028bf9 100644
--- a/transform/models/bronze/stg_commit.sql
+++ b/transform/models/bronze/stg_commit.sql
@@ -1,4 +1,4 @@
-{{ config(materialized='view') }}
+{{ config(materialized='table') }}
 
 select
     repo_full_name,
@@ -13,4 +13,8 @@ select
     parents,
     cast(fetched_at as timestamp) as fetched_at,
     raw_payload
-from {{ source('bronze', 'commit') }}
+from read_parquet(
+    's3://bronze/commit/**/*.parquet',
+    hive_partitioning = true,
+    union_by_name = true
+)
diff --git a/transform/models/bronze/stg_issue_or_pr.sql b/transform/models/bronze/stg_issue_or_pr.sql
index 01fb50a..ed8e34a 100644
--- a/transform/models/bronze/stg_issue_or_pr.sql
+++ b/transform/models/bronze/stg_issue_or_pr.sql
@@ -1,4 +1,4 @@
-{{ config(materialized='view') }}
+{{ config(materialized='table') }}
 
 -- Bronze passthrough for issues + PRs (GitHub treats them via the same endpoint).
 -- Splitting happens in silver.
@@ -18,4 +18,8 @@ select
     cast(closed_at as timestamp) as closed_at,
     cast(fetched_at as timestamp) as fetched_at,
     raw_payload
-from {{ source('bronze', 'issue_or_pr') }}
+from read_parquet(
+    's3://bronze/issue_or_pr/**/*.parquet',
+    hive_partitioning = true,
+    union_by_name = true
+)
diff --git a/transform/models/bronze/stg_repo_metadata.sql b/transform/models/bronze/stg_repo_metadata.sql
index 10120c6..65fa817 100644
--- a/transform/models/bronze/stg_repo_metadata.sql
+++ b/transform/models/bronze/stg_repo_metadata.sql
@@ -1,7 +1,9 @@
-{{ config(materialized='view') }}
+{{ config(materialized='table') }}
 
 -- Bronze passthrough for repository headline numbers.
--- Just type cast and rename for downstream consumption.
+-- Read directly from Garage S3 via DuckDB httpfs + parquet.
+-- (sources.yml documents the contract; dbt-duckdb's source resolution
+-- doesn't auto-handle Hive-partitioned globs, so we read explicitly.)
 select
     full_name as repo_full_name,
     name as repo_name,
@@ -15,7 +17,7 @@ select
     network_count,
     default_branch,
     language,
-    license_spdx,
+    try_cast(license_spdx as varchar) as license_spdx,
     archived,
     disabled,
     fork,
@@ -24,4 +26,8 @@ select
     cast(pushed_at as timestamp) as pushed_at,
     cast(fetched_at as timestamp) as fetched_at,
     raw_payload
-from {{ source('bronze', 'repo_metadata') }}
+from read_parquet(
+    's3://bronze/repo_metadata/**/*.parquet',
+    hive_partitioning = true,
+    union_by_name = true
+)
diff --git a/transform/models/silver/dim_repos.sql b/transform/models/silver/dim_repos.sql
index b0f468a..09c3e29 100644
--- a/transform/models/silver/dim_repos.sql
+++ b/transform/models/silver/dim_repos.sql
@@ -2,15 +2,10 @@
 
 -- Latest snapshot per repo. Bronze can have multiple snapshots over time;
 -- silver collapses to "current truth" by fetched_at.
-with ranked as (
-    select
-        *,
-        row_number() over (
-            partition by repo_full_name
-            order by fetched_at desc
-        ) as rn
-    from {{ ref('stg_repo_metadata') }}
-)
+--
+-- We use `qualify` rather than `with ranked as (select *, row_number() ...)`
+-- because `select *` against an external-parquet view confuses DuckDB's
+-- window-function binder (INTERNAL Error: TIMESTAMP != VARCHAR).
 select
     repo_full_name,
     repo_name,
@@ -30,5 +25,5 @@ select
     updated_at as repo_updated_at,
     pushed_at as last_pushed_at,
     fetched_at as snapshot_at
-from ranked
-where rn = 1
+from {{ ref('stg_repo_metadata') }}
+qualify row_number() over (partition by repo_full_name order by fetched_at desc) = 1
diff --git a/transform/models/silver/fct_commits.sql b/transform/models/silver/fct_commits.sql
index fae4a78..15e7735 100644
--- a/transform/models/silver/fct_commits.sql
+++ b/transform/models/silver/fct_commits.sql
@@ -1,14 +1,5 @@
 {{ config(materialized='table') }}
 
-with deduped as (
-    select
-        *,
-        row_number() over (
-            partition by repo_full_name, sha
-            order by fetched_at desc
-        ) as rn
-    from {{ ref('stg_commit') }}
-)
 select
     repo_full_name,
     sha,
@@ -19,7 +10,8 @@ select
     committer_login,
     committer_email,
     committed_date,
-    cardinality(parents) as parent_count,
+    -- DuckDB v1.5: cardinality() is for MAPs; use len() for LISTs
+    len(parents) as parent_count,
     fetched_at
-from deduped
-where rn = 1
+from {{ ref('stg_commit') }}
+qualify row_number() over (partition by repo_full_name, sha order by fetched_at desc) = 1
diff --git a/transform/models/silver/fct_issues.sql b/transform/models/silver/fct_issues.sql
index 7a28ebe..4b51884 100644
--- a/transform/models/silver/fct_issues.sql
+++ b/transform/models/silver/fct_issues.sql
@@ -1,16 +1,7 @@
 {{ config(materialized='table') }}
 
 -- Issues only. PRs are split off into fct_pull_requests.
-with deduped as (
-    select
-        *,
-        row_number() over (
-            partition by repo_full_name, number
-            order by fetched_at desc
-        ) as rn
-    from {{ ref('stg_issue_or_pr') }}
-    where is_pull_request = false
-)
+-- Use `qualify` to avoid DuckDB's `select * + row_number()` binder bug.
 select
     repo_full_name,
     number as issue_number,
@@ -25,5 +16,6 @@ select
     updated_at,
     closed_at,
     fetched_at
-from deduped
-where rn = 1
+from {{ ref('stg_issue_or_pr') }}
+where is_pull_request = false
+qualify row_number() over (partition by repo_full_name, number order by fetched_at desc) = 1
diff --git a/transform/models/silver/fct_pull_requests.sql b/transform/models/silver/fct_pull_requests.sql
index 390f02f..42ce0dc 100644
--- a/transform/models/silver/fct_pull_requests.sql
+++ b/transform/models/silver/fct_pull_requests.sql
@@ -1,15 +1,5 @@
 {{ config(materialized='table') }}
 
-with deduped as (
-    select
-        *,
-        row_number() over (
-            partition by repo_full_name, number
-            order by fetched_at desc
-        ) as rn
-    from {{ ref('stg_issue_or_pr') }}
-    where is_pull_request = true
-)
 select
     repo_full_name,
     number as pr_number,
@@ -23,5 +13,6 @@ select
     updated_at,
     closed_at,
     fetched_at
-from deduped
-where rn = 1
+from {{ ref('stg_issue_or_pr') }}
+where is_pull_request = true
+qualify row_number() over (partition by repo_full_name, number order by fetched_at desc) = 1
diff --git a/transform/models/sources.yml b/transform/models/sources.yml
index f51bdfe..e8e1ac7 100644
--- a/transform/models/sources.yml
+++ b/transform/models/sources.yml
@@ -1,9 +1,13 @@
 version: 2
 
-# External Bronze sources written by ingestion/python/.
-# These are materialized as DuckDB views over Garage S3 parquet partitions.
+# Documentation-only: these sources describe the contract of what the
+# Python ingestor writes to Garage S3. The bronze stg_* models read
+# directly via DuckDB's `read_parquet(...)` since dbt-duckdb's source
+# resolution doesn't auto-handle Hive-partitioned globs.
 #
-# Phase 2 will replace these with Iceberg tables managed by Lakekeeper.
+# Phase 2 will replace these external parquet sources with Iceberg tables
+# managed by Lakekeeper, at which point real `source()` references become
+# usable end-to-end.
 
 sources:
   - name: bronze
@@ -11,45 +15,11 @@ sources:
     meta:
       origin: ingestion/python/
       partitioned_by: [event_type, owner, repo, year, month, day]
+      external_location_template: "s3://bronze/{event_type}/{owner}/{repo}/year={Y}/month={M}/day={D}/*.parquet"
     tables:
       - name: repo_metadata
         description: "Snapshot of repository headline numbers."
-        external:
-          # Replaced at compile-time by env_var or vars.
-          # See profiles.yml.example: extensions=[httpfs, parquet] settings=[s3_*]
-          location: "s3://bronze/repo_metadata/**/*.parquet"
-        columns:
-          - name: full_name
-            description: "owner/repo"
-            tests: [not_null]
-          - name: stargazers_count
-            tests: [not_null]
-          - name: fetched_at
-            tests: [not_null]
-
       - name: issue_or_pr
         description: "Issues and pull requests (PR is a kind of issue on GitHub)."
-        external:
-          location: "s3://bronze/issue_or_pr/**/*.parquet"
-        columns:
-          - name: repo_full_name
-            tests: [not_null]
-          - name: number
-            tests: [not_null]
-          - name: state
-            tests:
-              - not_null
-              - accepted_values:
-                  values: [open, closed]
-          - name: is_pull_request
-            tests: [not_null]
-
       - name: commit
         description: "Commits on the default branch."
-        external:
-          location: "s3://bronze/commit/**/*.parquet"
-        columns:
-          - name: sha
-            tests: [not_null]
-          - name: repo_full_name
-            tests: [not_null]
diff --git a/transform/package-lock.yml b/transform/package-lock.yml
new file mode 100644
index 0000000..2299784
--- /dev/null
+++ b/transform/package-lock.yml
@@ -0,0 +1,11 @@
+packages:
+  - name: dbt_utils
+    package: dbt-labs/dbt_utils
+    version: 1.3.3
+  - name: dbt_expectations
+    package: calogica/dbt_expectations
+    version: 0.10.4
+  - name: dbt_date
+    package: calogica/dbt_date
+    version: 0.10.1
+sha1_hash: 7e6826471cf21d684924eabf28ee66f0d233f735
diff --git a/transform/profiles.yml.example b/transform/profiles.yml.example
index 6e8a893..2292630 100644
--- a/transform/profiles.yml.example
+++ b/transform/profiles.yml.example
@@ -10,6 +10,7 @@ de_lab:
         - iceberg
       settings:
         s3_endpoint: localhost:3900
+        s3_region: "{{ env_var('GARAGE_S3_REGION', 'garage') }}"
         s3_access_key_id: "{{ env_var('GARAGE_S3_ACCESS_KEY') }}"
         s3_secret_access_key: "{{ env_var('GARAGE_S3_SECRET_KEY') }}"
         s3_use_ssl: false
diff --git a/transform/requirements.txt b/transform/requirements.txt
new file mode 100644
index 0000000..23499b4
--- /dev/null
+++ b/transform/requirements.txt
@@ -0,0 +1,9 @@
+# dbt runtime for the transform/ project.
+# Pinned to dbt-core 1.10 (Phase 4 will switch to dbt Fusion).
+# See ADR-0005 for the migration strategy.
+
+dbt-core>=1.10,<1.11
+dbt-duckdb>=1.10,<1.11
+
+# Data quality (replaces Great Expectations — see ADR / SECURITY.md)
+# soda-core-duckdb>=3.5  # Phase 4 — keep commented for now to keep deps lean

From 803546ef59810a823d1813cf1a3bb0647511628b Mon Sep 17 00:00:00 2001
From: Kiichi Iwashita <kiichi.iwashita@info-box.jp>
Date: Sun, 10 May 2026 01:30:01 +0900
Subject: [PATCH 3/3] fix(dbt): override generate_schema_name to drop main_
 prefix

dbt-duckdb's default generate_schema_name macro produces
<target_schema>_<custom_schema>, so a model declared as
`+schema: gold` lands at `main_gold.repo_health_snapshot` rather than
`gold.repo_health_snapshot`. The Streamlit dashboard (and any
downstream SQL example) references the cleaner `gold.*` form, so the
dashboard fell into its CatalogException-caught fallback ("Gold models
not yet materialized").

Override the macro to use the custom schema verbatim, matching what's
declared in dbt_project.yml. Now bronze/silver/gold tables live in the
schemas of those names, and the dashboard renders correctly.

The case for keeping the prefix is multi-target schema isolation
(dev_gold vs prod_gold sharing one DuckDB file), which we don't
currently need; we have a single dev profile.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 transform/macros/generate_schema_name.sql | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 transform/macros/generate_schema_name.sql

diff --git a/transform/macros/generate_schema_name.sql b/transform/macros/generate_schema_name.sql
new file mode 100644
index 0000000..2dd6fa5
--- /dev/null
+++ b/transform/macros/generate_schema_name.sql
@@ -0,0 +1,19 @@
+{#
+    Override dbt's default schema-naming behaviour.
+
+    Default: when a model has `+schema: bronze`, dbt creates the table at
+        <target_schema>_<custom_schema>  →  main_bronze
+
+    Override: use the custom schema name verbatim, so models declared as
+    `+schema: bronze` land in `bronze.*`. This matches what Streamlit and
+    case-study SQL examples reference.
+
+    See https://docs.getdbt.com/docs/build/custom-schemas
+#}
+{% macro generate_schema_name(custom_schema_name, node) -%}
+    {%- if custom_schema_name is none -%}
+        {{ target.schema }}
+    {%- else -%}
+        {{ custom_schema_name | trim }}
+    {%- endif -%}
+{%- endmacro %}