Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,8 @@ logs/
sbom.json
sbom.spdx.json
trivy-results.sarif

# ========================================
# Personal/runtime workspace (never committed)
# ========================================
.local/
43 changes: 32 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,18 @@ env-check: ## Verify env wiring (1Password item or .env.local)
fi

# ----------------------------------------------------------------------
# Docker compose (no secrets needed for compose itself; secrets via .env if any)
# Docker compose
# ----------------------------------------------------------------------
# `compose up` needs GARAGE_RPC_SECRET, GARAGE_ADMIN_TOKEN, POSTGRES_PASSWORD
# from the env (Garage refuses to start without a 32-byte rpc_secret).
# We wrap with $(OP_RUN) so 1Password (or .env.local) supplies them.
.PHONY: compose-up
compose-up: ## Start the platform stack
cd $(REPO_ROOT)/docker && docker compose up -d
compose-up: env-check ## Start the platform stack
cd $(REPO_ROOT)/docker && $(OP_RUN) docker compose up -d

.PHONY: compose-up-streaming
compose-up-streaming: ## Start the platform stack + Kafka
cd $(REPO_ROOT)/docker && docker compose -f docker-compose.yml -f docker-compose.streaming.yml up -d
compose-up-streaming: env-check ## Start the platform stack + Kafka
cd $(REPO_ROOT)/docker && $(OP_RUN) docker compose -f docker-compose.yml -f docker-compose.streaming.yml up -d

.PHONY: compose-down
compose-down: ## Stop the platform stack
Expand All @@ -91,6 +94,13 @@ compose-ps: ## Show running containers
compose-logs: ## Tail logs
cd $(REPO_ROOT)/docker && docker compose logs -f

# ----------------------------------------------------------------------
# Garage cluster init (one-time per fresh stack)
# ----------------------------------------------------------------------
.PHONY: garage-init
garage-init: env-check ## Assign layout, create bucket, import keys, grant access
$(OP_RUN) bash $(REPO_ROOT)/scripts/garage-init.sh

# ----------------------------------------------------------------------
# Python ingestion
# ----------------------------------------------------------------------
Expand All @@ -115,19 +125,30 @@ ingest-test: ## Run the ingestor unit tests (no secrets needed)
cd $(REPO_ROOT)/ingestion/python && .venv/bin/pytest

# ----------------------------------------------------------------------
# dbt
# dbt — uses its own venv at transform/.venv
# ----------------------------------------------------------------------
DBT_VENV := $(REPO_ROOT)/transform/.venv
DBT := $(DBT_VENV)/bin/dbt
DBT_PROFILES_DIR := $(REPO_ROOT)/transform

.PHONY: dbt-install
dbt-install: ## Create transform/.venv and install dbt-core + dbt-duckdb
$(PY) -m venv $(DBT_VENV)
$(DBT_VENV)/bin/pip install -r $(REPO_ROOT)/transform/requirements.txt
@test -f $(DBT_PROFILES_DIR)/profiles.yml || cp $(REPO_ROOT)/transform/profiles.yml.example $(DBT_PROFILES_DIR)/profiles.yml
@echo "dbt ready: $(DBT)"

.PHONY: dbt-deps
dbt-deps: ## Install dbt packages
cd $(REPO_ROOT)/transform && dbt deps
dbt-deps: ## Install dbt packages from packages.yml
cd $(REPO_ROOT)/transform && DBT_PROFILES_DIR=$(DBT_PROFILES_DIR) $(DBT) deps

.PHONY: dbt-build
dbt-build: env-check ## Run dbt build end-to-end
cd $(REPO_ROOT)/transform && $(OP_RUN) dbt build
cd $(REPO_ROOT)/transform && $(OP_RUN) bash -c 'DBT_PROFILES_DIR=$(DBT_PROFILES_DIR) $(DBT) build'

.PHONY: dbt-debug
dbt-debug: env-check ## Validate dbt connection
cd $(REPO_ROOT)/transform && $(OP_RUN) dbt debug
cd $(REPO_ROOT)/transform && $(OP_RUN) bash -c 'DBT_PROFILES_DIR=$(DBT_PROFILES_DIR) $(DBT) debug'

# ----------------------------------------------------------------------
# Streamlit
Expand Down Expand Up @@ -157,7 +178,7 @@ precommit: ## Run all pre-commit hooks against everything
# End-to-end
# ----------------------------------------------------------------------
.PHONY: phase1
phase1: compose-up ingest-bootstrap ingest dbt-deps dbt-build dashboard ## Run the full Phase 1 pipeline
phase1: compose-up garage-init ingest-install ingest-bootstrap ingest dbt-install dbt-deps dbt-build dashboard ## Run the full Phase 1 pipeline

# ----------------------------------------------------------------------
# 1Password helpers
Expand Down
13 changes: 10 additions & 3 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@ services:
ports:
- "3900:3900" # S3 API
- "3903:3903" # admin API
environment:
# Secrets injected from 1Password via `op run --env-file=.env`.
# Garage reads any config field from env when prefixed with GARAGE_.
# See: https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/
GARAGE_RPC_SECRET: "${GARAGE_RPC_SECRET:?GARAGE_RPC_SECRET must be set (use op run --env-file=.env -- ...)}"
GARAGE_ADMIN_TOKEN: "${GARAGE_ADMIN_TOKEN:?GARAGE_ADMIN_TOKEN must be set}"
GARAGE_METRICS_TOKEN: "${GARAGE_ADMIN_TOKEN:?reuse admin token for metrics}"
volumes:
- garage-meta:/var/lib/garage/meta
- garage-data:/var/lib/garage/data
Expand All @@ -59,7 +66,7 @@ services:
# Lakekeeper — Iceberg REST catalog (Apache 2.0, Rust)
# ----------------------------------------------------------------
lakekeeper-db:
image: postgres:18-alpine@sha256:54451ecb8ab38c24c3ec123f2fd501303a3a1856a5c66e98cecf2460d5e1e9d7
image: postgres:17-alpine@sha256:c7526c0f6c3f30260a563d7bcf8ad778effac59a44f8ffa86678c35418338609
environment:
POSTGRES_USER: lakekeeper
POSTGRES_PASSWORD: lakekeeper
Expand Down Expand Up @@ -97,7 +104,7 @@ services:
# Postgres — operational mart (reverse-ETL target, app DB)
# ----------------------------------------------------------------
postgres:
image: postgres:18-alpine@sha256:54451ecb8ab38c24c3ec123f2fd501303a3a1856a5c66e98cecf2460d5e1e9d7
image: postgres:17-alpine@sha256:c7526c0f6c3f30260a563d7bcf8ad778effac59a44f8ffa86678c35418338609
ports:
- "5432:5432"
environment:
Expand All @@ -117,7 +124,7 @@ services:
# Temporal — orchestration
# ----------------------------------------------------------------
temporal-db:
image: postgres:18-alpine@sha256:54451ecb8ab38c24c3ec123f2fd501303a3a1856a5c66e98cecf2460d5e1e9d7
image: postgres:17-alpine@sha256:c7526c0f6c3f30260a563d7bcf8ad778effac59a44f8ffa86678c35418338609
environment:
POSTGRES_USER: temporal
POSTGRES_PASSWORD: temporal
Expand Down
9 changes: 4 additions & 5 deletions docker/garage/garage.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ db_engine = "lmdb"
replication_factor = 1
consistency_mode = "consistent"

# RPC secret — overridden via env at first boot.
# Generate with: openssl rand -hex 32
# Secrets (rpc_secret, admin_token, metrics_token) are injected via
# environment variables — see docker-compose.yml `garage.environment`
# block. The values come from 1Password (`op run --env-file=.env`).
# Reference: https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/
rpc_bind_addr = "[::]:3901"
rpc_public_addr = "127.0.0.1:3901"
rpc_secret = "REPLACE_ME_AT_BOOTSTRAP_via_env"

[s3_api]
s3_region = "garage"
Expand All @@ -24,5 +25,3 @@ index = "index.html"

[admin]
api_bind_addr = "[::]:3903"
admin_token = "REPLACE_ME_AT_BOOTSTRAP_via_env"
metrics_token = "REPLACE_ME_AT_BOOTSTRAP_via_env"
109 changes: 109 additions & 0 deletions scripts/garage-init.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env bash
# Initialize the single-node Garage cluster:
# 1. Assign role to the local node (zone, capacity)
# 2. Apply staged layout
# 3. Create the `bronze` bucket
# 4. Import the access keys from env (resolved by `op run`)
# 5. Grant the imported key read+write on `bronze`
#
# Idempotent — safe to re-run after a stack restart. Each step short-circuits
# if the desired state already exists.
#
# Required env (resolved via `op run --env-file=.env`):
# GARAGE_S3_ACCESS_KEY
# GARAGE_S3_SECRET_KEY
#
# Usage:
# make garage-init # via Makefile (wraps with op run automatically)
# bash scripts/garage-init.sh # if env vars are already exported

set -euo pipefail

CONTAINER="${GARAGE_CONTAINER:-de-lab-garage-1}"
ZONE="${GARAGE_ZONE:-dc1}"
CAPACITY="${GARAGE_CAPACITY:-10G}"
TAG="${GARAGE_TAG:-local}"
BUCKET="${GARAGE_BUCKET:-bronze}"
KEY_NAME="${GARAGE_KEY_NAME:-de-lab}"

color() { printf "\033[%sm%s\033[0m" "$1" "$2"; }
info() { echo "$(color 36 "[garage-init]") $*"; }
warn() { echo "$(color 33 "[garage-init]") $*" >&2; }
err() { echo "$(color 31 "[garage-init]") $*" >&2; }

g() { docker exec -i "$CONTAINER" /garage "$@"; }

# ----------------------------------------------------------------------
# 0. Pre-flight
# ----------------------------------------------------------------------
docker inspect "$CONTAINER" > /dev/null 2>&1 || {
err "Container '$CONTAINER' is not running. Run 'make compose-up' first."
exit 1
}

[ -n "${GARAGE_S3_ACCESS_KEY:-}" ] || { err "GARAGE_S3_ACCESS_KEY missing — run via 'make garage-init' (op run)"; exit 1; }
[ -n "${GARAGE_S3_SECRET_KEY:-}" ] || { err "GARAGE_S3_SECRET_KEY missing — run via 'make garage-init' (op run)"; exit 1; }

# Wait for daemon to accept commands
info "Waiting for Garage daemon..."
for _ in $(seq 1 30); do
if g status > /dev/null 2>&1; then
break
fi
sleep 1
done
g status > /dev/null 2>&1 || { err "Garage daemon did not respond within 30s"; exit 1; }

# ----------------------------------------------------------------------
# 1. Layout: assign role + apply (idempotent)
# ----------------------------------------------------------------------
status_out=$(g status 2>&1)
node_id=$(echo "$status_out" | awk '/HEALTHY NODES/{flag=1; next} flag && NF && $1 != "ID" {print $1; exit}')
[ -n "$node_id" ] || { err "Could not parse node ID from 'garage status'"; echo "$status_out" >&2; exit 1; }

if echo "$status_out" | grep -q "NO ROLE ASSIGNED"; then
info "Assigning role to node $node_id (zone=$ZONE, cap=$CAPACITY)..."
g layout assign "$node_id" -z "$ZONE" -c "$CAPACITY" -t "$TAG"
info "Applying staged layout (version 1)..."
g layout apply --version 1
else
info "Node $node_id already has role assigned — skipping layout."
fi

# ----------------------------------------------------------------------
# 2. Bucket (idempotent)
# ----------------------------------------------------------------------
if g bucket list 2>/dev/null | awk 'NR>2 {print $1}' | grep -qx "$BUCKET"; then
info "Bucket '$BUCKET' already exists — skipping."
else
info "Creating bucket '$BUCKET'..."
g bucket create "$BUCKET"
fi

# ----------------------------------------------------------------------
# 3. Key import (idempotent)
# ----------------------------------------------------------------------
if g key list 2>/dev/null | awk 'NR>2 {print $2}' | grep -qx "$KEY_NAME"; then
info "Key '$KEY_NAME' already exists — skipping import."
else
info "Importing access keys (id from env, name=$KEY_NAME)..."
g key import --yes -n "$KEY_NAME" "$GARAGE_S3_ACCESS_KEY" "$GARAGE_S3_SECRET_KEY"
fi

# ----------------------------------------------------------------------
# 4. Grant the key access to the bucket (idempotent: garage allow is OK to re-run)
# ----------------------------------------------------------------------
info "Granting read+write on '$BUCKET' to key '$KEY_NAME'..."
g bucket allow --read --write --owner "$BUCKET" --key "$KEY_NAME"

# ----------------------------------------------------------------------
# 5. Verify
# ----------------------------------------------------------------------
info "Verification:"
g bucket info "$BUCKET" 2>&1 | sed 's/^/ /'

echo ""
echo "$(color 32 "[garage-init] Done.")"
echo " Bucket : $BUCKET"
echo " Key name: $KEY_NAME"
echo " S3 endpoint (host): http://localhost:3900"
6 changes: 5 additions & 1 deletion transform/dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@ clean-targets:
models:
de_lab:
bronze:
+materialized: view
# Materialize bronze as table (not view) to dodge DuckDB v1.5 binder
# bug: view → silver `qualify` with multiple TIMESTAMP columns triggers
# "INTERNAL Error: TIMESTAMP != VARCHAR". Persisted bronze has stable
# column types and avoids the inline-view binder path.
+materialized: table
+schema: bronze
silver:
+materialized: table
Expand Down
19 changes: 19 additions & 0 deletions transform/macros/generate_schema_name.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{#
Override dbt's default schema-naming behaviour.

Default: when a model has `+schema: bronze`, dbt creates the table at
<target_schema>_<custom_schema> → main_bronze

Override: use the custom schema name verbatim, so models declared as
`+schema: bronze` land in `bronze.*`. This matches what Streamlit and
case-study SQL examples reference.

See https://docs.getdbt.com/docs/build/custom-schemas
#}
{% macro generate_schema_name(custom_schema_name, node) -%}
{%- if custom_schema_name is none -%}
{{ target.schema }}
{%- else -%}
{{ custom_schema_name | trim }}
{%- endif -%}
{%- endmacro %}
8 changes: 6 additions & 2 deletions transform/models/bronze/stg_commit.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{ config(materialized='view') }}
{{ config(materialized='table') }}

select
repo_full_name,
Expand All @@ -13,4 +13,8 @@ select
parents,
cast(fetched_at as timestamp) as fetched_at,
raw_payload
from {{ source('bronze', 'commit') }}
from read_parquet(
's3://bronze/commit/**/*.parquet',
hive_partitioning = true,
union_by_name = true
)
8 changes: 6 additions & 2 deletions transform/models/bronze/stg_issue_or_pr.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{ config(materialized='view') }}
{{ config(materialized='table') }}

-- Bronze passthrough for issues + PRs (GitHub treats them via the same endpoint).
-- Splitting happens in silver.
Expand All @@ -18,4 +18,8 @@ select
cast(closed_at as timestamp) as closed_at,
cast(fetched_at as timestamp) as fetched_at,
raw_payload
from {{ source('bronze', 'issue_or_pr') }}
from read_parquet(
's3://bronze/issue_or_pr/**/*.parquet',
hive_partitioning = true,
union_by_name = true
)
14 changes: 10 additions & 4 deletions transform/models/bronze/stg_repo_metadata.sql
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
{{ config(materialized='view') }}
{{ config(materialized='table') }}

-- Bronze passthrough for repository headline numbers.
-- Just type cast and rename for downstream consumption.
-- Read directly from Garage S3 via DuckDB httpfs + parquet.
-- (sources.yml documents the contract; dbt-duckdb's source resolution
-- doesn't auto-handle Hive-partitioned globs, so we read explicitly.)
select
full_name as repo_full_name,
name as repo_name,
Expand All @@ -15,7 +17,7 @@ select
network_count,
default_branch,
language,
license_spdx,
try_cast(license_spdx as varchar) as license_spdx,
archived,
disabled,
fork,
Expand All @@ -24,4 +26,8 @@ select
cast(pushed_at as timestamp) as pushed_at,
cast(fetched_at as timestamp) as fetched_at,
raw_payload
from {{ source('bronze', 'repo_metadata') }}
from read_parquet(
's3://bronze/repo_metadata/**/*.parquet',
hive_partitioning = true,
union_by_name = true
)
17 changes: 6 additions & 11 deletions transform/models/silver/dim_repos.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,10 @@

-- Latest snapshot per repo. Bronze can have multiple snapshots over time;
-- silver collapses to "current truth" by fetched_at.
with ranked as (
select
*,
row_number() over (
partition by repo_full_name
order by fetched_at desc
) as rn
from {{ ref('stg_repo_metadata') }}
)
--
-- We use `qualify` rather than `with ranked as (select *, row_number() ...)`
-- because `select *` against an external-parquet view confuses DuckDB's
-- window-function binder (INTERNAL Error: TIMESTAMP != VARCHAR).
select
repo_full_name,
repo_name,
Expand All @@ -30,5 +25,5 @@ select
updated_at as repo_updated_at,
pushed_at as last_pushed_at,
fetched_at as snapshot_at
from ranked
where rn = 1
from {{ ref('stg_repo_metadata') }}
qualify row_number() over (partition by repo_full_name order by fetched_at desc) = 1
Loading
Loading