Skip to content

Commit 7587a09

Browse files
author
kipruto45
committed
feat: restore runner, add Docker execution, and harden segmentation
1 parent 1333272 commit 7587a09

5 files changed

Lines changed: 230 additions & 40 deletions

File tree

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,13 @@ chmod +x run_all.sh
3838
./run_all.sh
3939
```
4040

41+
Run with Docker (PostgreSQL + pipeline):
42+
43+
```bash
44+
docker compose up -d postgres
45+
docker compose --profile run run --rm pipeline
46+
```
47+
4148
Manual execution:
4249

4350
1. Create database (example):
@@ -79,6 +86,7 @@ psql "$DATABASE_URL" -f analytics/revenue_analysis.sql
7986
psql "$DATABASE_URL" -f tests/test_data_load.sql
8087
psql "$DATABASE_URL" -f tests/test_scd_logic.sql
8188
psql "$DATABASE_URL" -f tests/test_quality_checks.sql
89+
psql "$DATABASE_URL" -f tests/test_customer_segmentation.sql
8290
```
8391

8492
## Pipeline Flow
Lines changed: 71 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,80 @@
1-
-- RFM-based segmentation using quartiles.
2-
WITH rfm_base AS (
1+
-- Rule-based segmentation with explicit thresholds.
2+
-- Optional deterministic date for tests:
3+
-- SET app.segment_as_of_date = '2026-02-19';
4+
5+
CREATE OR REPLACE VIEW warehouse.v_customer_segmentation AS
6+
WITH params AS (
7+
SELECT COALESCE(
8+
NULLIF(current_setting('app.segment_as_of_date', true), '')::DATE,
9+
CURRENT_DATE
10+
) AS as_of_date
11+
), thresholds AS (
12+
SELECT
13+
30::INT AS champion_max_recency_days,
14+
3::INT AS champion_min_paid_orders,
15+
400::NUMERIC(12,2) AS champion_min_revenue,
16+
60::INT AS high_value_max_recency_days,
17+
300::NUMERIC(12,2) AS high_value_min_revenue,
18+
3::INT AS loyal_min_paid_orders,
19+
90::INT AS loyal_max_recency_days,
20+
75::INT AS at_risk_min_recency_days
21+
), customer_kpis AS (
322
SELECT
423
c.customer_id,
5-
MAX(o.order_date) AS last_order_date,
6-
COUNT(DISTINCT o.order_id) FILTER (WHERE o.status = 'PAID') AS frequency,
7-
COALESCE(SUM(t.amount) FILTER (WHERE o.status = 'PAID' AND t.status = 'SUCCESS'), 0) AS monetary
24+
c.signup_date,
25+
MAX(o.order_date) FILTER (WHERE o.status = 'PAID') AS last_paid_order_date,
26+
COUNT(DISTINCT o.order_id) FILTER (WHERE o.status = 'PAID') AS paid_orders,
27+
COALESCE(SUM(t.amount) FILTER (WHERE o.status = 'PAID' AND t.status = 'SUCCESS'), 0) AS lifetime_revenue
828
FROM oltp.customers c
9-
LEFT JOIN oltp.orders o ON o.customer_id = c.customer_id
10-
LEFT JOIN oltp.transactions t ON t.order_id = o.order_id
11-
GROUP BY c.customer_id
12-
), scored AS (
29+
LEFT JOIN oltp.orders o
30+
ON o.customer_id = c.customer_id
31+
LEFT JOIN oltp.transactions t
32+
ON t.order_id = o.order_id
33+
GROUP BY c.customer_id, c.signup_date
34+
), segmentation_base AS (
1335
SELECT
14-
customer_id,
15-
CURRENT_DATE - COALESCE(last_order_date, CURRENT_DATE) AS recency_days,
16-
frequency,
17-
monetary,
18-
NTILE(4) OVER (ORDER BY CURRENT_DATE - COALESCE(last_order_date, CURRENT_DATE) DESC) AS recency_score,
19-
NTILE(4) OVER (ORDER BY frequency ASC) AS frequency_score,
20-
NTILE(4) OVER (ORDER BY monetary ASC) AS monetary_score
21-
FROM rfm_base
36+
ck.customer_id,
37+
p.as_of_date,
38+
(p.as_of_date - COALESCE(ck.last_paid_order_date, ck.signup_date, p.as_of_date))::INT AS recency_days,
39+
ck.paid_orders,
40+
ck.lifetime_revenue,
41+
ROUND(ck.lifetime_revenue / NULLIF(ck.paid_orders, 0), 2) AS avg_paid_order_value
42+
FROM customer_kpis ck
43+
CROSS JOIN params p
2244
)
2345
SELECT
24-
customer_id,
25-
recency_days,
26-
frequency,
27-
monetary,
28-
recency_score,
29-
frequency_score,
30-
monetary_score,
46+
sb.customer_id,
47+
sb.as_of_date,
48+
sb.recency_days,
49+
sb.paid_orders,
50+
sb.lifetime_revenue,
51+
sb.avg_paid_order_value,
3152
CASE
32-
WHEN recency_score = 1 AND frequency_score = 4 AND monetary_score = 4 THEN 'Champions'
33-
WHEN recency_score <= 2 AND monetary_score >= 3 THEN 'High Value'
34-
WHEN recency_score >= 3 AND frequency_score <= 2 THEN 'At Risk'
53+
WHEN sb.paid_orders = 0 THEN 'Inactive'
54+
WHEN sb.recency_days <= t.champion_max_recency_days
55+
AND sb.paid_orders >= t.champion_min_paid_orders
56+
AND sb.lifetime_revenue >= t.champion_min_revenue
57+
THEN 'Champions'
58+
WHEN sb.recency_days <= t.high_value_max_recency_days
59+
AND sb.lifetime_revenue >= t.high_value_min_revenue
60+
THEN 'High Value'
61+
WHEN sb.paid_orders >= t.loyal_min_paid_orders
62+
AND sb.recency_days <= t.loyal_max_recency_days
63+
THEN 'Loyal'
64+
WHEN sb.recency_days >= t.at_risk_min_recency_days
65+
THEN 'At Risk'
3566
ELSE 'Regular'
3667
END AS segment
37-
FROM scored
38-
ORDER BY monetary DESC;
68+
FROM segmentation_base sb
69+
CROSS JOIN thresholds t;
70+
71+
SELECT
72+
customer_id,
73+
as_of_date,
74+
recency_days,
75+
paid_orders,
76+
lifetime_revenue,
77+
avg_paid_order_value,
78+
segment
79+
FROM warehouse.v_customer_segmentation
80+
ORDER BY lifetime_revenue DESC, paid_orders DESC, customer_id;

docker-compose.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
services:
2+
postgres:
3+
image: postgres:16
4+
container_name: sql_data_postgres
5+
restart: unless-stopped
6+
environment:
7+
POSTGRES_DB: sql_data_engineering
8+
POSTGRES_USER: postgres
9+
POSTGRES_PASSWORD: postgres
10+
ports:
11+
- "${PG_HOST_PORT:-5433}:5432"
12+
healthcheck:
13+
test: ["CMD-SHELL", "pg_isready -U postgres -d sql_data_engineering"]
14+
interval: 10s
15+
timeout: 5s
16+
retries: 10
17+
volumes:
18+
- postgres_data:/var/lib/postgresql/data
19+
20+
pipeline:
21+
image: postgres:16
22+
depends_on:
23+
postgres:
24+
condition: service_healthy
25+
working_dir: /workspace
26+
volumes:
27+
- ./:/workspace
28+
environment:
29+
DB_HOST: postgres
30+
DB_PORT: 5432
31+
DB_NAME: sql_data_engineering
32+
DB_USER: postgres
33+
DB_PASSWORD: postgres
34+
DATABASE_URL: postgresql://postgres:postgres@postgres:5432/sql_data_engineering
35+
command: ["bash", "-lc", "chmod +x ./run_all.sh && ./run_all.sh"]
36+
profiles: ["run"]
37+
38+
volumes:
39+
postgres_data:

run_all.sh

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,45 @@ set -euo pipefail
44
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
55
cd "$ROOT_DIR"
66

7-
if [[ ! -f ".env" ]]; then
8-
echo "Missing .env in $ROOT_DIR"
9-
exit 1
10-
fi
7+
load_env_file() {
8+
local env_file="$1"
9+
while IFS= read -r line; do
10+
[[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
11+
[[ "$line" != *=* ]] && continue
12+
13+
local key="${line%%=*}"
14+
local value="${line#*=}"
15+
key="${key//[[:space:]]/}"
16+
17+
if [[ -z "$key" ]]; then
18+
continue
19+
fi
20+
21+
# Keep externally provided values (useful for Docker/CI overrides).
22+
if [[ -z "${!key+x}" ]]; then
23+
export "$key=$value"
24+
fi
25+
done < "$env_file"
26+
}
1127

12-
set -a
13-
source .env
14-
set +a
28+
if [[ -f ".env" ]]; then
29+
load_env_file ".env"
30+
else
31+
echo "No .env found. Using environment variables only."
32+
fi
1533

1634
if [[ -z "${DATABASE_URL:-}" ]]; then
17-
echo "DATABASE_URL is not set in .env"
18-
exit 1
35+
if [[ -n "${DB_HOST:-}" && -n "${DB_PORT:-}" && -n "${DB_NAME:-}" && -n "${DB_USER:-}" && -n "${DB_PASSWORD:-}" ]]; then
36+
export DATABASE_URL="postgresql://${DB_USER}:${DB_PASSWORD}@${DB_HOST}:${DB_PORT}/${DB_NAME}"
37+
else
38+
echo "DATABASE_URL is not set. Provide it directly or via DB_* variables."
39+
exit 1
40+
fi
1941
fi
2042

21-
export PGPASSWORD="${DB_PASSWORD:-}"
43+
export PGPASSWORD="${DB_PASSWORD:-${PGPASSWORD:-}}"
2244

23-
ADMIN_URL="postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-}@${DB_HOST:-localhost}:${DB_PORT:-5432}/postgres"
45+
ADMIN_URL="postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-localhost}:${DB_PORT:-5432}/postgres"
2446
TARGET_DB="${DB_NAME:-sql_data_engineering}"
2547

2648
echo "Checking target database: $TARGET_DB"
@@ -69,5 +91,6 @@ run_sql "performance/optimization_examples.sql"
6991
run_sql "tests/test_data_load.sql"
7092
run_sql "tests/test_scd_logic.sql"
7193
run_sql "tests/test_quality_checks.sql"
94+
run_sql "tests/test_customer_segmentation.sql"
7295

7396
echo "Pipeline completed successfully."
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
-- Deterministic segmentation tests.
2+
SET app.segment_as_of_date = '2026-02-19';
3+
4+
-- Test 1: segmentation returns one row per customer and no NULL segment.
5+
DO $$
6+
DECLARE
7+
v_customers INT;
8+
v_segmented INT;
9+
v_null_segments INT;
10+
BEGIN
11+
SELECT COUNT(*) INTO v_customers FROM oltp.customers;
12+
SELECT COUNT(*) INTO v_segmented FROM warehouse.v_customer_segmentation;
13+
SELECT COUNT(*) INTO v_null_segments
14+
FROM warehouse.v_customer_segmentation
15+
WHERE segment IS NULL;
16+
17+
IF v_segmented <> v_customers THEN
18+
RAISE EXCEPTION 'Segmentation row mismatch: expected %, got %.', v_customers, v_segmented;
19+
END IF;
20+
21+
IF v_null_segments > 0 THEN
22+
RAISE EXCEPTION 'Segmentation contains % NULL segments.', v_null_segments;
23+
END IF;
24+
END $$;
25+
26+
-- Test 2: segment values are within approved taxonomy.
27+
DO $$
28+
DECLARE
29+
v_invalid INT;
30+
BEGIN
31+
SELECT COUNT(*) INTO v_invalid
32+
FROM warehouse.v_customer_segmentation
33+
WHERE segment NOT IN ('Champions', 'High Value', 'Loyal', 'At Risk', 'Regular', 'Inactive');
34+
35+
IF v_invalid > 0 THEN
36+
RAISE EXCEPTION 'Found % rows with invalid segment labels.', v_invalid;
37+
END IF;
38+
END $$;
39+
40+
-- Test 3: inactive customers must have zero paid orders and vice versa.
41+
DO $$
42+
DECLARE
43+
v_mismatch INT;
44+
BEGIN
45+
SELECT COUNT(*) INTO v_mismatch
46+
FROM warehouse.v_customer_segmentation
47+
WHERE (paid_orders = 0 AND segment <> 'Inactive')
48+
OR (paid_orders > 0 AND segment = 'Inactive');
49+
50+
IF v_mismatch > 0 THEN
51+
RAISE EXCEPTION 'Found % rows violating inactive segmentation rule.', v_mismatch;
52+
END IF;
53+
END $$;
54+
55+
-- Test 4: regression checks for known sample customers.
56+
DO $$
57+
DECLARE
58+
v_customer_4 TEXT;
59+
v_customer_6 TEXT;
60+
BEGIN
61+
SELECT segment INTO v_customer_4
62+
FROM warehouse.v_customer_segmentation
63+
WHERE customer_id = 4;
64+
65+
SELECT segment INTO v_customer_6
66+
FROM warehouse.v_customer_segmentation
67+
WHERE customer_id = 6;
68+
69+
IF v_customer_4 <> 'High Value' THEN
70+
RAISE EXCEPTION 'Expected customer 4 to be High Value, got %.', COALESCE(v_customer_4, 'NULL');
71+
END IF;
72+
73+
IF v_customer_6 <> 'Inactive' THEN
74+
RAISE EXCEPTION 'Expected customer 6 to be Inactive, got %.', COALESCE(v_customer_6, 'NULL');
75+
END IF;
76+
END $$;
77+
78+
RESET app.segment_as_of_date;

0 commit comments

Comments
 (0)