Victor-Kipruto-Rop
diff --git a/‎.gitignore‎
Lines changed: 16 additions & 0 deletions b/‎.gitignore‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 96 additions & 0 deletions b/‎README.md‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎analytics/customer_segmentation.sql‎
Lines changed: 38 additions & 0 deletions b/‎analytics/customer_segmentation.sql‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎analytics/fraud_detection.sql‎
Lines changed: 35 additions & 0 deletions b/‎analytics/fraud_detection.sql‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎analytics/retention_analysis.sql‎
Lines changed: 31 additions & 0 deletions b/‎analytics/retention_analysis.sql‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎analytics/revenue_analysis.sql‎
Lines changed: 29 additions & 0 deletions b/‎analytics/revenue_analysis.sql‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎analytics/window_functions.sql‎
Lines changed: 33 additions & 0 deletions b/‎analytics/window_functions.sql‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎data/processed/cleaned_data.csv‎
Lines changed: 6 additions & 0 deletions b/‎data/processed/cleaned_data.csv‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎data/raw/customers.csv‎
Lines changed: 9 additions & 0 deletions b/‎data/raw/customers.csv‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎data/raw/orders.csv‎
Lines changed: 10 additions & 0 deletions b/‎data/raw/orders.csv‎
Lines changed: 10 additions & 0 deletions
@@ -0,0 +1,16 @@
+# Environment
+.env
+.venv/
+
+# Python
+__pycache__/
+*.pyc
+.pytest_cache/
+
+# Editors / OS
+.DS_Store
+.idea/
+.vscode/
+
+# Build / logs
+*.log
@@ -0,0 +1,96 @@
+# SQL Data Engineering Project
+
+End-to-end SQL project for ingesting raw CSV data, transforming it into an OLTP model, loading a dimensional warehouse, and running analytics, quality checks, performance tests, and monitoring.
+
+## Project Layout
+
+```text
+sql-data-engineering-project/
+├── database/
+├── data/
+├── etl/
+├── warehouse/
+├── analytics/
+├── data_quality/
+├── performance/
+├── monitoring/
+├── tests/
+└── docs/
+```
+
+## Tech Stack
+
+- PostgreSQL 14+
+- SQL (psql-compatible scripts)
+- Optional Python tools (linting/automation)
+
+## Quick Start
+
+One command end-to-end (recommended):
+
+```bash
+chmod +x run_all.sh
+./run_all.sh
+```
+
+Manual execution:
+
+1. Create database (example):
+
+```bash
+createdb sql_data_engineering
+```
+
+2. Set environment values in `.env`.
+
+3. Initialize core schemas and tables:
+
+```bash
+psql "$DATABASE_URL" -f database/schema.sql
+psql "$DATABASE_URL" -f database/tables.sql
+psql "$DATABASE_URL" -f database/constraints.sql
+psql "$DATABASE_URL" -f database/indexes.sql
+```
+
+4. Load source data and run ETL:
+
+```bash
+psql "$DATABASE_URL" -f etl/extract.sql
+psql "$DATABASE_URL" -f etl/transform.sql
+psql "$DATABASE_URL" -f etl/load.sql
+```
+
+5. Build warehouse model (includes SCD Type 2 update + fact load):
+
+```bash
+psql "$DATABASE_URL" -f warehouse/star_schema.sql
+```
+
+6. Run quality checks, analytics, and tests:
+
+```bash
+psql "$DATABASE_URL" -f data_quality/validation_queries.sql
+psql "$DATABASE_URL" -f analytics/revenue_analysis.sql
+psql "$DATABASE_URL" -f tests/test_data_load.sql
+psql "$DATABASE_URL" -f tests/test_scd_logic.sql
+psql "$DATABASE_URL" -f tests/test_quality_checks.sql
+```
+
+## Pipeline Flow
+
+1. **Extract** CSVs into staging raw tables.
+2. **Transform** and standardize datatypes + deduplicate records.
+3. **Load** cleaned data into OLTP tables with upserts.
+4. **Warehouse** load dimensions/facts and apply SCD Type 2 for customers.
+5. **Analyze** KPIs and run fraud/retention/segmentation logic.
+6. **Monitor** row counts and anomaly signals.
+
+## Notes
+
+- SQL is written for PostgreSQL.
+- `etl/extract.sql` uses `\copy`, so run with `psql` from project root.
+- Example data is included in `data/raw/`.
+- Architecture and data model diagrams are generated from DOT sources:
+  - `docs/architecture_diagram.dot`
+  - `docs/data_model.dot`
+  - Regenerate with `./docs/generate_diagrams.sh`
@@ -0,0 +1,38 @@
+-- RFM-based segmentation using quartiles.
+WITH rfm_base AS (
+    SELECT
+        c.customer_id,
+        MAX(o.order_date) AS last_order_date,
+        COUNT(DISTINCT o.order_id) FILTER (WHERE o.status = 'PAID') AS frequency,
+        COALESCE(SUM(t.amount) FILTER (WHERE o.status = 'PAID' AND t.status = 'SUCCESS'), 0) AS monetary
+    FROM oltp.customers c
+    LEFT JOIN oltp.orders o ON o.customer_id = c.customer_id
+    LEFT JOIN oltp.transactions t ON t.order_id = o.order_id
+    GROUP BY c.customer_id
+), scored AS (
+    SELECT
+        customer_id,
+        CURRENT_DATE - COALESCE(last_order_date, CURRENT_DATE) AS recency_days,
+        frequency,
+        monetary,
+        NTILE(4) OVER (ORDER BY CURRENT_DATE - COALESCE(last_order_date, CURRENT_DATE) DESC) AS recency_score,
+        NTILE(4) OVER (ORDER BY frequency ASC) AS frequency_score,
+        NTILE(4) OVER (ORDER BY monetary ASC) AS monetary_score
+    FROM rfm_base
+)
+SELECT
+    customer_id,
+    recency_days,
+    frequency,
+    monetary,
+    recency_score,
+    frequency_score,
+    monetary_score,
+    CASE
+        WHEN recency_score = 1 AND frequency_score = 4 AND monetary_score = 4 THEN 'Champions'
+        WHEN recency_score <= 2 AND monetary_score >= 3 THEN 'High Value'
+        WHEN recency_score >= 3 AND frequency_score <= 2 THEN 'At Risk'
+        ELSE 'Regular'
+    END AS segment
+FROM scored
+ORDER BY monetary DESC;
@@ -0,0 +1,35 @@
+-- High-risk transactions with material amount.
+SELECT
+    t.transaction_id,
+    t.order_id,
+    o.customer_id,
+    t.transaction_date,
+    t.amount,
+    t.status,
+    t.risk_score
+FROM oltp.transactions t
+JOIN oltp.orders o ON o.order_id = t.order_id
+WHERE t.risk_score >= 80
+   OR (t.status IN ('FAILED', 'CHARGEBACK') AND t.amount >= 200)
+ORDER BY t.risk_score DESC NULLS LAST, t.transaction_date DESC;
+
+-- Consecutive failed attempts within 15 minutes per order.
+WITH failed AS (
+    SELECT
+        transaction_id,
+        order_id,
+        transaction_date,
+        LAG(transaction_date) OVER (PARTITION BY order_id ORDER BY transaction_date) AS prev_txn_time
+    FROM oltp.transactions
+    WHERE status = 'FAILED'
+)
+SELECT
+    transaction_id,
+    order_id,
+    transaction_date,
+    prev_txn_time,
+    EXTRACT(EPOCH FROM (transaction_date - prev_txn_time)) / 60.0 AS minutes_since_previous
+FROM failed
+WHERE prev_txn_time IS NOT NULL
+  AND transaction_date - prev_txn_time <= INTERVAL '15 minutes'
+ORDER BY order_id, transaction_date;
@@ -0,0 +1,31 @@
+-- Monthly cohort retention based on paid order activity.
+WITH signup_cohort AS (
+    SELECT
+        customer_id,
+        DATE_TRUNC('month', signup_date)::DATE AS cohort_month
+    FROM oltp.customers
+), activity AS (
+    SELECT DISTINCT
+        o.customer_id,
+        DATE_TRUNC('month', o.order_date)::DATE AS activity_month
+    FROM oltp.orders o
+    WHERE o.status = 'PAID'
+), combined AS (
+    SELECT
+        c.cohort_month,
+        a.activity_month,
+        EXTRACT(YEAR FROM AGE(a.activity_month, c.cohort_month)) * 12
+            + EXTRACT(MONTH FROM AGE(a.activity_month, c.cohort_month)) AS month_number,
+        c.customer_id
+    FROM signup_cohort c
+    JOIN activity a
+        ON a.customer_id = c.customer_id
+    WHERE a.activity_month >= c.cohort_month
+)
+SELECT
+    cohort_month,
+    month_number::INT,
+    COUNT(DISTINCT customer_id) AS active_customers
+FROM combined
+GROUP BY cohort_month, month_number
+ORDER BY cohort_month, month_number;
@@ -0,0 +1,29 @@
+-- Monthly revenue, order volume, and average order value.
+SELECT
+    DATE_TRUNC('month', o.order_date)::DATE AS month_start,
+    COUNT(DISTINCT o.order_id) AS orders,
+    SUM(CASE WHEN t.status = 'SUCCESS' AND o.status = 'PAID' THEN t.amount ELSE 0 END) AS revenue,
+    ROUND(
+        SUM(CASE WHEN t.status = 'SUCCESS' AND o.status = 'PAID' THEN t.amount ELSE 0 END)
+        / NULLIF(COUNT(DISTINCT o.order_id), 0),
+        2
+    ) AS avg_order_value
+FROM oltp.orders o
+LEFT JOIN oltp.transactions t
+    ON t.order_id = o.order_id
+GROUP BY DATE_TRUNC('month', o.order_date)::DATE
+ORDER BY month_start;
+
+-- Top customers by paid revenue.
+SELECT
+    c.customer_id,
+    c.first_name,
+    c.last_name,
+    SUM(CASE WHEN t.status = 'SUCCESS' AND o.status = 'PAID' THEN t.amount ELSE 0 END) AS lifetime_revenue
+FROM oltp.customers c
+JOIN oltp.orders o
+    ON o.customer_id = c.customer_id
+LEFT JOIN oltp.transactions t
+    ON t.order_id = o.order_id
+GROUP BY c.customer_id, c.first_name, c.last_name
+ORDER BY lifetime_revenue DESC;
@@ -0,0 +1,33 @@
+-- Running revenue trend by month.
+WITH monthly AS (
+    SELECT
+        DATE_TRUNC('month', o.order_date)::DATE AS month_start,
+        SUM(CASE WHEN o.status = 'PAID' AND t.status = 'SUCCESS' THEN t.amount ELSE 0 END) AS monthly_revenue
+    FROM oltp.orders o
+    LEFT JOIN oltp.transactions t
+        ON t.order_id = o.order_id
+    GROUP BY DATE_TRUNC('month', o.order_date)::DATE
+)
+SELECT
+    month_start,
+    monthly_revenue,
+    SUM(monthly_revenue) OVER (ORDER BY month_start) AS running_revenue,
+    AVG(monthly_revenue) OVER (ORDER BY month_start ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS rolling_3_month_avg,
+    LAG(monthly_revenue) OVER (ORDER BY month_start) AS previous_month_revenue
+FROM monthly
+ORDER BY month_start;
+
+-- Customer ranking by lifetime spend.
+SELECT
+    c.customer_id,
+    c.first_name,
+    c.last_name,
+    SUM(CASE WHEN o.status = 'PAID' AND t.status = 'SUCCESS' THEN t.amount ELSE 0 END) AS spend,
+    DENSE_RANK() OVER (
+        ORDER BY SUM(CASE WHEN o.status = 'PAID' AND t.status = 'SUCCESS' THEN t.amount ELSE 0 END) DESC
+    ) AS spend_rank
+FROM oltp.customers c
+LEFT JOIN oltp.orders o ON o.customer_id = c.customer_id
+LEFT JOIN oltp.transactions t ON t.order_id = o.order_id
+GROUP BY c.customer_id, c.first_name, c.last_name
+ORDER BY spend_rank, customer_id;
@@ -0,0 +1,6 @@
+customer_id,order_id,transaction_id,order_date,transaction_date,revenue_amount,order_status,transaction_status,payment_method,risk_score
+1,1001,5001,2025-12-15,2025-12-15T12:10:00Z,120.50,PAID,SUCCESS,CARD,12.50
+2,1002,5002,2025-12-16,2025-12-16T16:00:00Z,89.99,PAID,SUCCESS,BANK_TRANSFER,20.00
+3,1004,5004,2025-12-20,2025-12-20T20:45:00Z,215.00,PAID,SUCCESS,WALLET,18.00
+4,1005,5006,2026-01-05,2026-01-05T14:25:00Z,560.00,PAID,SUCCESS,CARD,75.00
+5,1006,5007,2026-01-08,2026-01-08T09:50:00Z,45.25,PAID,SUCCESS,CARD,10.00
@@ -0,0 +1,9 @@
+customer_id,first_name,last_name,email,phone,city,country,signup_date,updated_at
+1,Alice,Johnson,alice.johnson@example.com,+1-555-1001,Austin,US,2024-01-15,2026-01-01T10:00:00Z
+2,Bob,Smith,bob.smith@example.com,+1-555-1002,Seattle,US,2024-02-11,2026-01-02T09:30:00Z
+2,Bob,Smith,bob.smith@example.com,+1-555-1002,Portland,US,2024-02-11,2026-01-12T09:30:00Z
+3,Carol,Nguyen,carol.nguyen@example.com,+1-555-1003,Denver,US,2024-03-05,2026-01-03T11:45:00Z
+4,David,Lopez,david.lopez@example.com,+1-555-1004,Miami,US,2024-05-20,2026-01-04T14:15:00Z
+5,Eva,Brown,eva.brown@example.com,+1-555-1005,Boston,US,2024-07-09,2026-01-05T08:10:00Z
+6,Frank,Kim,frank.kim@example.com,+1-555-1006,Chicago,US,2025-01-17,2026-01-06T12:00:00Z
+7,Gina,Hall,gina.hall_at_example.com,+1-555-1007,Phoenix,US,2025-03-20,2026-01-07T07:50:00Z
@@ -0,0 +1,10 @@
+order_id,customer_id,order_date,status,total_amount,currency,updated_at
+1001,1,2025-12-15,PAID,120.50,USD,2026-01-10T10:20:00Z
+1002,2,2025-12-16,PAID,89.99,USD,2026-01-10T10:25:00Z
+1003,2,2025-12-18,CANCELLED,49.00,USD,2026-01-11T09:10:00Z
+1004,3,2025-12-20,PAID,215.00,USD,2026-01-11T10:00:00Z
+1005,4,2026-01-05,PENDING,560.00,USD,2026-01-12T08:00:00Z
+1005,4,2026-01-05,PAID,560.00,USD,2026-01-15T08:00:00Z
+1006,5,2026-01-08,PAID,45.25,USD,2026-01-12T09:00:00Z
+1007,6,2026-01-10,REFUNDED,199.99,USD,2026-01-12T10:00:00Z
+1008,99,2026-01-11,PAID,75.00,USD,2026-01-12T11:00:00Z