Skip to content

Commit 4d1bf18

Browse files
committed
docs: add documentation, examples, demo data, and infra configs
- Add docs/extending.md: step-by-step guide for adding new services - Add examples/pipelines/: HR, e-commerce, weather YAML pipeline examples - Add data/demo/: sample CSV datasets for quick start - Add templates/new_service/: scaffold with checklist for new services - Add .env.example with all configuration variables - Add Makefile with 20+ targets (quickstart, test, lint, benchmark, etc.) - Update docker-compose.yml: add Streamlit, healthchecks, dependency ordering - Update prometheus.yml with all scrape targets - Update README.md with project overview and quick start guide - Update copilot-instructions.md with comprehensive AI agent context - Update .gitignore for Python, Docker, IDE artifacts
1 parent 5349a16 commit 4d1bf18

21 files changed

Lines changed: 2914 additions & 304 deletions

.env.example

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# ── PostgreSQL (Airflow metadata) ──
2+
POSTGRES_USER=airflow
3+
POSTGRES_PASSWORD=change-me-strong-password
4+
POSTGRES_DB=airflow
5+
GF_SECURITY_ADMIN_PASSWORD=change-me-strong-password
6+
7+
# ── Security / Path Resolution ──
8+
ETL_DATA_ROOT=/app/data
9+
ALLOW_PRIVATE_API_URLS=false
10+
11+
# ── HuggingFace Models ──
12+
HF_MODELS_PATH=./hf_models
13+
14+
# ── AI Agent ──
15+
LLM_PROVIDER=openai
16+
OPENAI_API_KEY=
17+
OPENAI_MODEL=gpt-4o-mini

.github/copilot-instructions.md

Lines changed: 515 additions & 200 deletions
Large diffs are not rendered by default.

.gitignore

Lines changed: 45 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,48 @@
1-
airflow/logs/
2-
airflow/dags/__pycache__/
3-
preparator/__pycache__/
4-
5-
services/clean-nan-service/app/__pycache__/
6-
services/delete-columns-service/app/__pycache__/
7-
services/extract-csv-service/app/__pycache__/
8-
services/extract-sql-service/app/__pycache__/
9-
services/extract-excel-service/app/__pycache__/
10-
services/extract-api-service/app/__pycache__/
11-
services/data-quality-service/app/__pycache__/
12-
services/join-datasets-service/app/__pycache__/
13-
services/load-data-service/app/__pycache__/
14-
services/outlier-detection-service/app/__pycache__/
1+
# ── Python ──
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
*.so
6+
*.egg-info/
7+
dist/
8+
build/
9+
*.egg
1510

16-
otherfiles/
11+
# ── Environment ──
12+
.env
13+
.venv/
14+
venv/
15+
env/
16+
17+
# ── IDE ──
18+
.vscode/
19+
.idea/
20+
*.swp
21+
*.swo
22+
23+
# ── Testing & Coverage ──
24+
.pytest_cache/
25+
htmlcov/
26+
coverage.xml
27+
.coverage
28+
29+
# ── Linting ──
30+
.ruff_cache/
31+
32+
# ── Model cache ──
33+
hf_models/
1734

35+
# ── Benchmark artifacts ──
36+
benchmark/data/
37+
benchmark/results/
38+
39+
# ── Airflow ──
40+
airflow/logs/
41+
42+
# ── Data files (should not be committed) ──
1843
*.csv
19-
*.xlsx
44+
*.xlsx
45+
!data/demo/*.csv
46+
47+
# ── Legacy/misc ──
48+
otherfiles/

Makefile

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
.PHONY: help up down build logs test lint benchmark clean quickstart demo-data
2+
3+
help: ## Show this help
4+
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | \
5+
awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
6+
7+
# ── Quick Start ──
8+
quickstart: ## One-command setup: env → build → start → load demo data
9+
@echo "══════════════════════════════════════════════════════"
10+
@echo " ETL Microservices — Quick Start"
11+
@echo "══════════════════════════════════════════════════════"
12+
@test -f .env || (cp .env.example .env && echo "✓ Created .env from .env.example")
13+
@test -f .env && echo "✓ .env already exists"
14+
docker compose build
15+
docker compose up -d
16+
@echo "⏳ Waiting for services to be healthy..."
17+
@sleep 10
18+
@echo "📦 Loading demo datasets into shared volume..."
19+
@$(MAKE) demo-data
20+
@echo ""
21+
@echo "══════════════════════════════════════════════════════"
22+
@echo " ✓ All services running!"
23+
@echo ""
24+
@echo " Streamlit UI: http://localhost:8501"
25+
@echo " Airflow: http://localhost:8080"
26+
@echo " Prometheus: http://localhost:9090"
27+
@echo " Grafana: http://localhost:3000"
28+
@echo ""
29+
@echo " Demo datasets loaded at /app/data/ in containers."
30+
@echo " Try the 'ecommerce_pipeline' or"
31+
@echo " 'hr_analytics_pipeline' DAG in Airflow!"
32+
@echo "══════════════════════════════════════════════════════"
33+
34+
demo-data: ## Copy demo datasets into the shared Docker volume
35+
@mkdir -p data/demo
36+
@echo "Copying demo datasets to running containers..."
37+
@for svc in extract-csv-service clean-nan-service delete-columns-service \
38+
data-quality-service outlier-detection-service load-data-service; do \
39+
if docker ps --format '{{.Names}}' | grep -q "^$$svc$$"; then \
40+
docker cp data/demo/hr_sample.csv $$svc:/app/data/hr_demo/data.csv && \
41+
docker cp data/demo/ecommerce_orders.csv $$svc:/app/data/ecommerce_demo/data.csv && \
42+
echo "$$svc"; \
43+
else \
44+
echo "$$svc is not running — start services first with 'make up'"; \
45+
exit 1; \
46+
fi; \
47+
done
48+
@echo "✓ Demo data loaded: hr_demo/data.csv, ecommerce_demo/data.csv"
49+
50+
# ── Docker ──
51+
up: ## Start all services (detached)
52+
docker compose up -d
53+
54+
down: ## Stop all services
55+
docker compose down
56+
57+
build: ## Build all Docker images
58+
docker compose build
59+
60+
rebuild: ## Rebuild all images (no cache)
61+
docker compose build --no-cache
62+
63+
logs: ## Tail logs for all services
64+
docker compose logs -f
65+
66+
logs-service: ## Tail logs for a specific service (usage: make logs-service SVC=clean-nan-service)
67+
docker compose logs -f $(SVC)
68+
69+
ps: ## Show running containers
70+
docker compose ps
71+
72+
# ── Testing ──
73+
test: ## Run all tests
74+
python -m pytest tests/ -v
75+
76+
test-unit: ## Run unit tests only
77+
python -m pytest tests/unit/ -v
78+
79+
test-integration: ## Run integration tests only
80+
python -m pytest tests/integration/ -v
81+
82+
test-coverage: ## Run tests with coverage report
83+
python -m pytest tests/ --cov=services --cov-report=html --cov-report=term
84+
85+
# ── Code Quality ──
86+
lint: ## Run ruff linter
87+
python -m ruff check .
88+
89+
lint-fix: ## Auto-fix lint issues
90+
python -m ruff check . --fix
91+
92+
format: ## Format code with ruff
93+
python -m ruff format .
94+
95+
# ── Benchmark ──
96+
benchmark-data: ## Generate benchmark datasets (all scales)
97+
python benchmark/generate_hr_dataset.py --all-scales
98+
99+
benchmark-mono: ## Run monolith benchmark
100+
python benchmark/run_benchmark.py --mode monolith --plot
101+
102+
benchmark-micro: ## Run microservices benchmark (services must be running)
103+
python benchmark/run_benchmark.py --mode microservices --plot
104+
105+
benchmark-all: ## Run full comparison benchmark
106+
python benchmark/run_benchmark.py --mode both --plot
107+
108+
# ── Streamlit ──
109+
streamlit: ## Run Streamlit app locally
110+
streamlit run streamlit_app/app.py
111+
112+
# ── Cleanup ──
113+
clean: ## Remove generated files
114+
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
115+
find . -type f -name "*.pyc" -delete 2>/dev/null || true
116+
rm -rf benchmark/results/*.png benchmark/results/*.html benchmark/results/*.json
117+
rm -rf benchmark/data/*.csv
118+
rm -rf htmlcov/ .coverage .pytest_cache/

0 commit comments

Comments
 (0)