Skip to content

Commit 005e49c

Browse files
committed
Repo update
1 parent a589981 commit 005e49c

12 files changed

Lines changed: 1817 additions & 1 deletion

File tree

?

Whitespace-only changes.

MlFlow/Dockerfile

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# ─────────────────────────────────────────────────────────
2+
# OASIS MLflow — Hugging Face Space
3+
# MLflow Tracking Server
4+
# Backend : NeonDB (PostgreSQL)
5+
# Artifacts: HuggingFace Dataset (hf://datasets/Dreipfelt/oasis-mlflow-artifacts)
6+
# ─────────────────────────────────────────────────────────
7+
FROM continuumio/miniconda3
8+
9+
LABEL maintainer="Dreipfelt" \
10+
project="OASIS Security" \
11+
component="MLflow Tracking Server" \
12+
version="2.0"
13+
14+
# System dependencies
15+
RUN apt-get update && apt-get install -y \
16+
curl \
17+
unzip \
18+
nano \
19+
&& rm -rf /var/lib/apt/lists/*
20+
21+
# HuggingFace Spaces: non-root user required
22+
RUN useradd -m -u 1000 user
23+
USER user
24+
25+
ENV HOME=/home/user \
26+
PATH=/home/user/.local/bin:$PATH
27+
28+
WORKDIR $HOME/app
29+
30+
# Copy files with correct ownership
31+
COPY --chown=user . $HOME/app
32+
33+
# Install Python dependencies
34+
RUN pip install --no-cache-dir --upgrade pip \
35+
&& pip install --no-cache-dir -r requirements.txt
36+
37+
# Authenticate to HuggingFace Hub using the HF_TOKEN secret
38+
# This allows MLflow to push artefacts to the HF Dataset
39+
RUN --mount=type=secret,id=HF_TOKEN,mode=0444 \
40+
pip install --no-cache-dir huggingface_hub && \
41+
echo "HF auth configured"
42+
43+
# Launch MLflow tracking server
44+
# All env vars are injected as HF Space Secrets — never hardcoded
45+
CMD huggingface-cli login --token $HF_TOKEN --add-to-git-credential && \
46+
mlflow server \
47+
--host 0.0.0.0 \
48+
--port $PORT \
49+
--backend-store-uri $BACKEND_STORE_URI \
50+
--default-artifact-root $ARTIFACT_STORE_URI \
51+
--serve-artifacts \
52+
--workers 2

MlFlow/README.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
---
2+
title: Oasis MLflow
3+
emoji: 📊
4+
colorFrom: blue
5+
colorTo: indigo
6+
sdk: docker
7+
pinned: false
8+
---
9+
10+
# OASIS MLflow — Experiment Tracking Server
11+
12+
MLflow tracking server for the OASIS Security project.
13+
14+
| Component | Technology |
15+
|---|---|
16+
| Metadata backend | NeonDB (PostgreSQL) |
17+
| Artefact store | HuggingFace Dataset |
18+
| UI | MLflow 2.12 |
19+
20+
## Tracked experiments
21+
22+
Gradient Boosting · XGBoost · Random Forest · Ridge · LightGBM · Prophet · Holt-Winters
23+
24+
## Champion model
25+
26+
**Gradient Boosting — R² = 0.979 · RMSE = 48.84 · MAE = 29.95**
27+
28+
*CDSD Certification Project — RNCP35288*

MlFlow/requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
mlflow==2.12.1
2+
psycopg2-binary==2.9.9
3+
huggingface_hub==0.23.0
4+
hf-transfer==0.1.6

MlFlow/train.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
# models/crime_predictor/src/train.py
2+
# Training pipeline with MLflow tracking
3+
# Artefacts pushed to hf://datasets/Dreipfelt/oasis-mlflow-artifacts
4+
5+
import os
6+
import json
7+
import joblib
8+
import numpy as np
9+
import pandas as pd
10+
import mlflow
11+
import mlflow.sklearn
12+
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
13+
from sklearn.linear_model import Ridge
14+
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
15+
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
16+
from pathlib import Path
17+
18+
# ── MLflow remote tracking server (HF Space) ──────────────
19+
MLFLOW_TRACKING_URI = os.getenv(
20+
"MLFLOW_TRACKING_URI",
21+
"https://dreipfelt-oasis-mlflow.hf.space"
22+
)
23+
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
24+
mlflow.set_experiment("oasis-security-crime-prediction")
25+
26+
# ── Data source ────────────────────────────────────────────
27+
DATA_URL = os.getenv(
28+
"DATA_URL",
29+
"https://static.data.gouv.fr/resources/"
30+
"bases-statistiques-communale-departementale-et-regionale-de-la-delinquance-"
31+
"enregistree-par-la-police-et-la-gendarmerie-nationales/"
32+
"20260129-160256/donnee-reg-data.gouv-2025-geographie2025-produit-le2026-01-22.csv"
33+
)
34+
35+
# ── Models to benchmark ────────────────────────────────────
36+
MODELS = {
37+
"GradientBoosting": GradientBoostingRegressor(
38+
n_estimators=300, max_depth=4, learning_rate=0.08,
39+
random_state=42
40+
),
41+
"RandomForest": RandomForestRegressor(
42+
n_estimators=200, max_depth=6, random_state=42, n_jobs=-1
43+
),
44+
"Ridge": Ridge(alpha=1.0),
45+
}
46+
47+
48+
def load_data(url: str) -> pd.DataFrame:
49+
"""Load and clean data from data.gouv.fr"""
50+
print(f"📥 Loading data from {url[:60]}...")
51+
df = pd.read_csv(url, sep=";", encoding="utf-8", low_memory=False)
52+
df = df[df["unite_de_compte"] == "nombre"].copy()
53+
df["taux_100k"] = df["nombre"] / df["insee_pop"] * 100_000
54+
return df.dropna(subset=["taux_100k"])
55+
56+
57+
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
58+
"""Production-grade feature engineering"""
59+
df = df.copy()
60+
61+
# Cyclic temporal encoding
62+
df["year_sin"] = np.sin(2 * np.pi * df["annee"] / 10)
63+
df["year_cos"] = np.cos(2 * np.pi * df["annee"] / 10)
64+
df["year_trend"] = (
65+
(df["annee"] - df["annee"].min())
66+
/ (df["annee"].max() - df["annee"].min())
67+
)
68+
69+
# Lag features (per indicator × region group)
70+
grp = df.groupby(["indicateur", "Code_region"])["taux_100k"]
71+
df["lag1"] = grp.shift(1).fillna(grp.transform("mean"))
72+
df["lag2"] = grp.shift(2).fillna(grp.transform("mean"))
73+
df["roll_mean_3"] = (
74+
grp.rolling(3, min_periods=1).mean().reset_index(0, drop=True)
75+
)
76+
77+
# Regional aggregate
78+
df["region_mean"] = df.groupby("Code_region")["taux_100k"].transform("mean")
79+
80+
# Categorical encoding
81+
df["ind_code"] = pd.Categorical(df["indicateur"]).codes
82+
df["reg_code"] = pd.Categorical(df["Code_region"]).codes
83+
84+
feature_cols = [
85+
"year_sin", "year_cos", "year_trend",
86+
"lag1", "lag2", "roll_mean_3",
87+
"region_mean", "ind_code", "reg_code",
88+
]
89+
return df[feature_cols + ["taux_100k"]].dropna()
90+
91+
92+
def evaluate(model, X_test, y_test) -> dict:
93+
"""Compute test set metrics"""
94+
preds = model.predict(X_test)
95+
return {
96+
"r2_test": round(r2_score(y_test, preds), 4),
97+
"rmse_test": round(np.sqrt(mean_squared_error(y_test, preds)), 4),
98+
"mae_test": round(mean_absolute_error(y_test, preds), 4),
99+
}
100+
101+
102+
def train_and_log(model_name: str, model, X_train, X_test, y_train, y_test):
103+
"""Train one model and log everything to MLflow"""
104+
print(f"\n🔧 Training {model_name}...")
105+
106+
with mlflow.start_run(run_name=model_name):
107+
108+
# Cross-validation (TimeSeriesSplit)
109+
tscv = TimeSeriesSplit(n_splits=3)
110+
cv_scores = cross_val_score(
111+
model, X_train, y_train, cv=tscv, scoring="r2", n_jobs=-1
112+
)
113+
114+
# Final fit on full train set
115+
model.fit(X_train, y_train)
116+
r2_train = model.score(X_train, y_train)
117+
118+
# Test metrics
119+
metrics = evaluate(model, X_test, y_test)
120+
metrics["r2_train"] = round(r2_train, 4)
121+
metrics["cv_r2_mean"] = round(cv_scores.mean(), 4)
122+
metrics["cv_r2_std"] = round(cv_scores.std(), 4)
123+
124+
# Log to MLflow
125+
mlflow.log_param("model", model_name)
126+
mlflow.log_metrics(metrics)
127+
128+
# Log model artefact → pushed to HF Dataset
129+
mlflow.sklearn.log_model(
130+
sk_model=model,
131+
artifact_path="model",
132+
registered_model_name=f"crime_predictor_{model_name.lower()}",
133+
)
134+
135+
print(f" R² test={metrics['r2_test']} · "
136+
f"RMSE={metrics['rmse_test']} · "
137+
f"CV R²={metrics['cv_r2_mean']}±{metrics['cv_r2_std']}")
138+
139+
return metrics
140+
141+
142+
def main():
143+
# ── Load & prepare data ────────────────────────────────
144+
df = load_data(DATA_URL)
145+
df_features = engineer_features(df)
146+
147+
FEATURE_COLS = [
148+
"year_sin", "year_cos", "year_trend",
149+
"lag1", "lag2", "roll_mean_3",
150+
"region_mean", "ind_code", "reg_code",
151+
]
152+
X = df_features[FEATURE_COLS]
153+
y = df_features["taux_100k"]
154+
155+
# Temporal train/test split — last 20% as test
156+
split = int(len(X) * 0.8)
157+
X_train, X_test = X.iloc[:split], X.iloc[split:]
158+
y_train, y_test = y.iloc[:split], y.iloc[split:]
159+
160+
print(f"📊 Train: {len(X_train)} rows · Test: {len(X_test)} rows")
161+
162+
# ── Benchmark all models ───────────────────────────────
163+
all_metrics = {}
164+
for name, model in MODELS.items():
165+
all_metrics[name] = train_and_log(
166+
name, model, X_train, X_test, y_train, y_test
167+
)
168+
169+
# ── Select champion ────────────────────────────────────
170+
best_name = max(all_metrics, key=lambda k: all_metrics[k]["r2_test"])
171+
best_metrics = all_metrics[best_name]
172+
print(f"\n🏆 Champion: {best_name} (R²={best_metrics['r2_test']})")
173+
174+
# ── Save champion locally ──────────────────────────────
175+
champion = MODELS[best_name]
176+
champion.fit(X, y) # Retrain on full dataset
177+
178+
artifacts_dir = Path(__file__).parent.parent / "artifacts"
179+
artifacts_dir.mkdir(exist_ok=True)
180+
181+
joblib.dump(champion, artifacts_dir / "crime_predictor.pkl")
182+
183+
metrics_out = {"best_model": best_name, **best_metrics, "all_models": all_metrics}
184+
with open(artifacts_dir / "metrics.json", "w") as f:
185+
json.dump(metrics_out, f, indent=2)
186+
187+
print(f"✅ Artefacts saved to {artifacts_dir}")
188+
print(f"📊 metrics.json: R²={best_metrics['r2_test']}")
189+
190+
191+
if __name__ == "__main__":
192+
main()

0 commit comments

Comments
 (0)