-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdrift_analysis.py
More file actions
75 lines (63 loc) · 2.91 KB
/
drift_analysis.py
File metadata and controls
75 lines (63 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import pandas as pd
import mlflow
import json
# Updated imports as per your request
from evidently import Report
from evidently.presets import DataDriftPreset, DataSummaryPreset
import os
# --- Configuration ---
# Reference data (what the model was trained on)
REFERENCE_FILE = 'data/train.parquet'
# Current data (the new data we want to check for drift)
CURRENT_FILE = 'dataset/processed/transactions_v2.parquet'
TARGET_COLUMN = 'Class'
# --- MLflow Configuration ---
mlflow.set_tracking_uri("http://127.0.0.1:8000")
EXPERIMENT_NAME = "Fraud Detection - Data Drift Analysis"
mlflow.set_experiment(EXPERIMENT_NAME)
def run_drift_analysis():
"""
Performs data and target drift analysis and logs reports to MLflow.
"""
print("--- Starting Data Drift Analysis ---")
# 1. Load Data
print("Loading reference and current datasets...")
try:
reference_df = pd.read_parquet(REFERENCE_FILE)
current_df = pd.read_parquet(CURRENT_FILE)
except FileNotFoundError as e:
print(f"Error: Data file not found. Please ensure files exist.")
print(e)
return
with mlflow.start_run(run_name="Drift_Analysis_V1_vs_V2"):
# 2. Perform Data Drift and Summary Analysis
print("Generating Data Drift and Summary report...")
# Create the report configuration
data_drift_report_config = Report(metrics=[
DataDriftPreset(),
DataSummaryPreset()
])
# Capture the object returned by the .run() method
report_result = data_drift_report_config.run(reference_data=reference_df, current_data=current_df)
# Use the returned object to get the HTML content
html_content = report_result._repr_html_()
mlflow.log_text(html_content, "reports/data_drift_and_summary_report.html")
print(" -> Data drift and summary report logged directly to MLflow.")
# 3. Extract and log key drift metrics to MLflow for easy comparison
print("Extracting and logging key drift metrics...")
# Use .dict() to get the report results as a dictionary
drift_results = report_result.dict()
# Save the full JSON report locally for inspection, as requested
with open("my_eval.json", "w") as f:
json.dump(drift_results, f, indent=4)
print(" -> Saved full drift report to my_eval.json")
mlflow.log_artifact("my_eval.json")
# CORRECTED: Access the correct key for the number of drifted features
# The first metric in the list is 'DriftedColumnsCount'
num_drifted_features = drift_results['metrics'][0]['value']['count']
mlflow.log_metric("num_drifted_features", num_drifted_features)
print(f" -> Logged 'num_drifted_features': {num_drifted_features}")
print("\n✅ Drift analysis experiment complete.")
print("📊 Check MLflow UI for the full HTML reports and metrics.")
if __name__ == "__main__":
run_drift_analysis()