Skip to content

Commit f155bf9

Browse files
wfr-data-acquisitionhaoruizhou
authored andcommitted
vector-agent: AI code generation, RAG observability, and data downloader UI (#63)
* Update Data Downloader * Refactor AI Code Generation to use MiniMax SDK and update dependencies * Increase max_tokens for ChatAnthropic and enhance output file collection in sandbox server * Succesful cases RAG * Fix RAG * RAG vectors * !aistats report * Dashboard color improvement
1 parent ab0fba8 commit f155bf9

18 files changed

Lines changed: 1525 additions & 358 deletions

rag_vectors.png

157 KB
Loading

rag_viz.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
"""
2+
RAG Vector Visualizer — PCA projection of ChromaDB collections.
3+
Run: uv run --with scikit-learn --with matplotlib python rag_viz.py
4+
"""
5+
import json
6+
import subprocess
7+
import numpy as np
8+
import matplotlib.pyplot as plt
9+
from sklearn.decomposition import PCA
10+
11+
COLORS = {
12+
"sensors": "#6c7ee1",
13+
"runs": "#e1735e",
14+
"verified_solutions": "#4ecb71",
15+
}
16+
LABEL_NAMES = {
17+
"sensors": "Sensors",
18+
"runs": "Runs",
19+
"verified_solutions": "Verified Solutions",
20+
}
21+
22+
23+
def fetch_collection(name: str) -> tuple[np.ndarray, list]:
24+
"""Return (embeddings, labels) from a ChromaDB collection inside code-generator container."""
25+
script = f"""
26+
import chromadb, json
27+
_chroma = chromadb.PersistentClient(path='/app/chroma_db')
28+
col = _chroma.get_collection('{name}')
29+
all_data = col.get(include=['documents', 'metadatas', 'embeddings'])
30+
embs_raw = all_data.get('embeddings', [])
31+
embs = [list(e) for e in embs_raw]
32+
ids_ = all_data.get('ids', [])
33+
docs = all_data.get('documents', [])
34+
metas = all_data.get('metadatas', [])
35+
labels = []
36+
for id_, meta, doc in zip(ids_, metas, docs):
37+
if '{name}' == 'sensors':
38+
labels.append(meta.get('name', id_) if meta else id_)
39+
elif '{name}' == 'runs':
40+
labels.append(f"{{meta.get('season','')}}:{{meta.get('key', id_[:6])}}" if meta else id_[:12])
41+
else:
42+
labels.append((doc or id_)[:80].replace('\\n', ' '))
43+
result = json.dumps({{'embs': embs, 'labels': labels}})
44+
print('START' + result + 'END')
45+
"""
46+
result = subprocess.run(
47+
["docker", "exec", "code-generator", "python", "-c", script],
48+
capture_output=True, text=True, check=True,
49+
)
50+
raw = result.stdout.strip()
51+
start = raw.index("START") + 5
52+
end = raw.rindex("END")
53+
data = json.loads(raw[start:end])
54+
return np.array(data["embs"], dtype=np.float32), data["labels"]
55+
56+
57+
def main():
58+
all_X, all_labels, all_names = [], [], []
59+
60+
for name in ["sensors", "runs", "verified_solutions"]:
61+
X, labels = fetch_collection(name)
62+
n = len(X)
63+
print(f"{name}: {n} vectors, dim={X.shape[1]}")
64+
all_X.append(X)
65+
all_labels.extend(labels)
66+
all_names.extend([name] * n)
67+
68+
X_all = np.vstack(all_X)
69+
print(f"Combined: {len(X_all)} vectors, dim={X_all.shape[1]}")
70+
71+
# PCA on combined space
72+
pca = PCA(n_components=2)
73+
X2_all = pca.fit_transform(X_all)
74+
75+
# Plot
76+
fig, ax = plt.subplots(figsize=(14, 10))
77+
cumulative = 0
78+
79+
for name in ["sensors", "runs", "verified_solutions"]:
80+
n = sum(1 for x in all_names if x == name)
81+
if n == 0:
82+
continue
83+
start = cumulative
84+
cumulative += n
85+
X2 = X2_all[start:cumulative]
86+
labels_slice = all_labels[start:cumulative]
87+
color = COLORS[name]
88+
89+
# Subsample sensors for visual clarity
90+
if name == "sensors" and n > 200:
91+
np.random.seed(42)
92+
idx = np.random.choice(n, 200, replace=False)
93+
X2 = X2[idx]
94+
labels_slice = [labels_slice[i] for i in idx]
95+
count_label = f"{LABEL_NAMES[name]} (200/{n})"
96+
else:
97+
count_label = f"{LABEL_NAMES[name]} ({n})"
98+
99+
ax.scatter(X2[:, 0], X2[:, 1], c=color, label=count_label,
100+
alpha=0.7, s=60,
101+
edgecolors="white" if name != "sensors" else "none",
102+
linewidths=0.5)
103+
104+
# Annotate runs and verified_solutions
105+
if name in ("runs", "verified_solutions"):
106+
for (x, y), lbl in zip(X2, labels_slice):
107+
short = lbl[:32].replace("\n", " ")
108+
ax.annotate(
109+
short, (x, y), fontsize=8, alpha=0.95,
110+
xytext=(6, 6), textcoords="offset points",
111+
bbox=dict(boxstyle="round,pad=0.3", facecolor="white",
112+
alpha=0.75, edgecolor="none"),
113+
)
114+
115+
ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)", fontsize=12)
116+
ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)", fontsize=12)
117+
ax.set_title("RAG Vector Space — ChromaDB Collections (PCA projection)", fontsize=14, fontweight="bold")
118+
ax.legend(fontsize=11, loc="upper right")
119+
ax.grid(True, alpha=0.25)
120+
plt.tight_layout()
121+
out = "rag_vectors.png"
122+
plt.savefig(out, dpi=150)
123+
print(f"Saved {out}")
124+
125+
126+
if __name__ == "__main__":
127+
main()

server/installer/.env.example

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -69,25 +69,19 @@ DEBUG=0
6969
# Data Downloader configuration
7070
# ------------------------------------------------------------
7171
POSTGRES_DSN=postgresql://wfr:wfr_password@timescaledb:5432/wfr
72-
# Comma-separated season list for data-downloader season switcher.
73-
# Format: NAME:YEAR[:COLOR_RGB], newest first.
74-
# Add previous seasons here to keep them accessible in the UI.
75-
# Example with two seasons: SEASONS=WFR25:2025,WFR26:2026
76-
SEASONS=WFR26:2026
72+
# Optional: explicitly list seasons (NAME:YEAR[:COLOR], newest first).
73+
# If omitted, the scanner auto-discovers all season tables from TimescaleDB.
74+
# Example: SEASONS=WFR25:2025,WFR26:2026
75+
# SEASONS=
7776

7877
DATA_DIR=/app/data
7978

80-
SCANNER_YEAR=2025
8179
SCANNER_BIN=hour
8280
SCANNER_INCLUDE_COUNTS=true
8381
SCANNER_INITIAL_CHUNK_DAYS=31
8482

8583
SENSOR_WINDOW_DAYS=7
86-
# How far back to look for sensor data when scanning
8784
SENSOR_LOOKBACK_DAYS=365
88-
# If no sensor data is found in the lookback period, use this fallback range
89-
SENSOR_FALLBACK_START=2025-06-10T00:00:00
90-
SENSOR_FALLBACK_END=2025-07-10T00:00:00
9185

9286
SCAN_INTERVAL_SECONDS=3600
9387

@@ -97,13 +91,11 @@ ALLOWED_ORIGINS=http://localhost:3000,http://localhost:5173
9791
# End Data Downloader configuration
9892

9993
# ------------------------------------------------------------
100-
# AI Code Generation (Sandbox) configuration
94+
# AI Code Generation — MiniMax (Anthropic-compatible SDK)
10195
# ------------------------------------------------------------
102-
# Cohere API key for AI-powered code generation
103-
COHERE_API_KEY=your-cohere-api-key-here
104-
105-
# Cohere model to use (default: command-r-plus)
106-
COHERE_MODEL=command-r-plus
96+
ANTHROPIC_API_KEY=your-minimax-api-key-here
97+
ANTHROPIC_BASE_URL=https://api.minimaxi.com/anthropic
98+
ANTHROPIC_MODEL=MiniMax-M2.7
10799

108100
# Maximum number of retries when generated code fails (default: 2)
109101
MAX_RETRIES=2

server/installer/data-downloader/backend/app.py

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,12 @@ class CanFramesBatchPayload(BaseModel):
5858

5959
@app.get("/api/health")
6060
def healthcheck() -> dict:
61-
return {"status": "ok"}
61+
ok, detail = service.check_db_connectivity()
62+
return {
63+
"status": "ok" if ok else "degraded",
64+
"timescaledb": ok,
65+
"timescaledb_detail": detail,
66+
}
6267

6368

6469
def _docker_container_running(container_name: str) -> bool:
@@ -129,13 +134,19 @@ def trigger_scan(background_tasks: BackgroundTasks, season: str | None = None) -
129134
@app.post("/api/query")
130135
def query_signal(payload: DataQueryPayload, season: str | None = None) -> dict:
131136
limit = None if payload.no_limit else (payload.limit or 2000)
132-
return service.query_signal_series(
133-
payload.signal,
134-
payload.start,
135-
payload.end,
136-
limit,
137-
season=season
138-
)
137+
try:
138+
return service.query_signal_series(
139+
payload.signal,
140+
payload.start,
141+
payload.end,
142+
limit,
143+
season=season,
144+
)
145+
except ValueError as exc:
146+
raise HTTPException(status_code=400, detail=str(exc))
147+
except Exception as exc:
148+
logger.exception("Query failed for signal %s", payload.signal)
149+
raise HTTPException(status_code=503, detail=f"Database query failed: {exc}")
139150

140151

141152
# ── CAN frames batch ingest (for flight-recorder sync) ─────────────────────────
@@ -185,7 +196,7 @@ def ingest_can_frames(payload: CanFramesBatchPayload) -> dict:
185196
if not payload.frames:
186197
return {"ingested": 0}
187198

188-
table = f"{payload.season.lower()}_base"
199+
table = payload.season.lower()
189200

190201
# Deduplicate within batch: last frame wins for (time, message_name)
191202
seen: dict = {}
@@ -238,13 +249,9 @@ def ingest_can_frames(payload: CanFramesBatchPayload) -> dict:
238249
@app.get("/", response_class=HTMLResponse)
239250
def index():
240251
"""Simple status page for debugging."""
241-
try:
242-
service._log_db_connectivity()
243-
timescale_status = "Connected"
244-
timescale_color = "green"
245-
except Exception as e:
246-
timescale_status = f"Error: {e}"
247-
timescale_color = "red"
252+
db_ok, db_detail = service.check_db_connectivity()
253+
timescale_status = db_detail if db_ok else f"Error: {db_detail}"
254+
timescale_color = "green" if db_ok else "red"
248255

249256
# Default to first season for overview
250257
runs = service.get_runs()

server/installer/data-downloader/backend/config.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,14 @@ class SeasonConfig(BaseModel):
2525

2626

2727
def _parse_seasons(raw: str | None) -> List[SeasonConfig]:
28-
"""Parse SEASONS env var: \"WFR25:2025:colour,WFR26:2026\"."""
28+
"""
29+
Parse SEASONS env var: \"WFR25:2025:colour,WFR26:2026\".
30+
31+
Returns an empty list when SEASONS is not set — the service will then
32+
auto-discover season tables directly from TimescaleDB.
33+
"""
2934
if not raw:
30-
return [SeasonConfig(name="WFR25", year=2025, table="wfr25")]
35+
return []
3136

3237
seasons = []
3338
for part in raw.split(","):
@@ -50,9 +55,6 @@ def _parse_seasons(raw: str | None) -> List[SeasonConfig]:
5055
except ValueError:
5156
continue
5257

53-
if not seasons:
54-
return [SeasonConfig(name="WFR25", year=2025, table="wfr25")]
55-
5658
seasons.sort(key=lambda s: s.year, reverse=True)
5759
return seasons
5860

server/installer/data-downloader/backend/periodic_worker.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,15 @@ async def run_worker():
2424

2525
while True:
2626
try:
27-
active_season = settings.seasons[0] # sorted descending by year; first = active
28-
logging.info(f"Running scheduled scan for active season: {active_season.name}")
29-
service.run_full_scan(source="periodic", season_names=[active_season.name])
27+
# scan all known seasons (auto-discovered or explicitly configured)
28+
seasons = service._seasons
29+
if seasons:
30+
active_season = seasons[0] # newest first
31+
logging.info(f"Running scheduled scan for active season: {active_season.name}")
32+
service.run_full_scan(source="periodic", season_names=[active_season.name])
33+
else:
34+
logging.info("No seasons available yet, running full discovery scan")
35+
service.run_full_scan(source="periodic")
3036
logging.info("Finished scheduled scan.")
3137

3238
if daily_time:

0 commit comments

Comments
 (0)