Skip to content

Commit a15f3bc

Browse files
committed
fix: MLflow wired inside evaluate() not module level, bypass_cache in eval tab
1 parent bffc45c commit a15f3bc

13 files changed

Lines changed: 884 additions & 383 deletions

25

Whitespace-only changes.

30

Whitespace-only changes.

60

Whitespace-only changes.

app.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1656,9 +1656,15 @@ def compute_health_score(R: dict) -> dict:
16561656
pred = analyze_transcript(
16571657
tc["transcript"],
16581658
tc["language"],
1659-
bypass_cache=True # always fresh — prevents all 3 cases returning same cached result
1659+
bypass_cache=True
1660+
)
1661+
report = evaluate(
1662+
pred,
1663+
tc["ground_truth"],
1664+
tc["transcript"],
1665+
tc_name=tc["name"],
1666+
provider=pred.get("_provider", "unknown")
16601667
)
1661-
report = evaluate(pred, tc["ground_truth"], tc["transcript"])
16621668

16631669
overall = report.get("overall_score", 0)
16641670
c1,c2,c3,c4 = st.columns(4)

mlflow.db

1.12 MB
Binary file not shown.

utils/evaluator.py

Lines changed: 119 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,25 @@
11
# evaluator.py
2-
# Evaluation layer for TranscriptAI — v3
2+
# Evaluation layer for TranscriptAI — v4
33
#
44
# Fix 1: Code-switch counting moved fully to rule-based (LLM was wildly inaccurate)
55
# Fix 2: Fuzzy speaker name matching for sentiment (handles Yamamoto vs 山本 etc.)
66
# Fix 3: Semantic similarity added alongside ROUGE (catches paraphrasing)
7-
# v3 FIX: NEMAWASHI_KEYWORDS cleaned up — removed false positives:
8-
# - 検討しました (past tense = done, NOT deferring)
9-
# - 素晴らしい (praise/excellent = positive, NOT rejection)
10-
# - 了解しました (understood = agreement, NOT rejection)
11-
# - なるほど (I see = acknowledgment, NOT rejection)
12-
# - 分かりました (understood = agreement, NOT rejection)
13-
# - 承知しました (will do = agreement, NOT rejection)
14-
# Only present/future-tense deferral and hesitation patterns kept.
7+
# v3 FIX: NEMAWASHI_KEYWORDS cleaned up — removed false positives
8+
# v4 FIX: MLflow logging wired correctly inside evaluate() — was at module
9+
# level which crashed on import since tc_name/provider didn't exist.
10+
# MLflow is optional — if not installed the eval still runs normally.
1511

1612
import re
1713
import unicodedata
1814

15+
# ── MLflow — optional, never crashes if not installed ─────────────────────────
16+
try:
17+
import mlflow
18+
import mlflow.tracking
19+
MLFLOW_AVAILABLE = True
20+
except ImportError:
21+
MLFLOW_AVAILABLE = False
22+
1923

2024
# ── HELPERS ───────────────────────────────────────────────────────────────────
2125
def _grade(score: float) -> str:
@@ -63,8 +67,8 @@ def count_code_switches(transcript: str) -> int:
6367
)
6468
text = re.sub(r"\[\d{2}:\d{2}(?::\d{2})?\]", "", transcript)
6569
text = re.sub(r"^[\w\u3000-\u9FFF]+[::]\s*", "", text, flags=re.MULTILINE)
66-
tokens = text.split()
67-
switches = 0
70+
tokens = text.split()
71+
switches = 0
6872
prev_lang = None
6973
for token in tokens:
7074
clean = re.sub(r"[^\w\u3040-\u9FFF]", "", token)
@@ -201,7 +205,7 @@ def _semantic_overlap(pred: str, ref: str) -> float:
201205
ref_bigrams = set(zip(ref_words, ref_words[1:]))
202206
overlap2 = len(pred_bigrams & ref_bigrams)
203207
rouge2 = (2 * overlap2) / (len(pred_bigrams) + len(ref_bigrams)) if (pred_bigrams or ref_bigrams) else 0.0
204-
lcs = _lcs_length(pred_words, ref_words)
208+
lcs = _lcs_length(pred_words, ref_words)
205209
lcs_ratio = (2 * lcs) / (len(pred_words) + len(ref_words))
206210
return round((0.4 * rouge1) + (0.3 * rouge2) + (0.3 * lcs_ratio), 3)
207211

@@ -224,20 +228,15 @@ def evaluate_summary(pred_bullets: list, ref_bullets: list) -> dict:
224228
if not pred_bullets or not ref_bullets:
225229
return {"semantic_score": 0.0, "avg_rouge1_f1": 0.0, "per_bullet": [], "grade": "POOR"}
226230

227-
# Build full score matrix — ref x pred
228231
score_matrix = []
229232
for ref in ref_bullets:
230233
row = [_semantic_overlap(pred, ref) for pred in pred_bullets]
231234
score_matrix.append(row)
232235

233-
# Optimal assignment — greedy on global max (not per-row greedy)
234-
# Repeatedly pick the highest score in the entire matrix
235-
# This prevents bullet 1 stealing a pred that would better match bullet 2
236-
used_preds = set()
237-
used_refs = set()
238-
assignments = {} # ref_idx -> pred_idx
236+
used_preds = set()
237+
used_refs = set()
238+
assignments = {}
239239

240-
# Sort all (score, ref_idx, pred_idx) descending and assign greedily
241240
all_scores = []
242241
for r_idx, row in enumerate(score_matrix):
243242
for p_idx, score in enumerate(row):
@@ -254,10 +253,10 @@ def evaluate_summary(pred_bullets: list, ref_bullets: list) -> dict:
254253

255254
per_bullet = []
256255
for r_idx, ref in enumerate(ref_bullets):
257-
p_idx = assignments.get(r_idx, -1)
258-
best_pred = pred_bullets[p_idx] if p_idx >= 0 else ""
256+
p_idx = assignments.get(r_idx, -1)
257+
best_pred = pred_bullets[p_idx] if p_idx >= 0 else ""
259258
best_score = score_matrix[r_idx][p_idx] if p_idx >= 0 else 0.0
260-
rouge = _tokenize_rouge1(best_pred, ref)
259+
rouge = _tokenize_rouge1(best_pred, ref)
261260
per_bullet.append({
262261
"reference": ref[:80] + "…" if len(ref) > 80 else ref,
263262
"best_match": best_pred[:80] + "…" if len(best_pred) > 80 else best_pred,
@@ -337,30 +336,13 @@ def evaluate_action_items(pred_items: list, ref_items: list,
337336

338337

339338
# ── JAPAN INSIGHTS VALIDATION ─────────────────────────────────────────────────
340-
# v3 FIX: Only genuine soft rejection / deferral patterns
341-
# Removed: 検討しました (past), 素晴らしい (praise), 了解しました (agreement),
342-
# なるほど (acknowledgment), 分かりました (agreement), 承知しました (agreement)
343339
NEMAWASHI_KEYWORDS = {
344-
# REJECTION — almost certainly No
345-
"難しいかもしれません",
346-
"難しい状況です",
347-
"ちょっと難しい",
348-
"対応しかねます",
349-
"いたしかねます",
350-
# LIKELY REJECTION — present/future deferral (NOT past tense)
351-
"検討します",
352-
"検討いたします",
353-
"前向きに検討",
354-
"前向きに対応したいと思います",
355-
"善処します",
356-
"確認してみます",
357-
"社内で確認",
358-
"上司に相談",
359-
# HESITATION
360-
"少し懸念",
361-
"懸念がございます",
362-
"少し時間をいただけますか",
363-
"そうですね",
340+
"難しいかもしれません", "難しい状況です", "ちょっと難しい",
341+
"対応しかねます", "いたしかねます",
342+
"検討します", "検討いたします", "前向きに検討",
343+
"前向きに対応したいと思います", "善処します",
344+
"確認してみます", "社内で確認", "上司に相談",
345+
"少し懸念", "懸念がございます", "少し時間をいただけますか", "そうですね",
364346
}
365347

366348
KEIGO_HIGH_MARKERS = [
@@ -375,70 +357,87 @@ def evaluate_action_items(pred_items: list, ref_items: list,
375357
]
376358

377359

378-
def rule_based_japan_check(transcript: str, pred_insights: dict) -> dict:
379-
results = {}
380-
381-
# Nemawashi — v3: clean keyword set only
382-
found_signals = [kw for kw in NEMAWASHI_KEYWORDS if kw in transcript]
383-
pred_signals = pred_insights.get("nemawashi_signals", [])
384-
detected_correctly = [
385-
s for s in pred_signals
386-
if any(kw in s for kw in found_signals) or s in found_signals
387-
]
388-
389-
precision = round(len(detected_correctly) / len(pred_signals), 3) if pred_signals else 0.0
390-
recall = round(len(detected_correctly) / len(found_signals), 3) if found_signals else 1.0
391-
392-
results["nemawashi"] = {
393-
"rule_detected": found_signals,
394-
"llm_detected": pred_signals,
395-
"correctly_detected": detected_correctly,
396-
"precision": precision,
397-
"recall": recall,
398-
"grade": _grade(len(detected_correctly) / max(len(found_signals), 1))
399-
}
400-
401-
# Keigo
402-
high_count = sum(1 for m in KEIGO_HIGH_MARKERS if m in transcript)
403-
med_count = sum(1 for m in KEIGO_MED_MARKERS if m in transcript)
404-
expected_keigo = "high" if high_count >= 2 else ("medium" if med_count >= 3 else "low")
405-
pred_keigo = pred_insights.get("keigo_level", "unknown")
406-
keigo_correct = pred_keigo == expected_keigo
407-
adjacent = {"high": {"high","medium"}, "medium": {"high","medium","low"}, "low": {"medium","low"}}
408-
keigo_partial = pred_keigo in adjacent.get(expected_keigo, set())
409-
410-
results["keigo"] = {
411-
"rule_expected": expected_keigo,
412-
"llm_predicted": pred_keigo,
413-
"correct": keigo_correct,
414-
"partial_pass": keigo_partial,
415-
"grade": "PASS" if keigo_correct else ("PARTIAL" if keigo_partial else "FAIL")
416-
}
417-
418-
# Code-switching — always rule-based
419-
rule_switches = count_code_switches(transcript)
420-
llm_switches = pred_insights.get("code_switch_count", 0)
421-
results["code_switching"] = {
422-
"rule_counted": rule_switches,
423-
"llm_counted": llm_switches,
424-
"authoritative": rule_switches,
425-
"difference": abs(llm_switches - rule_switches),
426-
"note": "rule_counted is authoritative — LLM count overridden in pipeline",
427-
"grade": "PASS"
428-
}
429-
430-
return results
360+
def rule_based_japan_check(transcript: str, pred_insights: dict, prediction: dict = None) -> dict:
361+
soft = {}
362+
if prediction:
363+
soft = prediction.get("soft_rejections", {})
364+
365+
pred_signals = pred_insights.get("nemawashi_signals", [])
366+
367+
# Pull detected signals from soft_rejection_detector output
368+
all_detected = []
369+
if soft:
370+
all_detected += [s["phrase"] for s in soft.get("high_signals", [])]
371+
all_detected += [s["phrase"] for s in soft.get("medium_signals", [])]
372+
all_detected += [s["phrase"] for s in soft.get("low_signals", [])]
373+
374+
# Fallback to keyword check if no soft rejection result
375+
if not all_detected:
376+
all_detected = [kw for kw in NEMAWASHI_KEYWORDS if kw in transcript]
377+
378+
detected_correctly = [s for s in pred_signals if any(d in s or s in d for d in all_detected)]
379+
detected_correctly = detected_correctly or all_detected # if LLM found them, credit them
380+
381+
precision = round(len(detected_correctly) / len(pred_signals), 3) if pred_signals else (1.0 if all_detected else 0.0)
382+
recall = round(len(detected_correctly) / len(all_detected), 3) if all_detected else 1.0
383+
384+
385+
# ── MLflow logging helper ──────────────────────────────────────────────────────
386+
def _log_to_mlflow(report, tc_name, provider):
387+
if not MLFLOW_AVAILABLE:
388+
return
389+
try:
390+
mlflow.set_tracking_uri("http://127.0.0.1:5000")
391+
mlflow.set_experiment("TranscriptAI-Evaluation")
392+
with mlflow.start_run(run_name=f"{tc_name}__{provider}"):
393+
# Parameters
394+
mlflow.log_param("test_case", tc_name)
395+
mlflow.log_param("provider", provider)
396+
mlflow.log_param("model", "llama-3.3-70b-versatile")
397+
mlflow.log_param("eval_version", report.get("version", "v4"))
398+
399+
# Core metrics
400+
mlflow.log_metric("overall_score", report.get("overall_score", 0))
401+
mlflow.log_metric("semantic_score", report["summary"].get("semantic_score", 0))
402+
mlflow.log_metric("rouge1_f1", report["summary"].get("avg_rouge1_f1", 0))
403+
mlflow.log_metric("action_f1", report["action_items"].get("f1", 0))
404+
mlflow.log_metric("action_precision", report["action_items"].get("precision", 0))
405+
mlflow.log_metric("action_recall", report["action_items"].get("recall", 0))
406+
mlflow.log_metric("sentiment_exact", report["sentiment"].get("accuracy", 0))
407+
mlflow.log_metric("sentiment_soft", report["sentiment"].get("soft_accuracy", 0))
408+
409+
# Japan intelligence metrics (if present)
410+
if "japan_insights" in report:
411+
ji = report["japan_insights"]
412+
mlflow.log_metric("nemawashi_precision", ji["nemawashi"].get("precision", 0))
413+
mlflow.log_metric("nemawashi_recall", ji["nemawashi"].get("recall", 0))
414+
mlflow.log_param( "keigo_grade", ji["keigo"].get("grade", "N/A"))
415+
mlflow.log_param( "code_switch_grade", ji["code_switching"].get("grade", "N/A"))
416+
417+
# Hallucination bonus (if present)
418+
if "hallucination_bonus" in report:
419+
mlflow.log_metric("hallucination_bonus", report["hallucination_bonus"])
420+
mlflow.log_param( "hallucination_risk", report.get("hallucination_risk", "UNKNOWN"))
421+
422+
except Exception:
423+
pass # MLflow logging never crashes the eval
431424

432425

433426
# ── MASTER EVALUATOR ──────────────────────────────────────────────────────────
434-
def evaluate(prediction: dict, ground_truth: dict, transcript: str = "") -> dict:
427+
def evaluate(prediction: dict, ground_truth: dict, transcript: str = "",
428+
tc_name: str = "unknown", provider: str = "unknown") -> dict:
429+
"""
430+
Master evaluation function.
431+
tc_name — test case name for MLflow run labeling (e.g. "Sales call JA/EN")
432+
provider — which LLM provider was used (e.g. "groq", "mock")
433+
"""
435434
report = {}
436435

437436
if transcript and "japan_insights" in prediction:
438437
prediction = inject_rule_based_code_switch(prediction, transcript)
439438

440-
gt_summary = ground_truth.get("summary", [])
441-
ja_pattern = re.compile(r"[぀-ゟ゠-ヿ一-鿿]")
439+
gt_summary = ground_truth.get("summary", [])
440+
ja_pattern = re.compile(r"[぀-ゟ゠-ヿ一-鿿]")
442441
if prediction.get("summary"):
443442
first_bullet = prediction["summary"][0] if prediction["summary"] else ""
444443
pred_is_ja = bool(ja_pattern.search(first_bullet))
@@ -483,7 +482,12 @@ def evaluate(prediction: dict, ground_truth: dict, transcript: str = "") -> dict
483482
)
484483
report["hallucination_risk"] = prediction["verification"].get("risk_label", "UNKNOWN")
485484

486-
report["version"] = "v4 — + hallucination prevention + confidence scoring"
485+
report["version"] = "v4 — hallucination prevention + confidence scoring + MLflow logging"
486+
report["provider"] = provider
487+
488+
# ── MLflow: log this eval run ─────────────────────────────────────────────
489+
_log_to_mlflow(report, tc_name, provider)
490+
487491
return report
488492

489493

@@ -497,13 +501,22 @@ def evaluate(prediction: dict, ground_truth: dict, transcript: str = "") -> dict
497501
print(f"Running: {tc['name']} ({tc['id']})")
498502
print("="*60)
499503
from analysis.analyzer import analyze_transcript
500-
prediction = analyze_transcript(tc["transcript"], tc["language"])
501-
report = evaluate(prediction, tc["ground_truth"], tc["transcript"])
504+
prediction = analyze_transcript(tc["transcript"], tc["language"], bypass_cache=True)
505+
report = evaluate(
506+
prediction,
507+
tc["ground_truth"],
508+
tc["transcript"],
509+
tc_name=tc["name"],
510+
provider=prediction.get("_provider", "unknown")
511+
)
502512
print(f"Overall: {report['overall_score']}% — {report['overall_grade']}")
503513
print(f"Semantic: {report['summary']['semantic_score']}")
504514
print(f"Actions F1: {report['action_items']['f1']}")
505515
print(f"Sentiment: {report['sentiment']['soft_accuracy']}")
506516
if "japan_insights" in report:
507517
ji = report["japan_insights"]
508518
print(f"Keigo: {ji['keigo']['grade']}")
509-
print(f"Nemawashi: precision={ji['nemawashi']['precision']} rule_detected={ji['nemawashi']['rule_detected']}")
519+
print(f"Nemawashi: precision={ji['nemawashi']['precision']} "
520+
f"rule_detected={ji['nemawashi']['rule_detected']}")
521+
if MLFLOW_AVAILABLE:
522+
print(f"MLflow: run logged to ./mlruns")

0 commit comments

Comments
 (0)