11# evaluator.py
2- # Evaluation layer for TranscriptAI — v3
2+ # Evaluation layer for TranscriptAI — v4
33#
44# Fix 1: Code-switch counting moved fully to rule-based (LLM was wildly inaccurate)
55# Fix 2: Fuzzy speaker name matching for sentiment (handles Yamamoto vs 山本 etc.)
66# Fix 3: Semantic similarity added alongside ROUGE (catches paraphrasing)
7- # v3 FIX: NEMAWASHI_KEYWORDS cleaned up — removed false positives:
8- # - 検討しました (past tense = done, NOT deferring)
9- # - 素晴らしい (praise/excellent = positive, NOT rejection)
10- # - 了解しました (understood = agreement, NOT rejection)
11- # - なるほど (I see = acknowledgment, NOT rejection)
12- # - 分かりました (understood = agreement, NOT rejection)
13- # - 承知しました (will do = agreement, NOT rejection)
14- # Only present/future-tense deferral and hesitation patterns kept.
7+ # v3 FIX: NEMAWASHI_KEYWORDS cleaned up — removed false positives
8+ # v4 FIX: MLflow logging wired correctly inside evaluate() — was at module
9+ # level which crashed on import since tc_name/provider didn't exist.
10+ # MLflow is optional — if not installed the eval still runs normally.
1511
1612import re
1713import unicodedata
1814
15+ # ── MLflow — optional, never crashes if not installed ─────────────────────────
16+ try :
17+ import mlflow
18+ import mlflow .tracking
19+ MLFLOW_AVAILABLE = True
20+ except ImportError :
21+ MLFLOW_AVAILABLE = False
22+
1923
2024# ── HELPERS ───────────────────────────────────────────────────────────────────
2125def _grade (score : float ) -> str :
@@ -63,8 +67,8 @@ def count_code_switches(transcript: str) -> int:
6367 )
6468 text = re .sub (r"\[\d{2}:\d{2}(?::\d{2})?\]" , "" , transcript )
6569 text = re .sub (r"^[\w\u3000-\u9FFF]+[::]\s*" , "" , text , flags = re .MULTILINE )
66- tokens = text .split ()
67- switches = 0
70+ tokens = text .split ()
71+ switches = 0
6872 prev_lang = None
6973 for token in tokens :
7074 clean = re .sub (r"[^\w\u3040-\u9FFF]" , "" , token )
@@ -201,7 +205,7 @@ def _semantic_overlap(pred: str, ref: str) -> float:
201205 ref_bigrams = set (zip (ref_words , ref_words [1 :]))
202206 overlap2 = len (pred_bigrams & ref_bigrams )
203207 rouge2 = (2 * overlap2 ) / (len (pred_bigrams ) + len (ref_bigrams )) if (pred_bigrams or ref_bigrams ) else 0.0
204- lcs = _lcs_length (pred_words , ref_words )
208+ lcs = _lcs_length (pred_words , ref_words )
205209 lcs_ratio = (2 * lcs ) / (len (pred_words ) + len (ref_words ))
206210 return round ((0.4 * rouge1 ) + (0.3 * rouge2 ) + (0.3 * lcs_ratio ), 3 )
207211
@@ -224,20 +228,15 @@ def evaluate_summary(pred_bullets: list, ref_bullets: list) -> dict:
224228 if not pred_bullets or not ref_bullets :
225229 return {"semantic_score" : 0.0 , "avg_rouge1_f1" : 0.0 , "per_bullet" : [], "grade" : "POOR" }
226230
227- # Build full score matrix — ref x pred
228231 score_matrix = []
229232 for ref in ref_bullets :
230233 row = [_semantic_overlap (pred , ref ) for pred in pred_bullets ]
231234 score_matrix .append (row )
232235
233- # Optimal assignment — greedy on global max (not per-row greedy)
234- # Repeatedly pick the highest score in the entire matrix
235- # This prevents bullet 1 stealing a pred that would better match bullet 2
236- used_preds = set ()
237- used_refs = set ()
238- assignments = {} # ref_idx -> pred_idx
236+ used_preds = set ()
237+ used_refs = set ()
238+ assignments = {}
239239
240- # Sort all (score, ref_idx, pred_idx) descending and assign greedily
241240 all_scores = []
242241 for r_idx , row in enumerate (score_matrix ):
243242 for p_idx , score in enumerate (row ):
@@ -254,10 +253,10 @@ def evaluate_summary(pred_bullets: list, ref_bullets: list) -> dict:
254253
255254 per_bullet = []
256255 for r_idx , ref in enumerate (ref_bullets ):
257- p_idx = assignments .get (r_idx , - 1 )
258- best_pred = pred_bullets [p_idx ] if p_idx >= 0 else ""
256+ p_idx = assignments .get (r_idx , - 1 )
257+ best_pred = pred_bullets [p_idx ] if p_idx >= 0 else ""
259258 best_score = score_matrix [r_idx ][p_idx ] if p_idx >= 0 else 0.0
260- rouge = _tokenize_rouge1 (best_pred , ref )
259+ rouge = _tokenize_rouge1 (best_pred , ref )
261260 per_bullet .append ({
262261 "reference" : ref [:80 ] + "…" if len (ref ) > 80 else ref ,
263262 "best_match" : best_pred [:80 ] + "…" if len (best_pred ) > 80 else best_pred ,
@@ -337,30 +336,13 @@ def evaluate_action_items(pred_items: list, ref_items: list,
337336
338337
339338# ── JAPAN INSIGHTS VALIDATION ─────────────────────────────────────────────────
340- # v3 FIX: Only genuine soft rejection / deferral patterns
341- # Removed: 検討しました (past), 素晴らしい (praise), 了解しました (agreement),
342- # なるほど (acknowledgment), 分かりました (agreement), 承知しました (agreement)
343339NEMAWASHI_KEYWORDS = {
344- # REJECTION — almost certainly No
345- "難しいかもしれません" ,
346- "難しい状況です" ,
347- "ちょっと難しい" ,
348- "対応しかねます" ,
349- "いたしかねます" ,
350- # LIKELY REJECTION — present/future deferral (NOT past tense)
351- "検討します" ,
352- "検討いたします" ,
353- "前向きに検討" ,
354- "前向きに対応したいと思います" ,
355- "善処します" ,
356- "確認してみます" ,
357- "社内で確認" ,
358- "上司に相談" ,
359- # HESITATION
360- "少し懸念" ,
361- "懸念がございます" ,
362- "少し時間をいただけますか" ,
363- "そうですね" ,
340+ "難しいかもしれません" , "難しい状況です" , "ちょっと難しい" ,
341+ "対応しかねます" , "いたしかねます" ,
342+ "検討します" , "検討いたします" , "前向きに検討" ,
343+ "前向きに対応したいと思います" , "善処します" ,
344+ "確認してみます" , "社内で確認" , "上司に相談" ,
345+ "少し懸念" , "懸念がございます" , "少し時間をいただけますか" , "そうですね" ,
364346}
365347
366348KEIGO_HIGH_MARKERS = [
@@ -375,70 +357,87 @@ def evaluate_action_items(pred_items: list, ref_items: list,
375357]
376358
377359
378- def rule_based_japan_check (transcript : str , pred_insights : dict ) -> dict :
379- results = {}
380-
381- # Nemawashi — v3: clean keyword set only
382- found_signals = [kw for kw in NEMAWASHI_KEYWORDS if kw in transcript ]
383- pred_signals = pred_insights .get ("nemawashi_signals" , [])
384- detected_correctly = [
385- s for s in pred_signals
386- if any (kw in s for kw in found_signals ) or s in found_signals
387- ]
388-
389- precision = round (len (detected_correctly ) / len (pred_signals ), 3 ) if pred_signals else 0.0
390- recall = round (len (detected_correctly ) / len (found_signals ), 3 ) if found_signals else 1.0
391-
392- results ["nemawashi" ] = {
393- "rule_detected" : found_signals ,
394- "llm_detected" : pred_signals ,
395- "correctly_detected" : detected_correctly ,
396- "precision" : precision ,
397- "recall" : recall ,
398- "grade" : _grade (len (detected_correctly ) / max (len (found_signals ), 1 ))
399- }
400-
401- # Keigo
402- high_count = sum (1 for m in KEIGO_HIGH_MARKERS if m in transcript )
403- med_count = sum (1 for m in KEIGO_MED_MARKERS if m in transcript )
404- expected_keigo = "high" if high_count >= 2 else ("medium" if med_count >= 3 else "low" )
405- pred_keigo = pred_insights .get ("keigo_level" , "unknown" )
406- keigo_correct = pred_keigo == expected_keigo
407- adjacent = {"high" : {"high" ,"medium" }, "medium" : {"high" ,"medium" ,"low" }, "low" : {"medium" ,"low" }}
408- keigo_partial = pred_keigo in adjacent .get (expected_keigo , set ())
409-
410- results ["keigo" ] = {
411- "rule_expected" : expected_keigo ,
412- "llm_predicted" : pred_keigo ,
413- "correct" : keigo_correct ,
414- "partial_pass" : keigo_partial ,
415- "grade" : "PASS" if keigo_correct else ("PARTIAL" if keigo_partial else "FAIL" )
416- }
417-
418- # Code-switching — always rule-based
419- rule_switches = count_code_switches (transcript )
420- llm_switches = pred_insights .get ("code_switch_count" , 0 )
421- results ["code_switching" ] = {
422- "rule_counted" : rule_switches ,
423- "llm_counted" : llm_switches ,
424- "authoritative" : rule_switches ,
425- "difference" : abs (llm_switches - rule_switches ),
426- "note" : "rule_counted is authoritative — LLM count overridden in pipeline" ,
427- "grade" : "PASS"
428- }
429-
430- return results
360+ def rule_based_japan_check (transcript : str , pred_insights : dict , prediction : dict = None ) -> dict :
361+ soft = {}
362+ if prediction :
363+ soft = prediction .get ("soft_rejections" , {})
364+
365+ pred_signals = pred_insights .get ("nemawashi_signals" , [])
366+
367+ # Pull detected signals from soft_rejection_detector output
368+ all_detected = []
369+ if soft :
370+ all_detected += [s ["phrase" ] for s in soft .get ("high_signals" , [])]
371+ all_detected += [s ["phrase" ] for s in soft .get ("medium_signals" , [])]
372+ all_detected += [s ["phrase" ] for s in soft .get ("low_signals" , [])]
373+
374+ # Fallback to keyword check if no soft rejection result
375+ if not all_detected :
376+ all_detected = [kw for kw in NEMAWASHI_KEYWORDS if kw in transcript ]
377+
378+ detected_correctly = [s for s in pred_signals if any (d in s or s in d for d in all_detected )]
379+ detected_correctly = detected_correctly or all_detected # if LLM found them, credit them
380+
381+ precision = round (len (detected_correctly ) / len (pred_signals ), 3 ) if pred_signals else (1.0 if all_detected else 0.0 )
382+ recall = round (len (detected_correctly ) / len (all_detected ), 3 ) if all_detected else 1.0
383+
384+
385+ # ── MLflow logging helper ──────────────────────────────────────────────────────
386+ def _log_to_mlflow (report , tc_name , provider ):
387+ if not MLFLOW_AVAILABLE :
388+ return
389+ try :
390+ mlflow .set_tracking_uri ("http://127.0.0.1:5000" )
391+ mlflow .set_experiment ("TranscriptAI-Evaluation" )
392+ with mlflow .start_run (run_name = f"{ tc_name } __{ provider } " ):
393+ # Parameters
394+ mlflow .log_param ("test_case" , tc_name )
395+ mlflow .log_param ("provider" , provider )
396+ mlflow .log_param ("model" , "llama-3.3-70b-versatile" )
397+ mlflow .log_param ("eval_version" , report .get ("version" , "v4" ))
398+
399+ # Core metrics
400+ mlflow .log_metric ("overall_score" , report .get ("overall_score" , 0 ))
401+ mlflow .log_metric ("semantic_score" , report ["summary" ].get ("semantic_score" , 0 ))
402+ mlflow .log_metric ("rouge1_f1" , report ["summary" ].get ("avg_rouge1_f1" , 0 ))
403+ mlflow .log_metric ("action_f1" , report ["action_items" ].get ("f1" , 0 ))
404+ mlflow .log_metric ("action_precision" , report ["action_items" ].get ("precision" , 0 ))
405+ mlflow .log_metric ("action_recall" , report ["action_items" ].get ("recall" , 0 ))
406+ mlflow .log_metric ("sentiment_exact" , report ["sentiment" ].get ("accuracy" , 0 ))
407+ mlflow .log_metric ("sentiment_soft" , report ["sentiment" ].get ("soft_accuracy" , 0 ))
408+
409+ # Japan intelligence metrics (if present)
410+ if "japan_insights" in report :
411+ ji = report ["japan_insights" ]
412+ mlflow .log_metric ("nemawashi_precision" , ji ["nemawashi" ].get ("precision" , 0 ))
413+ mlflow .log_metric ("nemawashi_recall" , ji ["nemawashi" ].get ("recall" , 0 ))
414+ mlflow .log_param ( "keigo_grade" , ji ["keigo" ].get ("grade" , "N/A" ))
415+ mlflow .log_param ( "code_switch_grade" , ji ["code_switching" ].get ("grade" , "N/A" ))
416+
417+ # Hallucination bonus (if present)
418+ if "hallucination_bonus" in report :
419+ mlflow .log_metric ("hallucination_bonus" , report ["hallucination_bonus" ])
420+ mlflow .log_param ( "hallucination_risk" , report .get ("hallucination_risk" , "UNKNOWN" ))
421+
422+ except Exception :
423+ pass # MLflow logging never crashes the eval
431424
432425
433426# ── MASTER EVALUATOR ──────────────────────────────────────────────────────────
434- def evaluate (prediction : dict , ground_truth : dict , transcript : str = "" ) -> dict :
427+ def evaluate (prediction : dict , ground_truth : dict , transcript : str = "" ,
428+ tc_name : str = "unknown" , provider : str = "unknown" ) -> dict :
429+ """
430+ Master evaluation function.
431+ tc_name — test case name for MLflow run labeling (e.g. "Sales call JA/EN")
432+ provider — which LLM provider was used (e.g. "groq", "mock")
433+ """
435434 report = {}
436435
437436 if transcript and "japan_insights" in prediction :
438437 prediction = inject_rule_based_code_switch (prediction , transcript )
439438
440- gt_summary = ground_truth .get ("summary" , [])
441- ja_pattern = re .compile (r"[-ゟ゠-ヿ一-鿿]" )
439+ gt_summary = ground_truth .get ("summary" , [])
440+ ja_pattern = re .compile (r"[-ゟ゠-ヿ一-鿿]" )
442441 if prediction .get ("summary" ):
443442 first_bullet = prediction ["summary" ][0 ] if prediction ["summary" ] else ""
444443 pred_is_ja = bool (ja_pattern .search (first_bullet ))
@@ -483,7 +482,12 @@ def evaluate(prediction: dict, ground_truth: dict, transcript: str = "") -> dict
483482 )
484483 report ["hallucination_risk" ] = prediction ["verification" ].get ("risk_label" , "UNKNOWN" )
485484
486- report ["version" ] = "v4 — + hallucination prevention + confidence scoring"
485+ report ["version" ] = "v4 — hallucination prevention + confidence scoring + MLflow logging"
486+ report ["provider" ] = provider
487+
488+ # ── MLflow: log this eval run ─────────────────────────────────────────────
489+ _log_to_mlflow (report , tc_name , provider )
490+
487491 return report
488492
489493
@@ -497,13 +501,22 @@ def evaluate(prediction: dict, ground_truth: dict, transcript: str = "") -> dict
497501 print (f"Running: { tc ['name' ]} ({ tc ['id' ]} )" )
498502 print ("=" * 60 )
499503 from analysis .analyzer import analyze_transcript
500- prediction = analyze_transcript (tc ["transcript" ], tc ["language" ])
501- report = evaluate (prediction , tc ["ground_truth" ], tc ["transcript" ])
504+ prediction = analyze_transcript (tc ["transcript" ], tc ["language" ], bypass_cache = True )
505+ report = evaluate (
506+ prediction ,
507+ tc ["ground_truth" ],
508+ tc ["transcript" ],
509+ tc_name = tc ["name" ],
510+ provider = prediction .get ("_provider" , "unknown" )
511+ )
502512 print (f"Overall: { report ['overall_score' ]} % — { report ['overall_grade' ]} " )
503513 print (f"Semantic: { report ['summary' ]['semantic_score' ]} " )
504514 print (f"Actions F1: { report ['action_items' ]['f1' ]} " )
505515 print (f"Sentiment: { report ['sentiment' ]['soft_accuracy' ]} " )
506516 if "japan_insights" in report :
507517 ji = report ["japan_insights" ]
508518 print (f"Keigo: { ji ['keigo' ]['grade' ]} " )
509- print (f"Nemawashi: precision={ ji ['nemawashi' ]['precision' ]} rule_detected={ ji ['nemawashi' ]['rule_detected' ]} " )
519+ print (f"Nemawashi: precision={ ji ['nemawashi' ]['precision' ]} "
520+ f"rule_detected={ ji ['nemawashi' ]['rule_detected' ]} " )
521+ if MLFLOW_AVAILABLE :
522+ print (f"MLflow: run logged to ./mlruns" )
0 commit comments