@@ -552,7 +552,7 @@ def _count_trace_metrics(trace):
552552 for span in trace .spans :
553553 if span .event_type == "USER_MESSAGE_RECEIVED" :
554554 user_turns += 1
555- elif span .event_type == "TOOL_COMPLETED" :
555+ elif span .event_type in ( "TOOL_COMPLETED" , "TOOL_ERROR" ) :
556556 tool_calls += 1
557557 return user_turns , tool_calls
558558
@@ -1767,7 +1767,26 @@ def _build_agent_stats(report, resolved_map):
17671767 },
17681768}
17691769
1770- _DIMENSION_NAMES = list (_DIMENSION_SCORES .keys ())
1770+ _DIMENSION_NAMES = list (_DIMENSION_SCORES .keys ()) # order matters for rendering
1771+
1772+ _PRIMARY_METRICS = {"response_usefulness" , "task_grounding" }
1773+
1774+ _SCORECARD_ICONS = {
1775+ "correct" : "✅" ,
1776+ "mostly_correct" : "⚠️" ,
1777+ "incorrect" : "❌" ,
1778+ "proper" : "✅" ,
1779+ "partial" : "⚠️" ,
1780+ "none" : "❌" ,
1781+ "specific" : "✅" ,
1782+ "somewhat_specific" : "⚠️" ,
1783+ "vague" : "❌" ,
1784+ "compliant" : "✅" ,
1785+ "partially_compliant" : "⚠️" ,
1786+ "non_compliant" : "❌" ,
1787+ "clarification_needed" : "⚠️" ,
1788+ "correction_needed" : "❌" ,
1789+ }
17711790
17721791# Maps dimension → (lowest category, section title) for "Low X" report sections.
17731792_DIMENSION_LOW_CATEGORIES = {
@@ -1792,8 +1811,9 @@ def _compute_dimension_averages(report):
17921811 for mr in sr .metrics :
17931812 if mr .metric_name in _DIMENSION_SCORES :
17941813 score_map = _DIMENSION_SCORES [mr .metric_name ]
1795- score = score_map .get (mr .category , 0 )
1796- dim_totals [mr .metric_name ].append (score )
1814+ if mr .parse_error or mr .category not in score_map :
1815+ continue
1816+ dim_totals [mr .metric_name ].append (score_map [mr .category ])
17971817 return {
17981818 d : round (sum (scores ) / len (scores ), 2 ) if scores else 0
17991819 for d , scores in dim_totals .items ()
@@ -1808,7 +1828,11 @@ def _compute_multiturn_stats(resolved_map):
18081828 verifications = [r .get ("verifications" , 0 ) for r in resolved_map .values ()]
18091829 total = len (user_turns )
18101830 if not total :
1811- return {}
1831+ return {
1832+ "avg_user_turns" : 0 ,
1833+ "avg_tool_calls" : 0 ,
1834+ "multi_turn_sessions" : 0 ,
1835+ }
18121836 mt_count = sum (1 for t in user_turns if t > 1 )
18131837 stats = {
18141838 "avg_user_turns" : round (sum (user_turns ) / total , 1 ),
@@ -1876,7 +1900,7 @@ def _print_eval_results(
18761900
18771901 # Primary metrics with justifications
18781902 for mr in sr .metrics :
1879- if mr .metric_name not in ( "response_usefulness" , "task_grounding" ) :
1903+ if mr .metric_name not in _PRIMARY_METRICS :
18801904 continue
18811905 mr_label = _category_label (mr .category )
18821906 if mr .parse_error :
@@ -1892,7 +1916,7 @@ def _print_eval_results(
18921916 # Compact scorecard for quality dimensions
18931917 dim_parts = []
18941918 for mr in sr .metrics :
1895- if mr .metric_name in ( "response_usefulness" , "task_grounding" ) :
1919+ if mr .metric_name in _PRIMARY_METRICS :
18961920 continue
18971921 display_name = _METRIC_LABELS .get (mr .metric_name , mr .metric_name )
18981922 mr_label = _category_label (mr .category )
@@ -2055,7 +2079,7 @@ def _print_eval_results(
20552079
20562080 print ("\n Category Distributions:" )
20572081 for metric_name , dist in report .category_distributions .items ():
2058- if metric_name not in ( "response_usefulness" , "task_grounding" ) :
2082+ if metric_name not in _PRIMARY_METRICS :
20592083 continue
20602084 print (f"\n [{ metric_name } ]" )
20612085 dist_total = sum (dist .values ())
@@ -2378,28 +2402,12 @@ def _md_write_trajectory_section(w, trajectories, resolved_map):
23782402
23792403def _md_dimension_scorecard (sr ):
23802404 """Build a compact one-line scorecard for the 5 quality dimensions."""
2381- _SCORECARD_ICONS = {
2382- "correct" : "\u2705 " ,
2383- "mostly_correct" : "\u26a0 \ufe0f " ,
2384- "incorrect" : "\u274c " ,
2385- "proper" : "\u2705 " ,
2386- "partial" : "\u26a0 \ufe0f " ,
2387- "none" : "\u274c " ,
2388- "specific" : "\u2705 " ,
2389- "somewhat_specific" : "\u26a0 \ufe0f " ,
2390- "vague" : "\u274c " ,
2391- "compliant" : "\u2705 " ,
2392- "partially_compliant" : "\u26a0 \ufe0f " ,
2393- "non_compliant" : "\u274c " ,
2394- "clarification_needed" : "\u26a0 \ufe0f " ,
2395- "correction_needed" : "\u274c " ,
2396- }
23972405 parts = []
23982406 for mr in sr .metrics :
2399- if mr .metric_name in ( "response_usefulness" , "task_grounding" ) :
2407+ if mr .metric_name in _PRIMARY_METRICS :
24002408 continue
24012409 label = _METRIC_LABELS .get (mr .metric_name , mr .metric_name )
2402- icon = _SCORECARD_ICONS .get (mr .category , "\u2705 " )
2410+ icon = _SCORECARD_ICONS .get (mr .category , "\u2753 " )
24032411 parts .append (f"{ label } { icon } " )
24042412 return " | " .join (parts )
24052413
@@ -2458,7 +2466,7 @@ def _md_write_session_section(
24582466 w (f"- **Response:** { r_display } " )
24592467
24602468 for mr in sr .metrics :
2461- if mr .metric_name not in ( "response_usefulness" , "task_grounding" ) :
2469+ if mr .metric_name not in _PRIMARY_METRICS :
24622470 continue
24632471 label = _category_label (mr .category )
24642472 display = _METRIC_LABELS .get (mr .metric_name , mr .metric_name )
@@ -2868,7 +2876,6 @@ def _write_md_report(
28682876 sessions = _md_find_low_dimension_sessions (report , dim , low_cat )
28692877 if sessions :
28702878 low_dims [dim ] = sessions
2871- _PRIMARY_METRICS = {"response_usefulness" , "task_grounding" }
28722879
28732880 # --- TOC ---
28742881 w ("# Quality Evaluation Report" )
0 commit comments