Skip to content

Commit 041b978

Browse files
committed
Address PR #174 review: fix dimension scoring, TOOL_ERROR counting, and module constants
- H1: Skip parse errors and unknown categories in _compute_dimension_averages instead of scoring them as 0 (which inflated averages downward) - H2: Default unknown categories to ❓ instead of ✅ in scorecard icons - H3: Count TOOL_ERROR spans as tool attempts in _count_trace_metrics - L3: Lift _SCORECARD_ICONS to module level (was duplicated in function) - L7: Extract _PRIMARY_METRICS constant, replace 5 inline references - M2: _compute_multiturn_stats returns stable shape on empty input - Update tests: add parse_error attr to _FakeMetric, test TOOL_ERROR counting, test parse error/unknown category skipping, fix empty map assertion
1 parent 6369ae9 commit 041b978

2 files changed

Lines changed: 83 additions & 30 deletions

File tree

scripts/quality_report.py

Lines changed: 35 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -552,7 +552,7 @@ def _count_trace_metrics(trace):
552552
for span in trace.spans:
553553
if span.event_type == "USER_MESSAGE_RECEIVED":
554554
user_turns += 1
555-
elif span.event_type == "TOOL_COMPLETED":
555+
elif span.event_type in ("TOOL_COMPLETED", "TOOL_ERROR"):
556556
tool_calls += 1
557557
return user_turns, tool_calls
558558

@@ -1767,7 +1767,26 @@ def _build_agent_stats(report, resolved_map):
17671767
},
17681768
}
17691769

1770-
_DIMENSION_NAMES = list(_DIMENSION_SCORES.keys())
1770+
_DIMENSION_NAMES = list(_DIMENSION_SCORES.keys()) # order matters for rendering
1771+
1772+
_PRIMARY_METRICS = {"response_usefulness", "task_grounding"}
1773+
1774+
_SCORECARD_ICONS = {
1775+
"correct": "✅",
1776+
"mostly_correct": "⚠️",
1777+
"incorrect": "❌",
1778+
"proper": "✅",
1779+
"partial": "⚠️",
1780+
"none": "❌",
1781+
"specific": "✅",
1782+
"somewhat_specific": "⚠️",
1783+
"vague": "❌",
1784+
"compliant": "✅",
1785+
"partially_compliant": "⚠️",
1786+
"non_compliant": "❌",
1787+
"clarification_needed": "⚠️",
1788+
"correction_needed": "❌",
1789+
}
17711790

17721791
# Maps dimension → (lowest category, section title) for "Low X" report sections.
17731792
_DIMENSION_LOW_CATEGORIES = {
@@ -1792,8 +1811,9 @@ def _compute_dimension_averages(report):
17921811
for mr in sr.metrics:
17931812
if mr.metric_name in _DIMENSION_SCORES:
17941813
score_map = _DIMENSION_SCORES[mr.metric_name]
1795-
score = score_map.get(mr.category, 0)
1796-
dim_totals[mr.metric_name].append(score)
1814+
if mr.parse_error or mr.category not in score_map:
1815+
continue
1816+
dim_totals[mr.metric_name].append(score_map[mr.category])
17971817
return {
17981818
d: round(sum(scores) / len(scores), 2) if scores else 0
17991819
for d, scores in dim_totals.items()
@@ -1808,7 +1828,11 @@ def _compute_multiturn_stats(resolved_map):
18081828
verifications = [r.get("verifications", 0) for r in resolved_map.values()]
18091829
total = len(user_turns)
18101830
if not total:
1811-
return {}
1831+
return {
1832+
"avg_user_turns": 0,
1833+
"avg_tool_calls": 0,
1834+
"multi_turn_sessions": 0,
1835+
}
18121836
mt_count = sum(1 for t in user_turns if t > 1)
18131837
stats = {
18141838
"avg_user_turns": round(sum(user_turns) / total, 1),
@@ -1876,7 +1900,7 @@ def _print_eval_results(
18761900

18771901
# Primary metrics with justifications
18781902
for mr in sr.metrics:
1879-
if mr.metric_name not in ("response_usefulness", "task_grounding"):
1903+
if mr.metric_name not in _PRIMARY_METRICS:
18801904
continue
18811905
mr_label = _category_label(mr.category)
18821906
if mr.parse_error:
@@ -1892,7 +1916,7 @@ def _print_eval_results(
18921916
# Compact scorecard for quality dimensions
18931917
dim_parts = []
18941918
for mr in sr.metrics:
1895-
if mr.metric_name in ("response_usefulness", "task_grounding"):
1919+
if mr.metric_name in _PRIMARY_METRICS:
18961920
continue
18971921
display_name = _METRIC_LABELS.get(mr.metric_name, mr.metric_name)
18981922
mr_label = _category_label(mr.category)
@@ -2055,7 +2079,7 @@ def _print_eval_results(
20552079

20562080
print("\n Category Distributions:")
20572081
for metric_name, dist in report.category_distributions.items():
2058-
if metric_name not in ("response_usefulness", "task_grounding"):
2082+
if metric_name not in _PRIMARY_METRICS:
20592083
continue
20602084
print(f"\n [{metric_name}]")
20612085
dist_total = sum(dist.values())
@@ -2378,28 +2402,12 @@ def _md_write_trajectory_section(w, trajectories, resolved_map):
23782402

23792403
def _md_dimension_scorecard(sr):
23802404
"""Build a compact one-line scorecard for the 5 quality dimensions."""
2381-
_SCORECARD_ICONS = {
2382-
"correct": "\u2705",
2383-
"mostly_correct": "\u26a0\ufe0f",
2384-
"incorrect": "\u274c",
2385-
"proper": "\u2705",
2386-
"partial": "\u26a0\ufe0f",
2387-
"none": "\u274c",
2388-
"specific": "\u2705",
2389-
"somewhat_specific": "\u26a0\ufe0f",
2390-
"vague": "\u274c",
2391-
"compliant": "\u2705",
2392-
"partially_compliant": "\u26a0\ufe0f",
2393-
"non_compliant": "\u274c",
2394-
"clarification_needed": "\u26a0\ufe0f",
2395-
"correction_needed": "\u274c",
2396-
}
23972405
parts = []
23982406
for mr in sr.metrics:
2399-
if mr.metric_name in ("response_usefulness", "task_grounding"):
2407+
if mr.metric_name in _PRIMARY_METRICS:
24002408
continue
24012409
label = _METRIC_LABELS.get(mr.metric_name, mr.metric_name)
2402-
icon = _SCORECARD_ICONS.get(mr.category, "\u2705")
2410+
icon = _SCORECARD_ICONS.get(mr.category, "\u2753")
24032411
parts.append(f"{label} {icon}")
24042412
return " | ".join(parts)
24052413

@@ -2458,7 +2466,7 @@ def _md_write_session_section(
24582466
w(f"- **Response:** {r_display}")
24592467

24602468
for mr in sr.metrics:
2461-
if mr.metric_name not in ("response_usefulness", "task_grounding"):
2469+
if mr.metric_name not in _PRIMARY_METRICS:
24622470
continue
24632471
label = _category_label(mr.category)
24642472
display = _METRIC_LABELS.get(mr.metric_name, mr.metric_name)
@@ -2868,7 +2876,6 @@ def _write_md_report(
28682876
sessions = _md_find_low_dimension_sessions(report, dim, low_cat)
28692877
if sessions:
28702878
low_dims[dim] = sessions
2871-
_PRIMARY_METRICS = {"response_usefulness", "task_grounding"}
28722879

28732880
# --- TOC ---
28742881
w("# Quality Evaluation Report")

tests/test_quality_report_helpers.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,10 @@ def __init__(self, spans):
6262

6363
class _FakeMetric:
6464

65-
def __init__(self, metric_name, category):
65+
def __init__(self, metric_name, category, parse_error=False):
6666
self.metric_name = metric_name
6767
self.category = category
68+
self.parse_error = parse_error
6869

6970

7071
class _FakeSession:
@@ -655,6 +656,18 @@ def test_tool_starting_not_counted(self):
655656
_, tool_calls = _count_trace_metrics(trace)
656657
assert tool_calls == 1
657658

659+
def test_tool_error_counted(self):
660+
trace = _FakeTrace(
661+
[
662+
_FakeSpan("TOOL_STARTING", {"tool": "search"}),
663+
_FakeSpan("TOOL_ERROR", {"error": "timeout"}),
664+
_FakeSpan("TOOL_STARTING", {"tool": "lookup"}),
665+
_FakeSpan("TOOL_COMPLETED", {"tool": "lookup"}),
666+
]
667+
)
668+
_, tool_calls = _count_trace_metrics(trace)
669+
assert tool_calls == 2
670+
658671

659672
# ================================================================== #
660673
# _compute_dimension_averages #
@@ -726,6 +739,34 @@ def test_missing_dimensions(self):
726739
# Non-dimension metrics should not contribute
727740
assert avgs["correctness"] == 0
728741

742+
def test_parse_error_skipped(self):
743+
sessions = [
744+
_FakeSession(
745+
"s1",
746+
[
747+
_FakeMetric("correctness", "correct"),
748+
_FakeMetric("correctness", "incorrect", parse_error=True),
749+
],
750+
),
751+
]
752+
report = _FakeReport(sessions)
753+
avgs = _compute_dimension_averages(report)
754+
assert avgs["correctness"] == 2.0
755+
756+
def test_unknown_category_skipped(self):
757+
sessions = [
758+
_FakeSession(
759+
"s1",
760+
[
761+
_FakeMetric("correctness", "correct"),
762+
_FakeMetric("correctness", "bogus_value"),
763+
],
764+
),
765+
]
766+
report = _FakeReport(sessions)
767+
avgs = _compute_dimension_averages(report)
768+
assert avgs["correctness"] == 2.0
769+
729770

730771
# ================================================================== #
731772
# _compute_multiturn_stats #
@@ -745,7 +786,12 @@ def test_basic_stats(self):
745786
assert stats["multi_turn_sessions"] == 1
746787

747788
def test_empty_map(self):
748-
assert _compute_multiturn_stats({}) == {}
789+
result = _compute_multiturn_stats({})
790+
assert result == {
791+
"avg_user_turns": 0,
792+
"avg_tool_calls": 0,
793+
"multi_turn_sessions": 0,
794+
}
749795

750796
def test_all_single_turn(self):
751797
resolved = {

0 commit comments

Comments
 (0)