Skip to content

Commit af6a5f2

Browse files
committed
feat(deep_finance): Integrates the FinanceCompositionEvaluator based on OpenJudge
- Refactored reward_metric_helper, optimizing the data structure and statistical logic of OpenJudge and Finance Evaluator - Added the DeepFinanceJudgeByOpenJudge class to achieve unified calls and weighted fusion across multiple Graders - Supports both RM Gallery and Finance Evaluator as evaluation sources, enhancing evaluation dimensions - Asynchronously calls OpenJudge Runner, adding retry and error handling mechanisms - Implements cached loading of reference answers, improving RM Gallery evaluation efficiency - Added tool call penalty calculation, fusing step_reward and scores from each Grade - Added automatic saving of debug information when OpenJudge scores for each Grade are zero - Log recording and time consumption statistics cover the entire evaluation process, facilitating performance monitoring and troubleshooting
1 parent f785b22 commit af6a5f2

File tree

13 files changed

+1223
-2356
lines changed

13 files changed

+1223
-2356
lines changed

ajet/utils/metric_helper/reward_metric_helper.py

Lines changed: 51 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22
deep_finance Reward Metrics Helper
33
44
Provides standalone utility functions for reward_stats extraction and SwanLab metrics formatting.
5-
Decouples deep_finance-specific logic from core code, reducing intrusion into native_compat_trainer.
5+
6+
Data sources:
7+
1. Finance Evaluator (finance_raw, finance_contribution)
8+
2. OpenJudge Graders (openjudge_xxx_raw, openjudge_xxx_contribution)
69
710
SwanLab metrics directory structure:
811
- rewards/ Top-level aggregated scores
9-
- rewards/dimensions/ Raw scores (unweighted)
10-
- rewards/contribution/ Weighted contributions
12+
- rewards/dimensions/ Raw scores (unweighted): finance_raw, openjudge_*_raw
13+
- rewards/contribution/ Weighted contributions: finance_contribution, openjudge_*_contribution
14+
- rewards/openjudge/ OpenJudge grader specific metrics
1115
- judge_time/ Judge time consumption statistics
1216
"""
1317

@@ -41,9 +45,9 @@ def compute_reward_metrics(reward_stats_list: List[Dict[str, Any]], prefix: str
4145
"""
4246
Compute SwanLab metrics from reward_stats list.
4347
44-
Supports two data sources:
45-
1. RM Gallery RewardStats fields (rm_raw, etc.)
46-
2. OpenJudge fields (openjudge_xxx_raw, openjudge_xxx_contribution, etc.)
48+
Data sources:
49+
1. Finance Evaluator (finance_raw, finance_contribution)
50+
2. OpenJudge Graders (openjudge_xxx_raw, openjudge_xxx_contribution)
4751
4852
Args:
4953
reward_stats_list: List of reward_stats dictionaries
@@ -72,61 +76,47 @@ def compute_reward_metrics(reward_stats_list: List[Dict[str, Any]], prefix: str
7276
metrics[f"{prefix}rewards/fused_reward_mean"] = float(np.mean(fused_reward_list))
7377
metrics[f"{prefix}rewards/penalty_mean"] = float(np.mean(penalty_list))
7478
metrics[f"{prefix}rewards/step_reward_mean"] = float(np.mean(step_reward_list))
75-
metrics[f"{prefix}rewards/penalty_count"] = len(non_zero_penalties)
76-
metrics[f"{prefix}rewards/penalty_rate"] = len(non_zero_penalties) / n * 100 if n > 0 else 0.0
77-
78-
# ========== OpenJudge Metrics (PresentationQualityGrader, GroundingGrader) ==========
79-
openjudge_enabled_count = sum(1 for rs in reward_stats_list if rs.get('openjudge_enabled', False))
80-
81-
if openjudge_enabled_count > 0:
82-
# OpenJudge graders: presentation_quality, grounding
83-
openjudge_graders = [
84-
"presentation_quality",
85-
"grounding",
86-
"planning",
87-
"audit",
88-
"traceability",
89-
"cgcv"
90-
]
91-
92-
for grader_name in openjudge_graders:
93-
raw_key = f"openjudge_{grader_name}_raw"
94-
contrib_key = f"openjudge_{grader_name}_contribution"
95-
96-
raw_list = [rs.get(raw_key, 0.0) for rs in reward_stats_list]
97-
contrib_list = [rs.get(contrib_key, 0.0) for rs in reward_stats_list]
98-
99-
# Only report when non-zero values exist
100-
if any(v != 0.0 for v in raw_list):
101-
metrics[f"{prefix}rewards/openjudge/{grader_name}_raw_mean"] = float(np.mean(raw_list))
102-
if any(v != 0.0 for v in contrib_list):
103-
metrics[f"{prefix}rewards/openjudge/{grader_name}_contribution_mean"] = float(np.mean(contrib_list))
104-
105-
# OpenJudge time consumption statistics
106-
grading_time_list = [rs.get('grading_time', 0.0) for rs in reward_stats_list]
107-
if any(v != 0.0 for v in grading_time_list):
108-
metrics[f"{prefix}judge_time/openjudge_grading_time_mean"] = float(np.mean(grading_time_list))
109-
metrics[f"{prefix}judge_time/openjudge_grading_time_max"] = float(np.max(grading_time_list))
110-
111-
# ========== RM Gallery Metrics ==========
112-
113-
# RM Gallery
114-
rm_raw_list = [rs.get('rm_raw', 0.0) for rs in reward_stats_list]
115-
rm_contribution_list = [rs.get('rm_contribution', 0.0) for rs in reward_stats_list]
116-
117-
# dimensions/ raw scores
118-
metrics[f"{prefix}rewards/dimensions/rm_raw_mean"] = float(np.mean(rm_raw_list))
119-
120-
# contribution/ weighted contributions
121-
metrics[f"{prefix}rewards/contribution/rm_contribution_mean"] = float(np.mean(rm_contribution_list))
122-
123-
124-
# Time consumption statistics
125-
rm_time_list = [rs.get('rm_time', 0.0) for rs in reward_stats_list]
126-
metrics[f"{prefix}judge_time/rm_time_mean"] = float(np.mean(rm_time_list))
127-
128-
if rm_time_list:
129-
metrics[f"{prefix}judge_time/rm_time_max"] = float(np.max(rm_time_list))
79+
metrics[f"{prefix}rewards/penalty_count"] = float(len(non_zero_penalties))
80+
metrics[f"{prefix}rewards/penalty_rate"] = float(len(non_zero_penalties) / n * 100) if n > 0 else 0.0
81+
82+
# ========== OpenJudge Metrics ==========
83+
# OpenJudge graders: presentation_quality, grounding, audit, ebtu
84+
openjudge_graders = [
85+
"presentation_quality",
86+
"grounding",
87+
"planning",
88+
"audit",
89+
"ebtu",
90+
]
91+
92+
for grader_name in openjudge_graders:
93+
raw_key = f"openjudge_{grader_name}_raw"
94+
contrib_key = f"openjudge_{grader_name}_contribution"
95+
96+
raw_list = [rs.get(raw_key, 0.0) for rs in reward_stats_list]
97+
contrib_list = [rs.get(contrib_key, 0.0) for rs in reward_stats_list]
98+
99+
# Only report when non-zero values exist
100+
if any(v != 0.0 for v in raw_list):
101+
metrics[f"{prefix}rewards/openjudge/{grader_name}_raw_mean"] = float(np.mean(raw_list))
102+
if any(v != 0.0 for v in contrib_list):
103+
metrics[f"{prefix}rewards/openjudge/{grader_name}_contribution_mean"] = float(np.mean(contrib_list))
104+
105+
# OpenJudge time consumption statistics
106+
grading_time_list = [rs.get('grading_time', 0.0) for rs in reward_stats_list]
107+
if any(v != 0.0 for v in grading_time_list):
108+
metrics[f"{prefix}judge_time/openjudge_grading_time_mean"] = float(np.mean(grading_time_list))
109+
metrics[f"{prefix}judge_time/openjudge_grading_time_max"] = float(np.max(grading_time_list))
110+
111+
# ========== Finance Evaluator Metrics ==========
112+
finance_raw_list = [rs.get('finance_raw', 0.0) for rs in reward_stats_list]
113+
finance_contribution_list = [rs.get('finance_contribution', 0.0) for rs in reward_stats_list]
114+
115+
if any(v != 0.0 for v in finance_raw_list):
116+
metrics[f"{prefix}rewards/dimensions/finance_raw_mean"] = float(np.mean(finance_raw_list))
117+
118+
if any(v != 0.0 for v in finance_contribution_list):
119+
metrics[f"{prefix}rewards/contribution/finance_contribution_mean"] = float(np.mean(finance_contribution_list))
130120

131121
# ========== General Time Consumption Statistics ==========
132122
judge_total_time_list = [rs.get('judge_total_time', 0.0) for rs in reward_stats_list]

0 commit comments

Comments
 (0)