|
2 | 2 | deep_finance Reward Metrics Helper |
3 | 3 |
|
4 | 4 | Provides standalone utility functions for reward_stats extraction and SwanLab metrics formatting. |
5 | | -Decouples deep_finance-specific logic from core code, reducing intrusion into native_compat_trainer. |
| 5 | +
|
| 6 | +Data sources: |
| 7 | +1. Finance Evaluator (finance_raw, finance_contribution) |
| 8 | +2. OpenJudge Graders (openjudge_xxx_raw, openjudge_xxx_contribution) |
6 | 9 |
|
7 | 10 | SwanLab metrics directory structure: |
8 | 11 | - rewards/ Top-level aggregated scores |
9 | | -- rewards/dimensions/ Raw scores (unweighted) |
10 | | -- rewards/contribution/ Weighted contributions |
| 12 | +- rewards/dimensions/ Raw scores (unweighted): finance_raw, openjudge_*_raw |
| 13 | +- rewards/contribution/ Weighted contributions: finance_contribution, openjudge_*_contribution |
| 14 | +- rewards/openjudge/ OpenJudge grader specific metrics |
11 | 15 | - judge_time/ Judge time consumption statistics |
12 | 16 | """ |
13 | 17 |
|
@@ -41,9 +45,9 @@ def compute_reward_metrics(reward_stats_list: List[Dict[str, Any]], prefix: str |
41 | 45 | """ |
42 | 46 | Compute SwanLab metrics from reward_stats list. |
43 | 47 |
|
44 | | - Supports two data sources: |
45 | | - 1. RM Gallery RewardStats fields (rm_raw, etc.) |
46 | | - 2. OpenJudge fields (openjudge_xxx_raw, openjudge_xxx_contribution, etc.) |
| 48 | + Data sources: |
| 49 | + 1. Finance Evaluator (finance_raw, finance_contribution) |
| 50 | + 2. OpenJudge Graders (openjudge_xxx_raw, openjudge_xxx_contribution) |
47 | 51 |
|
48 | 52 | Args: |
49 | 53 | reward_stats_list: List of reward_stats dictionaries |
@@ -72,61 +76,47 @@ def compute_reward_metrics(reward_stats_list: List[Dict[str, Any]], prefix: str |
72 | 76 | metrics[f"{prefix}rewards/fused_reward_mean"] = float(np.mean(fused_reward_list)) |
73 | 77 | metrics[f"{prefix}rewards/penalty_mean"] = float(np.mean(penalty_list)) |
74 | 78 | metrics[f"{prefix}rewards/step_reward_mean"] = float(np.mean(step_reward_list)) |
75 | | - metrics[f"{prefix}rewards/penalty_count"] = len(non_zero_penalties) |
76 | | - metrics[f"{prefix}rewards/penalty_rate"] = len(non_zero_penalties) / n * 100 if n > 0 else 0.0 |
77 | | - |
78 | | - # ========== OpenJudge Metrics (PresentationQualityGrader, GroundingGrader) ========== |
79 | | - openjudge_enabled_count = sum(1 for rs in reward_stats_list if rs.get('openjudge_enabled', False)) |
80 | | - |
81 | | - if openjudge_enabled_count > 0: |
82 | | - # OpenJudge graders: presentation_quality, grounding |
83 | | - openjudge_graders = [ |
84 | | - "presentation_quality", |
85 | | - "grounding", |
86 | | - "planning", |
87 | | - "audit", |
88 | | - "traceability", |
89 | | - "cgcv" |
90 | | - ] |
91 | | - |
92 | | - for grader_name in openjudge_graders: |
93 | | - raw_key = f"openjudge_{grader_name}_raw" |
94 | | - contrib_key = f"openjudge_{grader_name}_contribution" |
95 | | - |
96 | | - raw_list = [rs.get(raw_key, 0.0) for rs in reward_stats_list] |
97 | | - contrib_list = [rs.get(contrib_key, 0.0) for rs in reward_stats_list] |
98 | | - |
99 | | - # Only report when non-zero values exist |
100 | | - if any(v != 0.0 for v in raw_list): |
101 | | - metrics[f"{prefix}rewards/openjudge/{grader_name}_raw_mean"] = float(np.mean(raw_list)) |
102 | | - if any(v != 0.0 for v in contrib_list): |
103 | | - metrics[f"{prefix}rewards/openjudge/{grader_name}_contribution_mean"] = float(np.mean(contrib_list)) |
104 | | - |
105 | | - # OpenJudge time consumption statistics |
106 | | - grading_time_list = [rs.get('grading_time', 0.0) for rs in reward_stats_list] |
107 | | - if any(v != 0.0 for v in grading_time_list): |
108 | | - metrics[f"{prefix}judge_time/openjudge_grading_time_mean"] = float(np.mean(grading_time_list)) |
109 | | - metrics[f"{prefix}judge_time/openjudge_grading_time_max"] = float(np.max(grading_time_list)) |
110 | | - |
111 | | - # ========== RM Gallery Metrics ========== |
112 | | - |
113 | | - # RM Gallery |
114 | | - rm_raw_list = [rs.get('rm_raw', 0.0) for rs in reward_stats_list] |
115 | | - rm_contribution_list = [rs.get('rm_contribution', 0.0) for rs in reward_stats_list] |
116 | | - |
117 | | - # dimensions/ raw scores |
118 | | - metrics[f"{prefix}rewards/dimensions/rm_raw_mean"] = float(np.mean(rm_raw_list)) |
119 | | - |
120 | | - # contribution/ weighted contributions |
121 | | - metrics[f"{prefix}rewards/contribution/rm_contribution_mean"] = float(np.mean(rm_contribution_list)) |
122 | | - |
123 | | - |
124 | | - # Time consumption statistics |
125 | | - rm_time_list = [rs.get('rm_time', 0.0) for rs in reward_stats_list] |
126 | | - metrics[f"{prefix}judge_time/rm_time_mean"] = float(np.mean(rm_time_list)) |
127 | | - |
128 | | - if rm_time_list: |
129 | | - metrics[f"{prefix}judge_time/rm_time_max"] = float(np.max(rm_time_list)) |
| 79 | + metrics[f"{prefix}rewards/penalty_count"] = float(len(non_zero_penalties)) |
| 80 | + metrics[f"{prefix}rewards/penalty_rate"] = float(len(non_zero_penalties) / n * 100) if n > 0 else 0.0 |
| 81 | + |
| 82 | + # ========== OpenJudge Metrics ========== |
| 83 | + # OpenJudge graders: presentation_quality, grounding, audit, ebtu |
| 84 | + openjudge_graders = [ |
| 85 | + "presentation_quality", |
| 86 | + "grounding", |
| 87 | + "planning", |
| 88 | + "audit", |
| 89 | + "ebtu", |
| 90 | + ] |
| 91 | + |
| 92 | + for grader_name in openjudge_graders: |
| 93 | + raw_key = f"openjudge_{grader_name}_raw" |
| 94 | + contrib_key = f"openjudge_{grader_name}_contribution" |
| 95 | + |
| 96 | + raw_list = [rs.get(raw_key, 0.0) for rs in reward_stats_list] |
| 97 | + contrib_list = [rs.get(contrib_key, 0.0) for rs in reward_stats_list] |
| 98 | + |
| 99 | + # Only report when non-zero values exist |
| 100 | + if any(v != 0.0 for v in raw_list): |
| 101 | + metrics[f"{prefix}rewards/openjudge/{grader_name}_raw_mean"] = float(np.mean(raw_list)) |
| 102 | + if any(v != 0.0 for v in contrib_list): |
| 103 | + metrics[f"{prefix}rewards/openjudge/{grader_name}_contribution_mean"] = float(np.mean(contrib_list)) |
| 104 | + |
| 105 | + # OpenJudge time consumption statistics |
| 106 | + grading_time_list = [rs.get('grading_time', 0.0) for rs in reward_stats_list] |
| 107 | + if any(v != 0.0 for v in grading_time_list): |
| 108 | + metrics[f"{prefix}judge_time/openjudge_grading_time_mean"] = float(np.mean(grading_time_list)) |
| 109 | + metrics[f"{prefix}judge_time/openjudge_grading_time_max"] = float(np.max(grading_time_list)) |
| 110 | + |
| 111 | + # ========== Finance Evaluator Metrics ========== |
| 112 | + finance_raw_list = [rs.get('finance_raw', 0.0) for rs in reward_stats_list] |
| 113 | + finance_contribution_list = [rs.get('finance_contribution', 0.0) for rs in reward_stats_list] |
| 114 | + |
| 115 | + if any(v != 0.0 for v in finance_raw_list): |
| 116 | + metrics[f"{prefix}rewards/dimensions/finance_raw_mean"] = float(np.mean(finance_raw_list)) |
| 117 | + |
| 118 | + if any(v != 0.0 for v in finance_contribution_list): |
| 119 | + metrics[f"{prefix}rewards/contribution/finance_contribution_mean"] = float(np.mean(finance_contribution_list)) |
130 | 120 |
|
131 | 121 | # ========== General Time Consumption Statistics ========== |
132 | 122 | judge_total_time_list = [rs.get('judge_total_time', 0.0) for rs in reward_stats_list] |
|
0 commit comments