Skip to content

Commit 4722a79

Browse files
committed
chore(deepfinance): 增加 EBTU 证据优先可追溯性审计支持
- 新增 EBTUTraceabilityGrader 并集成入 DeepFinanceJudge 权重配置 - deep_finance.yaml 配置最大模型长度调整为 40960 - 脚本 deep_finance.sh 和 deep_finance_single.sh 中增加 EBTU 及相关权重配置 - 完善 deep_finance_single.sh 单机调试日志及目录结构 - 深度完善 audit、cgcv、traceability json 解析,增加对常见 JSON 格式错误的自动修复 - audit grader 中移除对模型输出 integrity_score 的依赖,采用手动计算方式 - 禁用 ExampleDeepResearchProtocol 中部分工具统计日志输出,增加线程信号量限制 - 调整提示和 yaml 模板,新增 EBTU 权重占位符,完善配置文件生成日志显示
1 parent eb6e2af commit 4722a79

19 files changed

Lines changed: 1208 additions & 47 deletions

tutorial/example_deep_finance/deep_finance.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010

1111
# 创建信号量,允许同时12个线程运行
12-
sem = threading.Semaphore(30)
12+
sem = threading.Semaphore(60)
1313

1414
class ExampleDeepResearchProtocol(Workflow):
1515

@@ -125,9 +125,9 @@ async def execute(
125125
if info:
126126
if 'tool_stats' in info:
127127
latest_tool_stats = info['tool_stats']
128-
if latest_tool_stats.get('total_calls', 0) > 0:
129-
logger.info(f"步骤 {step + 1} 工具统计: 调用={latest_tool_stats.get('total_calls', 0)}, "
130-
f"成功率={latest_tool_stats.get('success_rate', 0):.1f}%")
128+
# if latest_tool_stats.get('total_calls', 0) > 0:
129+
# logger.info(f"步骤 {step + 1} 工具统计: 调用={latest_tool_stats.get('total_calls', 0)}, "
130+
# f"成功率={latest_tool_stats.get('success_rate', 0):.1f}%")
131131
if 'reward_stats' in info:
132132
latest_reward_stats = info['reward_stats']
133133
# 累加工具调用时间

tutorial/example_deep_finance/deep_finance.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ JUDGE_CONCURRENCY=10
1515
RM_WEIGHT=0.5
1616
PRESENTATION_QUALITY_WEIGHT=0.25
1717
GROUNDING_WEIGHT=0.25
18+
CGCV_WEIGHT=0.0 # 不使用 CGCV,设为 0
19+
AUDIT_WEIGHT=0.0 # 不使用 Audit,设为 0
20+
TRACEABILITY_WEIGHT=0.0 # 不使用 Traceability,设为 0
21+
EBTU_WEIGHT=0.0 # 不使用 EBTU,设为 0
1822

1923
# 训练参数配置
2024
NUM_REPEAT=4 # group size,每个query rollout NUM_REPEAT次
@@ -60,6 +64,10 @@ sed -e "s|{{SUFFIX}}|${SUFFIX}|g" \
6064
-e "s|{{RM_WEIGHT}}|${RM_WEIGHT}|g" \
6165
-e "s|{{PRESENTATION_QUALITY_WEIGHT}}|${PRESENTATION_QUALITY_WEIGHT}|g" \
6266
-e "s|{{GROUNDING_WEIGHT}}|${GROUNDING_WEIGHT}|g" \
67+
-e "s|{{CGCV_WEIGHT}}|${CGCV_WEIGHT}|g" \
68+
-e "s|{{AUDIT_WEIGHT}}|${AUDIT_WEIGHT}|g" \
69+
-e "s|{{TRACEABILITY_WEIGHT}}|${TRACEABILITY_WEIGHT}|g" \
70+
-e "s|{{EBTU_WEIGHT}}|${EBTU_WEIGHT}|g" \
6371
-e "s|{{OPENJUDGE_LLM}}|${OPENJUDGE_LLM}|g" \
6472
-e "s|{{RM_LLM}}|${RM_LLM}|g" \
6573
-e "s|{{JUDGE_CONCURRENCY}}|${JUDGE_CONCURRENCY}|g" \
@@ -75,7 +83,7 @@ sed -e "s|{{SUFFIX}}|${SUFFIX}|g" \
7583
${AJET_ROOT}/${CONFIG_TEMPLATE} > ${CONFIG_FILE}
7684

7785
echo "配置文件已生成: ${CONFIG_FILE}"
78-
echo "参数确认: RM=${RM_WEIGHT}, PresentationQuality=${PRESENTATION_QUALITY_WEIGHT}, Grounding=${GROUNDING_WEIGHT}, OpenJudge=${OPENJUDGE_LLM}, RM_LLM=${RM_LLM}"
86+
echo "参数确认: RM=${RM_WEIGHT}, PresentationQuality=${PRESENTATION_QUALITY_WEIGHT}, Grounding=${GROUNDING_WEIGHT}, CGCV=${CGCV_WEIGHT}, Audit=${AUDIT_WEIGHT}, Traceability=${TRACEABILITY_WEIGHT}, EBTU=${EBTU_WEIGHT}, OpenJudge=${OPENJUDGE_LLM}, RM_LLM=${RM_LLM}"
7987

8088
#===============================================================================
8189
# 3. 环境配置

tutorial/example_deep_finance/deep_finance.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ ajet:
3838
max_env_worker: 64 # 增加环境并行数
3939
max_num_seqs: 64 # 增加VLLM并发序列数
4040
max_response_length_in_one_turn: 8000
41-
max_model_len: 50000
41+
max_model_len: 40960
4242
agent_madness_reward: 0.0
4343
compute_madness_checklist: None
4444
multi_turn:

tutorial/example_deep_finance/deep_finance_judge.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
from openjudge.models.openai_chat_model import OpenAIChatModel
1717
from openjudge.runner.grading_runner import GraderConfig, GradingRunner
18-
from tutorial.example_deep_finance.judge import PresentationQualityGrader, GroundingGrader, CGCVGrader, AuditGrader, TraceabilityRewardGrader
18+
from tutorial.example_deep_finance.judge import PresentationQualityGrader, GroundingGrader, CGCVGrader, AuditGrader, TraceabilityRewardGrader, EBTUTraceabilityGrader
1919

2020

2121

@@ -105,8 +105,9 @@ def _setup_weights(self):
105105
"presentation_quality": getattr(cfg, "presentation_quality_weight", 0.25) if cfg else 0.25,
106106
"grounding": getattr(cfg, "grounding_weight", 0.0) if cfg else 0.0, # 引用规范性评估
107107
"cgcv": getattr(cfg, "cgcv_weight", 0.25) if cfg else 0.25, # Citation-Grounded Claim Verification
108-
"audit": getattr(cfg, "audit_weight", 0.0) if cfg else 0.0, # 引用逻辑审计
108+
"audit": getattr(cfg, "audit_weight", 0.0) if cfg else 0.0, # Audit Grader: audit reward 引用逻辑审计
109109
"traceability": getattr(cfg, "traceability_weight", 0.0) if cfg else 0.0, # 可追溯性/可核验性审计 (TVR)
110+
"ebtu": getattr(cfg, "ebtu_weight", 0.0) if cfg else 0.0, # Audit Grader: audit reward EBTU证据优先可追溯性审计
110111
}
111112

112113
# 归一化(注意:action_loop 是惩罚项,不参与归一化;rm 需要参与归一化)
@@ -274,6 +275,11 @@ def extract_report_content(data: Dict) -> str:
274275
grader=TraceabilityRewardGrader(model=model),
275276
mapper=lambda data: {"traj": data},
276277
),
278+
# Audit Grader: audit reward EBTU证据优先可追溯性审计 - Evidence-Backed Trace Units
279+
"ebtu": GraderConfig(
280+
grader=EBTUTraceabilityGrader(model=model),
281+
mapper=lambda data: {"traj": data},
282+
),
277283
}
278284

279285
def compute_reward(self, workflow_task: WorkflowTask, workflow_output: WorkflowOutput) -> Tuple[float, bool]:

tutorial/example_deep_finance/deep_finance_single.sh

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ JUDGE_CONCURRENCY=10
1515
RM_WEIGHT=0.5
1616
PRESENTATION_QUALITY_WEIGHT=0.25
1717
GROUNDING_WEIGHT=0.25
18+
CGCV_WEIGHT=0.0 # 不使用 CGCV,设为 0
19+
AUDIT_WEIGHT=0.0 # 不使用 Audit,设为 0
20+
TRACEABILITY_WEIGHT=0.0 # 不使用 Traceability,设为 0
21+
EBTU_WEIGHT=0.0 # 不使用 EBTU,设为 0
1822

1923
# 训练参数配置
2024
NUM_REPEAT=4 # group size,每个query rollout NUM_REPEAT次
@@ -28,7 +32,13 @@ ENV_SERVICE_URL="http://127.0.0.1:8080" # 环境服务地址
2832
# 主目录(需要更改)
2933
export AJET_ROOT="/mnt/data_cpfs/taoshuchang.tsc/deepresearch/AgentJet_new"
3034

31-
NNODES=${WORLD_SIZE}
35+
# 单机调试配置(默认值)
36+
NNODES=${WORLD_SIZE:-1}
37+
GPUS_PER_NODE=8
38+
CURRENT_TIME=$(date "+%Y%m%d_%H%M%S")
39+
LOG_DIR="${AJET_ROOT}/logs/${PREFIX}"
40+
TRAIN_LOG="${LOG_DIR}/train_${SUFFIX}_${CURRENT_TIME}.log"
41+
mkdir -p ${LOG_DIR}
3242

3343
# 涉密的配置(API_KEY以及模型、数据位置)从.env读取
3444
cd ${AJET_ROOT}
@@ -45,6 +55,9 @@ else
4555
echo -e "\033[31m警告: 找不到 .env 文件: $ENV_FILE\033[0m"
4656
fi
4757

58+
export MODEL_PATH="/mnt/data_cpfs/taoshuchang.tsc/models/Qwen3-8B"
59+
60+
4861
#===============================================================================
4962
# 2. 动态生成配置文件 (从yaml template生成yaml)
5063
#===============================================================================
@@ -60,6 +73,10 @@ sed -e "s|{{SUFFIX}}|${SUFFIX}|g" \
6073
-e "s|{{RM_WEIGHT}}|${RM_WEIGHT}|g" \
6174
-e "s|{{PRESENTATION_QUALITY_WEIGHT}}|${PRESENTATION_QUALITY_WEIGHT}|g" \
6275
-e "s|{{GROUNDING_WEIGHT}}|${GROUNDING_WEIGHT}|g" \
76+
-e "s|{{CGCV_WEIGHT}}|${CGCV_WEIGHT}|g" \
77+
-e "s|{{AUDIT_WEIGHT}}|${AUDIT_WEIGHT}|g" \
78+
-e "s|{{TRACEABILITY_WEIGHT}}|${TRACEABILITY_WEIGHT}|g" \
79+
-e "s|{{EBTU_WEIGHT}}|${EBTU_WEIGHT}|g" \
6380
-e "s|{{OPENJUDGE_LLM}}|${OPENJUDGE_LLM}|g" \
6481
-e "s|{{RM_LLM}}|${RM_LLM}|g" \
6582
-e "s|{{JUDGE_CONCURRENCY}}|${JUDGE_CONCURRENCY}|g" \
@@ -75,7 +92,7 @@ sed -e "s|{{SUFFIX}}|${SUFFIX}|g" \
7592
${AJET_ROOT}/${CONFIG_TEMPLATE} > ${CONFIG_FILE}
7693

7794
echo "配置文件已生成: ${CONFIG_FILE}"
78-
echo "参数确认: RM=${RM_WEIGHT}, PresentationQuality=${PRESENTATION_QUALITY_WEIGHT}, Grounding=${GROUNDING_WEIGHT}, OpenJudge=${OPENJUDGE_LLM}, RM_LLM=${RM_LLM}"
95+
echo "参数确认: RM=${RM_WEIGHT}, PresentationQuality=${PRESENTATION_QUALITY_WEIGHT}, Grounding=${GROUNDING_WEIGHT}, CGCV=${CGCV_WEIGHT}, Audit=${AUDIT_WEIGHT}, Traceability=${TRACEABILITY_WEIGHT}, EBTU=${EBTU_WEIGHT}, OpenJudge=${OPENJUDGE_LLM}, RM_LLM=${RM_LLM}"
7996

8097

8198
#===============================================================================
@@ -119,15 +136,16 @@ export RAY_CLUSTER_MODE="multi_node"
119136
#===============================================================================
120137
# 6. 主流程
121138
#===============================================================================
122-
log "节点数: ${NNODES}, 每节点GPU数: ${GPUS_PER_NODE}"
123-
mkdir -p ${LOG_DIR}
124-
mkdir -p $(dirname ${CONFIG_FILE})
139+
log "单机调试模式: NNODES=${NNODES}, GPUS_PER_NODE=${GPUS_PER_NODE}"
125140

126141
#===============================================================================
127142
# 6.1 Master 节点启动流程
128143
#===============================================================================
129144
# 启动训练任务(最核心)
145+
# 请注意只有单节点需要--with-ray 多节点应该删除
130146
python ajet/launcher.py \
131147
--conf ${CONFIG_FILE} \
148+
--with-deepfinance \
149+
--with-ray \
132150
--backbone="debug" \
133151
2>&1 | tee ${TRAIN_LOG}

tutorial/example_deep_finance/judge/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44
from .cgcv.grader import CGCVGrader
55
from .audit.grader import AuditGrader
66
from .traceability.grader import TraceabilityRewardGrader
7+
from .ebtu.grader import EBTUTraceabilityGrader
78
# from .research_depth.grader import ResearchDepthGrader
89
# from .research_breadth.grader import ResearchBreadthGrader
910

1011
# 以后添加了其他 grader 也可以加在这里
1112
# from .grounding.grader import GroundingGrader
1213
# from .research_breadth.grader import ResearchBreadthGrader
1314
# __all__ = ["PresentationQualityGrader", "GroundingGrader", "ResearchDepthGrader", "ResearchBreadthGrader"]
14-
__all__ = ["PresentationQualityGrader", "GroundingGrader", "CGCVGrader", "AuditGrader", "TraceabilityRewardGrader"]
15+
__all__ = ["PresentationQualityGrader", "GroundingGrader", "CGCVGrader", "AuditGrader", "TraceabilityRewardGrader", "EBTUTraceabilityGrader"]

tutorial/example_deep_finance/judge/audit/grader.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -184,11 +184,11 @@ def _compute_scores(self, obj: Dict[str, Any]) -> Tuple[float, str]:
184184
supported_count = verdict_counts["Supported"]
185185

186186
# 优先使用模型输出的 score,如果有误则回退到手动计算
187-
model_score = obj.get("integrity_score")
188-
if isinstance(model_score, (float, int)) and 0.0 <= model_score <= 1.0:
189-
final_score = float(model_score)
190-
else:
191-
final_score = supported_count / total_citations if total_citations > 0 else 0.0
187+
# model_score = obj.get("integrity_score")
188+
# if isinstance(model_score, (float, int)) and 0.0 <= model_score <= 1.0:
189+
# final_score = float(model_score)
190+
# else:
191+
final_score = supported_count / total_citations if total_citations > 0 else 0.0
192192

193193
# 构建 Reason
194194
# 格式: Score: 0.80 | Total: 10 | Supp: 8, Over: 1, Hallu: 1 | Summary: ...

tutorial/example_deep_finance/judge/audit/json_utils.py

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,102 @@ def extract_first_json_object(text: str) -> str | None:
1515
return None
1616
return m.group(0)
1717

18+
19+
def _repair_json(js: str) -> str:
20+
"""
21+
尝试修复常见的JSON格式错误
22+
1. 修复字符串中未转义的换行符
23+
2. 修复trailing comma
24+
3. 修复缺少的逗号
25+
4. 修复不完整的JSON(截断)
26+
"""
27+
# 1. 替换字符串值中的未转义换行符
28+
# 这是最常见的问题:LLM在字符串中直接输出换行而非 \n
29+
def escape_newlines_in_strings(s: str) -> str:
30+
result = []
31+
in_string = False
32+
escape_next = False
33+
i = 0
34+
while i < len(s):
35+
c = s[i]
36+
if escape_next:
37+
result.append(c)
38+
escape_next = False
39+
elif c == '\\':
40+
result.append(c)
41+
escape_next = True
42+
elif c == '"':
43+
result.append(c)
44+
in_string = not in_string
45+
elif in_string and c == '\n':
46+
result.append('\\n')
47+
elif in_string and c == '\r':
48+
result.append('\\r')
49+
elif in_string and c == '\t':
50+
result.append('\\t')
51+
else:
52+
result.append(c)
53+
i += 1
54+
return ''.join(result)
55+
56+
js = escape_newlines_in_strings(js)
57+
58+
# 2. 移除trailing comma: ",}" -> "}" 和 ",]" -> "]"
59+
js = re.sub(r',\s*}', '}', js)
60+
js = re.sub(r',\s*]', ']', js)
61+
62+
# 3. 尝试修复截断的JSON - 补全缺失的括号
63+
# 统计括号数量
64+
open_braces = js.count('{')
65+
close_braces = js.count('}')
66+
open_brackets = js.count('[')
67+
close_brackets = js.count(']')
68+
69+
# 如果括号不匹配,尝试补全
70+
if open_braces > close_braces:
71+
# 先关闭可能未闭合的字符串
72+
# 检查最后是否在字符串中
73+
in_string = False
74+
escape_next = False
75+
for c in js:
76+
if escape_next:
77+
escape_next = False
78+
elif c == '\\':
79+
escape_next = True
80+
elif c == '"':
81+
in_string = not in_string
82+
if in_string:
83+
js += '"'
84+
85+
# 补全缺失的括号
86+
js += ']' * (open_brackets - close_brackets)
87+
js += '}' * (open_braces - close_braces)
88+
89+
return js
90+
91+
1892
def strict_load_json(text: str) -> Tuple[Dict[str, Any] | None, str | None]:
1993
js = extract_first_json_object(text)
2094
if js is None:
2195
return None, "No JSON object found"
96+
97+
# 第一次尝试:直接解析
2298
try:
2399
obj = json.loads(js)
24100
if not isinstance(obj, dict):
25101
return None, f"Root is not dict: {type(obj)}"
26102
return obj, None
27-
except Exception as e:
103+
except json.JSONDecodeError:
104+
pass # 继续尝试修复
105+
106+
# 第二次尝试:修复后解析
107+
try:
108+
repaired = _repair_json(js)
109+
obj = json.loads(repaired)
110+
if not isinstance(obj, dict):
111+
return None, f"Root is not dict: {type(obj)}"
112+
return obj, None
113+
except json.JSONDecodeError as e:
28114
return None, f"JSONDecodeError: {str(e)}"
29115

30116
def validate_integrity_shape(obj: Dict[str, Any]) -> Tuple[Dict[str, Any] | None, str | None]:

0 commit comments

Comments
 (0)