Skip to content

Commit 3d23e1c

Browse files
authored
Merge pull request #10 from modelscope/dev/shuchang_newjudge
# Pull Request: Deep Finance Judge System Enhancement
2 parents 6133583 + f785b22 commit 3d23e1c

33 files changed

+4268
-111
lines changed

.gitattributes

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
*.7z filter=lfs diff=lfs merge=lfs -text
2+
*.arrow filter=lfs diff=lfs merge=lfs -text
3+
*.bin filter=lfs diff=lfs merge=lfs -text
4+
*.bin.* filter=lfs diff=lfs merge=lfs -text
5+
*.bz2 filter=lfs diff=lfs merge=lfs -text
6+
*.ftz filter=lfs diff=lfs merge=lfs -text
7+
*.gz filter=lfs diff=lfs merge=lfs -text
8+
*.h5 filter=lfs diff=lfs merge=lfs -text
9+
*.joblib filter=lfs diff=lfs merge=lfs -text
10+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
11+
*.model filter=lfs diff=lfs merge=lfs -text
12+
*.msgpack filter=lfs diff=lfs merge=lfs -text
13+
*.onnx filter=lfs diff=lfs merge=lfs -text
14+
*.ot filter=lfs diff=lfs merge=lfs -text
15+
*.parquet filter=lfs diff=lfs merge=lfs -text
16+
*.pb filter=lfs diff=lfs merge=lfs -text
17+
*.pt filter=lfs diff=lfs merge=lfs -text
18+
*.pth filter=lfs diff=lfs merge=lfs -text
19+
*.rar filter=lfs diff=lfs merge=lfs -text
20+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21+
*.tar.* filter=lfs diff=lfs merge=lfs -text
22+
*.tflite filter=lfs diff=lfs merge=lfs -text
23+
*.tgz filter=lfs diff=lfs merge=lfs -text
24+
*.wasm filter=lfs diff=lfs merge=lfs -text
25+
*.xz filter=lfs diff=lfs merge=lfs -text
26+
*.zip filter=lfs diff=lfs merge=lfs -text
27+
*.zstandard filter=lfs diff=lfs merge=lfs -text
28+
*tfevents* filter=lfs diff=lfs merge=lfs -text
29+
# Audio files - uncompressed
30+
*.pcm filter=lfs diff=lfs merge=lfs -text
31+
*.sam filter=lfs diff=lfs merge=lfs -text
32+
*.raw filter=lfs diff=lfs merge=lfs -text
33+
# Audio files - compressed
34+
*.aac filter=lfs diff=lfs merge=lfs -text
35+
*.flac filter=lfs diff=lfs merge=lfs -text
36+
*.mp3 filter=lfs diff=lfs merge=lfs -text
37+
*.ogg filter=lfs diff=lfs merge=lfs -text
38+
*.wav filter=lfs diff=lfs merge=lfs -text

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ tutorial/example_deep_finance/scripts/*
160160
flash_attn-2.8.*.whl
161161
tutorial/example_deep_finance/prepare_data/*
162162
tutorial/example_deep_finance/judge/analytical_sufficiency/*
163+
tutorial/example_deep_finance/output_report/*
164+
dataset_gsm8k/*
163165

164166
.dockerignore
165167
benchmark_datasets

ajet/utils/metric_helper/reward_metric_helper.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,10 @@ def compute_reward_metrics(reward_stats_list: List[Dict[str, Any]], prefix: str
8383
openjudge_graders = [
8484
"presentation_quality",
8585
"grounding",
86-
"planning"
86+
"planning",
87+
"audit",
88+
"traceability",
89+
"cgcv"
8790
]
8891

8992
for grader_name in openjudge_graders:

tutorial/example_deep_finance/deep_finance.md

Lines changed: 358 additions & 1 deletion
Large diffs are not rendered by default.

tutorial/example_deep_finance/deep_finance.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010

1111
# 创建信号量,允许同时12个线程运行
12-
sem = threading.Semaphore(30)
12+
sem = threading.Semaphore(60)
1313

1414
class ExampleDeepResearchProtocol(Workflow):
1515

@@ -125,9 +125,9 @@ async def execute(
125125
if info:
126126
if 'tool_stats' in info:
127127
latest_tool_stats = info['tool_stats']
128-
if latest_tool_stats.get('total_calls', 0) > 0:
129-
logger.info(f"步骤 {step + 1} 工具统计: 调用={latest_tool_stats.get('total_calls', 0)}, "
130-
f"成功率={latest_tool_stats.get('success_rate', 0):.1f}%")
128+
# if latest_tool_stats.get('total_calls', 0) > 0:
129+
# logger.info(f"步骤 {step + 1} 工具统计: 调用={latest_tool_stats.get('total_calls', 0)}, "
130+
# f"成功率={latest_tool_stats.get('success_rate', 0):.1f}%")
131131
if 'reward_stats' in info:
132132
latest_reward_stats = info['reward_stats']
133133
# 累加工具调用时间

tutorial/example_deep_finance/deep_finance.sh

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,20 @@ JUDGE_CONCURRENCY=10
1515
RM_WEIGHT=0.5
1616
PRESENTATION_QUALITY_WEIGHT=0.25
1717
GROUNDING_WEIGHT=0.25
18+
CGCV_WEIGHT=0.0 # 不使用 CGCV,设为 0
19+
AUDIT_WEIGHT=0.0 # 不使用 Audit,设为 0
20+
TRACEABILITY_WEIGHT=0.0 # 不使用 Traceability,设为 0
21+
EBTU_WEIGHT=0.0 # 不使用 EBTU,设为 0
1822

1923
# 训练参数配置
2024
NUM_REPEAT=4 # group size,每个query rollout NUM_REPEAT次
2125
TRAIN_BATCH_SIZE=32 # 训练batchsize
2226
NUM_STEPS=6 # 每个样本step轮数
2327
DEEPFINANCE_TOOL_RESULT_MAX_CHARS=10000
2428

29+
# Env Service URL 配置
30+
ENV_SERVICE_URL="http://127.0.0.1:8080" # 环境服务地址
31+
2532
# 主目录(需要更改)
2633
export AJET_ROOT="/mnt/data_cpfs/taoshuchang.tsc/deepresearch/AgentJet_new"
2734

@@ -57,6 +64,10 @@ sed -e "s|{{SUFFIX}}|${SUFFIX}|g" \
5764
-e "s|{{RM_WEIGHT}}|${RM_WEIGHT}|g" \
5865
-e "s|{{PRESENTATION_QUALITY_WEIGHT}}|${PRESENTATION_QUALITY_WEIGHT}|g" \
5966
-e "s|{{GROUNDING_WEIGHT}}|${GROUNDING_WEIGHT}|g" \
67+
-e "s|{{CGCV_WEIGHT}}|${CGCV_WEIGHT}|g" \
68+
-e "s|{{AUDIT_WEIGHT}}|${AUDIT_WEIGHT}|g" \
69+
-e "s|{{TRACEABILITY_WEIGHT}}|${TRACEABILITY_WEIGHT}|g" \
70+
-e "s|{{EBTU_WEIGHT}}|${EBTU_WEIGHT}|g" \
6071
-e "s|{{OPENJUDGE_LLM}}|${OPENJUDGE_LLM}|g" \
6172
-e "s|{{RM_LLM}}|${RM_LLM}|g" \
6273
-e "s|{{JUDGE_CONCURRENCY}}|${JUDGE_CONCURRENCY}|g" \
@@ -68,10 +79,11 @@ sed -e "s|{{SUFFIX}}|${SUFFIX}|g" \
6879
-e "s|{{TRAIN_REF_ANS_PATH}}|${TRAIN_REF_ANS_PATH}|g" \
6980
-e "s|{{VAL_REF_ANS_PATH}}|${VAL_REF_ANS_PATH}|g" \
7081
-e "s|{{CKPT_SAVE_PATH}}|${CKPT_SAVE_PATH}|g" \
82+
-e "s|{{ENV_SERVICE_URL}}|${ENV_SERVICE_URL}|g" \
7183
${AJET_ROOT}/${CONFIG_TEMPLATE} > ${CONFIG_FILE}
7284

7385
echo "配置文件已生成: ${CONFIG_FILE}"
74-
echo "参数确认: RM=${RM_WEIGHT}, PresentationQuality=${PRESENTATION_QUALITY_WEIGHT}, Grounding=${GROUNDING_WEIGHT}, OpenJudge=${OPENJUDGE_LLM}, RM_LLM=${RM_LLM}"
86+
echo "参数确认: RM=${RM_WEIGHT}, PresentationQuality=${PRESENTATION_QUALITY_WEIGHT}, Grounding=${GROUNDING_WEIGHT}, CGCV=${CGCV_WEIGHT}, Audit=${AUDIT_WEIGHT}, Traceability=${TRACEABILITY_WEIGHT}, EBTU=${EBTU_WEIGHT}, OpenJudge=${OPENJUDGE_LLM}, RM_LLM=${RM_LLM}"
7587

7688
#===============================================================================
7789
# 3. 环境配置

tutorial/example_deep_finance/deep_finance.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ ajet:
3737
max_env_worker: 64 # 增加环境并行数
3838
max_num_seqs: 64 # 增加VLLM并发序列数
3939
max_response_length_in_one_turn: 8000
40-
max_model_len: 50000
40+
max_model_len: 40960
4141
agent_madness_reward: 0.0
4242
compute_madness_checklist: None
4343
multi_turn:

tutorial/example_deep_finance/deep_finance_judge.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
from openjudge.models.openai_chat_model import OpenAIChatModel
1717
from openjudge.runner.grading_runner import GraderConfig, GradingRunner
18-
from tutorial.example_deep_finance.judge import PresentationQualityGrader, GroundingGrader
18+
from tutorial.example_deep_finance.judge import PresentationQualityGrader, GroundingGrader, CGCVGrader, AuditGrader, TraceabilityRewardGrader, EBTUTraceabilityGrader
1919

2020

2121

@@ -103,7 +103,11 @@ def _setup_weights(self):
103103
self.w = {
104104
"rm": getattr(cfg, "rm_weight", 1.0) if cfg else 1.0, # RM Gallery 权重
105105
"presentation_quality": getattr(cfg, "presentation_quality_weight", 0.25) if cfg else 0.25,
106-
"grounding": getattr(cfg, "grounding_weight", 0.25) if cfg else 0.25,
106+
"grounding": getattr(cfg, "grounding_weight", 0.0) if cfg else 0.0, # 引用规范性评估
107+
"cgcv": getattr(cfg, "cgcv_weight", 0.25) if cfg else 0.25, # Citation-Grounded Claim Verification
108+
"audit": getattr(cfg, "audit_weight", 0.0) if cfg else 0.0, # Audit Grader: audit reward 引用逻辑审计
109+
"traceability": getattr(cfg, "traceability_weight", 0.0) if cfg else 0.0, # 可追溯性/可核验性审计 (TVR)
110+
"ebtu": getattr(cfg, "ebtu_weight", 0.0) if cfg else 0.0, # Audit Grader: audit reward EBTU证据优先可追溯性审计
107111
}
108112

109113
# 归一化(注意:action_loop 是惩罚项,不参与归一化;rm 需要参与归一化)
@@ -256,6 +260,26 @@ def extract_report_content(data: Dict) -> str:
256260
grader=GroundingGrader(model=model),
257261
mapper=lambda data: {"traj": data},
258262
),
263+
# CGCV: Citation-Grounded Claim Verification - 引用锤定的断言验证
264+
"cgcv": GraderConfig(
265+
grader=CGCVGrader(model=model),
266+
mapper=lambda data: {"traj": data},
267+
),
268+
# Audit: 引用逻辑审计 - 验证引用是否严格符合逻辑蕴含原则
269+
"audit": GraderConfig(
270+
grader=AuditGrader(model=model),
271+
mapper=lambda data: {"traj": data},
272+
),
273+
# Traceability: 可追溯性/可核验性审计 - 验证报告断言是否有证据锚点支撑
274+
"traceability": GraderConfig(
275+
grader=TraceabilityRewardGrader(model=model),
276+
mapper=lambda data: {"traj": data},
277+
),
278+
# Audit Grader: audit reward EBTU证据优先可追溯性审计 - Evidence-Backed Trace Units
279+
"ebtu": GraderConfig(
280+
grader=EBTUTraceabilityGrader(model=model),
281+
mapper=lambda data: {"traj": data},
282+
),
259283
}
260284

261285
def compute_reward(self, workflow_task: WorkflowTask, workflow_output: WorkflowOutput) -> Tuple[float, bool]:

tutorial/example_deep_finance/deep_finance_single.sh

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,30 @@ JUDGE_CONCURRENCY=10
1515
RM_WEIGHT=0.5
1616
PRESENTATION_QUALITY_WEIGHT=0.25
1717
GROUNDING_WEIGHT=0.25
18+
CGCV_WEIGHT=0.0 # 不使用 CGCV,设为 0
19+
AUDIT_WEIGHT=0.0 # 不使用 Audit,设为 0
20+
TRACEABILITY_WEIGHT=0.0 # 不使用 Traceability,设为 0
21+
EBTU_WEIGHT=0.0 # 不使用 EBTU,设为 0
1822

1923
# 训练参数配置
2024
NUM_REPEAT=4 # group size,每个query rollout NUM_REPEAT次
2125
TRAIN_BATCH_SIZE=32 # 训练batchsize
2226
NUM_STEPS=6 # 每个样本step轮数
2327
DEEPFINANCE_TOOL_RESULT_MAX_CHARS=10000
2428

29+
# Env Service URL 配置
30+
ENV_SERVICE_URL="http://127.0.0.1:8080" # 环境服务地址
31+
2532
# 主目录(需要更改)
2633
export AJET_ROOT="/mnt/data_cpfs/taoshuchang.tsc/deepresearch/AgentJet_new"
2734

28-
NNODES=${WORLD_SIZE}
35+
# 单机调试配置(默认值)
36+
NNODES=${WORLD_SIZE:-1}
37+
GPUS_PER_NODE=8
38+
CURRENT_TIME=$(date "+%Y%m%d_%H%M%S")
39+
LOG_DIR="${AJET_ROOT}/logs/${PREFIX}"
40+
TRAIN_LOG="${LOG_DIR}/train_${SUFFIX}_${CURRENT_TIME}.log"
41+
mkdir -p ${LOG_DIR}
2942

3043
# 涉密的配置(API_KEY以及模型、数据位置)从.env读取
3144
cd ${AJET_ROOT}
@@ -42,6 +55,9 @@ else
4255
echo -e "\033[31m警告: 找不到 .env 文件: $ENV_FILE\033[0m"
4356
fi
4457

58+
export MODEL_PATH="/mnt/data_cpfs/taoshuchang.tsc/models/Qwen3-8B"
59+
60+
4561
#===============================================================================
4662
# 2. 动态生成配置文件 (从yaml template生成yaml)
4763
#===============================================================================
@@ -57,6 +73,10 @@ sed -e "s|{{SUFFIX}}|${SUFFIX}|g" \
5773
-e "s|{{RM_WEIGHT}}|${RM_WEIGHT}|g" \
5874
-e "s|{{PRESENTATION_QUALITY_WEIGHT}}|${PRESENTATION_QUALITY_WEIGHT}|g" \
5975
-e "s|{{GROUNDING_WEIGHT}}|${GROUNDING_WEIGHT}|g" \
76+
-e "s|{{CGCV_WEIGHT}}|${CGCV_WEIGHT}|g" \
77+
-e "s|{{AUDIT_WEIGHT}}|${AUDIT_WEIGHT}|g" \
78+
-e "s|{{TRACEABILITY_WEIGHT}}|${TRACEABILITY_WEIGHT}|g" \
79+
-e "s|{{EBTU_WEIGHT}}|${EBTU_WEIGHT}|g" \
6080
-e "s|{{OPENJUDGE_LLM}}|${OPENJUDGE_LLM}|g" \
6181
-e "s|{{RM_LLM}}|${RM_LLM}|g" \
6282
-e "s|{{JUDGE_CONCURRENCY}}|${JUDGE_CONCURRENCY}|g" \
@@ -68,10 +88,11 @@ sed -e "s|{{SUFFIX}}|${SUFFIX}|g" \
6888
-e "s|{{TRAIN_REF_ANS_PATH}}|${TRAIN_REF_ANS_PATH}|g" \
6989
-e "s|{{VAL_REF_ANS_PATH}}|${VAL_REF_ANS_PATH}|g" \
7090
-e "s|{{CKPT_SAVE_PATH}}|${CKPT_SAVE_PATH}|g" \
91+
-e "s|{{ENV_SERVICE_URL}}|${ENV_SERVICE_URL}|g" \
7192
${AJET_ROOT}/${CONFIG_TEMPLATE} > ${CONFIG_FILE}
7293

7394
echo "配置文件已生成: ${CONFIG_FILE}"
74-
echo "参数确认: RM=${RM_WEIGHT}, PresentationQuality=${PRESENTATION_QUALITY_WEIGHT}, Grounding=${GROUNDING_WEIGHT}, OpenJudge=${OPENJUDGE_LLM}, RM_LLM=${RM_LLM}"
95+
echo "参数确认: RM=${RM_WEIGHT}, PresentationQuality=${PRESENTATION_QUALITY_WEIGHT}, Grounding=${GROUNDING_WEIGHT}, CGCV=${CGCV_WEIGHT}, Audit=${AUDIT_WEIGHT}, Traceability=${TRACEABILITY_WEIGHT}, EBTU=${EBTU_WEIGHT}, OpenJudge=${OPENJUDGE_LLM}, RM_LLM=${RM_LLM}"
7596

7697

7798
#===============================================================================
@@ -115,15 +136,16 @@ export RAY_CLUSTER_MODE="multi_node"
115136
#===============================================================================
116137
# 6. 主流程
117138
#===============================================================================
118-
log "节点数: ${NNODES}, 每节点GPU数: ${GPUS_PER_NODE}"
119-
mkdir -p ${LOG_DIR}
120-
mkdir -p $(dirname ${CONFIG_FILE})
139+
log "单机调试模式: NNODES=${NNODES}, GPUS_PER_NODE=${GPUS_PER_NODE}"
121140

122141
#===============================================================================
123142
# 6.1 Master 节点启动流程
124143
#===============================================================================
125144
# 启动训练任务(最核心)
145+
# 请注意只有单节点需要--with-ray 多节点应该删除
126146
python ajet/launcher.py \
127147
--conf ${CONFIG_FILE} \
148+
--with-deepfinance \
149+
--with-ray \
128150
--backbone="debug" \
129151
2>&1 | tee ${TRAIN_LOG}
Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
# 使得可以通过 from judge import PresentationQualityGrader 直接引用
22
from .grounding.grader import GroundingGrader
33
from .presentation_quality.grader import PresentationQualityGrader
4+
from .cgcv.grader import CGCVGrader
5+
from .audit.grader import AuditGrader
6+
from .traceability.grader import TraceabilityRewardGrader
7+
from .ebtu.grader import EBTUTraceabilityGrader
48
# from .research_depth.grader import ResearchDepthGrader
59
# from .research_breadth.grader import ResearchBreadthGrader
610

711
# 以后添加了其他 grader 也可以加在这里
812
# from .grounding.grader import GroundingGrader
913
# from .research_breadth.grader import ResearchBreadthGrader
1014
# __all__ = ["PresentationQualityGrader", "GroundingGrader", "ResearchDepthGrader", "ResearchBreadthGrader"]
11-
__all__ = ["PresentationQualityGrader", "GroundingGrader"]
15+
__all__ = ["PresentationQualityGrader", "GroundingGrader", "CGCVGrader", "AuditGrader", "TraceabilityRewardGrader", "EBTUTraceabilityGrader"]

0 commit comments

Comments
 (0)