Skip to content

Commit 8a8064f

Browse files
authored
deep-fin-pre-commit-patch (#15)
* deep-fin-pre-commit-patch * revise openclaw training
1 parent f556a55 commit 8a8064f

File tree

7 files changed

+259
-37
lines changed

7 files changed

+259
-37
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,4 @@ werewolves_swarm
174174
.claude
175175
tensorboard_log
176176
tutorial/**/*.json
177+
node_modules

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ repos:
55
- id: trailing-whitespace
66
- id: end-of-file-fixer
77
- id: check-yaml
8+
exclude: ^tutorial/example_deep_finance/
89
- id: check-added-large-files
910
- id: check-ast
1011
- id: check-json

tutorial/example_deep_finance/deep_finance_judge.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ def _load(path, key):
200200
_load(train_ref_ans_path, "train")
201201
_load(val_ref_ans_path, "val")
202202

203-
def _get_reference_data(self, task_id: str) -> Tuple[str, str]:
203+
def _get_reference_data(self, task_id: str) -> Tuple[str, str | None]:
204204
"""获取任务的参考答案和领域"""
205205
cache_key = "val" if task_id.startswith("val_") else "train"
206206
ans = DeepFinanceJudgeByOpenJudge._ref_answers_cache.get(cache_key, {}).get(task_id, "")
@@ -301,8 +301,8 @@ def compute_reward(self, workflow_task: WorkflowTask, workflow_output: WorkflowO
301301

302302
# 1. 提取输入数据
303303
history = metadata.get("conversation_history", [])
304-
query = metadata.get("query") or getattr(workflow_task.task, "main_query", "")
305-
task_id = metadata.get("task_id") or getattr(workflow_task.task, "task_id", "")
304+
query: str = metadata.get("query") or getattr(workflow_task.task, "main_query", "")
305+
task_id: str = metadata.get("task_id") or getattr(workflow_task.task, "task_id", "")
306306
rubrics = metadata.get("rubrics") # 可能是 None 或 list of dicts
307307
step_reward = metadata.get("reward_stats", {}).get("step_reward", 0.0)
308308
chat_date = metadata.get("chat_date") if metadata else datetime.now().strftime("%Y-%m-%d")
@@ -318,7 +318,7 @@ def compute_reward(self, workflow_task: WorkflowTask, workflow_output: WorkflowO
318318
# RM Gallery 耗时记录
319319
rm_start_time = time.time()
320320
if self._rm_enabled and self.rm_evaluator:
321-
rm_raw = self._evaluate_with_rm_gallery(query, assistants[-1] if assistants else "", ref_ans, task_id, domain)
321+
rm_raw = self._evaluate_with_rm_gallery(query, assistants[-1] if assistants else "", ref_ans, task_id, domain or "")
322322
else:
323323
rm_raw = 0.0
324324
rm_time = time.time() - rm_start_time
@@ -788,19 +788,20 @@ def _save_evaluation_log(self, task_id: str, grader_results: Dict[str, List[Any]
788788
保存 OpenJudge 评估日志(可选)
789789
"""
790790
try:
791+
grader_results_log: Dict[str, List[Dict[str, Any]]] = {}
791792
log = {
792793
"task_id": task_id,
793794
"query": query,
794795
"timestamp": datetime.now().isoformat(),
795-
"grader_results": {}
796+
"grader_results": grader_results_log
796797
}
797798

798799
# 简化 grader_results 以便序列化
799800
for grader_name, score_list in grader_results.items():
800-
log["grader_results"][grader_name] = []
801+
grader_results_log[grader_name] = []
801802
for score in score_list:
802803
if hasattr(score, "score"):
803-
log["grader_results"][grader_name].append({
804+
grader_results_log[grader_name].append({
804805
"score": score.score,
805806
"reason": score.reason[:200] if hasattr(score, "reason") else "",
806807
})

tutorial/example_deep_finance/judge/cgcv/json_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ def validate_cgcv_schema(obj: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]],
299299
# 验证 status
300300
if normalized["status"] not in VALID_STATUSES:
301301
# 尝试模糊匹配
302-
status_lower = normalized["status"]
302+
status_lower: str = normalized["status"]
303303
matched = False
304304
for valid_status in VALID_STATUSES:
305305
if valid_status in status_lower or status_lower in valid_status:

tutorial/opencode_build_openclaw_agent/README.md

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,19 @@ In a new terminal:
7575

7676
```bash
7777
cd tutorial/opencode_build_openclaw_agent
78+
79+
# Option 1: Use OpenJudge pointwise grading (default)
80+
export AJET_SWARM_URL="http://localhost:10086"
81+
export NUM_REPEAT=4
82+
export REWARD_MODE=pointwise
83+
export DASHSCOPE_API_KEY=your_api_key_here
84+
python fake_vllm_endpoint.py
85+
86+
# Option 2: Use OpenJudge listwise ranking
7887
export AJET_SWARM_URL="http://localhost:10086"
7988
export NUM_REPEAT=4
89+
export REWARD_MODE=listwise
90+
export DASHSCOPE_API_KEY=your_api_key_here
8091
python fake_vllm_endpoint.py
8192
```
8293

@@ -113,13 +124,40 @@ Key parameters in `fake_vllm_endpoint.py`:
113124
- `num_repeat=4` - GRPO N parameter (responses per query)
114125
- `model` - Base model path
115126

127+
Environment variables for reward computation:
128+
129+
- `REWARD_MODE` - Reward computation mode: `pointwise` (default) or `listwise`
130+
- `DASHSCOPE_API_KEY` - API key for OpenJudge LLM grader
131+
- `JUDGE_BASE_URL` - Base URL for judge model API (default: DashScope)
132+
- `JUDGE_MODEL` - Judge model name (default: `qwen-plus`)
133+
116134
## Reward Function
117135

118-
The `ExtraversionGrader` evaluates responses on a 1-10 scale:
119-
- 1 = Highly introverted (reserved, quiet)
120-
- 10 = Highly extraverted (energetic, enthusiastic)
136+
Two OpenJudge-based reward modes are available:
137+
138+
### 1. Pointwise Mode (Default)
121139

122-
Scores are normalized to [-1, 1] for GRPO training.
140+
Uses OpenJudge LLM grader to evaluate each response independently:
141+
- Evaluates extraversion traits on 1-10 scale
142+
- Provides detailed reasoning for each score
143+
- Scores normalized to [-1, 1] for GRPO training
144+
145+
```bash
146+
export REWARD_MODE=pointwise
147+
export DASHSCOPE_API_KEY=your_api_key_here
148+
```
149+
150+
### 2. Listwise Mode
151+
152+
Uses OpenJudge to rank all responses together:
153+
- Compares responses directly against each other
154+
- Produces relative rankings
155+
- Best for capturing subtle differences
156+
157+
```bash
158+
export REWARD_MODE=listwise
159+
export DASHSCOPE_API_KEY=your_api_key_here
160+
```
123161

124162
## Monitoring
125163

Lines changed: 113 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,129 @@
11
# -*- coding: utf-8 -*-
2-
"""Compute relative rewards based on extraversion personality alignment."""
2+
"""Compute relative rewards based on extraversion personality alignment using OpenJudge."""
33

4+
import os
45
from typing import List, Dict
56
from beast_logger import print_listofdict
7+
from openjudge.graders.base_grader import GraderMode, GraderScore, GraderRank
8+
from openjudge.graders.llm_grader import LLMGrader
9+
from openjudge.models import OpenAIChatModel
610

7-
def score_extraversion(response_text: str) -> float:
8-
"""Score response for extraversion traits (1-10 scale)."""
9-
extraversion_keywords = [
10-
'excited', 'love', 'amazing', 'awesome', 'fantastic', 'great',
11-
'wonderful', 'thrilled', 'energetic', 'enthusiastic', 'fun',
12-
'social', 'outgoing', 'active', 'lively', 'vibrant', 'happy',
13-
'enjoy', 'delighted', 'cheerful', 'positive'
14-
]
11+
# Configuration
12+
REWARD_MODE = os.getenv("REWARD_MODE", "pointwise") # Options: pointwise, listwise
13+
API_KEY = os.getenv("DASHSCOPE_API_KEY", "sk-xxx")
14+
BASE_URL = os.getenv("JUDGE_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1")
15+
JUDGE_MODEL = os.getenv("JUDGE_MODEL", "qwen-plus")
1516

16-
text_lower = response_text.lower()
17-
score = 5.0
17+
# OpenJudge grader setup
18+
judge_model = OpenAIChatModel(
19+
model=JUDGE_MODEL,
20+
api_key=API_KEY,
21+
base_url=BASE_URL,
22+
)
1823

19-
for keyword in extraversion_keywords:
20-
if keyword in text_lower:
21-
score += 0.5
24+
EXTRAVERSION_PROMPT = """You are evaluating responses for extraversion personality traits.
2225
23-
score += min(response_text.count('!') * 0.3, 2.0)
26+
Extraversion characteristics include:
27+
- Outgoing, energetic, enthusiastic tone
28+
- Social engagement and excitement
29+
- Positive, upbeat language
30+
- Action-oriented expressions
31+
- Use of exclamation marks and emotional words
2432
25-
if len(response_text) < 50:
26-
score -= 1.0
33+
Rate the response on a scale of 0.0-1.0:
34+
0.0 = Highly introverted (reserved, quiet, minimal emotion)
35+
1.0 = Highly extraverted (energetic, enthusiastic, very expressive)
2736
28-
return max(1.0, min(10.0, score))
37+
Question: {question}
38+
Response: {response}
2939
30-
async def on_compute_relative_reward(valid_results: List, all_answers: List[Dict]) -> List[float]:
31-
"""Compute relative rewards for extraversion alignment."""
40+
Return a json object with exactly two fields:
41+
- "score": float between 0.0 and 1.0
42+
- "reason": brief explanation"""
43+
44+
def build_listwise_template(n: int) -> str:
45+
"""Build a listwise prompt template for n responses."""
46+
answers_block = "\n".join([f"{i+1}. {{answer_{i+1}}}" for i in range(n)])
47+
return f"""You are ranking multiple responses based on extraversion personality traits.
48+
49+
Extraversion characteristics include:
50+
- Outgoing, energetic, enthusiastic tone
51+
- Social engagement and excitement
52+
- Positive, upbeat language
53+
- Action-oriented expressions
54+
55+
Question: {{question}}
56+
57+
Responses to rank:
58+
{answers_block}
59+
60+
Rank these responses from most extraverted to least extraverted.
61+
Return a json object with exactly two fields:
62+
- "rank": list of integers (1-indexed) ordered from most to least extraverted, e.g. [2, 1, 3]
63+
- "reason": brief explanation of the ranking"""
64+
65+
pointwise_grader = LLMGrader(
66+
name="extraversion_pointwise",
67+
mode=GraderMode.POINTWISE,
68+
description="Evaluate extraversion traits",
69+
model=judge_model,
70+
template=EXTRAVERSION_PROMPT,
71+
)
72+
73+
74+
async def compute_pointwise_rewards(question: str, all_answers: List[Dict]) -> List[float]:
75+
"""Compute rewards using OpenJudge pointwise grading."""
3276
scores = []
3377
for answer in all_answers:
3478
content = answer.get("content", "")
35-
raw_score = score_extraversion(content)
36-
normalized = (raw_score - 5.5) / 4.5
37-
scores.append(normalized)
38-
answer["reward"] = normalized
79+
result = await pointwise_grader.aevaluate(question=question, response=content)
80+
if isinstance(result, GraderScore):
81+
# score is already normalized 0-1 by OpenJudge
82+
score = result.score
83+
else:
84+
score = 0.0
85+
scores.append(score)
86+
answer["reward"] = score
87+
return scores
88+
89+
90+
async def compute_listwise_rewards(question: str, all_answers: List[Dict]) -> List[float]:
91+
"""Compute rewards using OpenJudge listwise ranking."""
92+
n = len(all_answers)
93+
template = build_listwise_template(n)
94+
grader = LLMGrader(
95+
name="extraversion_listwise",
96+
mode=GraderMode.LISTWISE,
97+
description="Rank responses by extraversion",
98+
model=judge_model,
99+
template=template,
100+
)
101+
kwargs = {"question": question}
102+
for i, ans in enumerate(all_answers):
103+
kwargs[f"answer_{i+1}"] = ans.get("content", "")
104+
105+
result = await grader.aevaluate(**kwargs)
106+
107+
scores = [0.0] * n
108+
if isinstance(result, GraderRank):
109+
# rank is a list of 1-indexed positions ordered best to worst
110+
# convert to reward: rank 1 (best) -> 1.0, rank n (worst) -> 0.0
111+
for position, idx in enumerate(result.rank):
112+
scores[idx - 1] = 1.0 - (position / (n - 1)) if n > 1 else 0.5
113+
114+
for answer, score in zip(all_answers, scores):
115+
answer["reward"] = score
116+
return scores
117+
118+
119+
async def on_compute_relative_reward(valid_results: List, all_answers: List[Dict]) -> List[float]:
120+
"""Compute relative rewards for extraversion alignment."""
121+
question = valid_results[0].get("question", "") if valid_results else ""
122+
123+
if REWARD_MODE == "listwise":
124+
scores = await compute_listwise_rewards(question, all_answers)
125+
else: # pointwise (default)
126+
scores = await compute_pointwise_rewards(question, all_answers)
39127

40-
print_listofdict(all_answers, header="on_compute_relative_reward")
128+
print_listofdict(all_answers, header=f"on_compute_relative_reward (mode={REWARD_MODE})")
41129
return scores
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#!/usr/bin/env python3
2+
"""Test script for on_compute_relative_reward.py using real OpenJudge API."""
3+
4+
import asyncio
5+
import sys
6+
import os
7+
8+
sys.path.insert(0, os.path.dirname(__file__))
9+
os.environ["DASHSCOPE_API_KEY"] = os.getenv("DASHSCOPE_API_KEY", "sk-xxx")
10+
11+
12+
async def test_pointwise():
13+
"""Test pointwise reward mode with real API."""
14+
print("\n=== Testing Pointwise Mode (real API) ===")
15+
os.environ["REWARD_MODE"] = "pointwise"
16+
17+
import importlib
18+
import on_compute_relative_reward as mod
19+
importlib.reload(mod)
20+
21+
valid_results = [{"question": "What are your thoughts on Paris?"}]
22+
all_answers = [
23+
{"content": "I'm so excited about Paris! It's amazing and wonderful!"},
24+
{"content": "Paris is a city in France."},
25+
{"content": "I absolutely love Paris! The energy is fantastic and vibrant!"},
26+
]
27+
28+
try:
29+
scores = await mod.on_compute_relative_reward(valid_results, all_answers)
30+
print(f"Scores: {scores}")
31+
assert len(scores) == 3, f"Expected 3 scores, got {len(scores)}"
32+
assert all(isinstance(s, float) for s in scores), "All scores should be floats"
33+
# extraverted responses should score higher than neutral
34+
assert scores[0] > scores[1], f"Extraverted response should score higher than neutral: {scores}"
35+
assert scores[2] > scores[1], f"Extraverted response should score higher than neutral: {scores}"
36+
print("✓ Pointwise mode test passed")
37+
return True
38+
except Exception as e:
39+
print(f"✗ Pointwise mode test failed: {e}")
40+
import traceback
41+
traceback.print_exc()
42+
return False
43+
44+
45+
async def test_listwise():
46+
"""Test listwise reward mode with real API."""
47+
print("\n=== Testing Listwise Mode (real API) ===")
48+
os.environ["REWARD_MODE"] = "listwise"
49+
50+
import importlib
51+
import on_compute_relative_reward as mod
52+
importlib.reload(mod)
53+
54+
valid_results = [{"question": "What are your thoughts on Paris?"}]
55+
all_answers = [
56+
{"content": "I'm so excited about Paris! It's amazing and wonderful!"},
57+
{"content": "Paris is a city in France."},
58+
{"content": "I absolutely love Paris! The energy is fantastic and vibrant!"},
59+
]
60+
61+
try:
62+
scores = await mod.on_compute_relative_reward(valid_results, all_answers)
63+
print(f"Scores: {scores}")
64+
assert len(scores) == 3, f"Expected 3 scores, got {len(scores)}"
65+
assert all(isinstance(s, float) for s in scores), "All scores should be floats"
66+
# neutral response should score lowest
67+
assert scores[1] < scores[0] or scores[1] < scores[2], \
68+
f"Neutral response should score lower than at least one extraverted response: {scores}"
69+
print("✓ Listwise mode test passed")
70+
return True
71+
except Exception as e:
72+
print(f"✗ Listwise mode test failed: {e}")
73+
import traceback
74+
traceback.print_exc()
75+
return False
76+
77+
78+
async def main():
79+
print("Testing on_compute_relative_reward.py (real API)")
80+
print("=" * 50)
81+
82+
results = []
83+
results.append(await test_pointwise())
84+
results.append(await test_listwise())
85+
86+
print("\n" + "=" * 50)
87+
print(f"Tests passed: {sum(results)}/{len(results)}")
88+
return all(results)
89+
90+
91+
if __name__ == "__main__":
92+
success = asyncio.run(main())
93+
sys.exit(0 if success else 1)

0 commit comments

Comments
 (0)