modelscope
diff --git a/‎docs/en/example_train_multi_model.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/en/example_train_multi_model.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/en/example_train_multi_model.zh.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/en/example_train_multi_model.zh.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tutorial/opencode_build_openclaw_agent/cheatsheet.md‎
Lines changed: 47 additions & 0 deletions b/‎tutorial/opencode_build_openclaw_agent/cheatsheet.md‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎tutorial/opencode_build_openclaw_agent/fake_vllm_endpoint.py‎
Lines changed: 11 additions & 3 deletions b/‎tutorial/opencode_build_openclaw_agent/fake_vllm_endpoint.py‎
Lines changed: 11 additions & 3 deletions
@@ -90,6 +90,9 @@ graph TB
     C -->|end_episode + reward_14b| S2
 ```
 
+![alt text](https://img.alicdn.com/imgextra/i3/O1CN01vHfNt41LRcQeDMjE4_!!6000000001296-2-tps-1408-768.png)
+
+
 **Architecture Explanation**:
 
 - **Swarm Server 1 (Port 10086)**: Hosts the 7B model, responsible for Agent 1 and Agent 3's inference and training
 
@@ -88,6 +88,9 @@ graph TB
     C -->|end_episode + reward_14b| S2
 ```
 
+![alt text](https://img.alicdn.com/imgextra/i3/O1CN01vHfNt41LRcQeDMjE4_!!6000000001296-2-tps-1408-768.png)
+
+
 **架构说明**：
 
 - **Swarm Server 1 (端口 10086)**：承载 7B 模型，负责 Agent 1 和 Agent 3 的推理与训练
@@ -176,6 +179,8 @@ sequenceDiagram
 4. 将各自的奖励汇报给对应的 Swarm Server
 5. 两个 Server 独立执行策略梯度更新
 
+
+
 ## 训练曲线
 
 ![alt text](https://img.alicdn.com/imgextra/i2/O1CN0161wtDk1zZwFmIX15x_!!6000000006729-2-tps-2978-1413.png)
 
@@ -0,0 +1,47 @@
+# OpenClaw Reward Cheatsheet
+
+## Run the test
+
+```bash
+cd agentjet/tutorial/opencode_build_openclaw_agent
+
+# pointwise (default)
+DASHSCOPE_API_KEY=your_key python test_reward.py
+
+# listwise
+REWARD_MODE=listwise DASHSCOPE_API_KEY=your_key python test_reward.py
+```
+
+## Run the training endpoint
+
+```bash
+# pointwise (default)
+AJET_SWARM_URL=http://localhost:10086 \
+DASHSCOPE_API_KEY=your_key \
+REWARD_MODE=pointwise \
+python fake_vllm_endpoint.py
+
+# listwise
+AJET_SWARM_URL=http://localhost:10086 \
+DASHSCOPE_API_KEY=your_key \
+REWARD_MODE=listwise \
+python fake_vllm_endpoint.py
+```
+
+## Reward modes
+
+| Mode | Description |
+|------|-------------|
+| `pointwise` | Each response scored independently (0.0–1.0) |
+| `listwise` | All responses ranked together (best=1.0, worst=0.0) |
+
+## Environment variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `REWARD_MODE` | `pointwise` | `pointwise` or `listwise` |
+| `DASHSCOPE_API_KEY` | — | DashScope API key (required) |
+| `JUDGE_MODEL` | `qwen-plus` | Judge model name |
+| `JUDGE_BASE_URL` | DashScope endpoint | Judge model base URL |
+| `AJET_SWARM_URL` | `http://localhost:10086` | Swarm server URL |
+| `NUM_REPEAT` | `4` | GRPO N (responses per query) |
@@ -25,7 +25,7 @@
 import sys
 sys.path.insert(0, os.path.dirname(__file__))
 
-from on_user_submit_new_requests import on_user_submit_new_requests
+from on_user_submit_new_requests import on_user_submit_new_requests, get_query_history
 from on_compute_relative_reward import on_compute_relative_reward
 
 # Configuration
@@ -91,6 +91,14 @@ async def proxy_chat_completion(base_url: str, api_key: str, request: Request, i
     json_data = await request.json()
     json_data["stream"] = is_stream
 
+    # Remove fields not supported by vLLM to avoid warnings
+    UNSUPPORTED_FIELDS = {"strict", "store"}
+    for field in UNSUPPORTED_FIELDS:
+        json_data.pop(field, None)
+    # Also remove 'strict' from response_format if present
+    if "response_format" in json_data and isinstance(json_data["response_format"], dict):
+        json_data["response_format"].pop("strict", None)
+
     async with httpx.AsyncClient(timeout=300.0) as client:
         resp = await client.post(f"{base_url}/chat/completions", json=json_data, headers=headers)
         resp.raise_for_status()
@@ -200,7 +208,7 @@ async def handle_one2many_request(request: Request, request_id: str) -> Dict | L
 
     valid_results = await run_all_episodes(request, is_stream)
     all_answers = [extract_assistant_message(r.response) for r in valid_results]
-    rewards = await on_compute_relative_reward(valid_results, all_answers)
+    rewards = await on_compute_relative_reward(valid_results, all_answers, question=user_query)
 
     await finalize_episodes(task, valid_results, rewards)
 
@@ -259,7 +267,7 @@ async def health_check():
 @app.get("/requests")
 async def get_requests():
     """Get all recorded user requests."""
-    return {"requests": USER_REQUEST_RECORD}
+    return {"requests": get_query_history()}
 
 
 if __name__ == "__main__":