revise pro-trans

binary-husky · binary-husky · commit 47812cba22ff · 2026-02-05T16:51:04.000+08:00
diff --git a/ajet/context_tracker/multiagent_tracking.py b/ajet/context_tracker/multiagent_tracking.py
@@ -294,7 +294,7 @@ def save_llm_interaction_timeline(self, tools, llm_ext_msg, timeline):
         # save to self.saved_timelines
         self.saved_timelines += [copy.deepcopy(timeline)]
 
-        # DEBUG = True   # warn when merge fails
+        # warn when merge fails
         timeline_merging_policy: TimelineMergingPolicyConfig = self.config.ajet.context_tracker.timeline_merging_policy
         if (
             self.config.ajet.context_tracker.detect_timeline_snap
diff --git a/ajet/task_runner/swarm_runner.py b/ajet/task_runner/swarm_runner.py
@@ -16,7 +16,8 @@
 from ajet import Workflow
 from typing import Callable
 
-DEBUG = True
+# DEBUG = True
+DEBUG = False
 
 context = zmq.Context()
 atexit.register(context.term)
diff --git a/ajet/tuner_lib/weight_tuner/experimental/as_oai_model_client.py b/ajet/tuner_lib/weight_tuner/experimental/as_oai_model_client.py
@@ -22,8 +22,8 @@
 if TYPE_CHECKING:
     from ajet.context_tracker.multiagent_tracking import MultiAgentContextTracker
 
-# DEBUG = False
-DEBUG = True
+DEBUG = False
+# DEBUG = True
 
 def generate_auth_token(agent_name, target_tag, episode_uuid, episode_address):
     """
diff --git a/ajet/tuner_lib/weight_tuner/experimental/as_oai_model_server.py b/ajet/tuner_lib/weight_tuner/experimental/as_oai_model_server.py
@@ -54,8 +54,8 @@ class HealthCheckRequest(BaseModel):
 
 # Create FastAPI app
 SERVER_SHUTDOWN_EVENT = threading.Event()
-# DEBUG = False
-DEBUG = True
+DEBUG = False
+# DEBUG = True
 
 context = zmq.Context()
 atexit.register(context.term)
diff --git a/ajet/tuner_lib/weight_tuner/experimental/as_swarm_server.py b/ajet/tuner_lib/weight_tuner/experimental/as_swarm_server.py
@@ -26,7 +26,8 @@
     VALID_STATUSES,
 )
 
-DEBUG = True
+# DEBUG = True
+DEBUG = False
 RCVTIMEO = 2 * 1000
 RCVTIMEO_OUT = 300 * 1000
 RCVTIMEO_WAIT_N = RCVTIMEO_OUT // RCVTIMEO
diff --git a/tutorial/example_academic_trans/trans.py b/tutorial/example_academic_trans/trans.py
@@ -3,10 +3,10 @@
 import os
 import time
 import asyncio
-import requests
 import threading
 from loguru import logger
 from textwrap import dedent
+from openai import OpenAI
 
 from ajet import WorkflowOutput
 from ajet.schema.task import Task
@@ -22,18 +22,6 @@
 from .trans_reward import TranslationQualityGrader, build_translation_quality_messages, examples
 
 
-LOCAL_DATASET_PATH = "/mnt/data_cpfs/qingxu.fu/agentjet/agentjet/tmp/arxiv_papers/train.parquet"
-
-
-# Handshake with swarm remote, then send training param to swarm remote (such as model to be trained, algorithm, etc)
-dataset = RouterTaskReader(
-    reader_type = "huggingface_dat_repo",
-    reader_config = AjetTaskReader(
-        huggingface_dat_repo = HuggingfaceDatRepo(
-            dataset_path = LOCAL_DATASET_PATH
-        )
-    )
-)
 
 @retry_with_backoff(max_retry=3)
 def execute_agent(task: Task, api_baseurl_key: OpenaiBaseUrlAndApiKey):
@@ -48,68 +36,91 @@ def execute_agent(task: Task, api_baseurl_key: OpenaiBaseUrlAndApiKey):
     messages, rough_translate = rough_translate_agent(base_url, api_key, abstract)
     # print_listofdict(messages, header="rough_translate_agent", mod="c")
 
-    messages, fix_nouns = detect_hard_proper_nouns(messages, base_url, api_key, abstract, rough_translate)
+    # messages, fix_nouns = detect_hard_proper_nouns(messages, base_url, api_key, abstract, rough_translate)
+    messages, fix_nouns = detect_hard_proper_nouns(messages, grader_base_url, grader_api_key, abstract, rough_translate)
     # print_listofdict(messages, header="detect_hard_proper_nouns", mod="c")
 
     messages, final_translation = produce_final_translation(messages, base_url, api_key, abstract, rough_translate, fix_nouns)
     print_listofdict(messages, header="final_translation", mod="c")
 
-    grader = TranslationQualityGrader(
-        model=OpenAIChatModel(base_url=grader_base_url, api_key=grader_api_key, model="qwen-max")
-    )
-    grader_score = asyncio.run(grader.aevaluate(original_text=abstract, translation=final_translation))
-    raw_reward = grader_score.score  # Normalize to 0-1 range (score is 0-3)
+    if final_translation is None:
+        raw_reward = 0.0
+    else:
+        grader = TranslationQualityGrader(
+            model=OpenAIChatModel(base_url=grader_base_url, api_key=grader_api_key, model="qwen3-max-2026-01-23")
+        )
+        grader_score = asyncio.run(grader.aevaluate(original_text=abstract, translation=final_translation))
+        raw_reward = grader_score.score
+        print(f"Grader Score: {grader_score.score}, Reason: {grader_score.reason}, Metadata: {grader_score.metadata}")
     return WorkflowOutput(reward=raw_reward, metadata={
         "rough_translate": rough_translate,
         "fix_nouns": fix_nouns,
         "final_translation": final_translation
     })
 
 
-def detect_hard_proper_nouns(messages, base_url, api_key, abstract, rough_translate):
+def produce_final_translation(messages, base_url, api_key, abstract, rough_translate, fix_nouns):
     messages = messages + [
-
         {
             "role": "user",
-            "content":  "You new job is to detect translation errors of discipline-specific proper nouns. "
-                        "Use json to list all errors found in the translation result and provide correction. "
-                        "Json format: [{\"original_word\": \"xxx\", \"wrong_translation\": \"xxx\", \"wrong_reason\": \"xxx\", \"correct_translation\": \"xxx\"}, ...]. "
-                        "If no errors are found, return an empty list []."
-                        "Please list all translation errors of discipline-specific proper nouns found in the translation result according to the requirements."
+            "content": "Please produce the final, corrected Chinese translation by applying all the corrections listed above. "
+                       "Output only the final translation between <final_result> ... </final_result>, so I will extract result with regex."
         },
     ]
 
-    response = requests.post( f"{base_url}/chat/completions", json = { "model": "qwen-turbo", "messages": messages, }, headers = { "Authorization": f"Bearer {api_key}" } )
-    fix_nouns = response.json()['choices'][0]['message']['content']
+    client = OpenAI(base_url=base_url, api_key=api_key)
+    response = client.chat.completions.create(
+        model="agentjet-model",
+        messages=messages
+    )
+    final_translation = response.choices[0].message.content
+
     messages += [
         {
             "role": "assistant",
-            "content": fix_nouns
+            "content": final_translation
         }
     ]
-    return messages, fix_nouns
 
+    # Extract final translation
+    match = re.search(r"<final_result>(.*?)</final_result>", final_translation, re.DOTALL)
+    if match:
+        final_translation = match.group(1).strip()
+    else:
+        final_translation = None
 
-def produce_final_translation(messages, base_url, api_key, abstract, rough_translate, fix_nouns):
+    return messages, final_translation
+
+
+
+def detect_hard_proper_nouns(messages, base_url, api_key, abstract, rough_translate):
     messages = messages + [
+
         {
             "role": "user",
-            "content": "Please produce the final, corrected Chinese translation by applying all the corrections listed above. "
-                       "Output only the final translation without any explanations or additional text."
+            "content":  "You new job is to detect translation errors of discipline-specific proper nouns. "
+                        "Use json to list all errors found in the translation result and provide correction. "
+                        "Json format: [{\"original_word\": \"xxx\", \"wrong_translation\": \"xxx\", \"wrong_reason\": \"xxx\", \"correct_translation\": \"xxx\"}, ...]. "
+                        "If no errors are found, return an empty list []."
+                        "Please list all translation errors of discipline-specific proper nouns found in the translation result according to the requirements."
         },
-    ]
 
-    response = requests.post( f"{base_url}/chat/completions", json = { "model": "qwen-turbo", "messages": messages, }, headers = { "Authorization": f"Bearer {api_key}" } )
-    final_translation = response.json()['choices'][0]['message']['content']
+    ]
 
+    client = OpenAI(base_url=base_url, api_key=api_key)
+    response = client.chat.completions.create(
+        model="qwen3-max-2026-01-23",
+        messages=messages,
+        extra_body={"enable_thinking":True}
+    )
+    fix_nouns = response.choices[0].message.content
     messages += [
         {
             "role": "assistant",
-            "content": final_translation
+            "content": fix_nouns
         }
     ]
-
-    return messages, final_translation
+    return messages, fix_nouns
 
 
 def rough_translate_agent(base_url, api_key, abstract):
@@ -123,9 +134,12 @@ def rough_translate_agent(base_url, api_key, abstract):
                 "such as conforming to the logic of the Chinese language, being simple, rigorous, and concise, "
                 "and avoiding the use of first-person pronouns when passive voice is appropriate. "
                 "Ensure that specialized terms are translated correctly according to academic standards. "
-                "Replace 我们 with 本研究 or 本文. "
-                "If an abbreviation is short in Chinese, use Chinese. "
-                "If an abbreviation is long in Chinese, use abbreviation. "
+                "Replace 我/我们 with 本研究 or 本文 or 研究者 or simply remove it and rephrase the sentence. "
+                "If an English abbreviation is short in Chinese, use Chinese. "
+                "If an English abbreviation is long in Chinese, use English abbreviation. "
+                "To use an English abbreviation, if the author has mentioned the full form first, mention the full form at its first appearance. "
+                "e.g. `We have used the LAsMA heterodyne array installed on the Atacama Pathfinder EXperiment (APEX)` should be translated as "
+                "`本研究使用了安装在阿塔卡马探路者实验望远镜（APEX, Atacama Pathfinder EXperiment）上的LAsMA外差阵列`. "
         },
         {
             "role": "user",
@@ -135,8 +149,13 @@ def rough_translate_agent(base_url, api_key, abstract):
 
     for ex in examples:
         messages[0]['content'] += f"\n\nExample:\n\tOriginal: {ex['original']}\n\tBad Translation: {ex['bad']}\n\tHint: {ex['hint']}\n\tGood Translation: {ex['good']}"
-    response = requests.post( f"{base_url}/chat/completions", json = { "model": "qwen-turbo", "messages": messages, }, headers = { "Authorization": f"Bearer {api_key}" } )
-    rough_translate = response.json()['choices'][0]['message']['content']
+
+    client = OpenAI(base_url=base_url, api_key=api_key)
+    response = client.chat.completions.create(
+        model="agentjet-model",
+        messages=messages
+    )
+    rough_translate = response.choices[0].message.content
     messages += [
         {
             "role": "assistant",
@@ -145,18 +164,3 @@ def rough_translate_agent(base_url, api_key, abstract):
     ]
 
     return messages, rough_translate
-
-
-
-if __name__ == "__main__":
-
-    for i, task in enumerate(dataset.generate_training_tasks()):
-        execute_agent(
-            task,
-            OpenaiBaseUrlAndApiKey(
-                base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
-                api_key=os.environ.get("DASHSCOPE_API_KEY", "")
-            )
-        )
-
-
diff --git a/tutorial/example_academic_trans/trans_reward.py b/tutorial/example_academic_trans/trans_reward.py
@@ -4,6 +4,7 @@
 from openjudge.models.base_chat_model import BaseChatModel
 from typing import List
 from textwrap import dedent
+from beast_logger import print_listofdict
 
 
 examples = [
@@ -68,15 +69,19 @@ def get_translation_quality_system_prompt() -> str:
     return dedent("""
             You are an objective translation quality evaluator for academic paper translations from English to Chinese. Your task is to identify ONLY the specific types of errors demonstrated in the provided examples - not general translation quality issues.
 
-            Focus (but do not limit to) on issues below (as shown in the examples):
+            重点关注（但不限于）以下问题类型（如示例所示）：
 
-            1. **First-person pronoun issues** - Using "我们" instead of "本研究" or "本文" in academic contexts
-            2. **Abbreviation translation errors** - Using abbreviations when concise Chinese exists (e.g., "GWs" instead of "引力波"), or translating abbreviations that should remain in English (like "EMBB")
-            3. **Word order problems** - Not adjusting sentence structure to emphasize key points in Chinese academic style
-            4. **Subject-verb inconsistencies** - Mismatched subjects due to improper sentence structure (e.g., "在...中，本文展示..." where the subject is confused)
-            5. **Inappropriate word choices** - Using colloquial or incorrect terms instead of proper academic expressions (e.g., "效率" vs "有效性" in certain contexts)
-            6. **Redundant punctuation** - Unnecessary commas or other punctuation that disrupts Chinese reading flow
+            1. **错误使用第一人称代词** - 禁止使用"我们"。正确的方法是使用"本研究"、"本文"、“研究者”，或者直接删除we并改写句子替换主语。不要漏掉出现的任何第一人称代词。
+            2. **缩写翻译错误** - 当存在简洁的中文表达时使用缩写（例如，使用"GWs"而非"引力波"），或翻译本应保留英文的缩写（如"EMBB"）
+            3. **语序问题** - 未调整句子结构以符合中文学术风格强调重点的习惯
+            4. **主谓不一致、主语缺失** - 由于句子结构不当导致主语混乱（例如，"在...中，本文展示..."中主语混淆）
+            5. **用词不当** - 使用口语化或不正确的术语而非恰当的学术表达
+            6. **多余标点和停顿** - 不必要的逗号或其他标点符号影响中文阅读流畅性
             7. **主语不清晰** - 中文句子主语缺失或不明确。例如：“通过该实验，证明了该药物对癌细胞有抑制作用”（缺少主语）
+            8. **缩写问题** - 首次出现自定义缩写、且原文中已经提供自定义缩写的英文全称时，没有在首次出现的地方提供英文全称。
+                （正确的例子：`We have used the LAsMA heterodyne array installed on the Atacama Pathfinder EXperiment (APEX)`->`本研究使用了安装在阿塔卡马探路者实验望远镜（APEX, Atacama Pathfinder EXperiment）上的LAsMA外差阵列`）
+            9. **专有名词翻译错误** - 领域特定的专有名词翻译错误，例如技术术语、学科术语等。如错把Agent翻译成“代理”（实际上应为“智能体”）等。
+            10. **表意偏差** - 翻译结果与原文在意义上存在偏差，导致信息传达不准确。
 
             **Examples of these errors:**
             [[examples_text]]
@@ -90,15 +95,19 @@ def get_translation_quality_system_prompt() -> str:
             * For each key issue found, provide the specific error, its type, and where it appears in the translation.
             * Be precise about which error category each issue belongs to.
             * Focus on objective errors matching the example patterns, not subjective preferences.
+            * 当出现  **语序问题**、**主谓不一致、主语缺失**、**主语不清晰**、**专有名词翻译错误**、**表意偏差** 等严重问题时，直接给 0 分。
+            * 逐句分析，切勿遗漏。
 
             Think carefully before flagging any error. Ask yourself: Does this match one of the specific error types from the examples? Is this truly an objective error or just a stylistic preference?
 
             Return your response in this format:
-            <score>X</score>
-            <reasoning>Your detailed step-by-step reasoning analyzing the translation against the error categories</reasoning>
+            <reasoning>
+            Your analysis
+            </reasoning>
             <key_issues>
             - Error Type: [category]. Error: [specific issue]. Location: [where it appears in the translation]
             </key_issues>
+            <score>X</score>
 
             The score must be 0, 1, 2. Each key issue should be on its own line starting with a dash. If no errors are found, the key_issues section should be empty or state "None detected".
         """.replace("[[examples_text]]", examples_text))
@@ -129,7 +138,10 @@ def parse_translation_quality_response(text: str) -> dict:
 
 def build_translation_quality_messages(original_text: str, translation: str) -> List[dict]:
     return [
-        {"role": "system", "content": get_translation_quality_system_prompt()},
+        {
+            "role": "system",
+            "content": get_translation_quality_system_prompt()
+        },
         {
             "role": "user",
             "content": TRANSLATION_QUALITY_USER_PROMPT.format(

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,8 @@`
`26`	`26`	`VALID_STATUSES,`
`27`	`27`	`)`
`28`	`28`
`29`		`-DEBUG = True`
	`29`	`+# DEBUG = True`
	`30`	`+DEBUG = False`
`30`	`31`	`RCVTIMEO = 2 * 1000`
`31`	`32`	`RCVTIMEO_OUT = 300 * 1000`
`32`	`33`	`RCVTIMEO_WAIT_N = RCVTIMEO_OUT // RCVTIMEO`