Skip to content

Commit 904a54c

Browse files
committed
feat: robust evaluation with multiple test cases and detailed scoring
1 parent 42f4745 commit 904a54c

2 files changed

Lines changed: 144 additions & 90 deletions

File tree

src/agent.py

Lines changed: 139 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ def validate_request(self, request: EvalRequest) -> tuple[bool, str]:
4646
return False, "No participants provided in the evaluation request."
4747
if len(request.participants) > 1:
4848
return False, "Only one participant is supported per evaluation."
49-
if "code_to_translate" not in request.config:
50-
return False, "Missing 'code_to_translate' in config."
49+
if "code_to_translate" not in request.config and "test_cases" not in request.config:
50+
return False, "Missing 'code_to_translate' or 'test_cases' in config."
5151
if "source_language" not in request.config:
5252
return False, "Missing 'source_language' in config."
5353
if "target_language" not in request.config:
@@ -57,47 +57,76 @@ def validate_request(self, request: EvalRequest) -> tuple[bool, str]:
5757
async def run_eval(self, request: EvalRequest, updater: TaskUpdater) -> None:
5858
# Extract the single participant
5959
role, endpoint = next(iter(request.participants.items()))
60-
code_to_translate = request.config["code_to_translate"]
60+
61+
# Determine inputs: support both single 'code_to_translate' and list 'test_cases'
62+
code_inputs = []
63+
if "test_cases" in request.config and isinstance(request.config["test_cases"], list):
64+
code_inputs = request.config["test_cases"]
65+
elif "code_to_translate" in request.config:
66+
code_inputs = [request.config["code_to_translate"]]
67+
6168
source_language = request.config["source_language"]
6269
target_language = request.config["target_language"]
63-
64-
# Step 1: Request translation from the participant agent
65-
await updater.update_status(
66-
"working",
67-
new_agent_text_message(f"Requesting translation from participant '{role}'...")
68-
)
69-
try:
70-
# Send the code to translate to the participant agent
71-
print(f"[DEBUG] Sending message to Purple Agent at {endpoint}", flush=True)
72-
response = await self._tool_provider.talk_to_agent(
73-
url=endpoint,
74-
message=json.dumps({
75-
"code_to_translate": code_to_translate,
76-
"source_language": source_language,
77-
"target_language": target_language
78-
})
70+
71+
evaluations = []
72+
73+
for i, code_to_translate in enumerate(code_inputs):
74+
case_label = f"Case {i+1}/{len(code_inputs)}"
75+
await updater.update_status(
76+
"working",
77+
new_agent_text_message(f"Processing {case_label} with participant '{role}'...")
7978
)
80-
print(f"[DEBUG] Received response from Purple Agent: '{response}'", flush=True)
81-
# The response is expected to be a JSON string with the translated code
82-
translated_code_data = json.loads(response)
83-
translated_code = translated_code_data.get("translated_code", "")
84-
85-
if not translated_code:
86-
await updater.failed(new_agent_text_message("Participant did not return translated code."))
87-
return
79+
80+
# --- TRANSLATION STEP ---
81+
try:
82+
print(f"[DEBUG] Sending {case_label} to Purple Agent at {endpoint}", flush=True)
83+
response = await self._tool_provider.talk_to_agent(
84+
url=endpoint,
85+
message=json.dumps({
86+
"code_to_translate": code_to_translate,
87+
"source_language": source_language,
88+
"target_language": target_language
89+
})
90+
)
91+
print(f"[DEBUG] Received response for {case_label}: '{response}'", flush=True)
92+
93+
translated_code = None
94+
# Attempt 1: JSON
95+
try:
96+
data = json.loads(response)
97+
if isinstance(data, dict):
98+
translated_code = data.get("translated_code") or data.get("code") or data.get("content") or data.get("message")
99+
elif isinstance(data, str):
100+
translated_code = data
101+
except json.JSONDecodeError:
102+
pass
103+
104+
# Attempt 2: Markdown
105+
if not translated_code:
106+
import re
107+
matches = re.findall(r"```(?:\w+)?\n(.*?)```", response, re.DOTALL)
108+
if matches:
109+
translated_code = max(matches, key=len).strip()
110+
111+
# Attempt 3: Raw
112+
if not translated_code:
113+
translated_code = response.strip()
88114

89-
except Exception as e:
90-
print(f"[DEBUG] Exception communicating with participant: {e}", flush=True)
91-
await updater.failed(new_agent_text_message(f"Error communicating with participant: {e}"))
92-
return
115+
if not translated_code:
116+
print(f"[WARN] Empty response for {case_label}")
117+
translated_code = "// Error: No Code Translated"
118+
119+
except Exception as e:
120+
print(f"[ERROR] Communication failed for {case_label}: {e}")
121+
translated_code = f"// Error: Communication failed: {e}"
93122

94-
await updater.update_status(
95-
"working",
96-
new_agent_text_message("Received translated code. Evaluating...")
97-
)
123+
# --- EVALUATION STEP ---
124+
await updater.update_status(
125+
"working",
126+
new_agent_text_message(f"Evaluating {case_label}...")
127+
)
98128

99-
# Step 2: Use the judge agent to evaluate the translated code
100-
prompt = f"""
129+
prompt = f"""
101130
{SYSTEM_PROMPT}
102131
103132
Please evaluate the following code translation based on the criteria:
@@ -116,59 +145,81 @@ async def run_eval(self, request: EvalRequest, updater: TaskUpdater) -> None:
116145
{translated_code}
117146
```
118147
119-
Provide your evaluation in the TranslatorEval schema, including reasoning, winner (the participant's role if it's a good translation, or 'N/A' otherwise), and scores.
148+
Provide your evaluation in the TranslatorEval schema, including reasoning, winner (participant's role or 'N/A'), execution_correctness, style_score, conciseness, and relevance.
120149
"""
121-
models_to_try = [
122-
"gemini-2.5-flash",
123-
"gemini-2.0-flash",
124-
"gemma-3-27b-it",
125-
"gemma-3-12b-it",
126-
"gemini-flash-latest",
127-
"gemini-pro-latest",
128-
"gemini-2.5-pro"
129-
]
130-
131-
last_error = None
132-
for model in models_to_try:
133-
try:
134-
print(f"[DEBUG] Trying evaluation with model: {model}")
135-
response = await self.client.aio.models.generate_content(
136-
model=model,
137-
contents=prompt,
138-
config=types.GenerateContentConfig(
139-
response_mime_type='application/json',
140-
response_schema=TranslatorEval
150+
models_to_try = [
151+
"gemini-2.5-flash",
152+
"gemini-2.0-flash",
153+
"gemma-3-27b-it",
154+
"gemini-flash-latest"
155+
]
156+
157+
case_eval = None
158+
for model in models_to_try:
159+
try:
160+
response = await self.client.aio.models.generate_content(
161+
model=model,
162+
contents=prompt,
163+
config=types.GenerateContentConfig(
164+
response_mime_type='application/json',
165+
response_schema=TranslatorEval
166+
)
141167
)
168+
case_eval = response.parsed
169+
if case_eval:
170+
break
171+
except Exception as e:
172+
print(f"[DEBUG] Model {model} failed for {case_label}: {e}")
173+
if "429" in str(e):
174+
import asyncio
175+
await asyncio.sleep(5)
176+
177+
if not case_eval:
178+
# Fallback if evaluation fails
179+
case_eval = TranslatorEval(
180+
reasoning=f"Evaluation failed for {case_label}",
181+
winner="N/A",
182+
execution_correctness=0,
183+
style_score=0,
184+
conciseness=0,
185+
relevance=0
142186
)
143-
eval_result: TranslatorEval = response.parsed
144-
145-
# If parsed is None (should not happen with structured output)
146-
if not eval_result:
147-
raise ValueError("Model failed to return structured output")
148-
149-
# import json removed since it's global
150-
# from a2a.types import Part, DataPart moved to global (or just imported here)
151-
152-
await updater.add_artifact(
153-
parts=[Part(root=DataPart(data=eval_result.model_dump()))],
154-
name="Evaluation Result"
155-
)
156-
157-
await updater.update_status(
158-
"completed",
159-
new_agent_text_message(f"Evaluation complete. Winner: {eval_result.winner}, Scores: {eval_result.scores}")
160-
)
161-
return # Assessment successful, exit function
187+
188+
evaluations.append(case_eval)
189+
190+
# --- AGGREGATION STEP ---
191+
count = len(evaluations)
192+
if count == 0:
193+
await updater.failed(new_agent_text_message("No evaluations occurred."))
194+
return
195+
196+
avg_exec = sum(e.execution_correctness for e in evaluations) / count
197+
avg_style = sum(e.style_score for e in evaluations) / count
198+
avg_conciseness = sum(e.conciseness for e in evaluations) / count
199+
avg_relevance = sum(e.relevance for e in evaluations) / count
200+
201+
combined_reasoning = "\n\n".join([f"[{i+1}/{count}] Winner: {e.winner}. {e.reasoning}" for i, e in enumerate(evaluations)])
202+
203+
# Determine overall winner (majority wins or high score?)
204+
# For simplicity, if we have a winner in >50% cases, we propagate that, else N/A
205+
winners = [e.winner for e in evaluations if e.winner != "N/A"]
206+
overall_winner = max(set(winners), key=winners.count) if winners else "N/A"
207+
208+
final_result = TranslatorEval(
209+
reasoning=f"Aggregated Score across {count} test cases.\n\nDetails:\n{combined_reasoning}",
210+
winner=overall_winner,
211+
execution_correctness=round(avg_exec, 2),
212+
style_score=round(avg_style, 2),
213+
conciseness=round(avg_conciseness, 2),
214+
relevance=round(avg_relevance, 2)
215+
)
162216

163-
except Exception as e:
164-
print(f"[DEBUG] Model {model} failed: {e}")
165-
last_error = e
166-
# Check for resource exhausted and wait if needed
167-
if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e):
168-
print("[DEBUG] Quota exhausted. Waiting 30 seconds before trying next model...", flush=True)
169-
import asyncio
170-
await asyncio.sleep(30)
171-
# Continue to next model
217+
await updater.add_artifact(
218+
parts=[Part(root=DataPart(data=final_result.model_dump()))],
219+
name="Evaluation Result"
220+
)
172221

173-
# If all models failed
174-
await updater.failed(new_agent_text_message(f"All evaluation models failed. Last error: {last_error}"))
222+
await updater.update_status(
223+
"completed",
224+
new_agent_text_message(f"Evaluation complete. Winner: {final_result.winner}, Execution: {final_result.execution_correctness}, Style: {final_result.style_score}, Conciseness: {final_result.conciseness}, Relevance: {final_result.relevance}")
225+
)

src/common.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@ class ParticipantScore(BaseModel):
1212

1313
class TranslatorEval(BaseModel):
1414
reasoning: str = Field(description="The reasoning behind the evaluation.")
15-
winner: str = Field(description="The role of the winning agent (e.g., 'researcher_translator').")
16-
scores: List[ParticipantScore] = Field(description="Scores for each participant (0-10).")
15+
execution_correctness: float = Field(description="Score for execution correctness (0-10).")
16+
style_score: float = Field(description="Score for style and documentation (0-10).")
17+
conciseness: float = Field(description="Score for conciseness (0-10).")
18+
relevance: float = Field(description="Score for relevance (0-10).")
19+
winner: str = Field(description="The role of the winning agent (e.g., 'researcher_translator') or 'N/A'.")
1720

1821
def translator_judge_agent_card(name: str, url: str):
1922
return AgentCard(

0 commit comments

Comments
 (0)