@@ -46,8 +46,8 @@ def validate_request(self, request: EvalRequest) -> tuple[bool, str]:
4646 return False , "No participants provided in the evaluation request."
4747 if len (request .participants ) > 1 :
4848 return False , "Only one participant is supported per evaluation."
49- if "code_to_translate" not in request .config :
50- return False , "Missing 'code_to_translate' in config."
49+ if "code_to_translate" not in request .config and "test_cases" not in request . config :
50+ return False , "Missing 'code_to_translate' or 'test_cases' in config."
5151 if "source_language" not in request .config :
5252 return False , "Missing 'source_language' in config."
5353 if "target_language" not in request .config :
@@ -57,47 +57,76 @@ def validate_request(self, request: EvalRequest) -> tuple[bool, str]:
5757 async def run_eval (self , request : EvalRequest , updater : TaskUpdater ) -> None :
5858 # Extract the single participant
5959 role , endpoint = next (iter (request .participants .items ()))
60- code_to_translate = request .config ["code_to_translate" ]
60+
61+ # Determine inputs: support both single 'code_to_translate' and list 'test_cases'
62+ code_inputs = []
63+ if "test_cases" in request .config and isinstance (request .config ["test_cases" ], list ):
64+ code_inputs = request .config ["test_cases" ]
65+ elif "code_to_translate" in request .config :
66+ code_inputs = [request .config ["code_to_translate" ]]
67+
6168 source_language = request .config ["source_language" ]
6269 target_language = request .config ["target_language" ]
63-
64- # Step 1: Request translation from the participant agent
65- await updater .update_status (
66- "working" ,
67- new_agent_text_message (f"Requesting translation from participant '{ role } '..." )
68- )
69- try :
70- # Send the code to translate to the participant agent
71- print (f"[DEBUG] Sending message to Purple Agent at { endpoint } " , flush = True )
72- response = await self ._tool_provider .talk_to_agent (
73- url = endpoint ,
74- message = json .dumps ({
75- "code_to_translate" : code_to_translate ,
76- "source_language" : source_language ,
77- "target_language" : target_language
78- })
70+
71+ evaluations = []
72+
73+ for i , code_to_translate in enumerate (code_inputs ):
74+ case_label = f"Case { i + 1 } /{ len (code_inputs )} "
75+ await updater .update_status (
76+ "working" ,
77+ new_agent_text_message (f"Processing { case_label } with participant '{ role } '..." )
7978 )
80- print (f"[DEBUG] Received response from Purple Agent: '{ response } '" , flush = True )
81- # The response is expected to be a JSON string with the translated code
82- translated_code_data = json .loads (response )
83- translated_code = translated_code_data .get ("translated_code" , "" )
84-
85- if not translated_code :
86- await updater .failed (new_agent_text_message ("Participant did not return translated code." ))
87- return
79+
80+ # --- TRANSLATION STEP ---
81+ try :
82+ print (f"[DEBUG] Sending { case_label } to Purple Agent at { endpoint } " , flush = True )
83+ response = await self ._tool_provider .talk_to_agent (
84+ url = endpoint ,
85+ message = json .dumps ({
86+ "code_to_translate" : code_to_translate ,
87+ "source_language" : source_language ,
88+ "target_language" : target_language
89+ })
90+ )
91+ print (f"[DEBUG] Received response for { case_label } : '{ response } '" , flush = True )
92+
93+ translated_code = None
94+ # Attempt 1: JSON
95+ try :
96+ data = json .loads (response )
97+ if isinstance (data , dict ):
98+ translated_code = data .get ("translated_code" ) or data .get ("code" ) or data .get ("content" ) or data .get ("message" )
99+ elif isinstance (data , str ):
100+ translated_code = data
101+ except json .JSONDecodeError :
102+ pass
103+
104+ # Attempt 2: Markdown
105+ if not translated_code :
106+ import re
107+ matches = re .findall (r"```(?:\w+)?\n(.*?)```" , response , re .DOTALL )
108+ if matches :
109+ translated_code = max (matches , key = len ).strip ()
110+
111+ # Attempt 3: Raw
112+ if not translated_code :
113+ translated_code = response .strip ()
88114
89- except Exception as e :
90- print (f"[DEBUG] Exception communicating with participant: { e } " , flush = True )
91- await updater .failed (new_agent_text_message (f"Error communicating with participant: { e } " ))
92- return
115+ if not translated_code :
116+ print (f"[WARN] Empty response for { case_label } " )
117+ translated_code = "// Error: No Code Translated"
118+
119+ except Exception as e :
120+ print (f"[ERROR] Communication failed for { case_label } : { e } " )
121+ translated_code = f"// Error: Communication failed: { e } "
93122
94- await updater .update_status (
95- "working" ,
96- new_agent_text_message ("Received translated code. Evaluating..." )
97- )
123+ # --- EVALUATION STEP ---
124+ await updater .update_status (
125+ "working" ,
126+ new_agent_text_message (f"Evaluating { case_label } ..." )
127+ )
98128
99- # Step 2: Use the judge agent to evaluate the translated code
100- prompt = f"""
129+ prompt = f"""
101130{ SYSTEM_PROMPT }
102131
103132Please evaluate the following code translation based on the criteria:
@@ -116,59 +145,81 @@ async def run_eval(self, request: EvalRequest, updater: TaskUpdater) -> None:
116145{ translated_code }
117146```
118147
119- Provide your evaluation in the TranslatorEval schema, including reasoning, winner (the participant's role if it's a good translation, or 'N/A' otherwise ), and scores .
148+ Provide your evaluation in the TranslatorEval schema, including reasoning, winner (participant's role or 'N/A'), execution_correctness, style_score, conciseness, and relevance .
120149"""
121- models_to_try = [
122- "gemini-2.5-flash" ,
123- "gemini-2.0-flash" ,
124- "gemma-3-27b-it" ,
125- "gemma-3-12b-it" ,
126- "gemini-flash-latest" ,
127- "gemini-pro-latest" ,
128- "gemini-2.5-pro"
129- ]
130-
131- last_error = None
132- for model in models_to_try :
133- try :
134- print (f"[DEBUG] Trying evaluation with model: { model } " )
135- response = await self .client .aio .models .generate_content (
136- model = model ,
137- contents = prompt ,
138- config = types .GenerateContentConfig (
139- response_mime_type = 'application/json' ,
140- response_schema = TranslatorEval
150+ models_to_try = [
151+ "gemini-2.5-flash" ,
152+ "gemini-2.0-flash" ,
153+ "gemma-3-27b-it" ,
154+ "gemini-flash-latest"
155+ ]
156+
157+ case_eval = None
158+ for model in models_to_try :
159+ try :
160+ response = await self .client .aio .models .generate_content (
161+ model = model ,
162+ contents = prompt ,
163+ config = types .GenerateContentConfig (
164+ response_mime_type = 'application/json' ,
165+ response_schema = TranslatorEval
166+ )
141167 )
168+ case_eval = response .parsed
169+ if case_eval :
170+ break
171+ except Exception as e :
172+ print (f"[DEBUG] Model { model } failed for { case_label } : { e } " )
173+ if "429" in str (e ):
174+ import asyncio
175+ await asyncio .sleep (5 )
176+
177+ if not case_eval :
178+ # Fallback if evaluation fails
179+ case_eval = TranslatorEval (
180+ reasoning = f"Evaluation failed for { case_label } " ,
181+ winner = "N/A" ,
182+ execution_correctness = 0 ,
183+ style_score = 0 ,
184+ conciseness = 0 ,
185+ relevance = 0
142186 )
143- eval_result : TranslatorEval = response .parsed
144-
145- # If parsed is None (should not happen with structured output)
146- if not eval_result :
147- raise ValueError ("Model failed to return structured output" )
148-
149- # import json removed since it's global
150- # from a2a.types import Part, DataPart moved to global (or just imported here)
151-
152- await updater .add_artifact (
153- parts = [Part (root = DataPart (data = eval_result .model_dump ()))],
154- name = "Evaluation Result"
155- )
156-
157- await updater .update_status (
158- "completed" ,
159- new_agent_text_message (f"Evaluation complete. Winner: { eval_result .winner } , Scores: { eval_result .scores } " )
160- )
161- return # Assessment successful, exit function
187+
188+ evaluations .append (case_eval )
189+
190+ # --- AGGREGATION STEP ---
191+ count = len (evaluations )
192+ if count == 0 :
193+ await updater .failed (new_agent_text_message ("No evaluations occurred." ))
194+ return
195+
196+ avg_exec = sum (e .execution_correctness for e in evaluations ) / count
197+ avg_style = sum (e .style_score for e in evaluations ) / count
198+ avg_conciseness = sum (e .conciseness for e in evaluations ) / count
199+ avg_relevance = sum (e .relevance for e in evaluations ) / count
200+
201+ combined_reasoning = "\n \n " .join ([f"[{ i + 1 } /{ count } ] Winner: { e .winner } . { e .reasoning } " for i , e in enumerate (evaluations )])
202+
203+ # Determine overall winner (majority wins or high score?)
204+ # For simplicity, if we have a winner in >50% cases, we propagate that, else N/A
205+ winners = [e .winner for e in evaluations if e .winner != "N/A" ]
206+ overall_winner = max (set (winners ), key = winners .count ) if winners else "N/A"
207+
208+ final_result = TranslatorEval (
209+ reasoning = f"Aggregated Score across { count } test cases.\n \n Details:\n { combined_reasoning } " ,
210+ winner = overall_winner ,
211+ execution_correctness = round (avg_exec , 2 ),
212+ style_score = round (avg_style , 2 ),
213+ conciseness = round (avg_conciseness , 2 ),
214+ relevance = round (avg_relevance , 2 )
215+ )
162216
163- except Exception as e :
164- print (f"[DEBUG] Model { model } failed: { e } " )
165- last_error = e
166- # Check for resource exhausted and wait if needed
167- if "429" in str (e ) or "RESOURCE_EXHAUSTED" in str (e ):
168- print ("[DEBUG] Quota exhausted. Waiting 30 seconds before trying next model..." , flush = True )
169- import asyncio
170- await asyncio .sleep (30 )
171- # Continue to next model
217+ await updater .add_artifact (
218+ parts = [Part (root = DataPart (data = final_result .model_dump ()))],
219+ name = "Evaluation Result"
220+ )
172221
173- # If all models failed
174- await updater .failed (new_agent_text_message (f"All evaluation models failed. Last error: { last_error } " ))
222+ await updater .update_status (
223+ "completed" ,
224+ new_agent_text_message (f"Evaluation complete. Winner: { final_result .winner } , Execution: { final_result .execution_correctness } , Style: { final_result .style_score } , Conciseness: { final_result .conciseness } , Relevance: { final_result .relevance } " )
225+ )
0 commit comments