@@ -355,41 +355,41 @@ def test_evaluation_metric_resource_name(client):
355355 client ._api_client ._http_options .api_version = "v1beta1"
356356 tone_check_metric = types .LLMMetric (
357357 name = "tone_check" ,
358- prompt_template = """
359- # Instruction
360- You are a professional writing evaluator. Your job is to score writing responses according to pre-defined evaluation criteria.
361-
362- # Criteria
363- Analyze the tone of the response based on these two criteria :
364- 1. Professionalism: The response should use appropriate language and maintain a business-like demeanor.
365- 2. Empathy: The response should acknowledge the user's feelings and show understanding.
366-
367- # Input
368- Prompt: {agent_data.turns[0].events[0]}
369- Response: {agent_data.turns[0].events[1]}
370-
371- # Output Format
372- Respond in a JSON format with the following schema:
373- {
374- "type": "OBJECT",
375- "properties": {
376- "score": {"type": "NUMBER"},
377- "explanation": {"type": "STRING"},
378- },
379- "required": ["score", "explanation"],
380- }
381- Return the JSON format output in a string representation of a Python dictionary directly, without strings like '```json' or '```'.
382-
383- The output would include the following fields:
384- score: based on your evaluation, the score should be a number based on the rating rubrics.
385- explanation: your explanation for the score rating, in one line.
386-
387- ## Example Output Format:
388- {"score" : -1, "explanation": "Here is the reason that the response is given a score of -1 based on the rating rubric."}
389- {"score" : 3, "explanation": "Here is the reason that the response is given a score of 3 based on the rating rubric." }
390- {"score" : 0, "explanation": "Here is the reason that the response is given a score of 0 based on the rating rubric."}
391- {"score" : 5 , "explanation": "Here is the reason that the response is given a score of 5 based on the rating rubric. "}
392- """ ,
358+ prompt_template = """Analyze the tone of the response based on these two criteria: \n
359+ 1. Professionalism: The response should use appropriate language and maintain a business-like demeanor. \n
360+ 2. Empathy: The response should acknowledge the user's feelings and show understanding. \n \n
361+ Prompt: {agent_data.turns[0].events[0]}
362+ Response: {agent_data.turns[0].events[1]}
363+ Return ONLY a JSON list of objects for these two properties :
364+ [{"property": "Professionalism", "verdict": true, "reasoning": "..."},
365+ {"property": "Empathy", "verdict": true, "reasoning": "..."}]
366+ """ ,
367+ result_parsing_function = """
368+ import json, re
369+ def parse_results(responses):
370+ text = responses[0]
371+ # Use robust regex to find the JSON list block
372+ match = re.search("[ \\ [].*[]]", text, re.DOTALL)
373+ if not match: return {"score": 0.0, "explanation": "No valid JSON found"}
374+
375+ try:
376+ data = json.loads(match.group(0))
377+ # Calculate an overall score (e.g., average of verdicts)
378+ passed_count = sum(1 for r in data if r.get("verdict", False))
379+ total_count = len(data)
380+ score = passed_count / total_count if total_count > 0 else 0.0
381+
382+ # Consolidate reasoning into a single explanation string
383+ explanation = " \\ n".join([f"{r.get('property')}: {r.get('reasoning')}" for r in data])
384+
385+ # IMPORTANT: Return a dictionary, not a list
386+ return {
387+ "score": float(score),
388+ "explanation": explanation
389+ }
390+ except Exception as e:
391+ return {"score": 0.0 , "explanation": f"Parsing failed: {str(e)} "}
392+ """ ,
393393 )
394394 metric_resource_name = client .evals .create_evaluation_metric (
395395 metric = tone_check_metric ,
0 commit comments