Skip to content

Commit 4fe745c

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Add support for custom result parsing in LLM-based evaluation metrics
PiperOrigin-RevId: 892593906
1 parent 62656c2 commit 4fe745c

File tree

2 files changed

+51
-35
lines changed

2 files changed

+51
-35
lines changed

tests/unit/vertexai/genai/replays/test_evaluate.py

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -355,41 +355,41 @@ def test_evaluation_metric_resource_name(client):
355355
client._api_client._http_options.api_version = "v1beta1"
356356
tone_check_metric = types.LLMMetric(
357357
name="tone_check",
358-
prompt_template="""
359-
# Instruction
360-
You are a professional writing evaluator. Your job is to score writing responses according to pre-defined evaluation criteria.
361-
362-
# Criteria
363-
Analyze the tone of the response based on these two criteria:
364-
1. Professionalism: The response should use appropriate language and maintain a business-like demeanor.
365-
2. Empathy: The response should acknowledge the user's feelings and show understanding.
366-
367-
# Input
368-
Prompt: {agent_data.turns[0].events[0]}
369-
Response: {agent_data.turns[0].events[1]}
370-
371-
# Output Format
372-
Respond in a JSON format with the following schema:
373-
{
374-
"type": "OBJECT",
375-
"properties": {
376-
"score": {"type": "NUMBER"},
377-
"explanation": {"type": "STRING"},
378-
},
379-
"required": ["score", "explanation"],
380-
}
381-
Return the JSON format output in a string representation of a Python dictionary directly, without strings like '```json' or '```'.
382-
383-
The output would include the following fields:
384-
score: based on your evaluation, the score should be a number based on the rating rubrics.
385-
explanation: your explanation for the score rating, in one line.
386-
387-
## Example Output Format:
388-
{"score" : -1, "explanation": "Here is the reason that the response is given a score of -1 based on the rating rubric."}
389-
{"score" : 3, "explanation": "Here is the reason that the response is given a score of 3 based on the rating rubric."}
390-
{"score" : 0, "explanation": "Here is the reason that the response is given a score of 0 based on the rating rubric."}
391-
{"score" : 5, "explanation": "Here is the reason that the response is given a score of 5 based on the rating rubric."}
392-
""",
358+
prompt_template="""Analyze the tone of the response based on these two criteria:\n
359+
1. Professionalism: The response should use appropriate language and maintain a business-like demeanor.\n
360+
2. Empathy: The response should acknowledge the user's feelings and show understanding.\n\n
361+
Prompt: {agent_data.turns[0].events[0]}
362+
Response: {agent_data.turns[0].events[1]}
363+
Return ONLY a JSON list of objects for these two properties:
364+
[{"property": "Professionalism", "verdict": true, "reasoning": "..."},
365+
{"property": "Empathy", "verdict": true, "reasoning": "..."}]
366+
""",
367+
result_parsing_function="""
368+
import json, re
369+
def parse_results(responses):
370+
text = responses[0]
371+
# Use robust regex to find the JSON list block
372+
match = re.search("[\\[].*[]]", text, re.DOTALL)
373+
if not match: return {"score": 0.0, "explanation": "No valid JSON found"}
374+
375+
try:
376+
data = json.loads(match.group(0))
377+
# Calculate an overall score (e.g., average of verdicts)
378+
passed_count = sum(1 for r in data if r.get("verdict", False))
379+
total_count = len(data)
380+
score = passed_count / total_count if total_count > 0 else 0.0
381+
382+
# Consolidate reasoning into a single explanation string
383+
explanation = "\\n".join([f"{r.get('property')}: {r.get('reasoning')}" for r in data])
384+
385+
# IMPORTANT: Return a dictionary, not a list
386+
return {
387+
"score": float(score),
388+
"explanation": explanation
389+
}
390+
except Exception as e:
391+
return {"score": 0.0, "explanation": f"Parsing failed: {str(e)}"}
392+
""",
393393
)
394394
metric_resource_name = client.evals.create_evaluation_metric(
395395
metric=tone_check_metric,

vertexai/_genai/_transformers.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,14 @@ def t_metrics(
119119
if autorater_config:
120120
llm_based_spec["judge_autorater_config"] = autorater_config
121121

122+
result_parsing_function = getv(metric, ["result_parsing_function"])
123+
if result_parsing_function:
124+
llm_based_spec["result_parser_config"] = {
125+
"custom_code_parser_config": {
126+
"parsing_function": result_parsing_function
127+
}
128+
}
129+
122130
metric_payload_item["llm_based_metric_spec"] = llm_based_spec
123131
elif getattr(metric, "metric_resource_name", None) is not None:
124132
# Safe pass
@@ -235,6 +243,14 @@ def t_metric_for_registry(
235243
if autorater_config:
236244
llm_based_spec["judge_autorater_config"] = autorater_config
237245

246+
result_parsing_function = getv(metric, ["result_parsing_function"])
247+
if result_parsing_function:
248+
llm_based_spec["result_parser_config"] = {
249+
"custom_code_parser_config": {
250+
"parsing_function": result_parsing_function
251+
}
252+
}
253+
238254
metric_payload_item["llm_based_metric_spec"] = llm_based_spec
239255

240256
else:

0 commit comments

Comments
 (0)