-
Notifications
You must be signed in to change notification settings - Fork 11
feat: add Runner, RunnerResult, Judge, and Evaluator #180
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 30 commits
7c4dbde
a0c8784
1a7e1f6
bed4ca2
2b47c86
4ef3de2
1be0a1e
8e81ea0
e81e2f5
c21fdd7
4c96dca
4da5478
a94b2bf
394a044
6c80aed
5381bf4
1355033
add48f9
1bd6777
9a8143e
3aa5d08
121b140
faa4981
f42de0b
59835e3
25346b3
26a61b4
ef762f5
a6a1ca1
792d33c
01f0386
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| package com.launchdarkly.sdk.server.ai; | ||
|
|
||
| import com.launchdarkly.logging.LDLogger; | ||
| import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.JudgeConfiguration; | ||
| import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult; | ||
|
|
||
| import java.util.ArrayList; | ||
| import java.util.Collections; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.concurrent.CompletableFuture; | ||
|
|
||
| /** | ||
| * Coordinates evaluation of an AI Config output by running a set of {@link Judge} instances. | ||
| * <p> | ||
| * An {@code Evaluator} is attached to an {@link AICompletionConfig} or {@link AIAgentConfig} and | ||
| * invoked by managed AI types (plan 4). In v1.0, the evaluator returned by the config retrieval | ||
| * methods is always a noop that returns an empty list immediately. | ||
| * <p> | ||
| * Instances are immutable and thread-safe. | ||
| */ | ||
| public final class Evaluator { | ||
| private static final Evaluator NOOP = new Evaluator(); | ||
|
|
||
| private final Map<String, Judge> judges; | ||
| private final JudgeConfiguration judgeConfiguration; | ||
| private final LDLogger logger; | ||
| private final boolean isNoop; | ||
|
|
||
| private Evaluator() { | ||
| this.judges = Collections.emptyMap(); | ||
| this.judgeConfiguration = null; | ||
| this.logger = null; | ||
| this.isNoop = true; | ||
| } | ||
|
|
||
| /** | ||
| * Constructs an evaluator with the given judges and configuration. | ||
| * | ||
| * @param judges a map from judge config key to {@link Judge} instance; a {@code null} value is | ||
| * treated as an empty map | ||
| * @param judgeConfiguration the judge configuration listing which judges to run and their sampling | ||
| * rates | ||
| * @param logger the logger | ||
| */ | ||
| public Evaluator(Map<String, Judge> judges, JudgeConfiguration judgeConfiguration, LDLogger logger) { | ||
| this.judges = judges != null | ||
| ? Collections.unmodifiableMap(new HashMap<>(judges)) | ||
| : Collections.emptyMap(); | ||
| this.judgeConfiguration = judgeConfiguration; | ||
| this.logger = logger; | ||
| this.isNoop = false; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the shared noop evaluator, which immediately returns an empty result list without | ||
| * logging any warnings. | ||
| * | ||
| * @return the noop singleton, never {@code null} | ||
| */ | ||
| public static Evaluator noop() { | ||
| return NOOP; | ||
| } | ||
|
|
||
| /** | ||
| * Runs all configured judges against the given input/output pair and returns their results. | ||
| * <p> | ||
| * When this is the noop evaluator, returns a completed future holding an empty list immediately. | ||
| * Otherwise, judges are run sequentially in the order specified by the {@link JudgeConfiguration}. | ||
| * Judges referenced in the configuration but absent from the judges map are skipped with a | ||
| * warning; this is not an error. | ||
| * <p> | ||
| * This method does NOT call {@code trackJudgeResult} — that is the caller's responsibility. | ||
| * | ||
| * @param input the message history or prompt that was sent to the model | ||
| * @param output the model's response to evaluate | ||
| * @return a completed future holding the list of judge results; never {@code null} | ||
| */ | ||
| public CompletableFuture<List<JudgeResult>> evaluate(String input, String output) { | ||
| if (isNoop) { | ||
| return CompletableFuture.completedFuture(Collections.emptyList()); | ||
| } | ||
|
|
||
| if (judgeConfiguration == null) { | ||
| return CompletableFuture.completedFuture(Collections.emptyList()); | ||
| } | ||
|
|
||
| List<JudgeResult> results = new ArrayList<>(); | ||
| for (JudgeConfiguration.Judge entry : judgeConfiguration.getJudges()) { | ||
| Judge judge = judges.get(entry.getKey()); | ||
| if (judge == null) { | ||
| if (logger != null) logger.warn("Evaluator: no judge found for key '{}', skipping", entry.getKey()); | ||
| continue; | ||
| } | ||
| results.add(judge.evaluate(input, output, entry.getSamplingRate())); | ||
| } | ||
| return CompletableFuture.completedFuture(results); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,216 @@ | ||
| package com.launchdarkly.sdk.server.ai; | ||
|
|
||
| import com.launchdarkly.logging.LDLogger; | ||
| import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message; | ||
| import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult; | ||
|
|
||
| import java.util.Arrays; | ||
| import java.util.Collections; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.concurrent.ThreadLocalRandom; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| /** | ||
| * Evaluates an AI model output against a judge prompt, returning a scored {@link JudgeResult}. | ||
| * <p> | ||
| * A {@code Judge} wraps an {@link AIJudgeConfig} and a {@link Runner}. Each call to | ||
| * {@link #evaluate} or {@link #evaluateMessages} invokes the runner with a formatted evaluation | ||
| * prompt and parses the structured {@code {score, reasoning}} response. Evaluation can be sampled | ||
| * to reduce cost: pass a {@code samplingRate} of {@code 0.0} to always skip, or {@code 1.0} to | ||
| * always run. | ||
| * <p> | ||
| * Instances are immutable and thread-safe. | ||
| */ | ||
| public final class Judge { | ||
| /** | ||
| * JSON-Schema fragment sent to the runner as the {@code outputType}, requesting structured | ||
| * {@code {score, reasoning}} output. | ||
| */ | ||
| private static final Map<String, Object> EVALUATION_SCHEMA; | ||
| static { | ||
| Map<String, Object> scoreSchema = new HashMap<>(); | ||
| scoreSchema.put("type", "number"); | ||
| scoreSchema.put("minimum", 0); | ||
| scoreSchema.put("maximum", 1); | ||
| scoreSchema.put("description", "Score between 0.0 and 1.0."); | ||
|
|
||
| Map<String, Object> reasoningSchema = new HashMap<>(); | ||
| reasoningSchema.put("type", "string"); | ||
| reasoningSchema.put("description", "Reasoning behind the score."); | ||
|
|
||
| Map<String, Object> properties = new HashMap<>(); | ||
| properties.put("score", Collections.unmodifiableMap(scoreSchema)); | ||
| properties.put("reasoning", Collections.unmodifiableMap(reasoningSchema)); | ||
|
|
||
| Map<String, Object> schema = new HashMap<>(); | ||
| schema.put("type", "object"); | ||
| schema.put("properties", Collections.unmodifiableMap(properties)); | ||
| schema.put("required", Arrays.asList("score", "reasoning")); | ||
|
cursor[bot] marked this conversation as resolved.
|
||
| schema.put("additionalProperties", false); | ||
|
|
||
| EVALUATION_SCHEMA = Collections.unmodifiableMap(schema); | ||
| } | ||
|
|
||
| private final AIJudgeConfig config; | ||
| private final Runner runner; | ||
| private final LDLogger logger; | ||
|
|
||
| /** | ||
| * Constructs a judge. | ||
| * | ||
| * @param config the judge AI Config | ||
| * @param runner the runner to invoke | ||
| * @param logger the logger | ||
| */ | ||
| public Judge(AIJudgeConfig config, Runner runner, LDLogger logger) { | ||
| this.config = config; | ||
| this.runner = runner; | ||
| this.logger = logger; | ||
| } | ||
|
|
||
| /** | ||
| * Evaluates the given input/output pair, always running (sampling rate {@code 1.0}). | ||
| * | ||
| * @param input the message history or prompt that was sent to the model | ||
| * @param output the model's response to evaluate | ||
| * @return the evaluation result; never {@code null} | ||
| */ | ||
| public JudgeResult evaluate(String input, String output) { | ||
| return evaluate(input, output, 1.0); | ||
| } | ||
|
|
||
| /** | ||
| * Evaluates the given input/output pair, subject to the given sampling rate. | ||
| * | ||
| * @param input the message history or prompt that was sent to the model | ||
| * @param output the model's response to evaluate | ||
| * @param samplingRate the fraction of evaluations to actually run; {@code 0.0} always skips, | ||
| * {@code 1.0} always runs | ||
| * @return the evaluation result; never {@code null} | ||
| */ | ||
| public JudgeResult evaluate(String input, String output, double samplingRate) { | ||
| if (samplingRate <= 0.0) { | ||
| return JudgeResult.builder() | ||
| .sampled(false) | ||
| .success(false) | ||
| .build(); | ||
| } | ||
|
cursor[bot] marked this conversation as resolved.
|
||
| if (ThreadLocalRandom.current().nextDouble() > samplingRate) { | ||
| return JudgeResult.builder() | ||
| .sampled(false) | ||
| .success(false) | ||
| .build(); | ||
| } | ||
|
|
||
| try { | ||
| String formatted = "MESSAGE HISTORY:\n" + input + "\n\nRESPONSE TO EVALUATE:\n" + output; | ||
| LDAIConfigTracker tracker = config.createTracker(); | ||
|
|
||
| RunnerResult result = tracker.trackMetricsOf(RunnerResult::getMetrics, () -> runner.run(formatted, EVALUATION_SCHEMA)); | ||
|
|
||
| Map<String, Object> parsed = result.getParsed(); | ||
| if (parsed == null) { | ||
| if (logger != null) logger.warn("Judge {}: runner returned null parsed output", config.getKey()); | ||
| return JudgeResult.builder() | ||
| .sampled(true) | ||
| .success(false) | ||
| .judgeConfigKey(config.getKey()) | ||
| .metricKey(config.getEvaluationMetricKey()) | ||
| .build(); | ||
| } | ||
|
|
||
| Object scoreRaw = parsed.get("score"); | ||
| if (!(scoreRaw instanceof Number)) { | ||
| if (logger != null) logger.warn("Judge {}: parsed output missing numeric score", config.getKey()); | ||
| return JudgeResult.builder() | ||
| .sampled(true) | ||
| .success(false) | ||
| .judgeConfigKey(config.getKey()) | ||
| .metricKey(config.getEvaluationMetricKey()) | ||
| .build(); | ||
| } | ||
| double score = ((Number) scoreRaw).doubleValue(); | ||
| if (!Double.isFinite(score) || score < 0.0 || score > 1.0) { | ||
| if (logger != null) logger.warn("Judge {}: score {} is outside [0.0, 1.0]", config.getKey(), score); | ||
| return JudgeResult.builder() | ||
| .sampled(true) | ||
| .success(false) | ||
| .judgeConfigKey(config.getKey()) | ||
| .metricKey(config.getEvaluationMetricKey()) | ||
| .build(); | ||
| } | ||
|
|
||
| JudgeResult.Builder resultBuilder = JudgeResult.builder() | ||
| .sampled(true) | ||
| .success(true) | ||
| .judgeConfigKey(config.getKey()) | ||
| .metricKey(config.getEvaluationMetricKey()) | ||
| .score(score); | ||
|
|
||
| Object reasoningRaw = parsed.get("reasoning"); | ||
| if (reasoningRaw instanceof String) { | ||
| resultBuilder.reasoning((String) reasoningRaw); | ||
| } else if (reasoningRaw != null) { | ||
| if (logger != null) logger.warn("Judge {}: reasoning is not a string, ignoring", config.getKey()); | ||
| } | ||
|
|
||
| return resultBuilder.build(); | ||
| } catch (Exception ex) { | ||
| return JudgeResult.builder() | ||
| .sampled(true) | ||
| .success(false) | ||
| .errorMessage(ex.getMessage()) | ||
| .build(); | ||
| } | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exception path omits judge identityMedium Severity When Reviewed by Cursor Bugbot for commit 792d33c. Configure here. |
||
| } | ||
|
|
||
| /** | ||
| * Evaluates a message list and runner response, always running (sampling rate {@code 1.0}). | ||
| * <p> | ||
| * Messages are formatted as {@code role: content} lines, joined by newlines. | ||
| * | ||
| * @param messages the messages that were sent to the model | ||
| * @param response the runner result whose {@link RunnerResult#getContent() content} is evaluated | ||
| * @return the evaluation result; never {@code null} | ||
| */ | ||
| public JudgeResult evaluateMessages(List<Message> messages, RunnerResult response) { | ||
| return evaluateMessages(messages, response, 1.0); | ||
| } | ||
|
|
||
| /** | ||
| * Evaluates a message list and runner response, subject to the given sampling rate. | ||
| * <p> | ||
| * Messages are formatted as {@code role: content} lines, joined by newlines. | ||
| * | ||
| * @param messages the messages that were sent to the model | ||
| * @param response the runner result whose {@link RunnerResult#getContent() content} is evaluated | ||
| * @param samplingRate the fraction of evaluations to actually run | ||
| * @return the evaluation result; never {@code null} | ||
| */ | ||
| public JudgeResult evaluateMessages(List<Message> messages, RunnerResult response, double samplingRate) { | ||
| String formattedMessages = messages == null ? "" : messages.stream() | ||
| .map(m -> m.getRole().getWireValue() + ": " + m.getContent()) | ||
| .collect(Collectors.joining("\n")); | ||
| return evaluate(formattedMessages, response == null ? "" : response.getContent(), samplingRate); | ||
| } | ||
|
|
||
| /** | ||
| * Returns the judge AI Config this instance was constructed with. | ||
| * | ||
| * @return the judge config | ||
| */ | ||
| public AIJudgeConfig getConfig() { | ||
| return config; | ||
| } | ||
|
|
||
| /** | ||
| * Returns the runner this instance was constructed with. | ||
| * | ||
| * @return the runner | ||
| */ | ||
| public Runner getRunner() { | ||
| return runner; | ||
| } | ||
| } | ||


Uh oh!
There was an error while loading. Please reload this page.