Skip to content

Commit 1a7e1f6

Browse files
committed
feat: add Runner, RunnerResult, Judge, and Evaluator
1 parent a0c8784 commit 1a7e1f6

12 files changed

Lines changed: 981 additions & 10 deletions

File tree

lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIAgentConfig.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,9 @@ public final class AIAgentConfig extends AIConfig {
3030
String instructions,
3131
JudgeConfiguration judgeConfiguration,
3232
Map<String, Tool> tools,
33-
Supplier<LDAIConfigTracker> trackerFactory) {
34-
super(key, enabled, Mode.AGENT, model, provider, trackerFactory);
33+
Supplier<LDAIConfigTracker> trackerFactory,
34+
Evaluator evaluator) {
35+
super(key, enabled, Mode.AGENT, model, provider, trackerFactory, evaluator);
3536
this.instructions = instructions;
3637
this.judgeConfiguration = judgeConfiguration;
3738
this.tools = tools == null ? null : Collections.unmodifiableMap(tools);

lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AICompletionConfig.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ public final class AICompletionConfig extends AIConfig {
3232
List<Message> messages,
3333
JudgeConfiguration judgeConfiguration,
3434
Map<String, Tool> tools,
35-
Supplier<LDAIConfigTracker> trackerFactory) {
36-
super(key, enabled, Mode.COMPLETION, model, provider, trackerFactory);
35+
Supplier<LDAIConfigTracker> trackerFactory,
36+
Evaluator evaluator) {
37+
super(key, enabled, Mode.COMPLETION, model, provider, trackerFactory, evaluator);
3738
this.messages = messages == null ? null : Collections.unmodifiableList(messages);
3839
this.judgeConfiguration = judgeConfiguration;
3940
this.tools = tools == null ? null : Collections.unmodifiableMap(tools);

lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIConfig.java

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,23 @@ public abstract class AIConfig {
2424
private final Model model;
2525
private final Provider provider;
2626
private final Supplier<LDAIConfigTracker> trackerFactory;
27+
private final Evaluator evaluator;
2728

2829
AIConfig(
2930
String key,
3031
boolean enabled,
3132
Mode mode,
3233
Model model,
3334
Provider provider,
34-
Supplier<LDAIConfigTracker> trackerFactory) {
35+
Supplier<LDAIConfigTracker> trackerFactory,
36+
Evaluator evaluator) {
3537
this.key = key;
3638
this.enabled = enabled;
3739
this.mode = mode;
3840
this.model = model;
3941
this.provider = provider;
4042
this.trackerFactory = Objects.requireNonNull(trackerFactory, "trackerFactory");
43+
this.evaluator = Objects.requireNonNull(evaluator, "evaluator");
4144
}
4245

4346
/**
@@ -102,4 +105,17 @@ public Provider getProvider() {
102105
public LDAIConfigTracker createTracker() {
103106
return trackerFactory.get();
104107
}
108+
109+
/**
110+
* Returns the evaluator that coordinates judge execution for this configuration.
111+
* <p>
112+
* For {@link AIJudgeConfig} this is always {@link Evaluator#noop()}. For
113+
* {@link AICompletionConfig} and {@link AIAgentConfig} it is the evaluator supplied at
114+
* construction time (also {@link Evaluator#noop()} unless a custom one is wired in).
115+
*
116+
* @return the evaluator, never {@code null}
117+
*/
118+
public Evaluator getEvaluator() {
119+
return evaluator;
120+
}
105121
}

lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIJudgeConfig.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ public final class AIJudgeConfig extends AIConfig {
2929
List<Message> messages,
3030
String evaluationMetricKey,
3131
Supplier<LDAIConfigTracker> trackerFactory) {
32-
super(key, enabled, Mode.JUDGE, model, provider, trackerFactory);
32+
super(key, enabled, Mode.JUDGE, model, provider, trackerFactory, Evaluator.noop());
3333
this.messages = messages == null ? null : Collections.unmodifiableList(messages);
3434
this.evaluationMetricKey = evaluationMetricKey;
3535
}
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
package com.launchdarkly.sdk.server.ai;
2+
3+
import com.launchdarkly.logging.LDLogger;
4+
import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.JudgeConfiguration;
5+
import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
6+
7+
import java.util.ArrayList;
8+
import java.util.Collections;
9+
import java.util.List;
10+
import java.util.Map;
11+
import java.util.Objects;
12+
import java.util.concurrent.CompletableFuture;
13+
14+
/**
15+
* Coordinates evaluation of an AI Config output by running a set of {@link Judge} instances.
16+
* <p>
17+
* An {@code Evaluator} is attached to an {@link AICompletionConfig} or {@link AIAgentConfig} and
18+
* invoked by managed AI types (plan 4). In v1.0, the evaluator returned by the config retrieval
19+
* methods is always a noop that returns an empty list immediately.
20+
* <p>
21+
* Instances are immutable and thread-safe.
22+
*/
23+
public final class Evaluator {
24+
private static final Evaluator NOOP = new Evaluator();
25+
26+
private final Map<String, Judge> judges;
27+
private final JudgeConfiguration judgeConfiguration;
28+
private final LDLogger logger;
29+
private final boolean isNoop;
30+
31+
private Evaluator() {
32+
this.judges = Collections.emptyMap();
33+
this.judgeConfiguration = null;
34+
this.logger = null;
35+
this.isNoop = true;
36+
}
37+
38+
/**
39+
* Constructs an evaluator with the given judges and configuration.
40+
*
41+
* @param judges a map from judge config key to {@link Judge} instance; must not be {@code null}
42+
* @param judgeConfiguration the judge configuration listing which judges to run and their sampling
43+
* rates; must not be {@code null}
44+
* @param logger the logger; must not be {@code null}
45+
*/
46+
public Evaluator(Map<String, Judge> judges, JudgeConfiguration judgeConfiguration, LDLogger logger) {
47+
this.judges = Objects.requireNonNull(judges, "judges");
48+
this.judgeConfiguration = Objects.requireNonNull(judgeConfiguration, "judgeConfiguration");
49+
this.logger = Objects.requireNonNull(logger, "logger");
50+
this.isNoop = false;
51+
}
52+
53+
/**
54+
* Returns the shared noop evaluator, which immediately returns an empty result list without
55+
* logging any warnings.
56+
*
57+
* @return the noop singleton, never {@code null}
58+
*/
59+
public static Evaluator noop() {
60+
return NOOP;
61+
}
62+
63+
/**
64+
* Runs all configured judges against the given input/output pair and returns their results.
65+
* <p>
66+
* When this is the noop evaluator, returns a completed future holding an empty list immediately.
67+
* Otherwise, judges are run sequentially in the order specified by the {@link JudgeConfiguration}.
68+
* Judges referenced in the configuration but absent from the judges map are skipped with a
69+
* warning; this is not an error.
70+
* <p>
71+
* This method does NOT call {@code trackJudgeResult} — that is the caller's responsibility.
72+
*
73+
* @param input the message history or prompt that was sent to the model
74+
* @param output the model's response to evaluate
75+
* @return a completed future holding the list of judge results; never {@code null}
76+
*/
77+
public CompletableFuture<List<JudgeResult>> evaluate(String input, String output) {
78+
if (isNoop) {
79+
return CompletableFuture.completedFuture(Collections.emptyList());
80+
}
81+
82+
List<JudgeResult> results = new ArrayList<>();
83+
for (JudgeConfiguration.Judge entry : judgeConfiguration.getJudges()) {
84+
Judge judge = judges.get(entry.getKey());
85+
if (judge == null) {
86+
logger.warn("Evaluator: no judge found for key '{}', skipping", entry.getKey());
87+
continue;
88+
}
89+
results.add(judge.evaluate(input, output, entry.getSamplingRate()));
90+
}
91+
return CompletableFuture.completedFuture(results);
92+
}
93+
}
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
package com.launchdarkly.sdk.server.ai;
2+
3+
import com.launchdarkly.logging.LDLogger;
4+
import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message;
5+
import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
6+
7+
import java.util.List;
8+
import java.util.Map;
9+
import java.util.Objects;
10+
import java.util.concurrent.ThreadLocalRandom;
11+
import java.util.stream.Collectors;
12+
13+
/**
14+
* Evaluates an AI model output against a judge prompt, returning a scored {@link JudgeResult}.
15+
* <p>
16+
* A {@code Judge} wraps an {@link AIJudgeConfig} and a {@link Runner}. Each call to
17+
* {@link #evaluate} or {@link #evaluateMessages} invokes the runner with a formatted evaluation
18+
* prompt and parses the structured {@code {score, reasoning}} response. Evaluation can be sampled
19+
* to reduce cost: pass a {@code samplingRate} of {@code 0.0} to always skip, or {@code 1.0} to
20+
* always run.
21+
* <p>
22+
* Instances are immutable and thread-safe.
23+
*/
24+
public final class Judge {
25+
/**
26+
* JSON-Schema fragment sent to the runner as the {@code outputType}, requesting structured
27+
* {@code {score, reasoning}} output.
28+
*/
29+
private static final Map<String, Object> EVALUATION_SCHEMA = Map.of(
30+
"type", "object",
31+
"properties", Map.of(
32+
"score", Map.of("type", "number"),
33+
"reasoning", Map.of("type", "string")),
34+
"required", List.of("score", "reasoning"));
35+
36+
private final AIJudgeConfig config;
37+
private final Runner runner;
38+
private final LDLogger logger;
39+
40+
/**
41+
* Constructs a judge.
42+
*
43+
* @param config the judge AI Config; must not be {@code null}
44+
* @param runner the runner to invoke; must not be {@code null}
45+
* @param logger the logger; must not be {@code null}
46+
*/
47+
public Judge(AIJudgeConfig config, Runner runner, LDLogger logger) {
48+
this.config = Objects.requireNonNull(config, "config");
49+
this.runner = Objects.requireNonNull(runner, "runner");
50+
this.logger = Objects.requireNonNull(logger, "logger");
51+
}
52+
53+
/**
54+
* Evaluates the given input/output pair, always running (sampling rate {@code 1.0}).
55+
*
56+
* @param input the message history or prompt that was sent to the model
57+
* @param output the model's response to evaluate
58+
* @return the evaluation result; never {@code null}
59+
*/
60+
public JudgeResult evaluate(String input, String output) {
61+
return evaluate(input, output, 1.0);
62+
}
63+
64+
/**
65+
* Evaluates the given input/output pair, subject to the given sampling rate.
66+
*
67+
* @param input the message history or prompt that was sent to the model
68+
* @param output the model's response to evaluate
69+
* @param samplingRate the fraction of evaluations to actually run; {@code 0.0} always skips,
70+
* {@code 1.0} always runs
71+
* @return the evaluation result; never {@code null}
72+
*/
73+
public JudgeResult evaluate(String input, String output, double samplingRate) {
74+
if (ThreadLocalRandom.current().nextDouble() >= samplingRate) {
75+
return JudgeResult.builder()
76+
.sampled(false)
77+
.success(false)
78+
.judgeConfigKey(config.getKey())
79+
.metricKey(config.getEvaluationMetricKey())
80+
.build();
81+
}
82+
83+
String formatted = "MESSAGE HISTORY:\n" + input + "\n\nRESPONSE TO EVALUATE:\n" + output;
84+
LDAIConfigTracker tracker = config.createTracker();
85+
86+
RunnerResult result;
87+
try {
88+
result = tracker.trackMetricsOf(RunnerResult::getMetrics, () -> runner.run(formatted, EVALUATION_SCHEMA));
89+
} catch (Exception ex) {
90+
return JudgeResult.builder()
91+
.sampled(true)
92+
.success(false)
93+
.judgeConfigKey(config.getKey())
94+
.metricKey(config.getEvaluationMetricKey())
95+
.errorMessage(ex.getMessage())
96+
.build();
97+
}
98+
99+
Map<String, Object> parsed = result.getParsed();
100+
if (parsed == null) {
101+
logger.warn("Judge {}: runner returned null parsed output", config.getKey());
102+
return JudgeResult.builder()
103+
.sampled(true)
104+
.success(false)
105+
.judgeConfigKey(config.getKey())
106+
.metricKey(config.getEvaluationMetricKey())
107+
.build();
108+
}
109+
110+
Object scoreRaw = parsed.get("score");
111+
if (!(scoreRaw instanceof Number)) {
112+
logger.warn("Judge {}: parsed output missing numeric score", config.getKey());
113+
return JudgeResult.builder()
114+
.sampled(true)
115+
.success(false)
116+
.judgeConfigKey(config.getKey())
117+
.metricKey(config.getEvaluationMetricKey())
118+
.build();
119+
}
120+
double score = ((Number) scoreRaw).doubleValue();
121+
if (score < 0.0 || score > 1.0) {
122+
logger.warn("Judge {}: score {} is outside [0.0, 1.0]", config.getKey(), score);
123+
return JudgeResult.builder()
124+
.sampled(true)
125+
.success(false)
126+
.judgeConfigKey(config.getKey())
127+
.metricKey(config.getEvaluationMetricKey())
128+
.build();
129+
}
130+
131+
JudgeResult.Builder resultBuilder = JudgeResult.builder()
132+
.sampled(true)
133+
.success(true)
134+
.judgeConfigKey(config.getKey())
135+
.metricKey(config.getEvaluationMetricKey())
136+
.score(score);
137+
138+
Object reasoningRaw = parsed.get("reasoning");
139+
if (reasoningRaw instanceof String) {
140+
resultBuilder.reasoning((String) reasoningRaw);
141+
} else if (reasoningRaw != null) {
142+
logger.warn("Judge {}: reasoning is not a string, ignoring", config.getKey());
143+
}
144+
145+
return resultBuilder.build();
146+
}
147+
148+
/**
149+
* Evaluates a message list and runner response, always running (sampling rate {@code 1.0}).
150+
* <p>
151+
* Messages are formatted as {@code role: content} lines, joined by newlines.
152+
*
153+
* @param messages the messages that were sent to the model
154+
* @param response the runner result whose {@link RunnerResult#getContent() content} is evaluated
155+
* @return the evaluation result; never {@code null}
156+
*/
157+
public JudgeResult evaluateMessages(List<Message> messages, RunnerResult response) {
158+
return evaluateMessages(messages, response, 1.0);
159+
}
160+
161+
/**
162+
* Evaluates a message list and runner response, subject to the given sampling rate.
163+
* <p>
164+
* Messages are formatted as {@code role: content} lines, joined by newlines.
165+
*
166+
* @param messages the messages that were sent to the model
167+
* @param response the runner result whose {@link RunnerResult#getContent() content} is evaluated
168+
* @param samplingRate the fraction of evaluations to actually run
169+
* @return the evaluation result; never {@code null}
170+
*/
171+
public JudgeResult evaluateMessages(List<Message> messages, RunnerResult response, double samplingRate) {
172+
String formattedMessages = messages == null ? "" : messages.stream()
173+
.map(m -> m.getRole().getWireValue() + ": " + m.getContent())
174+
.collect(Collectors.joining("\n"));
175+
return evaluate(formattedMessages, response == null ? "" : response.getContent(), samplingRate);
176+
}
177+
178+
/**
179+
* Returns the judge AI Config this instance was constructed with.
180+
*
181+
* @return the judge config, never {@code null}
182+
*/
183+
public AIJudgeConfig getConfig() {
184+
return config;
185+
}
186+
187+
/**
188+
* Returns the runner this instance was constructed with.
189+
*
190+
* @return the runner, never {@code null}
191+
*/
192+
public Runner getRunner() {
193+
return runner;
194+
}
195+
}

0 commit comments

Comments
 (0)