Skip to content

Commit 4a9d03d

Browse files
authored
feat: add AI Config judge support (#345)
**Requirements** - [X] I have added test coverage for new or changed functionality - [X] I have followed the repository's [pull request submission guidelines](../blob/v5/CONTRIBUTING.md#submitting-pull-requests) - [X] I have validated my changes against all supported platform versions **Related issues** See https://docs.google.com/document/d/1lzYwQqCcTzN_2zkxJZDfJtgUcEJ4jbpx0KSsJ2bRENw/edit?tab=t.0#heading=h.5d8l30brvyuw for context For other SDK implementations, see: - launchdarkly/js-core#1073 - launchdarkly/python-server-sdk-ai#86 & launchdarkly/python-server-sdk-ai#64 **Describe the solution you've provided** Extending the Go SDK to support AI Config evaluations. This includes custom evaluator support as well. This SDK was written with hopes to be congruent with the python and node implementations. Changes were verified by a local app that was created; [the resultant data can be observed in the evaluator metrics for this AI config](https://ld-stg.launchdarkly.com/projects/default/ai-configs/kf-comp-feb-3/monitoring?from_ts=1770094800000&to_ts=1770353999999&env=staging&selected-env=staging&chartTypes=Tokens%2CSatisfaction%2CGenerations%2CTime+to+generate%2CError+rate%2CTime+to+first+token%2CCosts%2CEvaluator+metrics+%28avg%29). **Describe alternatives you've considered** Provide a clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context about the pull request here. <!-- CURSOR_SUMMARY --> --- > [!NOTE] > **Medium Risk** > Adds new evaluation and metric-tracking paths (including dynamic metric keys and new event payload fields), which could affect analytics correctness and runtime behavior if misconfigured. Changes are well-covered by tests but touch core SDK tracking surfaces. > > **Overview** > Adds **judge-mode support** to AI Configs by extending the config datamodel and builder with `mode`, `evaluationMetricKey`/`evaluationMetricKeys`, and `judgeConfiguration` (with defensive copying to keep configs immutable). > > Introduces `Client.JudgeConfig` to fetch judge configs while preserving `{{message_history}}` / `{{response_to_evaluate}}` placeholders for a second Mustache interpolation pass during evaluation, and adds a new `ldai/judge` package that samples, interpolates, invokes a structured provider, and parses judge responses. > > Extends `Tracker` with `TrackJudgeResponse` to emit evaluation scores as metrics (including optional `judgeConfigKey` in event data), and adds comprehensive tests covering parsing, placeholder preservation, schema generation, sampling, and response validation. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 41141b9. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY -->
1 parent 0956ce9 commit 4a9d03d

8 files changed

Lines changed: 1628 additions & 7 deletions

File tree

ldai/client.go

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,17 @@ func (c *Client) Config(
7171
variables map[string]interface{},
7272
) (Config, *Tracker) {
7373
_ = c.sdk.TrackMetric("$ld:ai:config:function:single", context, 1, ldvalue.String(key))
74+
return c.evaluateConfig(key, context, defaultValue, variables)
75+
}
7476

77+
// evaluateConfig fetches and interpolates an AI Config without emitting any metric.
78+
// Callers (Config, JudgeConfig) are meant to emit their own metric before calling this.
79+
func (c *Client) evaluateConfig(
80+
key string,
81+
context ldcontext.Context,
82+
defaultValue Config,
83+
variables map[string]interface{},
84+
) (Config, *Tracker) {
7585
result, _ := c.sdk.JSONVariation(key, context, defaultValue.AsLdValue())
7686

7787
// The spec requires the config to at least be an object (although all properties are optional, so it may be an
@@ -102,7 +112,11 @@ func (c *Client) Config(
102112
builder := NewConfig().
103113
WithModelName(parsed.Model.Name).
104114
WithProviderName(parsed.Provider.Name).
105-
WithEnabled(parsed.Meta.Enabled)
115+
WithEnabled(parsed.Meta.Enabled).
116+
WithMode(parsed.Mode).
117+
WithEvaluationMetricKey(parsed.EvaluationMetricKey).
118+
WithEvaluationMetricKeys(parsed.EvaluationMetricKeys).
119+
WithJudgeConfiguration(parsed.JudgeConfiguration)
106120

107121
for k, v := range parsed.Model.Parameters {
108122
builder.WithModelParam(k, v)
@@ -174,3 +188,38 @@ func interpolateTemplate(template string, variables map[string]interface{}) (str
174188
}
175189
return m.RenderString(variables)
176190
}
191+
192+
// JudgeConfig evaluates an AI Config, tracking it as a judge function. See Config for details.
193+
//
194+
// This method extends the provided variables with reserved judge variables:
195+
// - "message_history": "{{message_history}}"
196+
// - "response_to_evaluate": "{{response_to_evaluate}}"
197+
//
198+
// These literal placeholder strings preserve the Mustache templates through the first interpolation
199+
// (during config fetch), allowing Judge.Evaluate() to perform a second interpolation with actual values.
200+
func (c *Client) JudgeConfig(
201+
key string,
202+
context ldcontext.Context,
203+
defaultValue Config,
204+
variables map[string]interface{},
205+
) (Config, *Tracker) {
206+
_ = c.sdk.TrackMetric("$ld:ai:judge:function:single", context, 1, ldvalue.String(key))
207+
208+
// Extend variables with reserved judge placeholders
209+
extendedVariables := make(map[string]interface{})
210+
for k, v := range variables {
211+
// Warn if user tries to override reserved variables
212+
if k == "message_history" || k == "response_to_evaluate" {
213+
c.logger.Warnf("AI Config '%s': variable '%s' is reserved by judge and will be ignored", key, k)
214+
continue
215+
}
216+
extendedVariables[k] = v
217+
}
218+
219+
// Inject reserved variables as literal placeholder strings
220+
// These will be preserved through the first interpolation and resolved during Judge.Evaluate()
221+
extendedVariables["message_history"] = "{{message_history}}"
222+
extendedVariables["response_to_evaluate"] = "{{response_to_evaluate}}"
223+
224+
return c.evaluateConfig(key, context, defaultValue, extendedVariables)
225+
}

ldai/client_test.go

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,42 @@ func TestConfigMethodTracking(t *testing.T) {
329329
assert.ElementsMatch(t, expectedEvents, mockSDK.events)
330330
}
331331

332+
// TestJudgeConfigMethodTracking verifies that JudgeConfig emits only the judge metric,
333+
// not the config metric, so judge evaluations are not double-counted on the dashboard.
334+
func TestJudgeConfigMethodTracking(t *testing.T) {
335+
json := []byte(`{
336+
"_ldMeta": {"variationKey": "1", "enabled": true},
337+
"mode": "judge",
338+
"evaluationMetricKey": "toxicity",
339+
"messages": [{"content": "test", "role": "system"}]
340+
}`)
341+
mockSDK := newMockSDK(json, nil)
342+
client, err := NewClient(mockSDK)
343+
require.NoError(t, err)
344+
require.NotNil(t, client)
345+
346+
defaultConfig := Disabled()
347+
context := ldcontext.New("user-key")
348+
configKey := "judge-config-key"
349+
350+
config, tracker := client.JudgeConfig(configKey, context, defaultConfig, nil)
351+
352+
require.NotNil(t, config)
353+
require.NotNil(t, tracker)
354+
355+
// Only the judge metric should be emitted; evaluateConfig does not emit any metric.
356+
expectedEvents := []mockEvent{
357+
{
358+
eventName: "$ld:ai:judge:function:single",
359+
context: context,
360+
metricValue: 1,
361+
data: ldvalue.String(configKey),
362+
},
363+
}
364+
assert.ElementsMatch(t, expectedEvents, mockSDK.events,
365+
"JudgeConfig must not emit $ld:ai:config:function:single to avoid double-counting")
366+
}
367+
332368
func TestCanSetModelParameters(t *testing.T) {
333369
client, err := NewClient(newMockSDK(nil, nil))
334370
require.NoError(t, err)
@@ -579,3 +615,167 @@ func TestInterpolation(t *testing.T) {
579615
assert.Equal(t, "user_kind=<>,cat_kind=<>", result)
580616
})
581617
}
618+
619+
func TestParseJudgeSpecificFields(t *testing.T) {
620+
json := []byte(`{
621+
"_ldMeta": {"variationKey": "1", "enabled": true},
622+
"mode": "judge",
623+
"evaluationMetricKey": "toxicity",
624+
"judgeConfiguration": {
625+
"judges": [
626+
{"key": "judge1", "samplingRate": 0.5},
627+
{"key": "judge2", "samplingRate": 1.0}
628+
]
629+
},
630+
"messages": [
631+
{"content": "test", "role": "system"}
632+
]
633+
}`)
634+
635+
client, err := NewClient(newMockSDK(json, nil))
636+
require.NoError(t, err)
637+
require.NotNil(t, client)
638+
639+
cfg, _ := client.Config("key", ldcontext.New("user"), Disabled(), nil)
640+
641+
assert.Equal(t, "judge", cfg.Mode())
642+
assert.Equal(t, "toxicity", cfg.EvaluationMetricKey())
643+
644+
judgeConfig := cfg.JudgeConfiguration()
645+
require.NotNil(t, judgeConfig)
646+
require.Len(t, judgeConfig.Judges, 2)
647+
assert.Equal(t, "judge1", judgeConfig.Judges[0].Key)
648+
assert.Equal(t, 0.5, judgeConfig.Judges[0].SamplingRate)
649+
assert.Equal(t, "judge2", judgeConfig.Judges[1].Key)
650+
assert.Equal(t, 1.0, judgeConfig.Judges[1].SamplingRate)
651+
}
652+
653+
func TestParseEvaluationMetricKeys(t *testing.T) {
654+
json := []byte(`{
655+
"_ldMeta": {"variationKey": "1", "enabled": true},
656+
"mode": "judge",
657+
"evaluationMetricKeys": ["relevance", "accuracy"],
658+
"messages": [
659+
{"content": "test", "role": "system"}
660+
]
661+
}`)
662+
663+
client, err := NewClient(newMockSDK(json, nil))
664+
require.NoError(t, err)
665+
require.NotNil(t, client)
666+
667+
cfg, _ := client.Config("key", ldcontext.New("user"), Disabled(), nil)
668+
669+
assert.Equal(t, "judge", cfg.Mode())
670+
assert.Equal(t, "", cfg.EvaluationMetricKey())
671+
assert.Equal(t, []string{"relevance", "accuracy"}, cfg.EvaluationMetricKeys())
672+
}
673+
674+
func TestParseEvaluationMetricKeyPriority(t *testing.T) {
675+
json := []byte(`{
676+
"_ldMeta": {"variationKey": "1", "enabled": true},
677+
"mode": "judge",
678+
"evaluationMetricKey": "toxicity",
679+
"evaluationMetricKeys": ["relevance", "accuracy"],
680+
"messages": [
681+
{"content": "test", "role": "system"}
682+
]
683+
}`)
684+
685+
client, err := NewClient(newMockSDK(json, nil))
686+
require.NoError(t, err)
687+
require.NotNil(t, client)
688+
689+
cfg, _ := client.Config("key", ldcontext.New("user"), Disabled(), nil)
690+
691+
assert.Equal(t, "judge", cfg.Mode())
692+
// Both fields should be parsed
693+
assert.Equal(t, "toxicity", cfg.EvaluationMetricKey())
694+
assert.Equal(t, []string{"relevance", "accuracy"}, cfg.EvaluationMetricKeys())
695+
}
696+
697+
func TestJudgeConfigurationImmutable(t *testing.T) {
698+
// Test that mutations to JudgeConfiguration don't affect the Config
699+
judgeConfig := &datamodel.JudgeConfiguration{
700+
Judges: []datamodel.Judge{
701+
{Key: "judge1", SamplingRate: 0.5},
702+
{Key: "judge2", SamplingRate: 1.0},
703+
},
704+
}
705+
706+
builder := NewConfig().
707+
Enable().
708+
WithJudgeConfiguration(judgeConfig)
709+
cfg := builder.Build()
710+
711+
// Mutate the original
712+
judgeConfig.Judges[0].Key = "mutated"
713+
judgeConfig.Judges = append(judgeConfig.Judges, datamodel.Judge{Key: "judge3", SamplingRate: 0.3})
714+
715+
// Config should not be affected
716+
retrieved := cfg.JudgeConfiguration()
717+
require.NotNil(t, retrieved)
718+
require.Len(t, retrieved.Judges, 2)
719+
assert.Equal(t, "judge1", retrieved.Judges[0].Key) // Should still be original value
720+
assert.Equal(t, "judge2", retrieved.Judges[1].Key)
721+
722+
// Mutate the retrieved config
723+
retrieved.Judges[0].Key = "mutated_again"
724+
retrieved.Judges = append(retrieved.Judges, datamodel.Judge{Key: "judge4", SamplingRate: 0.4})
725+
726+
// Config should still not be affected
727+
retrieved2 := cfg.JudgeConfiguration()
728+
require.NotNil(t, retrieved2)
729+
require.Len(t, retrieved2.Judges, 2)
730+
assert.Equal(t, "judge1", retrieved2.Judges[0].Key) // Should still be original value
731+
assert.Equal(t, "judge2", retrieved2.Judges[1].Key)
732+
}
733+
734+
// TestJudgeConfig_PreservesReservedPlaceholders verifies that JudgeConfig injects reserved variables
735+
// so that {{message_history}} and {{response_to_evaluate}} are preserved for the second interpolation
736+
// pass during Judge.Evaluate(). Without this, Config's first Mustache pass would render them as empty.
737+
func TestJudgeConfig_PreservesReservedPlaceholders(t *testing.T) {
738+
json := []byte(`{
739+
"_ldMeta": {"variationKey": "1", "enabled": true},
740+
"mode": "judge",
741+
"evaluationMetricKey": "toxicity",
742+
"messages": [
743+
{"content": "You are a judge.", "role": "system"},
744+
{"content": "Input: {{message_history}}\nOutput: {{response_to_evaluate}}", "role": "user"}
745+
]
746+
}`)
747+
748+
client, err := NewClient(newMockSDK(json, nil))
749+
require.NoError(t, err)
750+
require.NotNil(t, client)
751+
752+
cfg, _ := client.JudgeConfig("judge-key", ldcontext.New("user"), Disabled(), nil)
753+
754+
msgs := cfg.Messages()
755+
require.Len(t, msgs, 2)
756+
assert.Equal(t, "You are a judge.", msgs[0].Content)
757+
assert.Contains(t, msgs[1].Content, "{{message_history}}", "JudgeConfig must preserve placeholder for second interpolation")
758+
assert.Contains(t, msgs[1].Content, "{{response_to_evaluate}}", "JudgeConfig must preserve placeholder for second interpolation")
759+
assert.Equal(t, "Input: {{message_history}}\nOutput: {{response_to_evaluate}}", msgs[1].Content)
760+
}
761+
762+
// TestConfig_WithoutReservedVarsWipesJudgePlaceholders documents that Config (without reserved vars)
763+
// renders {{message_history}} and {{response_to_evaluate}} as empty when used for judge templates.
764+
func TestConfig_WithoutReservedVarsWipesJudgePlaceholders(t *testing.T) {
765+
json := []byte(`{
766+
"_ldMeta": {"variationKey": "1", "enabled": true},
767+
"messages": [
768+
{"content": "Input: {{message_history}}\nOutput: {{response_to_evaluate}}", "role": "user"}
769+
]
770+
}`)
771+
772+
client, err := NewClient(newMockSDK(json, nil))
773+
require.NoError(t, err)
774+
require.NotNil(t, client)
775+
776+
cfg, _ := client.Config("key", ldcontext.New("user"), Disabled(), nil)
777+
778+
msgs := cfg.Messages()
779+
require.Len(t, msgs, 1)
780+
assert.Equal(t, "Input: \nOutput: ", msgs[0].Content, "Config without reserved vars renders placeholders as empty")
781+
}

ldai/config.go

Lines changed: 80 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,33 @@ func (c *Config) CustomModelParam(key string) (ldvalue.Value, bool) {
6060
return val, ok
6161
}
6262

63+
// Mode returns the AI Config mode (e.g., "completion", "agent", "judge").
64+
func (c *Config) Mode() string {
65+
return c.c.Mode
66+
}
67+
68+
// EvaluationMetricKey returns the evaluation metric key for judge mode configs.
69+
func (c *Config) EvaluationMetricKey() string {
70+
return c.c.EvaluationMetricKey
71+
}
72+
73+
// EvaluationMetricKeys returns the deprecated array of evaluation metric keys.
74+
// Use EvaluationMetricKey instead.
75+
func (c *Config) EvaluationMetricKeys() []string {
76+
return slices.Clone(c.c.EvaluationMetricKeys)
77+
}
78+
79+
// JudgeConfiguration returns the judge configuration attached to this config, if any.
80+
// Returns a defensive copy to prevent mutations.
81+
func (c *Config) JudgeConfiguration() *datamodel.JudgeConfiguration {
82+
if c.c.JudgeConfiguration == nil {
83+
return nil
84+
}
85+
return &datamodel.JudgeConfiguration{
86+
Judges: slices.Clone(c.c.JudgeConfiguration.Judges),
87+
}
88+
}
89+
6390
// AsLdValue is used internally.
6491
func (c *Config) AsLdValue() ldvalue.Value {
6592
return ldvalue.FromJSONMarshal(c.c)
@@ -68,12 +95,16 @@ func (c *Config) AsLdValue() ldvalue.Value {
6895
// ConfigBuilder is used to define a default AI Config, returned when LaunchDarkly is unreachable or there
6996
// is an error evaluating the Config.
7097
type ConfigBuilder struct {
71-
messages []datamodel.Message
72-
enabled bool
73-
providerName string
74-
modelName string
75-
modelParams map[string]ldvalue.Value
76-
modelCustomParams map[string]ldvalue.Value
98+
messages []datamodel.Message
99+
enabled bool
100+
providerName string
101+
modelName string
102+
modelParams map[string]ldvalue.Value
103+
modelCustomParams map[string]ldvalue.Value
104+
mode string
105+
evaluationMetricKey string
106+
evaluationMetricKeys []string
107+
judgeConfiguration *datamodel.JudgeConfiguration
77108
}
78109

79110
// NewConfig returns a new ConfigBuilder. By default, the Config is disabled.
@@ -141,8 +172,47 @@ func (cb *ConfigBuilder) WithCustomModelParam(key string, value ldvalue.Value) *
141172
return cb
142173
}
143174

175+
// WithMode sets the AI Config mode (e.g., "completion", "agent", "judge").
176+
func (cb *ConfigBuilder) WithMode(mode string) *ConfigBuilder {
177+
cb.mode = mode
178+
return cb
179+
}
180+
181+
// WithEvaluationMetricKey sets the evaluation metric key for judge mode configs.
182+
func (cb *ConfigBuilder) WithEvaluationMetricKey(key string) *ConfigBuilder {
183+
cb.evaluationMetricKey = key
184+
return cb
185+
}
186+
187+
// WithEvaluationMetricKeys sets the deprecated array of evaluation metric keys.
188+
// Use WithEvaluationMetricKey instead.
189+
func (cb *ConfigBuilder) WithEvaluationMetricKeys(keys []string) *ConfigBuilder {
190+
cb.evaluationMetricKeys = slices.Clone(keys)
191+
return cb
192+
}
193+
194+
// WithJudgeConfiguration sets the judge configuration for this config.
195+
// The provided judgeConfig is defensively copied.
196+
func (cb *ConfigBuilder) WithJudgeConfiguration(judgeConfig *datamodel.JudgeConfiguration) *ConfigBuilder {
197+
if judgeConfig == nil {
198+
cb.judgeConfiguration = nil
199+
return cb
200+
}
201+
cb.judgeConfiguration = &datamodel.JudgeConfiguration{
202+
Judges: slices.Clone(judgeConfig.Judges),
203+
}
204+
return cb
205+
}
206+
144207
// Build creates a Config from the current builder state.
145208
func (cb *ConfigBuilder) Build() Config {
209+
var judgeConfig *datamodel.JudgeConfiguration
210+
if cb.judgeConfiguration != nil {
211+
judgeConfig = &datamodel.JudgeConfiguration{
212+
Judges: slices.Clone(cb.judgeConfiguration.Judges),
213+
}
214+
}
215+
146216
return Config{
147217
c: datamodel.Config{
148218
Messages: slices.Clone(cb.messages),
@@ -157,6 +227,10 @@ func (cb *ConfigBuilder) Build() Config {
157227
Provider: datamodel.Provider{
158228
Name: cb.providerName,
159229
},
230+
Mode: cb.mode,
231+
EvaluationMetricKey: cb.evaluationMetricKey,
232+
EvaluationMetricKeys: slices.Clone(cb.evaluationMetricKeys),
233+
JudgeConfiguration: judgeConfig,
160234
},
161235
}
162236
}

0 commit comments

Comments
 (0)