Changed: Use NewEval() instead of RunEval()

delner · delner · commit 420e7de07d47 · 2026-04-03T19:04:10.000Z
diff --git a/README.md b/README.md
@@ -139,8 +139,8 @@ func main() {
         log.Fatal(err)
     }
 
-    // Define a reusable eval (task + scorers)
-    greetingEval := &eval.Eval[string, string]{
+    // Create an eval
+    e := braintrust.NewEval(client, &eval.Eval[string, string]{
         Name: "greeting-experiment",
         Task: eval.T(func(ctx context.Context, input string) (string, error) {
             return "Hello " + input, nil
@@ -154,11 +154,10 @@ func main() {
                 return eval.S(score), nil
             }),
         },
-    }
+    })
 
     // Run against a dataset
-    evaluator := braintrust.NewEvaluator[string, string](client)
-    _, err = evaluator.RunEval(ctx, greetingEval, eval.RunOpts[string, string]{
+    _, err = e.Run(ctx, eval.RunOpts[string, string]{
         Dataset: eval.NewDataset([]eval.Case[string, string]{
             {Input: "World", Expected: "Hello World"},
             {Input: "Alice", Expected: "Hello Alice"},
diff --git a/client.go b/client.go
@@ -191,23 +191,24 @@ func (c *Client) Tracer(name string, opts ...oteltrace.TracerOption) oteltrace.T
 	return c.tracerProvider.Tracer(name, opts...)
 }
 
-// NewEvaluator creates a new evaluator for running evaluations with the same
-// input and output types.
+// NewEval creates a runnable [eval.Eval] by combining a client with an eval definition.
 //
 // Example:
 //
 //	client, _ := braintrust.New(tp)
-//	evaluator := braintrust.NewEvaluator[string, string](client)
-//
-//	// Define a reusable eval, then run it with different datasets
-//	myEval := &eval.Eval[string, string]{
-//	    Name:    "my-eval",
+//	e := braintrust.NewEval(client, &eval.Eval[string, string]{
+//	    Name:    "classify",
 //	    Task:    task,
 //	    Scorers: scorers,
-//	}
-//	result, _ := evaluator.RunEval(ctx, myEval, eval.RunOpts[string, string]{
-//	    Dataset: dataset,
 //	})
+//	result, _ := e.Run(ctx, eval.RunOpts[string, string]{Dataset: dataset})
+func NewEval[I, R any](client *Client, e *eval.Eval[I, R]) *eval.Eval[I, R] {
+	evaluator := eval.NewEvaluator[I, R](client.session, client.tracerProvider, client.API(), client.config.DefaultProjectName)
+	return eval.NewEval(evaluator, e)
+}
+
+// NewEvaluator creates a new evaluator for running evaluations with the same
+// input and output types.
 func NewEvaluator[I, R any](client *Client) *eval.Evaluator[I, R] {
 	return eval.NewEvaluator[I, R](client.session, client.tracerProvider, client.API(), client.config.DefaultProjectName)
 }
diff --git a/eval/eval.go b/eval/eval.go
@@ -107,7 +107,8 @@ type CaseProgress struct {
 }
 
 // Eval defines an evaluation: the task to run and the scorers to apply.
-// Run it via [Evaluator.RunEval] or register it with a remote eval server.
+// Create one with [braintrust.NewEval], then call [Eval.Run] to execute it
+// or pass it to a remote eval server.
 type Eval[I, R any] struct {
 	// Name is the eval name. Used as the default experiment name and as
 	// the registration key when registered with a remote eval server.
@@ -120,8 +121,24 @@ type Eval[I, R any] struct {
 	Scorers []Scorer[I, R]
 
 	// ProjectName is the Braintrust project for this eval.
-	// Optional; falls back to the Evaluator's default project.
+	// Optional; falls back to the default project from the client.
 	ProjectName string
+
+	// evaluator holds the infrastructure (session, tracer, API client)
+	// needed to run the eval. Set by NewEval / braintrust.NewEval.
+	evaluator *Evaluator[I, R]
+}
+
+// NewEval creates a runnable Eval by attaching an [Evaluator] as the default
+// runner. Users should call braintrust.NewEval rather than this directly.
+func NewEval[I, R any](evaluator *Evaluator[I, R], e *Eval[I, R]) *Eval[I, R] {
+	e.evaluator = evaluator
+	return e
+}
+
+// Run executes the evaluation using the default [Evaluator].
+func (e *Eval[I, R]) Run(ctx context.Context, opts RunOpts[I, R]) (*Result, error) {
+	return e.evaluator.Run(ctx, mergeOpts(e, opts))
 }
 
 // RunOpts configures a single evaluation run. These vary per invocation;
diff --git a/eval/eval_integration_test.go b/eval/eval_integration_test.go
@@ -728,8 +728,8 @@ func TestEval_NoProjectName(t *testing.T) {
 	assert.Contains(t, err.Error(), "project name is required")
 }
 
-// TestRunEval_Integration tests RunEval with a reusable Eval definition.
-func TestRunEval_Integration(t *testing.T) {
+// TestEvalRun_Integration tests Eval.Run with a reusable Eval definition.
+func TestEvalRun_Integration(t *testing.T) {
 	session, apiClient := setupIntegrationTest(t)
 	t.Parallel()
 
@@ -758,7 +758,8 @@ func TestRunEval_Integration(t *testing.T) {
 	defer func() { _ = tp.Shutdown(ctx) }()
 
 	evaluator := NewEvaluator[string, string](session, tp, apiClient, cfg.DefaultProjectName)
-	result, err := evaluator.RunEval(ctx, classify, RunOpts[string, string]{
+	e := NewEval(evaluator, classify)
+	result, err := e.Run(ctx, RunOpts[string, string]{
 		Dataset: NewDataset([]Case[string, string]{
 			{Input: "apple", Expected: "category-apple"},
 			{Input: "banana", Expected: "category-banana"},
diff --git a/eval/eval_runopts_test.go b/eval/eval_runopts_test.go
@@ -159,10 +159,10 @@ func TestMergeOpts_EvalReuse(t *testing.T) {
 	assert.Equal(t, "base-project", ev.ProjectName)
 }
 
-// TestRunEval_Success verifies that RunEval produces the same span structure as
-// the equivalent Run call. It uses the same testNewEval path as other unit tests
-// to avoid needing a real API client for experiment registration.
-func TestRunEval_Success(t *testing.T) {
+// TestEvalRun_Success verifies that Eval.Run produces the same span structure as
+// the equivalent Evaluator.Run call. It uses the same testNewEval path as other
+// unit tests to avoid needing a real API client for experiment registration.
+func TestEvalRun_Success(t *testing.T) {
 	t.Parallel()
 
 	task := T(func(_ context.Context, in testInput) (testOutput, error) {
diff --git a/eval/evaluator.go b/eval/evaluator.go
@@ -52,8 +52,3 @@ func (e *Evaluator[I, R]) Datasets() *DatasetAPI[I, R] {
 func (e *Evaluator[I, R]) Run(ctx context.Context, opts Opts[I, R]) (*Result, error) {
 	return run(ctx, opts, e.session, e.tracerProvider, e.api, e.defaultProjectName)
 }
-
-// RunEval executes an evaluation from a reusable [Eval] definition.
-func (e *Evaluator[I, R]) RunEval(ctx context.Context, ev *Eval[I, R], opts RunOpts[I, R]) (*Result, error) {
-	return run(ctx, mergeOpts(ev, opts), e.session, e.tracerProvider, e.api, e.defaultProjectName)
-}
diff --git a/eval/example_test.go b/eval/example_test.go
@@ -77,8 +77,8 @@ func Example_evalDefinition() {
 		log.Fatal(err)
 	}
 
-	// Define a reusable eval (task + scorers + project)
-	classify := &eval.Eval[string, string]{
+	// Create a runnable eval
+	e := braintrust.NewEval(client, &eval.Eval[string, string]{
 		Name: "classify",
 		Task: eval.T(func(ctx context.Context, input string) (string, error) {
 			return input + "!", nil
@@ -92,11 +92,10 @@ func Example_evalDefinition() {
 			}),
 		},
 		ProjectName: "test-project",
-	}
+	})
 
-	// Run the eval with a specific dataset
-	evaluator := braintrust.NewEvaluator[string, string](client)
-	result, err := evaluator.RunEval(ctx, classify, eval.RunOpts[string, string]{
+	// Run it
+	result, err := e.Run(ctx, eval.RunOpts[string, string]{
 		Dataset: eval.NewDataset([]eval.Case[string, string]{
 			{Input: "hello", Expected: "hello!"},
 		}),
diff --git a/eval/testdata/cassettes/TestEvalRun_Integration.yaml b/eval/testdata/cassettes/TestEvalRun_Integration.yaml
diff --git a/server/register.go b/server/register.go
@@ -207,8 +207,12 @@ func (r *registeredEvalImpl[I, R]) run(ctx context.Context, cfg *evalRunConfig)
 		}
 	}
 
+	// Create a per-request evaluator with the caller's session, not the
+	// default evaluator on the Eval, so traces are attributed to the user
+	// who triggered the request.
 	evaluator := eval.NewEvaluator[I, R](cfg.auth.session, tp, apiClient, r.projectName())
-	result, evalErr := evaluator.RunEval(evalCtx, r.def, eval.RunOpts[I, R]{
+	e := eval.NewEval(evaluator, r.def)
+	result, evalErr := e.Run(evalCtx, eval.RunOpts[I, R]{
 		Experiment:     experimentName,
 		Dataset:        dataset,
 		ProjectName:    r.projectName(),

Original file line number	Diff line number	Diff line change
`@@ -52,8 +52,3 @@ func (e Evaluator[I, R]) Datasets() DatasetAPI[I, R] {`
`52`	`52`	`func (e Evaluator[I, R]) Run(ctx context.Context, opts Opts[I, R]) (Result, error) {`
`53`	`53`	`return run(ctx, opts, e.session, e.tracerProvider, e.api, e.defaultProjectName)`
`54`	`54`	`}`
`55`		`-`
`56`		`-// RunEval executes an evaluation from a reusable [Eval] definition.`
`57`		`-func (e Evaluator[I, R]) RunEval(ctx context.Context, ev Eval[I, R], opts RunOpts[I, R]) (*Result, error) {`
`58`		`- return run(ctx, mergeOpts(ev, opts), e.session, e.tracerProvider, e.api, e.defaultProjectName)`
`59`		`-}`
Original file line number	Diff line number	Diff line change
`@@ -207,8 +207,12 @@ func (r registeredEvalImpl[I, R]) run(ctx context.Context, cfg evalRunConfig)`
`207`	`207`	`}`
`208`	`208`	`}`
`209`	`209`
	`210`	`+ // Create a per-request evaluator with the caller's session, not the`
	`211`	`+ // default evaluator on the Eval, so traces are attributed to the user`
	`212`	`+ // who triggered the request.`
`210`	`213`	`evaluator := eval.NewEvaluator[I, R](cfg.auth.session, tp, apiClient, r.projectName())`
`211`		`- result, evalErr := evaluator.RunEval(evalCtx, r.def, eval.RunOpts[I, R]{`
	`214`	`+ e := eval.NewEval(evaluator, r.def)`
	`215`	`+ result, evalErr := e.Run(evalCtx, eval.RunOpts[I, R]{`
`212`	`216`	`Experiment: experimentName,`
`213`	`217`	`Dataset: dataset,`
`214`	`218`	`ProjectName: r.projectName(),`