braintrust-sdk-go/examples/dataset-api/main.go at main · braintrustdata/braintrust-sdk-go · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
package main

import (
	"context"
	"fmt"
	"log"

	"go.opentelemetry.io/otel/sdk/trace"

	"github.com/braintrustdata/braintrust-sdk-go"
	"github.com/braintrustdata/braintrust-sdk-go/api"
	"github.com/braintrustdata/braintrust-sdk-go/api/datasets"
	functionsapi "github.com/braintrustdata/braintrust-sdk-go/api/functions"
	"github.com/braintrustdata/braintrust-sdk-go/api/projects"
	"github.com/braintrustdata/braintrust-sdk-go/eval"
)

// QuestionInput represents a question
type QuestionInput struct {
	Question string `json:"question"`
}

// AnswerOutput represents an answer
type AnswerOutput struct {
	Answer string `json:"answer"`
}

func main() {
	ctx := context.Background()

	// Create tracer provider
	tp := trace.NewTracerProvider()
	defer func() {
		if err := tp.Shutdown(ctx); err != nil {
			log.Printf("Error shutting down tracer provider: %v", err)
		}
	}()

	client, err := braintrust.New(tp,
		braintrust.WithProject("go-sdk-examples"),
	)
	if err != nil {
		log.Fatalf("Failed to create client: %v", err)
	}

	// Get API client for dataset operations
	apiClient := client.API()

	// Step 1: Create a prompt function for answering questions
	promptSlug := "qa-answer-prompt"
	if err := createPrompt(ctx, apiClient, promptSlug); err != nil {
		log.Fatalf("Failed to create prompt: %v", err)
	}

	// Step 2: Create a dataset with some test data
	datasetID, err := createDataset(ctx, apiClient)
	if err != nil {
		log.Fatalf("Failed to create dataset: %v", err)
	}

	// Step 3: Run an evaluation using the dataset and prompt
	evaluator := braintrust.NewEvaluator[QuestionInput, AnswerOutput](client)

	// Load dataset using the new DatasetAPI
	cases, err := evaluator.Datasets().Get(ctx, datasetID)
	if err != nil {
		log.Fatalf("Failed to load dataset: %v", err)
	}

	// Load the prompt as a task
	task, err := evaluator.Functions().Task(ctx, eval.FunctionOpts{Slug: promptSlug})
	if err != nil {
		log.Fatalf("Failed to load prompt: %v", err)
	}

	// Define an exact match scorer
	exactMatchScorer := eval.NewScorer("exact_match", func(ctx context.Context, result eval.TaskResult[QuestionInput, AnswerOutput]) (eval.Scores, error) {
		if result.Expected.Answer == result.Output.Answer {
			return eval.S(1.0), nil
		}
		return eval.S(0.0), nil
	})

	// Run the evaluation
	result, err := evaluator.Run(ctx, eval.Opts[QuestionInput, AnswerOutput]{
		Experiment: "qa-dataset-example",
		Dataset:    cases,
		Task:       task,
		Scorers:    []eval.Scorer[QuestionInput, AnswerOutput]{exactMatchScorer},
	})
	if err != nil {
		log.Fatalf("Failed to run evaluation: %v", err)
	}

	fmt.Printf("Evaluation complete! View results at: %s\n", result)

	// Step 4: Cleanup - delete the test dataset
	if err := apiClient.Datasets().Delete(ctx, datasetID); err != nil {
		// Note: Dataset deletion may fail due to permissions or timing
		// The dataset can be manually deleted from the Braintrust UI if needed
		log.Printf("Note: Dataset cleanup skipped (this is normal): %v\n", err)
	}
}

// createPrompt creates a prompt function for answering questions
func createPrompt(ctx context.Context, apiClient *api.API, slug string) error {
	// First, get or create the project
	project, err := apiClient.Projects().Create(ctx, projects.CreateParams{
		Name: "go-sdk-examples",
	})
	if err != nil {
		return fmt.Errorf("failed to create project: %w", err)
	}

	// Check if the prompt already exists and delete it
	functions := apiClient.Functions()
	if existing, _ := functions.Query(ctx, functionsapi.QueryParams{
		ProjectName: "go-sdk-examples",
		Slug:        slug,
		Limit:       1,
	}); len(existing) > 0 {
		_ = functions.Delete(ctx, existing[0].ID)
	}

	// Create a prompt that answers questions
	// The prompt will receive the question as input and should return an answer
	_, err = functions.Create(ctx, functionsapi.CreateParams{
		ProjectID: project.ID,
		Name:      "QA Answer Prompt",
		Slug:      slug,
		FunctionData: map[string]any{
			"type": "prompt",
		},
		PromptData: map[string]any{
			"prompt": map[string]any{
				"type": "chat",
				"messages": []map[string]any{
					{
						"role":    "system",
						"content": "You are a helpful assistant that answers questions accurately and concisely. You must respond with valid JSON in the format: {\"answer\": \"your answer here\"}",
					},
					{
						"role":    "user",
						"content": "{{input.question}}",
					},
				},
			},
			"options": map[string]any{
				"model": "gpt-4o-mini",
				"params": map[string]any{
					"temperature":     0,
					"max_tokens":      50,
					"response_format": map[string]any{"type": "json_object"},
				},
			},
		},
	})
	if err != nil {
		return fmt.Errorf("failed to create prompt: %w", err)
	}

	return nil
}

// createDataset creates a test dataset and returns its ID
func createDataset(ctx context.Context, apiClient *api.API) (string, error) {
	// First, get or create the project
	project, err := apiClient.Projects().Create(ctx, projects.CreateParams{
		Name: "go-sdk-examples",
	})
	if err != nil {
		return "", fmt.Errorf("failed to create project: %w", err)
	}

	// Create the dataset
	dataset, err := apiClient.Datasets().Create(ctx, datasets.CreateParams{
		ProjectID:   project.ID,
		Name:        "qa-test-dataset",
		Description: "Test dataset for DatasetAPI example",
	})
	if err != nil {
		return "", fmt.Errorf("failed to create dataset: %w", err)
	}

	// Insert test data
	events := []datasets.Event{
		{
			Input: map[string]interface{}{
				"question": "What is 2 + 2?",
			},
			Expected: map[string]interface{}{
				"answer": "4",
			},
			Tags: []string{"math", "easy"},
		},
		{
			Input: map[string]interface{}{
				"question": "What is the capital of France?",
			},
			Expected: map[string]interface{}{
				"answer": "Paris",
			},
			Tags: []string{"geography", "easy"},
		},
		{
			Input: map[string]interface{}{
				"question": "What is the square root of 144?",
			},
			Expected: map[string]interface{}{
				"answer": "12",
			},
			Tags: []string{"math", "medium"},
		},
		{
			Input: map[string]interface{}{
				"question": "Who wrote Romeo and Juliet?",
			},
			Expected: map[string]interface{}{
				"answer": "William Shakespeare",
			},
			Tags: []string{"literature", "easy"},
		},
	}

	if err := apiClient.Datasets().InsertEvents(ctx, dataset.ID, events); err != nil {
		return "", fmt.Errorf("failed to insert events: %w", err)
	}

	return dataset.ID, nil
}