Skip to content

Commit c1c3e79

Browse files
authored
[codex] Fix local experiment ID stability (#770)
1 parent 3d7c98b commit c1c3e79

4 files changed

Lines changed: 31 additions & 7 deletions

File tree

packages/client/src/experiment/ExperimentManager.ts

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ export class ExperimentManager {
119119
* @param config.maxConcurrency - Maximum number of concurrent task executions (default: 50)
120120
*
121121
* @returns Promise that resolves to experiment results including:
122+
* - experimentId: Stable identifier for the experiment execution
122123
* - runName: The experiment run name (either provided or generated)
123124
* - itemResults: Results for each processed data item
124125
* - runEvaluations: Results from run-level evaluators
@@ -193,6 +194,7 @@ export class ExperimentManager {
193194
name,
194195
runName: providedRunName,
195196
});
197+
const fallbackExperimentId = await createExperimentId();
196198

197199
if (!this.isOtelRegistered()) {
198200
this.logger.warn(
@@ -217,6 +219,7 @@ export class ExperimentManager {
217219
experimentRunName: runName,
218220
experimentDescription: description,
219221
experimentMetadata: metadata,
222+
fallbackExperimentId,
220223
datasetVersion: config.datasetVersion,
221224
});
222225
});
@@ -244,8 +247,10 @@ export class ExperimentManager {
244247
}
245248

246249
// Get dataset run URL
247-
const datasetRunId =
248-
itemResults.length > 0 ? itemResults[0].datasetRunId : undefined;
250+
const datasetRunId = itemResults.find(
251+
(item) => item.datasetRunId,
252+
)?.datasetRunId;
253+
const experimentId = datasetRunId || fallbackExperimentId;
249254

250255
let datasetRunUrl = undefined;
251256
if (datasetRunId && data.length > 0 && "datasetId" in data[0]) {
@@ -294,6 +299,7 @@ export class ExperimentManager {
294299
await this.langfuseClient.score.flush();
295300

296301
return {
302+
experimentId,
297303
runName,
298304
itemResults,
299305
datasetRunId,
@@ -355,6 +361,7 @@ export class ExperimentManager {
355361
ExpectedOutput,
356362
Metadata
357363
>["metadata"];
364+
fallbackExperimentId: string;
358365
item: ExperimentParams<Input, ExpectedOutput, Metadata>["data"][0];
359366
task: ExperimentTask<Input, ExpectedOutput, Metadata>;
360367
evaluators?: Evaluator<Input, ExpectedOutput, Metadata>[];
@@ -405,7 +412,7 @@ export class ExperimentManager {
405412
// Generate IDs
406413
const experimentItemId =
407414
datasetItemId || (await createExperimentItemId(input));
408-
const experimentId = datasetRunId || (await createExperimentId());
415+
const experimentId = datasetRunId || params.fallbackExperimentId;
409416

410417
// Set non-propagated experiment attributes directly on root span
411418
const rootSpanAttributes: Record<string, string> = {

packages/client/src/experiment/types.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,8 @@ export type ExperimentItemResult<
306306
* ```typescript
307307
* const result = await langfuse.experiment.run(config);
308308
*
309+
* console.log(`Experiment ID: ${result.experimentId}`);
310+
*
309311
* // Access individual results
310312
* console.log(`Processed ${result.itemResults.length} items`);
311313
*
@@ -332,6 +334,15 @@ export type ExperimentResult<
332334
ExpectedOutput = any,
333335
Metadata extends Record<string, any> = Record<string, any>,
334336
> = {
337+
/**
338+
* Stable identifier for this experiment execution.
339+
*
340+
* For Langfuse datasets, this is the dataset run ID when available.
341+
* For local data, this is a generated fallback ID shared across all items
342+
* in the run.
343+
*/
344+
experimentId: string;
345+
335346
/**
336347
* The experiment run name.
337348
*

tests/e2e/experiments.e2e.test.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ describe("Langfuse Datasets E2E", () => {
155155
expect(result.runName).toMatch(
156156
/^Euro capitals - \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/,
157157
);
158+
expect(result.experimentId).toMatch(/^[0-9a-f]{16}$/);
158159
// No datasetRunId for local datasets
159160
expect(result.datasetRunId).toBeUndefined();
160161

@@ -233,6 +234,7 @@ describe("Langfuse Datasets E2E", () => {
233234
/^Euro capitals on LF dataset - \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/,
234235
);
235236
expect(result.datasetRunId).toBeDefined();
237+
expect(result.experimentId).toBe(result.datasetRunId);
236238

237239
// Validate item results structure
238240
result.itemResults.forEach((itemResult, index) => {
@@ -333,6 +335,7 @@ describe("Langfuse Datasets E2E", () => {
333335
// Should use the custom run name exactly
334336
expect(result.runName).toBe(customRunName);
335337
expect(result.datasetRunId).toBeDefined();
338+
expect(result.experimentId).toBe(result.datasetRunId);
336339

337340
// Fetch dataset run and verify it has the custom name
338341
const datasetRun = await langfuse.api.datasets.getRun(
@@ -365,6 +368,7 @@ describe("Langfuse Datasets E2E", () => {
365368

366369
// Should use the custom run name exactly
367370
expect(result.runName).toBe(customRunName);
371+
expect(result.experimentId).toMatch(/^[0-9a-f]{16}$/);
368372
expect(result.itemResults).toHaveLength(2);
369373
// No dataset run for local datasets
370374
expect(result.datasetRunId).toBeUndefined();
@@ -483,6 +487,7 @@ describe("Langfuse Datasets E2E", () => {
483487
await waitForServerIngestion(500);
484488

485489
expect(result.itemResults).toHaveLength(0);
490+
expect(result.experimentId).toMatch(/^[0-9a-f]{16}$/);
486491
expect(result.runEvaluations).toHaveLength(1); // Run evaluators will still execute with empty data
487492
expect(await result.format()).toContain("No experiment results");
488493
});

tests/integration/experiment-propagation.integration.test.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ describe("Experiment Attribute Propagation", () => {
251251
});
252252

253253
describe("Multiple Experiment Items", () => {
254-
it("should not leak attributes between experiment items", async () => {
254+
it("should share experiment ID across local items while keeping item IDs distinct", async () => {
255255
const items = [
256256
{ input: "item1", metadata: { index: "1" } },
257257
{ input: "item2", metadata: { index: "2" } },
@@ -260,7 +260,7 @@ describe("Experiment Attribute Propagation", () => {
260260
const experimentIds: string[] = [];
261261
const itemIds: string[] = [];
262262

263-
await langfuse.experiment.run({
263+
const result = await langfuse.experiment.run({
264264
name: "no-leakage-test",
265265
data: items,
266266
task: async (item) => {
@@ -285,8 +285,9 @@ describe("Experiment Attribute Propagation", () => {
285285
// Each item should have different item IDs
286286
expect(itemIds[0]).not.toBe(itemIds[1]);
287287

288-
// Each item should have different experiment IDs (randomly generated)
289-
expect(experimentIds[0]).not.toBe(experimentIds[1]);
288+
// All local items should share the same experiment ID
289+
expect(experimentIds[0]).toBe(experimentIds[1]);
290+
expect(result.experimentId).toBe(experimentIds[0]);
290291
});
291292
});
292293

0 commit comments

Comments
 (0)