Skip to content

Commit 321b40a

Browse files
authored
feat(scores): support text scores in v3 (#1661)
* feat(scores): support text scores in v3 Backport TEXT score typing and generated API models to v3-stable. * ci: update pnpm action setup * test: fix v3 stable CI failures Backport dataset archived-item assertions and update flaky live test expectations. * test: address v3 review feedback Update generated export ordering and use the requested audio model. * test: tolerate omitted text score value Allow the TEXT score response to omit null numeric values while asserting stringValue.
1 parent e4353f3 commit 321b40a

24 files changed

Lines changed: 565 additions & 49 deletions

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ jobs:
7575
name: Test on Python version ${{ matrix.python-version }}
7676
steps:
7777
- uses: actions/checkout@v3
78-
- uses: pnpm/action-setup@v3
78+
- uses: pnpm/action-setup@739bfe42ca9233c5e6aca07c1a25a9d34aca49b0 # v6.0.7
7979
with:
80-
version: 9.5.0
80+
version: 11.1.3
8181

8282
- name: Clone langfuse server
8383
run: |

langfuse/_client/client.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ class Langfuse:
201201
cost_details={"total_cost": 0.0023}
202202
)
203203
204-
# Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
204+
# Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL, TEXT)
205205
generation.score(name="relevance", value=0.95, data_type="NUMERIC")
206206
```
207207
"""
@@ -1992,7 +1992,7 @@ def create_score(
19921992
trace_id: Optional[str] = None,
19931993
score_id: Optional[str] = None,
19941994
observation_id: Optional[str] = None,
1995-
data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
1995+
data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
19961996
comment: Optional[str] = None,
19971997
config_id: Optional[str] = None,
19981998
metadata: Optional[Any] = None,
@@ -2022,13 +2022,13 @@ def create_score(
20222022
20232023
Args:
20242024
name: Name of the score (e.g., "relevance", "accuracy")
2025-
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2025+
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
20262026
session_id: ID of the Langfuse session to associate the score with
20272027
dataset_run_id: ID of the Langfuse dataset run to associate the score with
20282028
trace_id: ID of the Langfuse trace to associate the score with
20292029
observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
20302030
score_id: Optional custom ID for the score (auto-generated if not provided)
2031-
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2031+
data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
20322032
comment: Optional comment or explanation for the score
20332033
config_id: Optional ID of a score config defined in Langfuse
20342034
metadata: Optional metadata to be attached to the score
@@ -2152,7 +2152,7 @@ def score_current_span(
21522152
name: str,
21532153
value: str,
21542154
score_id: Optional[str] = None,
2155-
data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
2155+
data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
21562156
comment: Optional[str] = None,
21572157
config_id: Optional[str] = None,
21582158
metadata: Optional[Any] = None,
@@ -2176,9 +2176,9 @@ def score_current_span(
21762176
21772177
Args:
21782178
name: Name of the score (e.g., "relevance", "accuracy")
2179-
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2179+
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
21802180
score_id: Optional custom ID for the score (auto-generated if not provided)
2181-
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2181+
data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
21822182
comment: Optional comment or explanation for the score
21832183
config_id: Optional ID of a score config defined in Langfuse
21842184
metadata: Optional metadata to be attached to the score
@@ -2216,7 +2216,7 @@ def score_current_span(
22162216
name=name,
22172217
value=cast(str, value),
22182218
score_id=score_id,
2219-
data_type=cast(Literal["CATEGORICAL"], data_type),
2219+
data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
22202220
comment=comment,
22212221
config_id=config_id,
22222222
metadata=metadata,
@@ -2242,7 +2242,7 @@ def score_current_trace(
22422242
name: str,
22432243
value: str,
22442244
score_id: Optional[str] = None,
2245-
data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
2245+
data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
22462246
comment: Optional[str] = None,
22472247
config_id: Optional[str] = None,
22482248
metadata: Optional[Any] = None,
@@ -2267,9 +2267,9 @@ def score_current_trace(
22672267
22682268
Args:
22692269
name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2270-
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2270+
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
22712271
score_id: Optional custom ID for the score (auto-generated if not provided)
2272-
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2272+
data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
22732273
comment: Optional comment or explanation for the score
22742274
config_id: Optional ID of a score config defined in Langfuse
22752275
metadata: Optional metadata to be attached to the score
@@ -2305,7 +2305,7 @@ def score_current_trace(
23052305
name=name,
23062306
value=cast(str, value),
23072307
score_id=score_id,
2308-
data_type=cast(Literal["CATEGORICAL"], data_type),
2308+
data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
23092309
comment=comment,
23102310
config_id=config_id,
23112311
metadata=metadata,

langfuse/_client/span.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ def score(
287287
name: str,
288288
value: str,
289289
score_id: Optional[str] = None,
290-
data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
290+
data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
291291
comment: Optional[str] = None,
292292
config_id: Optional[str] = None,
293293
timestamp: Optional[datetime] = None,
@@ -313,9 +313,9 @@ def score(
313313
314314
Args:
315315
name: Name of the score (e.g., "relevance", "accuracy")
316-
value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
316+
value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL/TEXT)
317317
score_id: Optional custom ID for the score (auto-generated if not provided)
318-
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
318+
data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
319319
comment: Optional comment or explanation for the score
320320
config_id: Optional ID of a score config defined in Langfuse
321321
timestamp: Optional timestamp for the score (defaults to current UTC time)
@@ -342,7 +342,7 @@ def score(
342342
trace_id=self.trace_id,
343343
observation_id=self.id,
344344
score_id=score_id,
345-
data_type=cast(Literal["CATEGORICAL"], data_type),
345+
data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
346346
comment=comment,
347347
config_id=config_id,
348348
timestamp=timestamp,
@@ -370,7 +370,7 @@ def score_trace(
370370
name: str,
371371
value: str,
372372
score_id: Optional[str] = None,
373-
data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
373+
data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
374374
comment: Optional[str] = None,
375375
config_id: Optional[str] = None,
376376
timestamp: Optional[datetime] = None,
@@ -397,9 +397,9 @@ def score_trace(
397397
398398
Args:
399399
name: Name of the score (e.g., "user_satisfaction", "overall_quality")
400-
value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
400+
value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL/TEXT)
401401
score_id: Optional custom ID for the score (auto-generated if not provided)
402-
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
402+
data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
403403
comment: Optional comment or explanation for the score
404404
config_id: Optional ID of a score config defined in Langfuse
405405
timestamp: Optional timestamp for the score (defaults to current UTC time)
@@ -425,7 +425,7 @@ def score_trace(
425425
value=cast(str, value),
426426
trace_id=self.trace_id,
427427
score_id=score_id,
428-
data_type=cast(Literal["CATEGORICAL"], data_type),
428+
data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
429429
comment=comment,
430430
config_id=config_id,
431431
timestamp=timestamp,

langfuse/api/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,12 @@
8888
GetScoresResponseDataCategorical,
8989
GetScoresResponseDataCorrection,
9090
GetScoresResponseDataNumeric,
91+
GetScoresResponseDataText,
9192
GetScoresResponseData_Boolean,
9293
GetScoresResponseData_Categorical,
9394
GetScoresResponseData_Correction,
9495
GetScoresResponseData_Numeric,
96+
GetScoresResponseData_Text,
9597
GetScoresResponseTraceData,
9698
HealthResponse,
9799
IngestionError,
@@ -200,10 +202,12 @@
200202
ScoreV1_Boolean,
201203
ScoreV1_Categorical,
202204
ScoreV1_Numeric,
205+
ScoreV1_Text,
203206
Score_Boolean,
204207
Score_Categorical,
205208
Score_Correction,
206209
Score_Numeric,
210+
Score_Text,
207211
SdkLogBody,
208212
SdkLogEvent,
209213
ServiceProviderConfig,
@@ -212,6 +216,8 @@
212216
SessionWithTraces,
213217
Sort,
214218
TextPrompt,
219+
TextScore,
220+
TextScoreV1,
215221
Trace,
216222
TraceBody,
217223
TraceEvent,
@@ -349,10 +355,12 @@
349355
"GetScoresResponseDataCategorical",
350356
"GetScoresResponseDataCorrection",
351357
"GetScoresResponseDataNumeric",
358+
"GetScoresResponseDataText",
352359
"GetScoresResponseData_Boolean",
353360
"GetScoresResponseData_Categorical",
354361
"GetScoresResponseData_Correction",
355362
"GetScoresResponseData_Numeric",
363+
"GetScoresResponseData_Text",
356364
"GetScoresResponseTraceData",
357365
"HealthResponse",
358366
"IngestionError",
@@ -461,10 +469,12 @@
461469
"ScoreV1_Boolean",
462470
"ScoreV1_Categorical",
463471
"ScoreV1_Numeric",
472+
"ScoreV1_Text",
464473
"Score_Boolean",
465474
"Score_Categorical",
466475
"Score_Correction",
467476
"Score_Numeric",
477+
"Score_Text",
468478
"SdkLogBody",
469479
"SdkLogEvent",
470480
"ServiceProviderConfig",
@@ -473,6 +483,8 @@
473483
"SessionWithTraces",
474484
"Sort",
475485
"TextPrompt",
486+
"TextScore",
487+
"TextScoreV1",
476488
"Trace",
477489
"TraceBody",
478490
"TraceEvent",

langfuse/api/resources/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,16 @@
100100
ScoreV1_Boolean,
101101
ScoreV1_Categorical,
102102
ScoreV1_Numeric,
103+
ScoreV1_Text,
103104
Score_Boolean,
104105
Score_Categorical,
105106
Score_Correction,
106107
Score_Numeric,
108+
Score_Text,
107109
Session,
108110
SessionWithTraces,
111+
TextScore,
112+
TextScoreV1,
109113
Trace,
110114
TraceWithDetails,
111115
TraceWithFullDetails,
@@ -272,10 +276,12 @@
272276
GetScoresResponseDataCategorical,
273277
GetScoresResponseDataCorrection,
274278
GetScoresResponseDataNumeric,
279+
GetScoresResponseDataText,
275280
GetScoresResponseData_Boolean,
276281
GetScoresResponseData_Categorical,
277282
GetScoresResponseData_Correction,
278283
GetScoresResponseData_Numeric,
284+
GetScoresResponseData_Text,
279285
GetScoresResponseTraceData,
280286
)
281287
from .sessions import PaginatedSessions
@@ -369,10 +375,12 @@
369375
"GetScoresResponseDataCategorical",
370376
"GetScoresResponseDataCorrection",
371377
"GetScoresResponseDataNumeric",
378+
"GetScoresResponseDataText",
372379
"GetScoresResponseData_Boolean",
373380
"GetScoresResponseData_Categorical",
374381
"GetScoresResponseData_Correction",
375382
"GetScoresResponseData_Numeric",
383+
"GetScoresResponseData_Text",
376384
"GetScoresResponseTraceData",
377385
"HealthResponse",
378386
"IngestionError",
@@ -481,10 +489,12 @@
481489
"ScoreV1_Boolean",
482490
"ScoreV1_Categorical",
483491
"ScoreV1_Numeric",
492+
"ScoreV1_Text",
484493
"Score_Boolean",
485494
"Score_Categorical",
486495
"Score_Correction",
487496
"Score_Numeric",
497+
"Score_Text",
488498
"SdkLogBody",
489499
"SdkLogEvent",
490500
"ServiceProviderConfig",
@@ -493,6 +503,8 @@
493503
"SessionWithTraces",
494504
"Sort",
495505
"TextPrompt",
506+
"TextScore",
507+
"TextScoreV1",
496508
"Trace",
497509
"TraceBody",
498510
"TraceEvent",

langfuse/api/resources/commons/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,16 @@
4040
ScoreV1_Boolean,
4141
ScoreV1_Categorical,
4242
ScoreV1_Numeric,
43+
ScoreV1_Text,
4344
Score_Boolean,
4445
Score_Categorical,
4546
Score_Correction,
4647
Score_Numeric,
48+
Score_Text,
4749
Session,
4850
SessionWithTraces,
51+
TextScore,
52+
TextScoreV1,
4953
Trace,
5054
TraceWithDetails,
5155
TraceWithFullDetails,
@@ -103,12 +107,16 @@
103107
"ScoreV1_Boolean",
104108
"ScoreV1_Categorical",
105109
"ScoreV1_Numeric",
110+
"ScoreV1_Text",
106111
"Score_Boolean",
107112
"Score_Categorical",
108113
"Score_Correction",
109114
"Score_Numeric",
115+
"Score_Text",
110116
"Session",
111117
"SessionWithTraces",
118+
"TextScore",
119+
"TextScoreV1",
112120
"Trace",
113121
"TraceWithDetails",
114122
"TraceWithFullDetails",

langfuse/api/resources/commons/types/__init__.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,23 @@
3636
Score_Categorical,
3737
Score_Correction,
3838
Score_Numeric,
39+
Score_Text,
3940
)
4041
from .score_config import ScoreConfig
4142
from .score_config_data_type import ScoreConfigDataType
4243
from .score_data_type import ScoreDataType
4344
from .score_source import ScoreSource
44-
from .score_v_1 import ScoreV1, ScoreV1_Boolean, ScoreV1_Categorical, ScoreV1_Numeric
45+
from .score_v_1 import (
46+
ScoreV1,
47+
ScoreV1_Boolean,
48+
ScoreV1_Categorical,
49+
ScoreV1_Numeric,
50+
ScoreV1_Text,
51+
)
4552
from .session import Session
4653
from .session_with_traces import SessionWithTraces
54+
from .text_score import TextScore
55+
from .text_score_v_1 import TextScoreV1
4756
from .trace import Trace
4857
from .trace_with_details import TraceWithDetails
4958
from .trace_with_full_details import TraceWithFullDetails
@@ -89,12 +98,16 @@
8998
"ScoreV1_Boolean",
9099
"ScoreV1_Categorical",
91100
"ScoreV1_Numeric",
101+
"ScoreV1_Text",
92102
"Score_Boolean",
93103
"Score_Categorical",
94104
"Score_Correction",
95105
"Score_Numeric",
106+
"Score_Text",
96107
"Session",
97108
"SessionWithTraces",
109+
"TextScore",
110+
"TextScoreV1",
98111
"Trace",
99112
"TraceWithDetails",
100113
"TraceWithFullDetails",

0 commit comments

Comments
 (0)