feat(scores): add TEXT type to score overloads and docstrings

wochinge · claude · wochinge · commit a9d96976ae0c · 2026-04-07T16:57:06.000+02:00
Extend string-value overloads in create_score, score_current_span,
score_current_trace, score, and score_trace to accept TEXT alongside
CATEGORICAL. Update all related docstrings. Add ExperimentScoreType
to exclude TEXT from experiments/evals. Add integration test for TEXT
scores.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
@@ -1747,7 +1747,7 @@ def create_score(
         trace_id: Optional[str] = None,
         score_id: Optional[str] = None,
         observation_id: Optional[str] = None,
-        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
+        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
         comment: Optional[str] = None,
         config_id: Optional[str] = None,
         metadata: Optional[Any] = None,
@@ -1777,13 +1777,13 @@ def create_score(
 
         Args:
             name: Name of the score (e.g., "relevance", "accuracy")
-            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
+            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
             session_id: ID of the Langfuse session to associate the score with
             dataset_run_id: ID of the Langfuse dataset run to associate the score with
             trace_id: ID of the Langfuse trace to associate the score with
             observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
             score_id: Optional custom ID for the score (auto-generated if not provided)
-            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
+            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
             comment: Optional comment or explanation for the score
             config_id: Optional ID of a score config defined in Langfuse
             metadata: Optional metadata to be attached to the score
@@ -1907,7 +1907,7 @@ def score_current_span(
         name: str,
         value: str,
         score_id: Optional[str] = None,
-        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
+        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
         comment: Optional[str] = None,
         config_id: Optional[str] = None,
         metadata: Optional[Any] = None,
@@ -1931,9 +1931,9 @@ def score_current_span(
 
         Args:
             name: Name of the score (e.g., "relevance", "accuracy")
-            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
+            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
             score_id: Optional custom ID for the score (auto-generated if not provided)
-            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
+            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
             comment: Optional comment or explanation for the score
             config_id: Optional ID of a score config defined in Langfuse
             metadata: Optional metadata to be attached to the score
@@ -1997,7 +1997,7 @@ def score_current_trace(
         name: str,
         value: str,
         score_id: Optional[str] = None,
-        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
+        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
         comment: Optional[str] = None,
         config_id: Optional[str] = None,
         metadata: Optional[Any] = None,
@@ -2022,9 +2022,9 @@ def score_current_trace(
 
         Args:
             name: Name of the score (e.g., "user_satisfaction", "overall_quality")
-            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
+            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
             score_id: Optional custom ID for the score (auto-generated if not provided)
-            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
+            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
             comment: Optional comment or explanation for the score
             config_id: Optional ID of a score config defined in Langfuse
             metadata: Optional metadata to be attached to the score
diff --git a/langfuse/_client/span.py b/langfuse/_client/span.py
@@ -308,7 +308,7 @@ def score(
         value: str,
         score_id: Optional[str] = None,
         data_type: Optional[
-            Literal[ScoreDataType.CATEGORICAL]
+            Literal[ScoreDataType.CATEGORICAL, ScoreDataType.TEXT]
         ] = ScoreDataType.CATEGORICAL,
         comment: Optional[str] = None,
         config_id: Optional[str] = None,
@@ -335,9 +335,9 @@ def score(
 
         Args:
             name: Name of the score (e.g., "relevance", "accuracy")
-            value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
+            value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL/TEXT)
             score_id: Optional custom ID for the score (auto-generated if not provided)
-            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
+            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
             comment: Optional comment or explanation for the score
             config_id: Optional ID of a score config defined in Langfuse
             timestamp: Optional timestamp for the score (defaults to current UTC time)
@@ -395,7 +395,7 @@ def score_trace(
         value: str,
         score_id: Optional[str] = None,
         data_type: Optional[
-            Literal[ScoreDataType.CATEGORICAL]
+            Literal[ScoreDataType.CATEGORICAL, ScoreDataType.TEXT]
         ] = ScoreDataType.CATEGORICAL,
         comment: Optional[str] = None,
         config_id: Optional[str] = None,
@@ -423,9 +423,9 @@ def score_trace(
 
         Args:
             name: Name of the score (e.g., "user_satisfaction", "overall_quality")
-            value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
+            value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL/TEXT)
             score_id: Optional custom ID for the score (auto-generated if not provided)
-            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
+            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
             comment: Optional comment or explanation for the score
             config_id: Optional ID of a score config defined in Langfuse
             timestamp: Optional timestamp for the score (defaults to current UTC time)
diff --git a/langfuse/experiment.py b/langfuse/experiment.py
@@ -17,8 +17,9 @@
     Union,
 )
 
-from langfuse.api import DatasetItem, ScoreDataType
+from langfuse.api import DatasetItem
 from langfuse.logger import langfuse_logger as logger
+from langfuse.types import ExperimentScoreType
 
 
 class LocalExperimentItem(TypedDict, total=False):
@@ -184,7 +185,7 @@ def __init__(
         value: Union[int, float, str, bool],
         comment: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
-        data_type: Optional[ScoreDataType] = None,
+        data_type: Optional[ExperimentScoreType] = None,
         config_id: Optional[str] = None,
     ):
         """Initialize an Evaluation with the provided data.
diff --git a/langfuse/types.py b/langfuse/types.py
@@ -35,7 +35,10 @@ def my_evaluator(*, output: str, **kwargs) -> Evaluation:
 
 SpanLevel = Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]
 
-ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]
+ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN", "TEXT"]
+
+# Text scores are not supported for evals and experiments
+ExperimentScoreType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]
 
 
 class MaskFunction(Protocol):
diff --git a/tests/test_core_sdk.py b/tests/test_core_sdk.py
@@ -321,6 +321,64 @@ def test_create_categorical_score():
     assert created_score["stringValue"] == "high score"
 
 
+def test_create_text_score():
+    langfuse = Langfuse()
+    api_wrapper = LangfuseAPI()
+
+    # Create a span and set trace properties
+    with langfuse.start_as_current_observation(name="test-span") as span:
+        with propagate_attributes(
+            trace_name="this-is-so-great-new",
+            user_id="test",
+            metadata={"test": "test"},
+        ):
+            # Get trace ID for later use
+            trace_id = span.trace_id
+
+    # Ensure data is sent
+    langfuse.flush()
+    sleep(2)
+
+    # Create a text score
+    score_id = create_uuid()
+    langfuse.create_score(
+        score_id=score_id,
+        trace_id=trace_id,
+        name="this-is-a-score",
+        value="This is a detailed text evaluation of the output quality.",
+        data_type="TEXT",
+    )
+
+    # Create a generation in the same trace
+    generation = langfuse.start_observation(
+        as_type="generation",
+        name="yet another child",
+        metadata="test",
+        trace_context={"trace_id": trace_id},
+    )
+    generation.end()
+
+    # Ensure data is sent
+    langfuse.flush()
+    sleep(2)
+
+    # Retrieve and verify
+    trace = api_wrapper.get_trace(trace_id)
+
+    # Find the score we created by name
+    created_score = next(
+        (s for s in trace["scores"] if s["name"] == "this-is-a-score"), None
+    )
+    assert created_score is not None, "Score not found in trace"
+    assert created_score["id"] == score_id
+    assert created_score["dataType"] == "TEXT"
+    assert created_score["value"] is None
+    assert (
+        created_score["stringValue"]
+        == "This is a detailed text evaluation of the output quality."
+    )
+
+
 def test_create_score_with_custom_timestamp():
     langfuse = Langfuse()
     api_wrapper = LangfuseAPI()