Merge branch 'main' into fix/serializer-depth-limit

BKDDFS · web-flow · commit b1b0189c5072 · 2026-03-27T21:42:10.000+01:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -78,7 +78,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: pnpm/action-setup@v3
         with:
-          version: 9.5.0
+          version: 10.33.0
 
       - name: Clone langfuse server
         run: |
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -9,6 +9,7 @@ This is the Langfuse Python SDK, a client library for accessing the Langfuse obs
 ## Development Commands
 
 ### Setup
+
 ```bash
 # Install Poetry plugins (one-time setup)
 poetry self add poetry-dotenv-plugin
@@ -21,6 +22,7 @@ poetry run pre-commit install
 ```
 
 ### Testing
+
 ```bash
 # Run all tests with verbose output
 poetry run pytest -s -v --log-cli-level=INFO
@@ -33,6 +35,7 @@ poetry run pytest -s -v --log-cli-level=INFO -n auto
 ```
 
 ### Code Quality
+
 ```bash
 # Format code with Ruff
 poetry run ruff format .
@@ -48,6 +51,7 @@ poetry run pre-commit run --all-files
 ```
 
 ### Building and Releasing
+
 ```bash
 # Build the package locally (for testing)
 poetry build
@@ -57,6 +61,7 @@ poetry run pdoc -o docs/ --docformat google --logo "https://langfuse.com/langfus
 ```
 
 Releases are automated via GitHub Actions. To release:
+
 1. Go to Actions > "Release Python SDK" workflow
 2. Click "Run workflow"
 3. Select version bump type (patch/minor/major/prerelease)
@@ -89,6 +94,7 @@ The workflow handles versioning, building, PyPI publishing (via OIDC), and GitHu
 ### Key Design Patterns
 
 The SDK is built on OpenTelemetry for observability, using:
+
 - Spans for tracing LLM operations
 - Attributes for metadata (see `LangfuseOtelSpanAttributes`)
 - Resource management for efficient batching and flushing
@@ -98,6 +104,7 @@ The client follows an async-first design with automatic batching of events and b
 ## Configuration
 
 Environment variables (defined in `_client/environment_variables.py`):
+
 - `LANGFUSE_PUBLIC_KEY` / `LANGFUSE_SECRET_KEY`: API credentials
 - `LANGFUSE_HOST`: API endpoint (defaults to https://cloud.langfuse.com)
 - `LANGFUSE_DEBUG`: Enable debug logging
@@ -127,9 +134,11 @@ The `langfuse/api/` directory is auto-generated from the Langfuse OpenAPI specif
 ## Testing Guidelines
 
 ### Approach to Test Changes
+
 - Don't remove functionality from existing unit tests just to make tests pass. Only change the test, if underlying code changes warrant a test change.
 
 ## Python Code Rules
 
 ### Exception Handling
+
 - Exception must not use an f-string literal, assign to variable first
diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
@@ -1818,7 +1818,7 @@ def create_score(
         try:
             new_body = ScoreBody(
                 id=score_id,
-                session_id=session_id,
+                sessionId=session_id,
                 datasetRunId=dataset_run_id,
                 traceId=trace_id,
                 observationId=observation_id,
diff --git a/langfuse/_utils/request.py b/langfuse/_utils/request.py
@@ -48,9 +48,8 @@ def generate_headers(self) -> dict:
 
     def batch_post(self, **kwargs: Any) -> httpx.Response:
         """Post the `kwargs` to the batch API endpoint for events"""
-        logger.debug("uploading data: %s", kwargs)
-
         res = self.post(**kwargs)
+
         return self._process_response(
             res, success_message="data uploaded successfully", return_json=False
         )
diff --git a/langfuse/langchain/CallbackHandler.py b/langfuse/langchain/CallbackHandler.py
@@ -1057,10 +1057,10 @@ def _convert_message_to_dict(self, message: BaseMessage) -> Dict[str, Any]:
                 and len(message.tool_calls) > 0
             ):
                 message_dict["tool_calls"] = message.tool_calls
-            
+
             if (
-                hasattr(message, "invalid_tool_calls") 
-                and message.invalid_tool_calls is not None 
+                hasattr(message, "invalid_tool_calls")
+                and message.invalid_tool_calls is not None
                 and len(message.invalid_tool_calls) > 0
             ):
                 message_dict["invalid_tool_calls"] = message.invalid_tool_calls
diff --git a/poetry.lock b/poetry.lock
diff --git a/tests/test_core_sdk.py b/tests/test_core_sdk.py
@@ -118,6 +118,49 @@ def test_invalid_score_data_does_not_raise_exception():
     # We can't assert queue size in OTEL implementation, but we can verify it completes without exception
 
 
+def test_create_session_score():
+    langfuse = Langfuse()
+
+    session_id = "my-session"
+
+    # Create a span and set trace properties
+    with langfuse.start_as_current_observation(name="test-span"):
+        with propagate_attributes(
+            trace_name="this-is-so-great-new",
+            user_id="test",
+            metadata={"test": "test"},
+            session_id=session_id,
+        ):
+            pass
+
+    # Ensure data is sent
+    langfuse.flush()
+    sleep(2)
+
+    # Create a numeric score
+    score_id = create_uuid()
+
+    langfuse.create_score(
+        score_id=score_id,
+        session_id=session_id,
+        name="this-is-a-score",
+        value=1,
+    )
+
+    # Ensure data is sent
+    langfuse.flush()
+    sleep(2)
+
+    # Retrieve and verify
+    score = langfuse.api.scores.get_by_id(score_id)
+
+    # find the score by name (server may transform the id format)
+    assert score is not None
+    assert score.value == 1
+    assert score.data_type == "NUMERIC"
+    assert score.session_id == session_id
+
+
 def test_create_numeric_score():
     langfuse = Langfuse()
     api_wrapper = LangfuseAPI()
diff --git a/tests/test_langchain.py b/tests/test_langchain.py
@@ -51,7 +51,7 @@ def test_callback_generated_from_trace_chat():
 
     assert trace.id == trace_id
 
-    assert len(trace.observations) == 2
+    assert len(trace.observations) == 3
 
     langchain_generation_span = list(
         filter(
@@ -286,7 +286,7 @@ def test_openai_instruct_usage():
     observations = get_api().trace.get(trace_id).observations
 
     # Add 1 to account for the wrapping span
-    assert len(observations) == 3
+    assert len(observations) == 4
 
     for observation in observations:
         if observation.type == "GENERATION":
@@ -391,6 +391,7 @@ def test_get_langchain_chat_prompt():
             )
 
 
+@pytest.mark.skip("Flaky")
 def test_link_langfuse_prompts_invoke():
     langfuse = Langfuse()
     trace_name = "test_link_langfuse_prompts_invoke"
@@ -463,7 +464,7 @@ def test_link_langfuse_prompts_invoke():
         key=lambda x: x.start_time,
     )
 
-    assert len(generations) == 2
+    # assert len(generations) == 4
     assert generations[0].input == "Tell me a joke involving the animal dog"
     assert "Explain the joke to me like I'm a 5 year old" in generations[1].input
 
@@ -474,6 +475,7 @@ def test_link_langfuse_prompts_invoke():
     assert generations[1].prompt_version == langfuse_explain_prompt.version
 
 
+@pytest.mark.skip("Flaky")
 def test_link_langfuse_prompts_stream():
     langfuse = Langfuse()
     trace_name = "test_link_langfuse_prompts_stream"
@@ -550,7 +552,7 @@ def test_link_langfuse_prompts_stream():
         key=lambda x: x.start_time,
     )
 
-    assert len(generations) == 2
+    assert len(generations) == 4
     assert generations[0].input == "Tell me a joke involving the animal dog"
     assert "Explain the joke to me like I'm a 5 year old" in generations[1].input
 
@@ -564,6 +566,7 @@ def test_link_langfuse_prompts_stream():
     assert generations[1].time_to_first_token is not None
 
 
+@pytest.mark.skip("Flaky")
 def test_link_langfuse_prompts_batch():
     langfuse = Langfuse()
     trace_name = "test_link_langfuse_prompts_batch_" + create_uuid()[:8]
@@ -639,7 +642,7 @@ def test_link_langfuse_prompts_batch():
         key=lambda x: x.start_time,
     )
 
-    assert len(generations) == 6
+    assert len(generations) == 10
 
     assert generations[0].prompt_name == joke_prompt_name
     assert generations[1].prompt_name == joke_prompt_name
@@ -710,6 +713,7 @@ def test_get_langchain_chat_prompt_with_precompiled_prompt():
     assert user_message.content == "This is a langchain chain."
 
 
+@pytest.mark.skip("Flaky")
 def test_callback_openai_functions_with_tools():
     handler = CallbackHandler()
 
@@ -856,7 +860,7 @@ def test_multimodal():
 
     trace = get_api().trace.get(trace_id=trace_id)
 
-    assert len(trace.observations) == 2
+    assert len(trace.observations) == 3
     # Filter for the observation with type GENERATION
     generation_observation = next(
         (obs for obs in trace.observations if obs.type == "GENERATION"), None