LEADS-240: Token usage should be 0 for a re-run with successful cache (lightspeed-core#176)

xmican10 · web-flow · commit 1e16a7fa9bed · 2026-03-02T21:07:50.000+05:30
* 0 JudgeLLM/API tokens are added when cache hit, added unit tests for such scenarios

* Resolving rebase errors

* Resolving pylint disable check
diff --git a/src/lightspeed_evaluation/core/api/client.py b/src/lightspeed_evaluation/core/api/client.py
@@ -321,7 +321,14 @@ def _get_cached_response(self, request: APIRequest) -> APIResponse | None:
         if self.cache is None:
             raise RuntimeError("cache is None, but used")
         key = self._get_cache_key(request)
-        return cast(APIResponse | None, self.cache.get(key))
+        cached_response = cast(APIResponse | None, self.cache.get(key))
+
+        # Zero out token counts for cached responses since no API call was made
+        if cached_response is not None:
+            cached_response.input_tokens = 0
+            cached_response.output_tokens = 0
+
+        return cached_response
 
     def close(self) -> None:
         """Close API client."""
diff --git a/src/lightspeed_evaluation/core/llm/custom.py b/src/lightspeed_evaluation/core/llm/custom.py
@@ -145,17 +145,10 @@ def call(
             **kwargs,
         }
 
+        response = None
         try:
             response = litellm.completion(**call_params)
 
-            # Direct token extraction - capture tokens synchronously from response
-            tracker = TokenTracker.get_active()
-            if tracker and hasattr(response, "usage") and response.usage:
-                tracker.add_tokens(
-                    getattr(response.usage, "prompt_tokens", 0),
-                    getattr(response.usage, "completion_tokens", 0),
-                )
-
             # Extract content from all choices
             results = []
             for choice in response.choices:  # type: ignore
@@ -185,3 +178,24 @@ def call(
 
         except Exception as e:
             raise LLMError(f"LLM call failed: {str(e)}") from e
+
+        finally:
+            # Track tokens even if the call failed - tokens may have been consumed
+            self._track_tokens(response)
+
+    def _track_tokens(self, response: Any) -> None:
+        """Track JudgeLLM tokens if a tracker is active."""
+        # Only track token counts if response exists and is NOT from cache
+        tracker = TokenTracker.get_active()
+        if tracker and response is not None:
+            cache_hit = getattr(
+                response, "_hidden_params", {}
+            ).get(  # pylint: disable=protected-access
+                "cache_hit", False
+            )
+            # Only add tokens if this response was not retrieved from cache
+            if not cache_hit and hasattr(response, "usage") and response.usage:
+                tracker.add_tokens(
+                    getattr(response.usage, "prompt_tokens", 0),
+                    getattr(response.usage, "completion_tokens", 0),
+                )
diff --git a/tests/unit/core/api/test_client.py b/tests/unit/core/api/test_client.py
@@ -496,6 +496,45 @@ def test_standard_endpoint_initialization(
 
         assert client.config.endpoint_type == "query"
 
+    def test_get_cached_response_zeros_token_counts(
+        self, basic_api_config_query_endpoint: APIConfig, mocker: MockerFixture
+    ) -> None:
+        """Test that _get_cached_response zeros out token counts."""
+        basic_api_config_query_endpoint.cache_enabled = True
+
+        mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
+
+        # Create a mock cache with a cached response that has token counts
+        mock_cache = mocker.Mock()
+        cached_response = APIResponse(
+            response="Cached response",
+            conversation_id="conv_123",
+            input_tokens=50,
+            output_tokens=100,
+        )
+        mock_cache.get.return_value = cached_response
+
+        mocker.patch(
+            "lightspeed_evaluation.core.api.client.Cache", return_value=mock_cache
+        )
+
+        client = APIClient(basic_api_config_query_endpoint)
+
+        # Prepare a request
+        request = client._prepare_request("Test query")
+
+        # Get cached response
+        result = client._get_cached_response(request)
+
+        # Verify token counts were zeroed
+        assert result is not None
+        assert result.input_tokens == 0
+        assert result.output_tokens == 0
+
+        # Verify other fields remain unchanged
+        assert result.response == "Cached response"
+        assert result.conversation_id == "conv_123"
+
 
 class TestRetryLogic:
     """Unit tests for retry logic in APIClient."""
diff --git a/tests/unit/core/llm/test_custom.py b/tests/unit/core/llm/test_custom.py
@@ -202,6 +202,7 @@ def test_call_captures_tokens_with_active_tracker(
         mock_response.usage = mocker.Mock()
         mock_response.usage.prompt_tokens = 50
         mock_response.usage.completion_tokens = 100
+        mock_response._hidden_params = {}  # Ensure no cache hit
         mock_litellm.completion.return_value = mock_response
 
         # Start a tracker
@@ -246,3 +247,34 @@ def test_call_does_not_capture_tokens_without_active_tracker(
 
         # Should succeed without error
         assert result == "Test response"
+
+    def test_call_does_not_add_tokens_on_cache_hit(self, mocker: MockerFixture) -> None:
+        """Test call does not add tokens when response is from cache."""
+        mock_litellm = mocker.patch("lightspeed_evaluation.core.llm.custom.litellm")
+        mocker.patch.dict("os.environ", {})
+
+        # Mock response with cache hit
+        mock_choice = mocker.Mock()
+        mock_choice.message.content = "Cached response"
+        mock_response = mocker.Mock()
+        mock_response.choices = [mock_choice]
+        mock_response.usage = mocker.Mock()
+        mock_response.usage.prompt_tokens = 50
+        mock_response.usage.completion_tokens = 100
+        mock_response._hidden_params = {"cache_hit": True}  # Cache hit
+        mock_litellm.completion.return_value = mock_response
+
+        # Start a tracker
+        tracker = TokenTracker()
+        tracker.start()
+
+        try:
+            llm = BaseCustomLLM("gpt-4", {"temperature": 0.0})
+            llm.call("test prompt")
+
+            # Tokens should NOT be captured due to cache hit
+            input_tokens, output_tokens = tracker.get_counts()
+            assert input_tokens == 0
+            assert output_tokens == 0
+        finally:
+            tracker.stop()