fix(plugin): replace misleading Cache:% status-bar metric with raw cache token display

JeremyDev87 · JeremyDev87 · commit 792976d0432c · 2026-04-05T17:49:41.000+09:00
The Cache:XX% segment derived from context_window.current_usage only reflects the most recent API call, not session-wide cache efficiency. Users frequently misread it as cumulative cache hit rate. Replace compute_cache_hit_rate() with format_cache_segment() that renders raw token values (e.g. ♻2k/3.5k) with the following semantics: - numerator = cache_read_input_tokens - denominator = input_tokens + cache_creation_input_tokens + cache_read_input_tokens - values represent the latest API call, not session totals Also add format_compact_tokens() helper for k-suffix compact rendering (532 → 532, 1000 → 1k, 1500 → 1.5k, 128000 → 128k). Safe fallback: when current_usage is missing/null/zero, the cache segment is omitted entirely so the status line still renders without a broken slot. Test coverage (#1354): - format_cache_segment: 7 cases covering empty, null, input-only, partial, full, large-value k-format, and no-percent regression - format_status_line integration: 3 cases locking in the new output contract and guarding against Cache:% regression Closes #1355 Closes #1354
diff --git a/packages/claude-code-plugin/hooks/codingbuddy-hud.py b/packages/claude-code-plugin/hooks/codingbuddy-hud.py
@@ -131,20 +131,53 @@ def estimate_cost(model_id: str, context_window: dict) -> float:
     return input_cost + cache_write_cost + cache_read_cost + output_cost
 
 
-def compute_cache_hit_rate(context_window: dict) -> float:
-    """Compute cache hit rate as percentage (0-100)."""
-    usage = context_window.get("current_usage", {})
+def format_compact_tokens(n: int) -> str:
+    """Format token count compactly for status-bar display.
+
+    - < 1000 → raw integer (e.g. `532`)
+    - >= 1000 → `Nk` with one decimal trimmed of trailing `.0` (e.g. `1.5k`, `128k`)
+    """
+    try:
+        value = int(n)
+    except (TypeError, ValueError):
+        return "0"
+    if value < 1000:
+        return str(value)
+    k = value / 1000.0
+    # Trim trailing .0 for whole thousands
+    if k == int(k):
+        return f"{int(k)}k"
+    return f"{k:.1f}k"
+
+
+def format_cache_segment(context_window: dict) -> str:
+    """Render the cache segment as raw tokens from the latest API call.
+
+    IMPORTANT: `context_window.current_usage` from Claude Code stdin reflects
+    **only the most recent API call**, not cumulative session cache usage.
+    This helper therefore renders raw token counts (numerator/denominator)
+    rather than a percentage, which users tend to misread as session-wide
+    cache efficiency (#1355, #1356).
+
+    Numerator   = `cache_read_input_tokens`
+    Denominator = `input_tokens + cache_creation_input_tokens + cache_read_input_tokens`
+
+    Returns an empty string when usage data is missing so the caller can
+    omit the segment entirely from the status line.
+    """
+    usage = context_window.get("current_usage") if context_window else None
     if not usage:
-        return 0.0
+        return ""
 
-    input_tokens = usage.get("input_tokens", 0)
-    cache_write = usage.get("cache_creation_input_tokens", 0)
-    cache_read = usage.get("cache_read_input_tokens", 0)
+    input_tokens = usage.get("input_tokens", 0) or 0
+    cache_write = usage.get("cache_creation_input_tokens", 0) or 0
+    cache_read = usage.get("cache_read_input_tokens", 0) or 0
     total = input_tokens + cache_write + cache_read
 
     if total == 0:
-        return 0.0
-    return (cache_read / total) * 100
+        return ""
+
+    return f"\u267b{format_compact_tokens(cache_read)}/{format_compact_tokens(total)}"
 
 
 def get_health(ctx_pct: float) -> str:
@@ -378,7 +411,7 @@ def format_status_line(
     model_id, display_name = resolve_model_label(stdin_data)
     cost, is_exact = resolve_cost(stdin_data, model_id, ctx_window)
     duration = resolve_duration(stdin_data, hud_state)
-    cache = compute_cache_hit_rate(ctx_window)
+    cache_segment = format_cache_segment(ctx_window)
     agent = resolve_agent(stdin_data, hud_state, active_agent)
 
     cost_prefix = "$" if is_exact else "~$"
@@ -390,9 +423,10 @@ def format_status_line(
         f"{mode_label} {health}",
         duration,
         f"{cost_prefix}{cost:.2f}",
-        f"Cache:{cache:.0f}%",
-        f"Ctx:{ctx_pct:.0f}%",
     ]
+    if cache_segment:
+        segments.append(cache_segment)
+    segments.append(f"Ctx:{ctx_pct:.0f}%")
 
     rl = format_rate_limits(stdin_data)
     if rl:
diff --git a/packages/claude-code-plugin/tests/test_hud.py b/packages/claude-code-plugin/tests/test_hud.py
@@ -96,34 +96,122 @@ def test_cache_reduces_cost(self):
         assert cost_with < cost_no
 
 
-class TestCacheHitRate:
-    def test_no_cache(self):
-        assert hud.compute_cache_hit_rate({}) == 0.0
+class TestFormatCacheSegment:
+    """Tests for the raw cache token display (#1355).
 
-    def test_zero_tokens(self):
+    The status-bar cache segment reflects the most recent API call only,
+    not session-wide cache efficiency. It must render raw tokens instead
+    of a percentage to avoid misleading users.
+    """
+
+    def test_no_context_window(self):
+        """Empty context window → safe fallback (empty string)."""
+        assert hud.format_cache_segment({}) == ""
+
+    def test_null_current_usage(self):
+        """current_usage missing → safe fallback (empty string)."""
+        assert hud.format_cache_segment({"current_usage": None}) == ""
+        assert hud.format_cache_segment({"current_usage": {}}) == ""
+
+    def test_input_tokens_only_no_cache_read(self):
+        """input_tokens > 0 with no cache read → 0/total."""
         ctx = {"current_usage": {
-            "input_tokens": 0,
+            "input_tokens": 1000,
             "cache_creation_input_tokens": 0,
             "cache_read_input_tokens": 0,
         }}
-        assert hud.compute_cache_hit_rate(ctx) == 0.0
+        result = hud.format_cache_segment(ctx)
+        assert "0/1k" in result
 
-    def test_partial_cache(self):
+    def test_partial_cache_read(self):
+        """Partial cache read → raw numerator/denominator."""
         ctx = {"current_usage": {
             "input_tokens": 500,
             "cache_creation_input_tokens": 200,
             "cache_read_input_tokens": 800,
         }}
-        rate = hud.compute_cache_hit_rate(ctx)
-        assert 53 < rate < 54  # 800/1500 = 53.3%
+        result = hud.format_cache_segment(ctx)
+        # numerator=800, denominator=500+200+800=1500
+        assert "800" in result
+        assert "1500" in result or "1.5k" in result
 
-    def test_full_cache(self):
+    def test_full_cache_read_shows_raw_not_100pct(self):
+        """Full cache read → shows raw tokens, NOT `100%`."""
         ctx = {"current_usage": {
             "input_tokens": 0,
             "cache_creation_input_tokens": 0,
             "cache_read_input_tokens": 1000,
         }}
-        assert hud.compute_cache_hit_rate(ctx) == 100.0
+        result = hud.format_cache_segment(ctx)
+        assert "100%" not in result
+        assert "1k/1k" in result
+
+    def test_large_values_use_k_format(self):
+        """Large values compact as `Nk`."""
+        ctx = {"current_usage": {
+            "input_tokens": 50000,
+            "cache_creation_input_tokens": 78000,
+            "cache_read_input_tokens": 128000,
+        }}
+        result = hud.format_cache_segment(ctx)
+        # numerator=128000 → 128k, denominator=256000 → 256k
+        assert "128k" in result
+        assert "256k" in result
+
+    def test_regression_no_percent_in_output(self):
+        """REGRESSION: Cache segment must never render `%`."""
+        ctx = {"current_usage": {
+            "input_tokens": 500,
+            "cache_creation_input_tokens": 200,
+            "cache_read_input_tokens": 800,
+        }}
+        result = hud.format_cache_segment(ctx)
+        assert "%" not in result
+
+
+class TestFormatStatusLineCacheSegment:
+    """Integration: final status-line output includes raw cache segment (#1354)."""
+
+    _NO_PLUGINS = "/tmp/_nonexistent_plugins_.json"
+
+    def test_status_line_no_longer_contains_cache_percent(self):
+        """REGRESSION: `Cache:XX%` must never appear in format_status_line output."""
+        stdin = {
+            "context_window": {
+                "used_percentage": 45,
+                "current_usage": {
+                    "input_tokens": 1000,
+                    "cache_creation_input_tokens": 500,
+                    "cache_read_input_tokens": 2000,
+                },
+            },
+        }
+        result = hud.format_status_line(stdin, {}, plugins_file=self._NO_PLUGINS)
+        assert "Cache:" not in result
+        assert "%" in result  # Ctx:45% is still a percentage — only cache changes
+
+    def test_status_line_contains_raw_cache_tokens(self):
+        """format_status_line renders raw cache token segment."""
+        stdin = {
+            "context_window": {
+                "used_percentage": 45,
+                "current_usage": {
+                    "input_tokens": 1000,
+                    "cache_creation_input_tokens": 500,
+                    "cache_read_input_tokens": 2000,
+                },
+            },
+        }
+        result = hud.format_status_line(stdin, {}, plugins_file=self._NO_PLUGINS)
+        # cache_read=2000 → 2k, total=3500 → 3.5k
+        assert "2k/3.5k" in result
+
+    def test_status_line_hides_cache_when_usage_absent(self):
+        """Missing current_usage → cache segment is hidden, status line still renders."""
+        stdin = {"context_window": {"used_percentage": 10}}
+        result = hud.format_status_line(stdin, {}, plugins_file=self._NO_PLUGINS)
+        assert "Cache:" not in result
+        assert "Ctx:10%" in result  # other segments still present
 
 
 class TestHealth: