Address PR review: defensive cache decode, stronger cap test, drop manual version bump

Robbie1977 · Robbie1977 · commit 268a2d874859 · 2026-06-16T14:19:53.000+01:00
- _decode_cache_field: catch base64/gzip/unicode errors on corrupt gz: payloads
  and return the raw string, so callers treat it as invalid JSON (and purge it)
  instead of aborting cleanup/stats runs. (Copilot)
- test: prove the cap is enforced on the COMPRESSED payload (raw &gt; cap,
  compressed &lt; cap) with a small cap and a highly compressible result. (Copilot)
- revert manual _version.py bump (1.22.0 -&gt; 1.21.0); the release workflow owns the
  version, and a manual mismatch forces cache recomputes. Legacy plain-JSON
  entries are still read transparently, so no invalidation is needed. (Clare)
diff --git a/src/vfbquery/_version.py b/src/vfbquery/_version.py
@@ -6,4 +6,4 @@
 cache's version stamp can never drift apart.
 """
 
-__version__ = "1.22.0"
+__version__ = "1.21.0"
diff --git a/src/vfbquery/solr_result_cache.py b/src/vfbquery/solr_result_cache.py
@@ -64,8 +64,15 @@ def _decode_cache_field(cached_field) -> str:
     if isinstance(cached_field, list):
         cached_field = cached_field[0] if cached_field else ""
     if isinstance(cached_field, str) and cached_field.startswith(_CACHE_GZIP_PREFIX):
-        blob = base64.b64decode(cached_field[len(_CACHE_GZIP_PREFIX):])
-        return gzip.decompress(blob).decode("utf-8")
+        try:
+            blob = base64.b64decode(cached_field[len(_CACHE_GZIP_PREFIX):])
+            return gzip.decompress(blob).decode("utf-8")
+        except Exception:
+            # Corrupt/truncated gz payload: return the raw string so callers'
+            # json.loads fails and the entry is treated as invalid (and purged),
+            # rather than raising an un-caught error that aborts cleanup/stats.
+            logger.warning("Failed to decode compressed cache payload; treating as invalid")
+            return cached_field
     return cached_field
 
 
diff --git a/tests/test_gzip_cache.py b/tests/test_gzip_cache.py
@@ -21,14 +21,19 @@ def test_decode_handles_legacy_plain_json_and_list_shape():
     assert _decode_cache_field([enc]) == legacy
 
 
-def test_cap_is_on_compressed_size():
-    c = SolrResultCache(max_result_size_mb=100)
-    assert c.max_result_size_bytes == 100 * 1024 * 1024
-    big = {"result": {"rows": [{"id": i, "name": "n"} for i in range(300000)]},
-           "cached_at": "2026-01-01T00:00:00+00:00",
-           "expires_at": "2026-04-01T00:00:00+00:00", "result_size": 0}
-    enc = _encode_cache_field(json.dumps(big))
-    assert len(enc.encode("utf-8")) < c.max_result_size_bytes
+def test_cap_is_enforced_on_compressed_not_raw_size():
+    # Small cap + a highly compressible payload: the RAW JSON must exceed the cap
+    # while the gzip+base64 form stays under it, proving the cap is on the stored
+    # (compressed) size, not the raw size. Kept fast/memory-light via repetition.
+    cap_mb = 1
+    c = SolrResultCache(max_result_size_mb=cap_mb)
+    cap = cap_mb * 1024 * 1024
+    assert c.max_result_size_bytes == cap
+    payload = json.dumps({"result": {"rows": ["x" * 100] * 50000}})  # ~5 MB raw, compresses hard
+    raw = len(payload.encode("utf-8"))
+    compressed = len(_encode_cache_field(payload).encode("utf-8"))
+    assert raw > cap, f"raw {raw} should exceed cap {cap}"
+    assert compressed < cap, f"compressed {compressed} should be under cap {cap}"
 
 
 def test_env_override(monkeypatch):