refactor(data): unify URL+headers cache key + document redaction

MDUYN · MDUYN · commit 6358ac9d2f6d · 2026-05-09T21:25:20.000+02:00
Follow-up to #529: * Extract a single url_cache_key(url, headers) helper in base_url.py so the in-memory provider dict on Context and the on-disk cache filename hash use the exact same canonical serialization. Prevents future drift where an in-memory hit could read the wrong on-disk file. * Add a note to the headers= docstrings on Context.fetch_csv / fetch_json / fetch_parquet and DataSource.from_csv / from_json / from_parquet documenting that header values are redacted to '***' in to_dict() so secrets do not leak into diagnostic payloads.
diff --git a/investing_algorithm_framework/app/context.py b/investing_algorithm_framework/app/context.py
@@ -2383,13 +2383,12 @@ def get_stop_losses(
         return self.trade_stop_loss_service.get_all(query_params)
 
     def _get_url_provider_cache_key(self, url, headers):
-        if not headers:
-            return url
-
-        return (
-            url,
-            tuple(sorted(headers.items()))
-        )
+        # Delegates to the canonical helper so the in-memory provider
+        # dict and the on-disk cache filename stay in lockstep — see
+        # ``url_cache_key`` for rationale.
+        from investing_algorithm_framework.infrastructure \
+            .data_providers.base_url import url_cache_key
+        return url_cache_key(url, headers)
 
     def fetch_csv(
         self,
@@ -2419,7 +2418,9 @@ def fetch_csv(
             refresh_interval (str, optional): Re-fetch interval
                 (e.g., "1d", "1h").
             headers (dict, optional): HTTP headers to send with the
-                request.
+                request. Header values are redacted (replaced
+                with "***") in ``DataSource.to_dict``, so
+                secrets do not leak into diagnostic payloads.
             pre_process (callable, optional): Transform raw CSV text
                 before parsing.
             post_process (callable, optional): Transform the parsed
@@ -2492,7 +2493,9 @@ def fetch_json(
             refresh_interval (str, optional): Re-fetch interval
                 (e.g., "1d", "1h").
             headers (dict, optional): HTTP headers to send with the
-                request.
+                request. Header values are redacted (replaced
+                with "***") in ``DataSource.to_dict``, so
+                secrets do not leak into diagnostic payloads.
             pre_process (callable, optional): Transform raw JSON text
                 before parsing.
             post_process (callable, optional): Transform the parsed
@@ -2560,7 +2563,9 @@ def fetch_parquet(
             refresh_interval (str, optional): Re-fetch interval
                 (e.g., "1d", "1h").
             headers (dict, optional): HTTP headers to send with the
-                request.
+                request. Header values are redacted (replaced
+                with "***") in ``DataSource.to_dict``, so
+                secrets do not leak into diagnostic payloads.
             post_process (callable, optional): Transform the parsed
                 DataFrame.
 
diff --git a/investing_algorithm_framework/domain/models/data/data_source.py b/investing_algorithm_framework/domain/models/data/data_source.py
@@ -152,6 +152,9 @@ def from_csv(
                 (e.g., "1d", "1h"). If None, data is fetched once and
                 cached indefinitely.
             headers: Optional HTTP headers to send with the request.
+                Values are redacted ("***") in ``to_dict()``
+                so secrets do not leak into diagnostic
+                payloads.
             pre_process: Optional callback to transform the raw CSV
                 text before parsing. Receives a string, must return
                 a string.
@@ -215,6 +218,9 @@ def from_json(
             refresh_interval: How often to re-fetch the data
                 (e.g., "1d", "1h").
             headers: Optional HTTP headers to send with the request.
+                Values are redacted ("***") in ``to_dict()``
+                so secrets do not leak into diagnostic
+                payloads.
             pre_process: Optional callback to transform the raw JSON
                 text before parsing. Receives a string, must return
                 a string.
@@ -271,6 +277,9 @@ def from_parquet(
             refresh_interval: How often to re-fetch the data
                 (e.g., "1d", "1h").
             headers: Optional HTTP headers to send with the request.
+                Values are redacted ("***") in ``to_dict()``
+                so secrets do not leak into diagnostic
+                payloads.
             post_process: Optional callback to transform the parsed
                 DataFrame.
 
diff --git a/investing_algorithm_framework/infrastructure/data_providers/base_url.py b/investing_algorithm_framework/infrastructure/data_providers/base_url.py
@@ -25,6 +25,27 @@
 }
 
 
+def url_cache_key(url, headers=None):
+    """Canonical cache key for a (url, headers) pair.
+
+    Used in two places:
+
+    * the in-memory provider dict on :class:`Context` (so two strategies
+      hitting the same URL with different credentials don't collide), and
+    * the on-disk cache filename hash inside
+      :class:`BaseURLDataProvider._get_cache_path` (same reason).
+
+    Both call sites must agree on the exact serialization, otherwise an
+    in-memory hit could read the wrong on-disk file. Centralizing it
+    here prevents that drift.
+    """
+    if not headers:
+        return url
+
+    serialized = "&".join(f"{k}={v}" for k, v in sorted(headers.items()))
+    return f"{url}|headers:{serialized}"
+
+
 class BaseURLDataProvider(DataProvider):
     """
     Abstract base class for data providers that fetch data from a
@@ -357,9 +378,7 @@ def _get_cache_path(self):
         if storage_dir is None:
             storage_dir = os.path.join(os.getcwd(), ".data_cache")
 
-        cache_key = self._url
-        if self._headers:
-            cache_key = f"{cache_key}|headers:{sorted(self._headers.items())}"
+        cache_key = url_cache_key(self._url, self._headers)
 
         url_hash = hashlib.md5(
             cache_key.encode()