Skip to content

Commit 6358ac9

Browse files
committed
refactor(data): unify URL+headers cache key + document redaction
Follow-up to #529: * Extract a single url_cache_key(url, headers) helper in base_url.py so the in-memory provider dict on Context and the on-disk cache filename hash use the exact same canonical serialization. Prevents future drift where an in-memory hit could read the wrong on-disk file. * Add a note to the headers= docstrings on Context.fetch_csv / fetch_json / fetch_parquet and DataSource.from_csv / from_json / from_parquet documenting that header values are redacted to '***' in to_dict() so secrets do not leak into diagnostic payloads.
1 parent 4c4d8ae commit 6358ac9

3 files changed

Lines changed: 46 additions & 13 deletions

File tree

investing_algorithm_framework/app/context.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2383,13 +2383,12 @@ def get_stop_losses(
23832383
return self.trade_stop_loss_service.get_all(query_params)
23842384

23852385
def _get_url_provider_cache_key(self, url, headers):
2386-
if not headers:
2387-
return url
2388-
2389-
return (
2390-
url,
2391-
tuple(sorted(headers.items()))
2392-
)
2386+
# Delegates to the canonical helper so the in-memory provider
2387+
# dict and the on-disk cache filename stay in lockstep — see
2388+
# ``url_cache_key`` for rationale.
2389+
from investing_algorithm_framework.infrastructure \
2390+
.data_providers.base_url import url_cache_key
2391+
return url_cache_key(url, headers)
23932392

23942393
def fetch_csv(
23952394
self,
@@ -2419,7 +2418,9 @@ def fetch_csv(
24192418
refresh_interval (str, optional): Re-fetch interval
24202419
(e.g., "1d", "1h").
24212420
headers (dict, optional): HTTP headers to send with the
2422-
request.
2421+
request. Header values are redacted (replaced
2422+
with "***") in ``DataSource.to_dict``, so
2423+
secrets do not leak into diagnostic payloads.
24232424
pre_process (callable, optional): Transform raw CSV text
24242425
before parsing.
24252426
post_process (callable, optional): Transform the parsed
@@ -2492,7 +2493,9 @@ def fetch_json(
24922493
refresh_interval (str, optional): Re-fetch interval
24932494
(e.g., "1d", "1h").
24942495
headers (dict, optional): HTTP headers to send with the
2495-
request.
2496+
request. Header values are redacted (replaced
2497+
with "***") in ``DataSource.to_dict``, so
2498+
secrets do not leak into diagnostic payloads.
24962499
pre_process (callable, optional): Transform raw JSON text
24972500
before parsing.
24982501
post_process (callable, optional): Transform the parsed
@@ -2560,7 +2563,9 @@ def fetch_parquet(
25602563
refresh_interval (str, optional): Re-fetch interval
25612564
(e.g., "1d", "1h").
25622565
headers (dict, optional): HTTP headers to send with the
2563-
request.
2566+
request. Header values are redacted (replaced
2567+
with "***") in ``DataSource.to_dict``, so
2568+
secrets do not leak into diagnostic payloads.
25642569
post_process (callable, optional): Transform the parsed
25652570
DataFrame.
25662571

investing_algorithm_framework/domain/models/data/data_source.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,9 @@ def from_csv(
152152
(e.g., "1d", "1h"). If None, data is fetched once and
153153
cached indefinitely.
154154
headers: Optional HTTP headers to send with the request.
155+
Values are redacted ("***") in ``to_dict()``
156+
so secrets do not leak into diagnostic
157+
payloads.
155158
pre_process: Optional callback to transform the raw CSV
156159
text before parsing. Receives a string, must return
157160
a string.
@@ -215,6 +218,9 @@ def from_json(
215218
refresh_interval: How often to re-fetch the data
216219
(e.g., "1d", "1h").
217220
headers: Optional HTTP headers to send with the request.
221+
Values are redacted ("***") in ``to_dict()``
222+
so secrets do not leak into diagnostic
223+
payloads.
218224
pre_process: Optional callback to transform the raw JSON
219225
text before parsing. Receives a string, must return
220226
a string.
@@ -271,6 +277,9 @@ def from_parquet(
271277
refresh_interval: How often to re-fetch the data
272278
(e.g., "1d", "1h").
273279
headers: Optional HTTP headers to send with the request.
280+
Values are redacted ("***") in ``to_dict()``
281+
so secrets do not leak into diagnostic
282+
payloads.
274283
post_process: Optional callback to transform the parsed
275284
DataFrame.
276285

investing_algorithm_framework/infrastructure/data_providers/base_url.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,27 @@
2525
}
2626

2727

28+
def url_cache_key(url, headers=None):
29+
"""Canonical cache key for a (url, headers) pair.
30+
31+
Used in two places:
32+
33+
* the in-memory provider dict on :class:`Context` (so two strategies
34+
hitting the same URL with different credentials don't collide), and
35+
* the on-disk cache filename hash inside
36+
:class:`BaseURLDataProvider._get_cache_path` (same reason).
37+
38+
Both call sites must agree on the exact serialization, otherwise an
39+
in-memory hit could read the wrong on-disk file. Centralizing it
40+
here prevents that drift.
41+
"""
42+
if not headers:
43+
return url
44+
45+
serialized = "&".join(f"{k}={v}" for k, v in sorted(headers.items()))
46+
return f"{url}|headers:{serialized}"
47+
48+
2849
class BaseURLDataProvider(DataProvider):
2950
"""
3051
Abstract base class for data providers that fetch data from a
@@ -357,9 +378,7 @@ def _get_cache_path(self):
357378
if storage_dir is None:
358379
storage_dir = os.path.join(os.getcwd(), ".data_cache")
359380

360-
cache_key = self._url
361-
if self._headers:
362-
cache_key = f"{cache_key}|headers:{sorted(self._headers.items())}"
381+
cache_key = url_cache_key(self._url, self._headers)
363382

364383
url_hash = hashlib.md5(
365384
cache_key.encode()

0 commit comments

Comments
 (0)