Skip to content

Commit 48994b7

Browse files
Add headers for external URL data sources
1 parent ddc309d commit 48994b7

8 files changed

Lines changed: 207 additions & 20 deletions

File tree

docusaurus/docs/Data/external-data.md

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ The framework provides two ways to load external data:
1313
1. **Data Sources** — Declare `DataSource.from_csv()`, `DataSource.from_json()`, or `DataSource.from_parquet()` in your strategy's `data_sources` list. Data is fetched automatically and available in your `data` dict.
1414
2. **Context methods** — Call `context.fetch_csv()`, `context.fetch_json()`, or `context.fetch_parquet()` on demand inside your strategy's `run_strategy` method.
1515

16-
Both approaches support caching, refresh intervals, date parsing, and pre/post-processing callbacks.
16+
Both approaches support caching, refresh intervals, date parsing, request headers, and pre/post-processing callbacks.
1717

1818
## Supported Formats
1919

@@ -112,6 +112,7 @@ class MyStrategy(TradingStrategy):
112112
earnings = context.fetch_json(
113113
url="https://api.example.com/earnings",
114114
date_column="report_date",
115+
headers={"Authorization": "Bearer <token>"},
115116
)
116117

117118
# Fetch Parquet on demand
@@ -132,9 +133,50 @@ All three factory methods and context methods accept the same core parameters:
132133
| `date_format` | `str` | `None` | strftime format for parsing dates (e.g., `"%Y-%m-%d"`). Auto-detected if omitted. |
133134
| `cache` | `bool` | `True` | Cache fetched data locally to avoid repeated downloads. |
134135
| `refresh_interval` | `str` | `None` | How often to re-fetch: `"1m"`, `"5m"`, `"15m"`, `"30m"`, `"1h"`, `"4h"`, `"1d"`, `"1W"`. |
136+
| `headers` | `dict` | `None` | Optional HTTP headers to send with the request, such as API keys or bearer tokens. |
135137
| `pre_process` | `callable` | `None` | Transform raw text before parsing. Receives `str`, returns `str`. Not available for Parquet. |
136138
| `post_process` | `callable` | `None` | Transform the parsed DataFrame. Receives `DataFrame`, returns `DataFrame`. |
137139

140+
## Authenticated APIs
141+
142+
Use `headers` when an external data API requires authentication. For example, Adanos Market Sentiment can be loaded as an optional alternative-data signal without writing a custom provider:
143+
144+
```python
145+
import json
146+
import os
147+
148+
import polars as pl
149+
150+
from investing_algorithm_framework import TimeUnit, TradingStrategy
151+
152+
153+
def extract_adanos_stocks(raw_text):
154+
payload = json.loads(raw_text)
155+
return json.dumps(payload.get("stocks", []))
156+
157+
158+
class SentimentStrategy(TradingStrategy):
159+
time_unit = TimeUnit.DAY
160+
interval = 1
161+
symbols = ["AAPL", "MSFT"]
162+
163+
def run_strategy(self, context, data):
164+
sentiment = context.fetch_json(
165+
url=(
166+
"https://api.adanos.org/news/stocks/v1/compare"
167+
"?tickers=AAPL,MSFT&days=7"
168+
),
169+
headers={"X-API-Key": os.environ["ADANOS_API_KEY"]},
170+
pre_process=extract_adanos_stocks,
171+
cache=True,
172+
refresh_interval="1d",
173+
)
174+
175+
aapl = sentiment.filter(pl.col("ticker") == "AAPL")
176+
if len(aapl) and aapl["sentiment_score"][0] > 0.2:
177+
context.create_limit_order(...)
178+
```
179+
138180
## Pre/Post Processing
139181

140182
### Pre-Processing

investing_algorithm_framework/app/context.py

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2092,13 +2092,23 @@ def get_stop_losses(
20922092

20932093
return self.trade_stop_loss_service.get_all(query_params)
20942094

2095+
def _get_url_provider_cache_key(self, url, headers):
2096+
if not headers:
2097+
return url
2098+
2099+
return (
2100+
url,
2101+
tuple(sorted(headers.items()))
2102+
)
2103+
20952104
def fetch_csv(
20962105
self,
20972106
url,
20982107
date_column=None,
20992108
date_format=None,
21002109
cache=True,
21012110
refresh_interval=None,
2111+
headers=None,
21022112
pre_process=None,
21032113
post_process=None,
21042114
):
@@ -2118,6 +2128,8 @@ def fetch_csv(
21182128
cache (bool): Cache fetched data locally (default: True).
21192129
refresh_interval (str, optional): Re-fetch interval
21202130
(e.g., "1d", "1h").
2131+
headers (dict, optional): HTTP headers to send with the
2132+
request.
21212133
pre_process (callable, optional): Transform raw CSV text
21222134
before parsing.
21232135
post_process (callable, optional): Transform the parsed
@@ -2141,20 +2153,23 @@ def run_strategy(self, context, data):
21412153
if not hasattr(self, '_csv_url_providers'):
21422154
self._csv_url_providers = {}
21432155

2144-
if url not in self._csv_url_providers:
2156+
provider_key = self._get_url_provider_cache_key(url, headers)
2157+
2158+
if provider_key not in self._csv_url_providers:
21452159
provider = CSVURLDataProvider(
21462160
url=url,
21472161
date_column=date_column,
21482162
date_format=date_format,
21492163
cache=cache,
21502164
refresh_interval=refresh_interval,
2165+
headers=headers,
21512166
pre_process=pre_process,
21522167
post_process=post_process,
21532168
)
21542169
provider.config = self.configuration_service.get_config()
2155-
self._csv_url_providers[url] = provider
2170+
self._csv_url_providers[provider_key] = provider
21562171

2157-
return self._csv_url_providers[url].get_data()
2172+
return self._csv_url_providers[provider_key].get_data()
21582173

21592174
def fetch_json(
21602175
self,
@@ -2163,6 +2178,7 @@ def fetch_json(
21632178
date_format=None,
21642179
cache=True,
21652180
refresh_interval=None,
2181+
headers=None,
21662182
pre_process=None,
21672183
post_process=None,
21682184
):
@@ -2185,6 +2201,8 @@ def fetch_json(
21852201
cache (bool): Cache fetched data locally (default: True).
21862202
refresh_interval (str, optional): Re-fetch interval
21872203
(e.g., "1d", "1h").
2204+
headers (dict, optional): HTTP headers to send with the
2205+
request.
21882206
pre_process (callable, optional): Transform raw JSON text
21892207
before parsing.
21902208
post_process (callable, optional): Transform the parsed
@@ -2207,20 +2225,23 @@ def run_strategy(self, context, data):
22072225
if not hasattr(self, '_json_url_providers'):
22082226
self._json_url_providers = {}
22092227

2210-
if url not in self._json_url_providers:
2228+
provider_key = self._get_url_provider_cache_key(url, headers)
2229+
2230+
if provider_key not in self._json_url_providers:
22112231
provider = JSONURLDataProvider(
22122232
url=url,
22132233
date_column=date_column,
22142234
date_format=date_format,
22152235
cache=cache,
22162236
refresh_interval=refresh_interval,
2237+
headers=headers,
22172238
pre_process=pre_process,
22182239
post_process=post_process,
22192240
)
22202241
provider.config = self.configuration_service.get_config()
2221-
self._json_url_providers[url] = provider
2242+
self._json_url_providers[provider_key] = provider
22222243

2223-
return self._json_url_providers[url].get_data()
2244+
return self._json_url_providers[provider_key].get_data()
22242245

22252246
def fetch_parquet(
22262247
self,
@@ -2229,6 +2250,7 @@ def fetch_parquet(
22292250
date_format=None,
22302251
cache=True,
22312252
refresh_interval=None,
2253+
headers=None,
22322254
post_process=None,
22332255
):
22342256
"""
@@ -2247,6 +2269,8 @@ def fetch_parquet(
22472269
cache (bool): Cache fetched data locally (default: True).
22482270
refresh_interval (str, optional): Re-fetch interval
22492271
(e.g., "1d", "1h").
2272+
headers (dict, optional): HTTP headers to send with the
2273+
request.
22502274
post_process (callable, optional): Transform the parsed
22512275
DataFrame.
22522276
@@ -2266,19 +2290,22 @@ def run_strategy(self, context, data):
22662290
if not hasattr(self, '_parquet_url_providers'):
22672291
self._parquet_url_providers = {}
22682292

2269-
if url not in self._parquet_url_providers:
2293+
provider_key = self._get_url_provider_cache_key(url, headers)
2294+
2295+
if provider_key not in self._parquet_url_providers:
22702296
provider = ParquetURLDataProvider(
22712297
url=url,
22722298
date_column=date_column,
22732299
date_format=date_format,
22742300
cache=cache,
22752301
refresh_interval=refresh_interval,
2302+
headers=headers,
22762303
post_process=post_process,
22772304
)
22782305
provider.config = self.configuration_service.get_config()
2279-
self._parquet_url_providers[url] = provider
2306+
self._parquet_url_providers[provider_key] = provider
22802307

2281-
return self._parquet_url_providers[url].get_data()
2308+
return self._parquet_url_providers[provider_key].get_data()
22822309

22832310
def batch_order(self, orders, market=None):
22842311
"""

investing_algorithm_framework/domain/models/data/data_source.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ class DataSource:
4747
date_format: Optional[str] = None
4848
cache: bool = True
4949
refresh_interval: Optional[str] = None
50+
headers: Optional[dict] = None
5051
pre_process: Optional[Callable] = field(
5152
default=None, repr=False, compare=False
5253
)
@@ -133,6 +134,7 @@ def from_csv(
133134
date_format: str = None,
134135
cache: bool = True,
135136
refresh_interval: str = None,
137+
headers: dict = None,
136138
pre_process: Callable = None,
137139
post_process: Callable = None,
138140
) -> "DataSource":
@@ -149,6 +151,7 @@ def from_csv(
149151
refresh_interval: How often to re-fetch the data
150152
(e.g., "1d", "1h"). If None, data is fetched once and
151153
cached indefinitely.
154+
headers: Optional HTTP headers to send with the request.
152155
pre_process: Optional callback to transform the raw CSV
153156
text before parsing. Receives a string, must return
154157
a string.
@@ -178,6 +181,7 @@ def from_csv(
178181
date_format=date_format,
179182
cache=cache,
180183
refresh_interval=refresh_interval,
184+
headers=headers,
181185
pre_process=pre_process,
182186
post_process=post_process,
183187
)
@@ -191,6 +195,7 @@ def from_json(
191195
date_format: str = None,
192196
cache: bool = True,
193197
refresh_interval: str = None,
198+
headers: dict = None,
194199
pre_process: Callable = None,
195200
post_process: Callable = None,
196201
) -> "DataSource":
@@ -209,6 +214,7 @@ def from_json(
209214
(default: True).
210215
refresh_interval: How often to re-fetch the data
211216
(e.g., "1d", "1h").
217+
headers: Optional HTTP headers to send with the request.
212218
pre_process: Optional callback to transform the raw JSON
213219
text before parsing. Receives a string, must return
214220
a string.
@@ -234,6 +240,7 @@ def from_json(
234240
date_format=date_format,
235241
cache=cache,
236242
refresh_interval=refresh_interval,
243+
headers=headers,
237244
pre_process=pre_process,
238245
post_process=post_process,
239246
)
@@ -247,6 +254,7 @@ def from_parquet(
247254
date_format: str = None,
248255
cache: bool = True,
249256
refresh_interval: str = None,
257+
headers: dict = None,
250258
post_process: Callable = None,
251259
) -> "DataSource":
252260
"""
@@ -262,6 +270,7 @@ def from_parquet(
262270
(default: True).
263271
refresh_interval: How often to re-fetch the data
264272
(e.g., "1d", "1h").
273+
headers: Optional HTTP headers to send with the request.
265274
post_process: Optional callback to transform the parsed
266275
DataFrame.
267276
@@ -284,6 +293,7 @@ def from_parquet(
284293
date_format=date_format,
285294
cache=cache,
286295
refresh_interval=refresh_interval,
296+
headers=headers,
287297
post_process=post_process,
288298
)
289299

@@ -330,6 +340,10 @@ def to_dict(self):
330340
non_null_attributes['data_type'] = self.data_type.value
331341
if self.time_frame is not None:
332342
non_null_attributes['time_frame'] = self.time_frame.value
343+
if self.headers is not None:
344+
non_null_attributes['headers'] = {
345+
key: "***" for key in self.headers
346+
}
333347

334348
return non_null_attributes
335349

investing_algorithm_framework/infrastructure/data_providers/base_url.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def __init__(
4545
date_format=None,
4646
cache=True,
4747
refresh_interval=None,
48+
headers=None,
4849
pre_process=None,
4950
post_process=None,
5051
priority=5,
@@ -62,6 +63,7 @@ def __init__(
6263
self._date_format = date_format
6364
self._cache = cache
6465
self._refresh_interval = refresh_interval
66+
self._headers = headers or {}
6567
self._pre_process = pre_process
6668
self._post_process = post_process
6769
self._cached_data = None
@@ -194,6 +196,7 @@ def copy(self, data_source=None):
194196
date_format = self._date_format
195197
cache = self._cache
196198
refresh_interval = self._refresh_interval
199+
headers = self._headers
197200
pre_process = self._pre_process
198201
post_process = self._post_process
199202
identifier = self.data_provider_identifier
@@ -206,6 +209,7 @@ def copy(self, data_source=None):
206209
else cache
207210
refresh_interval = data_source.refresh_interval \
208211
or refresh_interval
212+
headers = data_source.headers or headers
209213
pre_process = data_source.pre_process or pre_process
210214
post_process = data_source.post_process or post_process
211215

@@ -215,6 +219,7 @@ def copy(self, data_source=None):
215219
date_format=date_format,
216220
cache=cache,
217221
refresh_interval=refresh_interval,
222+
headers=headers,
218223
pre_process=pre_process,
219224
post_process=post_process,
220225
priority=self.priority,
@@ -277,9 +282,11 @@ def _fetch_and_parse(self):
277282

278283
# Fetch from URL
279284
ctx = ssl.create_default_context()
285+
headers = {"User-Agent": "investing-algorithm-framework"}
286+
headers.update(self._headers)
280287
req = urllib.request.Request(
281288
url,
282-
headers={"User-Agent": "investing-algorithm-framework"}
289+
headers=headers
283290
)
284291
with urllib.request.urlopen(req, context=ctx) as response:
285292
raw_bytes = response.read()
@@ -350,8 +357,12 @@ def _get_cache_path(self):
350357
if storage_dir is None:
351358
storage_dir = os.path.join(os.getcwd(), ".data_cache")
352359

360+
cache_key = self._url
361+
if self._headers:
362+
cache_key = f"{cache_key}|headers:{sorted(self._headers.items())}"
363+
353364
url_hash = hashlib.md5(
354-
self._url.encode()
365+
cache_key.encode()
355366
).hexdigest()[:12]
356367
suffix = self._cache_file_suffix()
357368
return os.path.join(storage_dir, f"url_{url_hash}{suffix}")

0 commit comments

Comments
 (0)