-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare_data.py
More file actions
126 lines (100 loc) · 4.84 KB
/
prepare_data.py
File metadata and controls
126 lines (100 loc) · 4.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""prepare_data.py — fetch the dataset used by experiments/PO_*.py from Yahoo Finance.
Produces two CSVs under ./dataset/ that the experiment scripts read as
"../dataset/top_50_us_stocks_data_20250526_011226_covariance.csv" and
"../dataset/top_50_us_stocks_returns_price.csv":
Ticker, Average_Return, Price, Company_Name (returns_price.csv)
Ticker, AAPL, MSFT, ... (covariance.csv — a square matrix)
The 50-ticker list below is a default placeholder of large-cap US equities; replace
TICKERS with the exact universe used in the paper to reproduce published numbers.
Run from the repo root:
python prepare_data.py
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
import numpy as np
import pandas as pd
try:
import yfinance as yf
except ImportError:
sys.exit("yfinance is required. Install with `pip install yfinance` "
"or activate the conda env from environment.yml.")
# ---------------------------------------------------------------------------
# Replace the list below with the exact 50 tickers used in the paper.
# Defaults are large-cap US equities that span a price range comparable to the
# paper's [108, 216] USD band, sufficient for a smoke test of the pipeline.
# ---------------------------------------------------------------------------
TICKERS: list[str] = [
"AAPL", "MSFT", "GOOGL", "AMZN", "META", "NVDA", "TSLA", "JPM", "V", "JNJ",
"WMT", "MA", "PG", "UNH", "HD", "BAC", "DIS", "ADBE", "CRM", "NFLX",
"KO", "PFE", "INTC", "CSCO", "PEP", "ABT", "ABBV", "COST", "NKE", "TMO",
"MRK", "AVGO", "MCD", "ACN", "T", "NEE", "LIN", "DHR", "TXN", "QCOM",
"BMY", "AMGN", "HON", "LOW", "UPS", "IBM", "MS", "GS", "CAT", "LMT",
]
START_DATE = "2015-04-01"
END_DATE = "2025-04-01"
COV_FILENAME = "top_50_us_stocks_data_20250526_011226_covariance.csv"
RET_FILENAME = "top_50_us_stocks_returns_price.csv"
def fetch_close_prices(tickers: list[str], start: str, end: str) -> pd.DataFrame:
"""Download adjusted-close daily prices from Yahoo Finance.
Returns a DataFrame indexed by date with one column per ticker. Tickers
with no data in the requested range are dropped (with a warning).
"""
print(f"Downloading {len(tickers)} tickers from {start} to {end}...")
df = yf.download(
tickers, start=start, end=end,
auto_adjust=True, progress=False, group_by="ticker", threads=True,
)
if isinstance(df.columns, pd.MultiIndex):
# group_by="ticker" produces a MultiIndex (ticker, field). Pull Close.
closes = pd.concat({t: df[t]["Close"] for t in tickers if t in df.columns.levels[0]}, axis=1)
else:
# Single-ticker download falls back to a flat DataFrame.
closes = df[["Close"]].rename(columns={"Close": tickers[0]})
dropped = [t for t in tickers if t not in closes.columns]
if dropped:
print(f" warning: no data for {dropped} (dropped from universe)")
return closes.dropna(how="all")
def build_returns_price(closes: pd.DataFrame) -> pd.DataFrame:
"""Build the per-ticker summary table the experiments expect."""
daily_returns = closes.pct_change().dropna(how="all")
last_price = closes.ffill().iloc[-1]
avg_return = daily_returns.mean()
out = pd.DataFrame({
"Ticker": closes.columns,
"Average_Return": avg_return.values,
"Price": last_price.values,
# Company_Name is informational only; keep blank to avoid hammering
# yfinance with one .info call per ticker.
"Company_Name": closes.columns,
})
return out
def build_covariance(closes: pd.DataFrame) -> pd.DataFrame:
"""Build the daily-return covariance matrix in the wide format used by the experiments."""
daily_returns = closes.pct_change().dropna(how="all")
cov = daily_returns.cov()
cov.insert(0, "Ticker", cov.index)
cov = cov.reset_index(drop=True)
return cov
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
parser.add_argument("--out-dir", default="dataset",
help="Directory to write CSV files into (default: dataset)")
parser.add_argument("--start", default=START_DATE)
parser.add_argument("--end", default=END_DATE)
args = parser.parse_args()
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
closes = fetch_close_prices(TICKERS, args.start, args.end)
ret_df = build_returns_price(closes)
ret_path = out_dir / RET_FILENAME
ret_df.to_csv(ret_path, index=False)
print(f"Wrote {ret_path} ({len(ret_df)} rows)")
cov_df = build_covariance(closes)
cov_path = out_dir / COV_FILENAME
cov_df.to_csv(cov_path, index=False)
print(f"Wrote {cov_path} ({len(cov_df)} rows × {len(cov_df.columns)} cols)")
print("\nDone. The experiment scripts can now find the CSVs at ../dataset/...")
if __name__ == "__main__":
main()