Subspace-confined-variational-quantum-algorithms-for-constrained-portfolio-optimization/prepare_data.py at main · AndewNN/Subspace-confined-variational-quantum-algorithms-for-constrained-portfolio-optimization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""prepare_data.py — fetch the dataset used by experiments/PO_*.py from Yahoo Finance.

Produces two CSVs under ./dataset/ that the experiment scripts read as
"../dataset/top_50_us_stocks_data_20250526_011226_covariance.csv" and
"../dataset/top_50_us_stocks_returns_price.csv":

  Ticker, Average_Return, Price, Company_Name              (returns_price.csv)
  Ticker, AAPL, MSFT, ...   (covariance.csv  —  a square matrix)

The 50-ticker list below is a default placeholder of large-cap US equities; replace
TICKERS with the exact universe used in the paper to reproduce published numbers.

Run from the repo root:
    python prepare_data.py
"""
from __future__ import annotations

import argparse
import sys
from pathlib import Path

import numpy as np
import pandas as pd

try:
    import yfinance as yf
except ImportError:
    sys.exit("yfinance is required. Install with `pip install yfinance` "
             "or activate the conda env from environment.yml.")

# ---------------------------------------------------------------------------
# Replace the list below with the exact 50 tickers used in the paper.
# Defaults are large-cap US equities that span a price range comparable to the
# paper's [108, 216] USD band, sufficient for a smoke test of the pipeline.
# ---------------------------------------------------------------------------
TICKERS: list[str] = [
    "AAPL", "MSFT", "GOOGL", "AMZN", "META", "NVDA", "TSLA", "JPM", "V", "JNJ",
    "WMT", "MA", "PG", "UNH", "HD", "BAC", "DIS", "ADBE", "CRM", "NFLX",
    "KO", "PFE", "INTC", "CSCO", "PEP", "ABT", "ABBV", "COST", "NKE", "TMO",
    "MRK", "AVGO", "MCD", "ACN", "T", "NEE", "LIN", "DHR", "TXN", "QCOM",
    "BMY", "AMGN", "HON", "LOW", "UPS", "IBM", "MS", "GS", "CAT", "LMT",
]

START_DATE = "2015-04-01"
END_DATE = "2025-04-01"

COV_FILENAME = "top_50_us_stocks_data_20250526_011226_covariance.csv"
RET_FILENAME = "top_50_us_stocks_returns_price.csv"


def fetch_close_prices(tickers: list[str], start: str, end: str) -> pd.DataFrame:
    """Download adjusted-close daily prices from Yahoo Finance.

    Returns a DataFrame indexed by date with one column per ticker. Tickers
    with no data in the requested range are dropped (with a warning).
    """
    print(f"Downloading {len(tickers)} tickers from {start} to {end}...")
    df = yf.download(
        tickers, start=start, end=end,
        auto_adjust=True, progress=False, group_by="ticker", threads=True,
    )
    if isinstance(df.columns, pd.MultiIndex):
        # group_by="ticker" produces a MultiIndex (ticker, field). Pull Close.
        closes = pd.concat({t: df[t]["Close"] for t in tickers if t in df.columns.levels[0]}, axis=1)
    else:
        # Single-ticker download falls back to a flat DataFrame.
        closes = df[["Close"]].rename(columns={"Close": tickers[0]})
    dropped = [t for t in tickers if t not in closes.columns]
    if dropped:
        print(f"  warning: no data for {dropped} (dropped from universe)")
    return closes.dropna(how="all")


def build_returns_price(closes: pd.DataFrame) -> pd.DataFrame:
    """Build the per-ticker summary table the experiments expect."""
    daily_returns = closes.pct_change().dropna(how="all")
    last_price = closes.ffill().iloc[-1]
    avg_return = daily_returns.mean()
    out = pd.DataFrame({
        "Ticker": closes.columns,
        "Average_Return": avg_return.values,
        "Price": last_price.values,
        # Company_Name is informational only; keep blank to avoid hammering
        # yfinance with one .info call per ticker.
        "Company_Name": closes.columns,
    })
    return out


def build_covariance(closes: pd.DataFrame) -> pd.DataFrame:
    """Build the daily-return covariance matrix in the wide format used by the experiments."""
    daily_returns = closes.pct_change().dropna(how="all")
    cov = daily_returns.cov()
    cov.insert(0, "Ticker", cov.index)
    cov = cov.reset_index(drop=True)
    return cov


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
    parser.add_argument("--out-dir", default="dataset",
                        help="Directory to write CSV files into (default: dataset)")
    parser.add_argument("--start", default=START_DATE)
    parser.add_argument("--end", default=END_DATE)
    args = parser.parse_args()

    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    closes = fetch_close_prices(TICKERS, args.start, args.end)

    ret_df = build_returns_price(closes)
    ret_path = out_dir / RET_FILENAME
    ret_df.to_csv(ret_path, index=False)
    print(f"Wrote {ret_path}  ({len(ret_df)} rows)")

    cov_df = build_covariance(closes)
    cov_path = out_dir / COV_FILENAME
    cov_df.to_csv(cov_path, index=False)
    print(f"Wrote {cov_path}  ({len(cov_df)} rows × {len(cov_df.columns)} cols)")

    print("\nDone. The experiment scripts can now find the CSVs at ../dataset/...")


if __name__ == "__main__":
    main()