Adding new alpha storage class + warm start from DB

Breta01 · Breta01 · commit fb7b711eb31c · 2025-05-30T00:58:52.000+08:00
diff --git a/README.md b/README.md
@@ -53,3 +53,23 @@ pre-commit install
 ```bash
 ./db_migrate.sh up
 ```
+
+## Running Using Docker
+
+Build
+
+```bash
+docker build -t brain:latest .
+```
+
+Run
+
+```bash
+docker run -d --name brain-container --restart=always brain:latest
+```
+
+Monitor
+
+```bash
+docker logs -f brain-container
+```
diff --git a/brain/alpha_class.py b/brain/alpha_class.py
@@ -1,5 +1,6 @@
 import datetime
 import sqlite3
+import uuid
 from dataclasses import asdict, dataclass, field
 from typing import Optional
 
@@ -9,7 +10,11 @@ class Alpha:
     # database-generated primary key (None before insert)
     alpha_id: str = None
 
+    # Lifecycle properties
+    is_temporary: bool = False
     print_counter: int = 0
+    visible: bool = True
+    hide_after: Optional[int] = None
 
     # user-supplied columns
     regular: str = ""
@@ -40,11 +45,17 @@ class Alpha:
         default_factory=lambda: datetime.datetime.now(datetime.timezone.utc).isoformat(" ")
     )
 
+    def _increase_print_counter(self):
+        """Increase the print counter."""
+        self.print_counter += 1
+        if self.hide_after is not None and self.print_counter > self.hide_after:
+            self.visible = False
+
     def prompt_format(self) -> str:
         """Format alpha data for the prompt."""
-        self.print_counter += 1
+        self._increase_print_counter()
 
-        if self.alpha_id is None:
+        if self.alpha_id is None or self.is_temporary:
             return f"**Expression:** `{self.regular}`"
 
         number_of_trades = (
@@ -69,14 +80,14 @@ def prompt_format(self) -> str:
     def as_dict(self) -> dict:
         """Convert the Alpha instance to a dictionary compatible with DB."""
         data = asdict(self)
-        for exclude in ["print_counter"]:
+        for exclude in ["print_counter", "visible", "hide_after"]:
             data.pop(exclude, None)
         data["failing_tests"] = ",".join(data["failing_tests"])
         return data
 
     @classmethod
     def from_config(cls, config: dict) -> "Alpha":
-        """Build an Alpha from a dictionary."""
+        """Build a temporary Alpha from a dictionary."""
         config_cols = [
             "regular",
             "region",
@@ -89,7 +100,9 @@ def from_config(cls, config: dict) -> "Alpha":
             "nan_handling",
             "unit_handling",
         ]
-        return cls(**{k: config.get(k) for k in config_cols})
+        return cls(
+            **{k: config.get(k) for k in config_cols}, alpha_id=str(uuid.uuid4()), is_temporary=True
+        )
 
     @classmethod
     def from_stats(cls, stats: dict) -> "Alpha":
diff --git a/brain/alpha_storage.py b/brain/alpha_storage.py
@@ -0,0 +1,57 @@
+import heapq
+from typing import Callable
+
+from brain.alpha_class import Alpha
+
+
+class Storage:
+    def __init__(self, score_func: Callable[[Alpha], float], max_size: int = 50):
+        """Initialize the storage with empty data and categories.
+
+        Args:
+            max_size: Maximum number of alphas to store in each category.
+        """
+        self.score_func = score_func
+        self.max_size = max_size
+        self.data = {}
+        self.categories = {
+            "passing": [],
+            "failing": [],
+            "pending": [],
+        }
+
+    def __getitem__(self, alpha_id: str) -> Alpha:
+        return self.data.get(alpha_id)
+
+    def get_top_k(self, category: str, k: int = 10) -> list[Alpha]:
+        """Get the top k alphas from a specific category."""
+        if category not in self.categories:
+            raise ValueError(f"Invalid category: {category}")
+
+        return [self.data[alpha_id] for alpha_id in self.categories[category][:k]]
+
+    def add_alpha(self, alpha: Alpha, category: str) -> None:
+        """Add an alpha to the storage in the specified category."""
+        if category not in self.categories:
+            raise ValueError(f"Invalid category: {category}")
+
+        self.data[alpha.alpha_id] = alpha
+        if alpha.alpha_id not in self.categories[category]:
+            self._append_to_category(alpha.alpha_id, category)
+
+    def remove_pending_alpha(self, alpha_id: str) -> None:
+        """Remove an alpha from the pending category."""
+        if alpha_id in self.categories["pending"]:
+            self.categories["pending"].remove(alpha_id)
+        self.data.pop(alpha_id, None)
+
+    def _score(self, alpha_id: str) -> float:
+        return self.score_func(self.data[alpha_id])
+
+    def _append_to_category(self, alpha_id: str, category: str) -> None:
+        """Append an alpha to a specific category."""
+        self.categories[category].append(alpha_id)
+        if category != "pending":
+            self.categories[category] = heapq.nlargest(
+                self.max_size, self.categories[category], key=self._score
+            )
diff --git a/brain/database.py b/brain/database.py
@@ -40,6 +40,29 @@ def find_by_code(self, code: str, neutralization: str, delay: int) -> list[Alpha
         rows = self.cursor.fetchall()
         return [Alpha.from_row(r) for r in rows]
 
+    def k_best_alphas(
+        self,
+        metric: str = "sharpe",
+        top_k: int = 100,
+        min_fitness: float = 1.0,
+        max_self_corr: float = 0.6,
+    ) -> list[Alpha]:
+        """Find best performing alphas by certain metric."""
+        # Make sure `metric` is a valid column in your `alphas` table!
+        sql = f"""
+            SELECT *
+            FROM alphas
+            WHERE {metric} IS NOT NULL
+                AND fitness > %s
+                AND self_correlation < %s
+            ORDER BY {metric} DESC
+            LIMIT %s
+        """
+        params = (min_fitness, max_self_corr, top_k)
+        self.cursor.execute(sql, params)
+        rows = self.cursor.fetchall()
+        return [Alpha.from_row(r) for r in rows]
+
     def close(self):
         self.cursor.close()
         self.conn.close()
diff --git a/brain/search_algorithm.py b/brain/search_algorithm.py
@@ -1,11 +1,10 @@
-import heapq
 import random
-import uuid
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from brain.agent import agent
 from brain.agent_config import DEFAULT_CONFIG
 from brain.alpha_class import Alpha
+from brain.alpha_storage import Storage
 from brain.api import DEFAULT_CONFIG as API_DEFAULT_CONFIG
 from brain.api import BrainAPI
 from brain.database import Database
@@ -20,23 +19,23 @@ def decay_hyperbolic(x, gamma=0.2, delta=0.1):
     return (gamma * x) / (1 + delta * x)
 
 
-def create_alpha_simulation(alphas_dict, alphas_categories):
-    """Create a new alpha based on the given ID."""
+def get_score(alpha: Alpha):
+    if not alpha.visible:
+        return float("-inf")
 
-    def get_score(alpha_id):
-        return (
-            alphas_dict[alpha_id].fitness
-            + 1.5 * alphas_dict[alpha_id].sharpe
-            - decay_hyperbolic(alphas_dict[alpha_id].print_counter, gamma=0.01, delta=0.02)
-        )
+    return (
+        alpha.fitness
+        + 1.5 * alpha.sharpe
+        - decay_hyperbolic(alpha.print_counter, gamma=0.01, delta=0.02)
+    )
 
-    n_largest = 10
-    for cat in ["passing", "failing"]:
-        alphas_categories[cat] = heapq.nlargest(n_largest, alphas_categories[cat], key=get_score)
+
+def create_alpha_simulation(storage: Storage):
+    """Create a new alpha based on the given ID."""
 
     formatted_alphas = {
-        cat: "\n".join(alphas_dict[id].prompt_format() for id in alphas_categories[cat])
-        for cat in alphas_categories.keys()
+        cat: "\n".join(alpha.prompt_format() for alpha in storage.get_top_k(cat, 10))
+        for cat in storage.categories
     }
 
     if random.random() < 0.05:
@@ -114,60 +113,80 @@ def monitor_alpha(response, alpha_config):
         }
 
 
-def update_alphas_dict(alphas_dict, alphas_categories, stats, temp_id):
+def update_alphas_dict(
+    storage: Storage,
+    stats: dict,
+    temp_id: str,
+):
     """Update the alphas dictionary with the new stats."""
-    alphas_categories["pending"].remove(temp_id)
-    alphas_dict.pop(temp_id)
+    storage.remove_pending_alpha(temp_id)
 
     if stats["alpha_id"] is None:
         return
 
-    alpha_id = stats["alpha_id"]
-    alphas_dict[alpha_id] = Alpha.from_stats(stats)
+    alpha = Alpha.from_stats(stats)
     try:
-        Database().insert_alpha(alphas_dict[alpha_id])
+        Database().insert_alpha(alpha)
     except Exception as e:
         print(f"Error during database insertion: {e}")
         pass
 
-    if alphas_dict[alpha_id].short_count + alphas_dict[alpha_id].long_count > 0:
+    if alpha.short_count + alpha.long_count > 0:
         if (stats["is_tests"]["result"] != "FAIL").all():
-            alphas_categories["passing"].append(alpha_id)
+            storage.add_alpha(alpha, "passing")
         else:
-            alphas_categories["failing"].append(alpha_id)
+            storage.add_alpha(alpha, "failing")
 
-    return alphas_dict[alpha_id]
+    return alpha
+
+
+def set_warm_start_alphas(storage: Storage) -> None:
+    """Initialize alphas_dict with warm start alphas from the database."""
+    try:
+        alphas = Database().k_best_alphas(
+            metric="sharpe",
+            top_k=100,
+            min_fitness=1.0,
+            max_self_corr=0.6,
+        )
+
+        alphas = random.sample(alphas, min(10, len(alphas)))
+        for alpha in alphas:
+            alpha.hide_after = 30
+            storage.add_alpha(alpha, "failing")
+
+    except Exception as e:
+        print(f"Error during database query: {e}")
 
 
 def main():
     """Main function to run the agent."""
-    alphas_dict = {}
-    alphas_categories = {
-        "passing": [],
-        "failing": [],
-        "pending": [],
-    }
+    storage = Storage(score_func=get_score, max_size=50)
+
+    set_warm_start_alphas(storage)
 
     with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
         live_jobs = {}
 
         for _ in range(MAX_WORKERS):
             # Start a new alpha simulation
-            response, alpha_config = create_alpha_simulation(alphas_dict, alphas_categories)
-
-            # Generate a unique ID for the alpha
-            temp_id = str(uuid.uuid4())
-            alphas_categories["pending"].append(temp_id)
-            alphas_dict[temp_id] = Alpha.from_config(alpha_config)
-            live_jobs[pool.submit(monitor_alpha, response, alpha_config)] = (temp_id, alpha_config)
+            response, alpha_config = create_alpha_simulation(storage)
+
+            # Create a temporary alpha configuration
+            alpha = Alpha.from_config(alpha_config)
+            storage.add_alpha(alpha, "pending")
+            live_jobs[pool.submit(monitor_alpha, response, alpha_config)] = (
+                alpha.alpha_id,
+                alpha_config,
+            )
 
         while live_jobs:
             for job in as_completed(live_jobs):
                 # Update alphas_dict with the results
                 temp_id, alpha_config = live_jobs.pop(job)  # remove from “running” set
                 stats = job.result()
                 print(f"Stats: {stats}")
-                alpha = update_alphas_dict(alphas_dict, alphas_categories, stats, temp_id)
+                alpha = update_alphas_dict(storage, stats, temp_id)
 
                 # Start a new alpha simulation
                 if alpha is not None and alpha.alpha_id is not None and alpha.fitness < -0.5:
@@ -176,12 +195,11 @@ def main():
                     alpha_config = {**alpha_config, "regular": regular}
                     response = BrainAPI.start_simulation(alpha_config)
                 else:
-                    response, alpha_config = create_alpha_simulation(alphas_dict, alphas_categories)
-                # Generate a unique ID for the alpha
-                temp_id = str(uuid.uuid4())
-                alphas_categories["pending"].append(temp_id)
-                alphas_dict[temp_id] = Alpha.from_config(alpha_config)
+                    response, alpha_config = create_alpha_simulation(storage)
+                # TODO: Turn this into a method + stop using alpha_config
+                alpha = Alpha.from_config(alpha_config)
+                storage.add_alpha(alpha, "pending")
                 live_jobs[pool.submit(monitor_alpha, response, alpha_config)] = (
-                    temp_id,
+                    alpha.alpha_id,
                     alpha_config,
                 )
diff --git a/brain/tools/simulation.py b/brain/tools/simulation.py
@@ -3,10 +3,8 @@
 import time
 from typing import Annotated
 
-from langchain_core.messages import ToolMessage
 from langchain_core.runnables import RunnableConfig
 from langchain_core.tools import InjectedToolCallId, tool
-from langgraph.graph import END
 from langgraph.types import Command
 
 from brain.agent_config import get_universe_config
diff --git a/setup.py b/setup.py
@@ -50,6 +50,8 @@ def parse_requirements(file_name):
     platforms=["Windows", "Linux", "Solaris", "Mac OS-X", "Unix"],
     python_requires=">=3.9",
     install_requires=REQUIREMENTS,
+    include_package_data=True,
+    package_data={"brain": ["tools/data/*"]},
     zip_safe=False,
     entry_points={
         "console_scripts": [