Support timeout trial (#32)

kerthcet · web-flow · commit 85abd50bd028 · 2025-10-29T19:39:23.000Z
* Support timeout trial

Signed-off-by: kerthcet &lt;kerthcet@gmail.com&gt;

* lock poetry

Signed-off-by: kerthcet &lt;kerthcet@gmail.com&gt;

* update tests

Signed-off-by: kerthcet &lt;kerthcet@gmail.com&gt;

* fix test

Signed-off-by: kerthcet &lt;kerthcet@gmail.com&gt;

* poetry lock

Signed-off-by: kerthcet &lt;kerthcet@gmail.com&gt;

---------

Signed-off-by: kerthcet &lt;kerthcet@gmail.com&gt;
diff --git a/Makefile b/Makefile
@@ -23,7 +23,7 @@ format:
 
 .PHONY: test
 test: lint
-	$(POETRY) run pytest tests/unit
+	$(POETRY) run pytest tests/unit --timeout=15
 
 .PHONY: test-integration
 test-integration: lint
diff --git a/alphatrion/experiment/base.py b/alphatrion/experiment/base.py
@@ -1,4 +1,4 @@
-import weakref
+import uuid
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 
@@ -29,7 +29,7 @@ def get_trial(self, id: int) -> trial.Trial | None:
     def _reset(self):
         self._trials = dict()
 
-    def __enter__(self):
+    async def __aenter__(self):
         if self._id is None:
             raise RuntimeError("Experiment is not set. Did you call run()?")
 
@@ -38,10 +38,10 @@ def __enter__(self):
             raise RuntimeError(f"Experiment {self._id} not found in the database.")
 
         # Use weakref to avoid circular reference
-        self._runtime.current_exp = weakref.ref(self)
+        self._runtime.current_exp = self
         return self
 
-    def __exit__(self, exc_type, exc_val, exc_tb):
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
         self._reset()
         self._runtime.current_exp = None
 
@@ -53,9 +53,12 @@ def run(
         """Return a new experiment."""
         ...
 
-    def _register_trial(self, id: int, instance: trial.Trial):
+    def register_trial(self, id: uuid.UUID, instance: trial.Trial):
         self._trials[id] = instance
 
+    def unregister_trial(self, id: uuid.UUID):
+        self._trials.pop(id, None)
+
     def _create(
         self,
         name: str,
diff --git a/alphatrion/experiment/craft_exp.py b/alphatrion/experiment/craft_exp.py
@@ -1,3 +1,5 @@
+import uuid
+
 from alphatrion.experiment.base import Experiment
 from alphatrion.trial.trial import Trial, TrialConfig
 
@@ -14,21 +16,32 @@ def __init__(self):
         super().__init__()
 
     @classmethod
-    def run(cls, name: str, description: str | None = None, meta: dict | None = None):
+    def run(
+        cls,
+        name: str,
+        id: uuid.UUID | None = None,
+        description: str | None = None,
+        meta: dict | None = None,
+    ) -> "CraftExperiment":
         """
         Begin the experiment. This method must be used to start multi-trial experiment.
+        If id is provided, the experiment with the given id will be used.
         """
 
         exp = CraftExperiment()
-        exp._create(
-            name=name,
-            description=description,
-            meta=meta,
-        )
+
+        if id is not None:
+            exp._id = id
+        else:
+            exp._create(
+                name=name,
+                description=description,
+                meta=meta,
+            )
 
         return exp
 
-    def start_trial(
+    async def start_trial(
         self,
         description: str | None = None,
         meta: dict | None = None,
@@ -46,11 +59,6 @@ def start_trial(
         """
 
         trial = Trial(exp_id=self._id, config=config)
-        trial._start(description=description, meta=meta, params=params)
-        self._register_trial(id=trial._id, instance=trial)
+        await trial._start(description=description, meta=meta, params=params)
+        self.register_trial(id=trial.id, instance=trial)
         return trial
-
-    # @classmethod
-    # # TODO: support async
-    # async def async_trial(cls):
-    #     pass
diff --git a/alphatrion/metadata/sql.py b/alphatrion/metadata/sql.py
@@ -1,3 +1,5 @@
+import uuid
+
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 
@@ -166,7 +168,7 @@ def create_trial(
         meta: dict | None,
         params: dict | None = None,
         status: TrialStatus = TrialStatus.PENDING,
-    ) -> int:
+    ) -> uuid.UUID:
         session = self._session()
         new_trial = Trial(
             experiment_id=exp_id,
@@ -183,13 +185,13 @@ def create_trial(
 
         return trial_id
 
-    def get_trial(self, trial_id: int) -> Trial | None:
+    def get_trial(self, trial_id: uuid.UUID) -> Trial | None:
         session = self._session()
         trial = session.query(Trial).filter(Trial.uuid == trial_id).first()
         session.close()
         return trial
 
-    def update_trial(self, trial_id: int, **kwargs):
+    def update_trial(self, trial_id: uuid.UUID, **kwargs):
         session = self._session()
         trial = session.query(Trial).filter(Trial.uuid == trial_id).first()
         if trial:
@@ -198,7 +200,7 @@ def update_trial(self, trial_id: int, **kwargs):
             session.commit()
         session.close()
 
-    def create_metric(self, trial_id: int, key: str, value: float, step: int):
+    def create_metric(self, trial_id: uuid.UUID, key: str, value: float, step: int):
         session = self._session()
         new_metric = Metrics(
             trial_id=trial_id,
@@ -210,7 +212,7 @@ def create_metric(self, trial_id: int, key: str, value: float, step: int):
         session.commit()
         session.close()
 
-    def list_metrics(self, trial_id: int) -> list[Metrics]:
+    def list_metrics(self, trial_id: uuid.UUID) -> list[Metrics]:
         session = self._session()
         metrics = session.query(Metrics).filter(Metrics.trial_id == trial_id).all()
         session.close()
diff --git a/alphatrion/metadata/sql_models.py b/alphatrion/metadata/sql_models.py
@@ -85,6 +85,7 @@ class Metrics(Base):
     key = Column(String, nullable=False)
     value = Column(Float, nullable=False)
     trial_id = Column(UUID(as_uuid=True), nullable=False)
+    # TODO: do we need?
     step = Column(Integer, nullable=False, default=0)
     created_at = Column(DateTime(timezone=True), default=datetime.now(UTC))
 
diff --git a/alphatrion/runtime/runtime.py b/alphatrion/runtime/runtime.py
@@ -41,7 +41,7 @@ def __init__(self, project_id: str, artifact_insecure: bool = False):
     # current_exp is the current running experiment.
     @property
     def current_exp(self):
-        return self.__current_exp()
+        return self.__current_exp
 
     @current_exp.setter
     def current_exp(self, value):
diff --git a/alphatrion/trial/trial.py b/alphatrion/trial/trial.py
@@ -1,10 +1,12 @@
 import contextvars
+import uuid
 from datetime import UTC, datetime
 
 from pydantic import BaseModel, Field, field_validator
 
 from alphatrion.metadata.sql_models import COMPLETED_STATUS, TrialStatus
 from alphatrion.runtime.runtime import global_runtime
+from alphatrion.utils.context import Context
 
 # Used in record/record.py to log params/metrics
 current_trial_id = contextvars.ContextVar("current_trial_id", default=None)
@@ -57,15 +59,15 @@ class TrialConfig(BaseModel):
     """Configuration for an experiment."""
 
     max_duration_seconds: int = Field(
-        default=86400,
+        default=-1,
         description="Maximum duration in seconds for the experiment. \
-        Default is 86400 seconds (1 day).",
-    )
-    max_retries: int = Field(
-        default=0,
-        description="Maximum number of retries for the experiment. \
-            Default is 0 (no retries).",
+        Default is -1 (no limit).",
     )
+    # max_retries: int = Field(
+    #     default=0,
+    #     description="Maximum number of retries for the experiment. \
+    #         Default is 0 (no retries).",
+    # )
     checkpoint: CheckpointConfig = Field(
         default=CheckpointConfig(),
         description="Configuration for checkpointing.",
@@ -78,8 +80,9 @@ class Trial:
         "_exp_id",
         "_config",
         "_runtime",
-        "_token",
         "_step",
+        "_context",
+        "_token",
     )
 
     def __init__(self, exp_id: int, config: TrialConfig | None = None):
@@ -88,13 +91,25 @@ def __init__(self, exp_id: int, config: TrialConfig | None = None):
         self._runtime = global_runtime()
         # step is used to track the round, e.g. the step in metric logging.
         self._step = 0
+        self._context = Context(
+            cancel_func=self._stop,
+            timeout=self._config.max_duration_seconds
+            if self._config.max_duration_seconds > 0
+            else None,
+        )
+
+    def stopped(self) -> bool:
+        return self._context.cancelled()
 
-    def _start(
+    async def wait_stopped(self):
+        await self._context.wait_cancelled()
+
+    async def _start(
         self,
         description: str | None = None,
         meta: dict | None = None,
         params: dict | None = None,
-    ) -> int:
+    ) -> uuid.UUID:
         self._id = self._runtime._metadb.create_trial(
             exp_id=self._exp_id,
             description=description,
@@ -103,26 +118,31 @@ def _start(
             status=TrialStatus.RUNNING,
         )
 
+        # We don't reset the trial id context var here, because
+        # each trial runs in its own context.
         self._token = current_trial_id.set(self._id)
+        await self._context.start()
         return self._id
 
     @property
-    def id(self):
+    def id(self) -> uuid.UUID:
         return self._id
 
-    # finish function should be called manually as a pair of start
-    def finish(self, status: TrialStatus = TrialStatus.FINISHED):
+    # stop function should be called manually as a pair of start
+    def stop(self):
+        self._context.cancel()
+
+    def _stop(self):
         trial = self._runtime._metadb.get_trial(trial_id=self._id)
         if trial is not None and trial.status not in COMPLETED_STATUS:
             duration = (
                 datetime.now(UTC) - trial.created_at.replace(tzinfo=UTC)
             ).total_seconds()
             self._runtime._metadb.update_trial(
-                trial_id=self._id, status=status, duration=duration
+                trial_id=self._id, status=TrialStatus.FINISHED, duration=duration
             )
 
-        # recover the context var
-        current_trial_id.reset(self._token)
+        self._runtime.current_exp.unregister_trial(self._id)
 
     def _get(self):
         return self._runtime._metadb.get_trial(trial_id=self._id)
diff --git a/alphatrion/utils/context.py b/alphatrion/utils/context.py
@@ -0,0 +1,35 @@
+import asyncio
+from collections.abc import Callable
+
+
+# Inspired by golang context package
+class Context:
+    def __init__(self, cancel_func: Callable | None = None, timeout=None):
+        """A context for managing cancellation and timeouts.
+        :param cancel_func: A function to call when the context is cancelled.
+        :param timeout: Timeout in seconds. If None, no timeout is set.
+        """
+        self._cancel_event = asyncio.Event()
+        self._cancel_func = cancel_func
+        self._timeout = timeout
+
+    async def start(self):
+        if self._timeout:
+            asyncio.create_task(self._auto_cancel(self._timeout))
+
+    async def _auto_cancel(self, timeout):
+        await asyncio.sleep(timeout)
+        self.cancel()
+
+    def cancel(self):
+        if self.cancelled():
+            return
+        if self._cancel_func:
+            self._cancel_func()
+        self._cancel_event.set()
+
+    def cancelled(self):
+        return self._cancel_event.is_set()
+
+    async def wait_cancelled(self):
+        await self._cancel_event.wait()
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,6 +17,8 @@ dependencies = [
 [tool.poetry.group.dev.dependencies]
 pytest = ">=8.4.2,<9.0.0"
 ruff = "^0.12.12"
+pytest-asyncio = ">=0.22.0,<1.0.0"
+pytest-timeout = ">=2.1.0,<3.0.0"
 
 [build-system]
 requires = ["poetry-core>=2.0.0,<3.0.0"]
diff --git a/tests/integration/test_log_functions.py b/tests/integration/test_log_functions.py
diff --git a/tests/unit/experiment/test_craft_exp.py b/tests/unit/experiment/test_craft_exp.py
diff --git a/tests/unit/utils/test_context.py b/tests/unit/utils/test_context.py