Add max_run_number (#47)

kerthcet · web-flow · commit 4535011ad572 · 2025-11-06T17:18:39.000Z
Signed-off-by: kerthcet &lt;kerthcet@gmail.com&gt;
diff --git a/alphatrion/__init__.py b/alphatrion/__init__.py
@@ -1,5 +1,14 @@
 from alphatrion.experiment.craft_exp import CraftExperiment
 from alphatrion.log.log import log_artifact, log_metrics, log_params
 from alphatrion.runtime.runtime import init
+from alphatrion.trial.trial import CheckpointConfig, TrialConfig
 
-__all__ = ["log_artifact", "log_params", "log_metrics", "CraftExperiment", "init"]
+__all__ = [
+    "log_artifact",
+    "log_params",
+    "log_metrics",
+    "CraftExperiment",
+    "init",
+    "TrialConfig",
+    "CheckpointConfig",
+]
diff --git a/alphatrion/run/run.py b/alphatrion/run/run.py
@@ -1,3 +1,4 @@
+import asyncio
 import uuid
 
 from alphatrion.runtime.runtime import global_runtime
@@ -16,3 +17,9 @@ def _start(self):
         self._id = self._runtime._metadb.create_run(
             project_id=self._runtime._project_id, trial_id=self._trial_id
         )
+
+    def register_task(self, task: asyncio.Task):
+        self._task = task
+
+    async def wait(self):
+        await self._task
diff --git a/alphatrion/trial/trial.py b/alphatrion/trial/trial.py
@@ -59,6 +59,11 @@ class TrialConfig(BaseModel):
         after which experiment will be stopped. Default is -1 (no early stopping). \
         Count each time when calling log_metrics with the monitored metric.",
     )
+    max_run_number: int = Field(
+        default=-1,
+        description="Maximum number of runs for the trial. \
+        Default is -1 (no limit). Count by the finished runs.",
+    )
     monitor_metric: str | None = Field(
         default=None,
         description="The metric to monitor for saving the best checkpoint. \
@@ -110,7 +115,10 @@ class Trial:
         # key is run_id, value is Run instance
         "_runs",
         "_running_tasks",
+        # Only work when early_stopping_runs > 0
         "_early_stopping_counter",
+        # Only work when max_run_number > 0
+        "_total_runs_counter",
     )
 
     def __init__(self, exp_id: int, config: TrialConfig | None = None):
@@ -126,6 +134,7 @@ def __init__(self, exp_id: int, config: TrialConfig | None = None):
         self._runs = dict()
         self._running_tasks = dict()
         self._early_stopping_counter = 0
+        self._total_runs_counter = 0
 
     async def __aenter__(self):
         return self
@@ -223,9 +232,10 @@ def _timeout(self) -> int | None:
             timeout -= int(elapsed)
         return timeout
 
-    def stopped(self) -> bool:
-        return self._context.cancelled()
-
+    # Make sure you have termination condition, either by timeout or by calling cancel()
+    # Before we have logic like once all the tasks are done, we'll call the cancel()
+    # automatically, however, this is unpredictable because some tasks may be waiting
+    # for external events, so we leave it to the user to decide when to stop the trial.
     async def wait(self):
         await self._context.wait()
 
@@ -287,18 +297,22 @@ def start_run(self, call_func: callable) -> Run:
         run._start()
         self._runs[run.id] = run
 
-        # the created task will also inherit the current context,
+        # The created task will also inherit the current context,
         # including the current_trial_id context var.
         task = asyncio.create_task(call_func())
         self._running_tasks[run.id] = task
+        run.register_task(task)
+
         task.add_done_callback(lambda t: self._running_tasks.pop(run.id, None))
         task.add_done_callback(lambda t: self._runs.pop(run.id, None))
-        # FIXME: One potential issue here is once the former task finished
-        # very fast, it could lead to cancelling the trial even if there are
-        # other pending tasks ready to run. We may need a more robust way to
-        # handle this.
-        task.add_done_callback(
-            lambda t: self.cancel() if len(self._running_tasks) == 0 else None
-        )
+        if self._config.max_run_number > 0:
+            task.add_done_callback(
+                lambda t: (
+                    setattr(self, "_total_runs_counter", self._total_runs_counter + 1),
+                    self.cancel()
+                    if self._total_runs_counter >= self._config.max_run_number
+                    else None,
+                )
+            )
 
         return run
diff --git a/tests/integration/test_log.py b/tests/integration/test_log.py
@@ -9,7 +9,7 @@
 
 import alphatrion as alpha
 from alphatrion.metadata.sql_models import TrialStatus
-from alphatrion.trial.trial import CheckpointConfig, TrialConfig, current_trial_id
+from alphatrion.trial.trial import current_trial_id
 
 
 @pytest.mark.asyncio
@@ -141,8 +141,8 @@ async def test_log_metrics_with_save_on_max():
 
             _ = exp.start_trial(
                 name="trial-with-save_on_best",
-                config=TrialConfig(
-                    checkpoint=CheckpointConfig(
+                config=alpha.TrialConfig(
+                    checkpoint=alpha.CheckpointConfig(
                         enabled=True,
                         path=tmpdir,
                         save_on_best=True,
@@ -195,8 +195,8 @@ async def test_log_metrics_with_save_on_min():
 
             _ = exp.start_trial(
                 name="trial-with-save_on_best",
-                config=TrialConfig(
-                    checkpoint=CheckpointConfig(
+                config=alpha.TrialConfig(
+                    checkpoint=alpha.CheckpointConfig(
                         enabled=True,
                         path=tmpdir,
                         save_on_best=True,
@@ -251,7 +251,7 @@ async def fake_sleep(value: float):
     ) as exp:
         async with exp.start_trial(
             name="trial-with-early-stopping",
-            config=TrialConfig(
+            config=alpha.TrialConfig(
                 monitor_metric="accuracy",
                 early_stopping_runs=2,
             ),
@@ -284,20 +284,48 @@ async def fake_sleep(value: float):
         await alpha.log_metrics({"accuracy": value})
 
     async with alpha.CraftExperiment.start(
-        name="log_metrics_with_early_stopping_never_triggered"
+        name="log_metrics_with_both_early_stopping_and_timeout"
     ) as exp:
         async with exp.start_trial(
             name="trial-with-early-stopping",
-            config=TrialConfig(
+            config=alpha.TrialConfig(
                 monitor_metric="accuracy",
                 early_stopping_runs=3,
+                max_duration_seconds=3,
             ),
         ) as trial:
             start_time = datetime.now()
             trial.start_run(lambda: fake_work(1))
             trial.start_run(lambda: fake_work(2))
-            trial.start_run(lambda: fake_sleep(3))
+            trial.start_run(lambda: fake_sleep(2))
+            # running in parallel.
             await trial.wait()
 
             assert len(trial._runtime._metadb.list_metrics(trial_id=trial.id)) == 3
             assert datetime.now() - start_time >= timedelta(seconds=3)
+
+
+@pytest.mark.asyncio
+async def test_log_metrics_with_max_run_number():
+    alpha.init(project_id=uuid.uuid4(), artifact_insecure=True, init_tables=True)
+
+    async def fake_work(value: float):
+        await alpha.log_metrics({"accuracy": value})
+        print("fake finished.")
+
+    async with alpha.CraftExperiment.start(
+        name="log_metrics_with_max_run_number"
+    ) as exp:
+        async with exp.start_trial(
+            name="trial-with-max-run-number",
+            config=alpha.TrialConfig(
+                monitor_metric="accuracy",
+                max_run_number=5,
+            ),
+        ) as trial:
+            while not trial.cancelled():
+                run = trial.start_run(lambda: fake_work(1))
+                # running in serial.
+                await run.wait()
+
+            assert len(trial._runtime._metadb.list_metrics(trial_id=trial.id)) == 5
diff --git a/tests/unit/experiment/test_craft_exp.py b/tests/unit/experiment/test_craft_exp.py
@@ -7,7 +7,7 @@
 
 from alphatrion.experiment.craft_exp import CraftExperiment
 from alphatrion.metadata.sql_models import TrialStatus
-from alphatrion.runtime.runtime import init
+from alphatrion.runtime.runtime import global_runtime, init
 from alphatrion.trial.trial import Trial, TrialConfig, current_trial_id
 
 
@@ -117,7 +117,7 @@ async def test_craft_experiment_with_context():
             name="first-trial", config=TrialConfig(max_duration_seconds=2)
         )
         await trial.wait()
-        assert trial.stopped()
+        assert trial.cancelled()
 
         trial = trial._get_obj()
         assert trial.status == TrialStatus.FINISHED
@@ -127,7 +127,9 @@ async def test_craft_experiment_with_context():
 async def test_craft_experiment_with_multi_trials_in_parallel():
     init(project_id=uuid.uuid4(), artifact_insecure=True, init_tables=True)
 
-    async def fake_work(exp: CraftExperiment):
+    async def fake_work():
+        exp = global_runtime().current_exp
+
         duration = random.randint(1, 5)
         trial = exp.start_trial(
             name="first-trial", config=TrialConfig(max_duration_seconds=duration)
@@ -136,7 +138,7 @@ async def fake_work(exp: CraftExperiment):
         assert trial.id == current_trial_id.get()
 
         await trial.wait()
-        assert trial.stopped()
+        assert trial.cancelled()
         # we don't reset the current trial id.
         assert trial.id == current_trial_id.get()
 
@@ -147,10 +149,10 @@ async def fake_work(exp: CraftExperiment):
         name="context_exp",
         description="Context manager test",
         meta={"key": "value"},
-    ) as exp:
+    ):
         await asyncio.gather(
-            fake_work(exp),
-            fake_work(exp),
-            fake_work(exp),
+            fake_work(),
+            fake_work(),
+            fake_work(),
         )
         print("All trials finished.")