kerthcet
diff --git a/‎README.md‎
Lines changed: 13 additions & 9 deletions b/‎README.md‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎alphatrion/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎alphatrion/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎alphatrion/artifact/artifact.py‎
Lines changed: 0 additions & 1 deletion b/‎alphatrion/artifact/artifact.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎alphatrion/experiment/base.py‎
Lines changed: 67 additions & 11 deletions b/‎alphatrion/experiment/base.py‎
Lines changed: 67 additions & 11 deletions
diff --git a/‎alphatrion/experiment/craft_experiment.py‎
Lines changed: 2 additions & 0 deletions b/‎alphatrion/experiment/craft_experiment.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎alphatrion/log/log.py‎
Lines changed: 17 additions & 25 deletions b/‎alphatrion/log/log.py‎
Lines changed: 17 additions & 25 deletions
@@ -18,9 +18,8 @@ Open, modular framework to build GenAI applications.
 
 ## Concepts
 
-- **Team**: A Team is the highest-level organizational unit in AlphaTrion. It represents a group of users collaborating on projects and experiments.
-- **Project**: A Project is a namespace-level abstraction that contains multiple experiments. It helps organize experiments related to a specific goal or topic.
-- **Experiment**: An Experiment is a logic-level abstraction for organizing and managing a series of related runs. It allows users to group runs that share a common purpose or configuration.
+- **Team**: A Team is the highest-level organizational unit in AlphaTrion. It represents a group of users collaborating on experiments.
+- **Experiment**: An Experiment is a logic-level abstraction for organizing and managing a series of related runs. It allows users to group runs that share a common purpose or configuration. Experiments can be organized using labels.
 - **Run**: A Run is a real execution instance of an experiment. It represents the actual execution of the code with the specified configuration and hyperparameters defined in the experiment.
 
 ## Quick Start
@@ -70,7 +69,7 @@ Below is a simple example with two approaches demonstrating how to create an exp
 
 ```python
 import alphatrion as alpha
-from alphatrion import experiment, project
+from alphatrion import experiment
 
 # Use the user ID generated from the `alphatrion init` command.
 alpha.init(user_id=<user_id>)
@@ -79,17 +78,16 @@ async def your_task():
   # Run your code here then log metrics.
   await alpha.log_metrics({"accuracy": 0.95})
 
-async with project.Project.setup(name="my_project"):
-  async with experiment.CraftExperiment.start(name="my_experiment") as exp:
-    task = exp.run(your_task) # use lambda or partial if you need to pass arguments to your_task
-    await task.wait()
+async with experiment.CraftExperiment.start(name="my_experiment") as exp:
+  task = exp.run(your_task) # use lambda or partial if you need to pass arguments to your_task
+  await task.wait()
 ```
 
 ### View Dashboard
 
 ![dashboard](./site/images/dashboard.png)
 
-The dashboard provides a web interface to explore projects, experiments, runs, and metrics through an intuitive UI.
+The dashboard provides a web interface to explore experiments, runs, and metrics through an intuitive UI.
 
 #### Launch Dashboard
 
@@ -111,6 +109,12 @@ The dashboard will automatically open in your browser at `http://127.0.0.1:5173`
 - [Dashboard CLI Guide](./docs/dashboard/dashboard-cli.md) - Using the dashboard CLI command
 - [Dashboard Architecture](./docs/dashboard/dashboard-architecture.md) - Technical architecture and deployment patterns
 
+### Tracing
+
+AlphaTrion automatically captures tracing data for all runs, including spans for each run and associated metadata. You can query this data to analyze model performance, latency, and token usage.
+
+![tracing](./site/images/trace.png)
+
 ### Cleanup
 
 ```bash
 
@@ -1,10 +1,10 @@
-from alphatrion.log.log import log_artifact, log_execution, log_metrics, log_params
+from alphatrion.log.log import log_artifact, log_metrics, log_params, log_result
 from alphatrion.runtime.runtime import init
 
 __all__ = [
     "init",
     "log_artifact",
     "log_params",
     "log_metrics",
-    "log_execution",
+    "log_result",
 ]
@@ -75,7 +75,6 @@ def list_versions(self, repo_name: str) -> list[str]:
                 or "does not exist" in error_msg
             ):
                 # Return empty list if repository doesn't exist yet
-                # This is expected for projects without artifacts
                 return []
             # Re-raise other errors
             raise RuntimeError(f"Failed to list artifacts versions: {e}") from e
 
@@ -1,14 +1,21 @@
+import asyncio
+import contextlib
 import enum
+import os
+import shutil
+import signal
 import uuid
 from abc import ABC, abstractmethod
 from collections.abc import Callable
 from datetime import UTC, datetime
 
 from pydantic import BaseModel, Field, model_validator
 
+from alphatrion import envs
 from alphatrion.run.run import Run
 from alphatrion.runtime.contextvars import current_exp_id
 from alphatrion.runtime.runtime import global_runtime
+from alphatrion.snapshot.snapshot import team_path
 from alphatrion.storage.sql_models import FINISHED_STATUS, Status
 from alphatrion.types import CallableEntry
 from alphatrion.utils import context
@@ -65,7 +72,6 @@ class ExperimentConfig(BaseModel):
     max_execution_seconds: int = Field(
         default=-1,
         description="Maximum execution seconds for the Experiment. \
-        Experiment timeout will override project timeout if both are set. \
         Default is -1 (no limit).",
     )
     early_stopping_runs: int = Field(
@@ -152,11 +158,16 @@ class Experiment(ABC):
         "_total_runs_counter",
         # The end status, None, Err or Cancelled.
         "_end_status",
+        "_stopped",
+        "_signal_task",
     )
 
     def __init__(self, config: ExperimentConfig | None = None):
         self._config = config or ExperimentConfig()
+
         self._runtime = global_runtime()
+        self._runtime.current_experiment = self
+
         self._construct_meta()
         self._runs = dict[uuid.UUID, Run]()
         self._early_stopping_counter = 0
@@ -165,35 +176,59 @@ def __init__(self, config: ExperimentConfig | None = None):
         # if experiment starts to wait, it will auto stop when the runs
         # are all finished.
         self._start_waiting = False
+        self._end_status = None
+        self._stopped = asyncio.Event()
+        self._signal_task: asyncio.Task | None = None
 
     async def __aenter__(self):
+        self._signal_task = self._start_signal_handlers()
         return self
 
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         self.done()
+        self._end_status = None
+
+        if self._signal_task:
+            # Already done, will not update the status again.
+            self._signal_task.cancel()
+
+            with contextlib.suppress(asyncio.CancelledError):
+                await self._signal_task
+
         if self._token:
             current_exp_id.reset(self._token)
+            self._runtime.current_experiment = None
 
     def _start(
         self,
         name: str,
         description: str | None = None,
+        labels: str | None = None,
         meta: dict | None = None,
         params: dict | None = None,
     ):
-        proj = self._runtime.current_proj
-        exp_obj = self._runtime.metadb.get_exp_by_name(name=name, project_id=proj.id)
+        exp_obj = self._runtime.metadb.get_exp_by_name(
+            name=name, team_id=self._runtime.team_id
+        )
 
-        # FIXME: what if the existing Experiment is completed, will lead to confusion?
-        if exp_obj:
+        # Just in case of kubernetes pod restarts, we want to make sure the experiment
+        # can be resumed if it is not completed, instead of creating a new experiment
+        # with the same name. If the experiment is already completed, we raise an error
+        # to avoid confusion.
+        if exp_obj and exp_obj.status != Status.COMPLETED:
             self._id = exp_obj.uuid
+        elif exp_obj and exp_obj.status == Status.COMPLETED:
+            raise RuntimeError(
+                f"Experiment with name '{name}' already exists and is completed. \
+                Please choose a different name or delete the existing experiment."
+            )
         else:
             self._id = self._runtime._metadb.create_experiment(
                 name=name,
                 team_id=self._runtime._team_id,
                 user_id=self._runtime._user_id,
-                project_id=proj.id,
                 description=description,
+                labels=labels,
                 meta=meta,
                 params=params,
                 status=Status.RUNNING,
@@ -204,10 +239,7 @@ def _start(
             timeout=self._timeout(),
         )
 
-        # We don't reset the Experiment id context var,
-        # because each experiment runs in its own context.
         self._token = current_exp_id.set(self._id)
-        proj.register_experiment(id=self.id, instance=self)
 
     @property
     def id(self) -> uuid.UUID:
@@ -335,6 +367,7 @@ def is_done(self) -> bool:
     # TODO: Should we distinguish done and cancel?
     def done(self):
         self._cancel()
+        self._cleanup()
 
     def done_with_err(self):
         self._end_status = "Err"
@@ -370,8 +403,6 @@ def _stop(self):
                 experiment_id=self._id, status=status, duration=duration
             )
 
-        self._runtime.current_proj.unregister_experiment(self.id)
-
     def _get_obj(self):
         return self._runtime._metadb.get_experiment(experiment_id=self.id)
 
@@ -425,3 +456,28 @@ def start(
         params: dict | None = None,
     ) -> "Experiment":
         raise NotImplementedError
+
+    def _cleanup(self):
+        # remove the whole folder once the experiment is done.
+        if (
+            os.path.exists(team_path())
+            and os.getenv(envs.AUTO_CLEANUP, "true").lower() == "true"
+        ):
+            shutil.rmtree(team_path(), ignore_errors=True)
+
+    def _start_signal_handlers(self):
+        loop = asyncio.get_running_loop()
+
+        # Handle SIGINT and SIGTERM to allow graceful shutdown.
+        # Make sure to call done() on receiving the signal.
+        for sig in (signal.SIGINT, signal.SIGTERM):
+            loop.add_signal_handler(sig, self._on_signal)
+
+        return asyncio.create_task(self._wait_for_stop())
+
+    def _on_signal(self):
+        self._stopped.set()
+
+    async def _wait_for_stop(self):
+        await self._stopped.wait()
+        self.done_with_cancel()
@@ -14,6 +14,7 @@ def start(
         cls,
         name: str,
         description: str | None = None,
+        labels: str | None = None,
         meta: dict | None = None,
         params: dict | None = None,
         config: base.ExperimentConfig | None = None,
@@ -27,6 +28,7 @@ def start(
         exp._start(
             name=name,
             description=description,
+            labels=labels,
             meta=meta,
             params=params,
         )
 
@@ -18,8 +18,8 @@
 
 async def log_artifact(
     paths: str | list[str],
+    repo_name: str,
     version: str | None = None,
-    repo_name: str | None = None,
     pre_save_hook: Callable | None = None,
 ) -> str:
     """
@@ -35,7 +35,7 @@ async def log_artifact(
            If want to save something, make sure it's under the paths.
 
     :return: the path of the logged artifact in the format of
-    {team_id}/{project_id}:{version}
+    {team_id}/{repo_name}:{version}
     """
 
     if not paths:
@@ -57,15 +57,9 @@ async def log_artifact(
         else:
             raise ValueError("pre_save_hook must be a callable function")
 
-    # We use project ID as the repo name rather than the project name,
-    # because project name is not unique and might change over time.
-    proj = runtime.current_proj
-    if proj is None:
-        raise RuntimeError("No running project found in the current context.")
-
     loop = asyncio.get_running_loop()
     return await loop.run_in_executor(
-        None, runtime._artifact.push, repo_name or str(proj.id), paths, version
+        None, runtime._artifact.push, repo_name, paths, version
     )
 
 
@@ -97,13 +91,12 @@ async def log_metrics(metrics: dict[str, float]) -> bool:
         raise RuntimeError("log_metrics must be called inside a Run.")
 
     runtime = global_runtime()
-    proj = runtime.current_proj
 
     exp_id = current_exp_id.get()
     if exp_id is None:
         raise RuntimeError("log_metrics must be called inside a Experiment.")
 
-    exp = proj.get_experiment(id=exp_id)
+    exp = runtime.current_experiment
     if exp is None:
         raise RuntimeError(f"Experiment {exp_id} not found in the database.")
 
@@ -117,7 +110,6 @@ async def log_metrics(metrics: dict[str, float]) -> bool:
             key=key,
             value=value,
             team_id=runtime._team_id,
-            project_id=proj.id,
             experiment_id=exp_id,
             run_id=run_id,
         )
@@ -138,7 +130,7 @@ async def log_metrics(metrics: dict[str, float]) -> bool:
     # TODO: refactor this with an event driven mechanism later.
     if should_checkpoint:
         path = await log_artifact(
-            repo_name=f"{str(proj.id)}/ckpt",
+            repo_name="ckpt",
             # If not provided, will use the default checkpoint path.
             paths=exp.config().checkpoint.path or checkpoint_path(),
             pre_save_hook=exp.config().checkpoint.pre_save_hook,
@@ -154,23 +146,23 @@ async def log_metrics(metrics: dict[str, float]) -> bool:
     return is_best_metric
 
 
-# log_execution is used to log the record of a run/experiment/project,
+# log_result is used to log the result of a run/experiment,
 # including both input and output, e.g. you want to save the code snippet.
 # It will be stored in the object storage as a JSON file if object storage
 # is enabled or locally otherwise.
-async def log_execution(
+async def log_result(
     output: dict[str, Any],
     input: dict[str, Any] | None = None,
     phase: str = "success",
     kind: ExecutionKind = ExecutionKind.RUN,
 ):
-    execution = None
+    result = None
 
     if kind == ExecutionKind.RUN:
-        execution = build_run_execution(output=output, input=input, phase=phase)
+        result = build_run_execution(output=output, input=input, phase=phase)
     else:
         raise NotImplementedError(
-            f"Logging record of kind {execution.kind} is not implemented yet."
+            f"Logging record of kind {result.kind} is not implemented yet."
         )
 
     # Can I get the file size to store in the database?
@@ -179,28 +171,28 @@ async def log_execution(
     if os.path.exists(path) is False:
         os.makedirs(path, exist_ok=True)
 
-    # Will eventually be cleanup on Project done() if AUTO_CLEANUP is enabled.
+    # Will eventually be cleanup on Experiment done() if AUTO_CLEANUP is enabled.
     # Considering the record file is small, we just save it locally first.
     # If this changes in the future, we should delete them after uploading.
-    with open(os.path.join(path, "execution.json"), "w") as f:
-        f.write(execution.model_dump_json())
+    with open(os.path.join(path, "result.json"), "w") as f:
+        f.write(result.model_dump_json())
 
-    file_size = os.path.getsize(os.path.join(path, "execution.json"))
+    file_size = os.path.getsize(os.path.join(path, "result.json"))
     runtime = global_runtime()
 
     # If not enabled, only save to local disk.
     if runtime.artifact_storage_enabled():
         path = await log_artifact(
-            paths=os.path.join(path, "execution.json"),
-            repo_name=f"{str(runtime.current_proj.id)}/execution",
+            paths=os.path.join(path, "result.json"),
+            repo_name="execution",
         )
         runtime.metadb.update_run(
             run_id=current_run_id.get(),
             meta={
                 EXECUTION_RESULT: {
                     "path": path,
                     "size": file_size,
-                    "file_name": "execution.json",
+                    "file_name": "result.json",
                 }
             },
         )