InftyAI · kerthcet · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026
diff --git a/.gitignore b/.gitignore
@@ -46,5 +46,4 @@ ptest/
 /node_modules/
 
 .claude/
-values-dev.yaml
-values-prod.yaml
+values-*.yaml
diff --git a/Makefile b/Makefile
@@ -8,7 +8,7 @@ IMAGE_NAME ?= alphatrion
 IMAGE_REPO := $(IMAGE_REGISTRY)/$(IMAGE_NAME)
 GIT_TAG ?= $(shell git describe --tags --dirty --always)
 IMG ?= $(IMAGE_REPO):$(GIT_TAG)
-PLATFORMS ?= linux/arm64,linux/amd64
+PLATFORMS ?= linux/amd64
 
 POETRY := poetry
 RUFF := .venv/bin/ruff
@@ -51,6 +51,7 @@ test-integration: lint
 	docker-compose -f ./docker-compose.yaml up -d; \
 	trap "docker-compose -f ./docker-compose.yaml down" EXIT; \
 	until docker exec postgres pg_isready -U alphatr1on; do sleep 1; done; \
+	until docker exec clickhouse clickhouse-client --query "SELECT 1"; do sleep 1; done; \
 	until curl -sf http://localhost:11434/api/tags | grep "smollm:135m" > /dev/null; do sleep 1; done; \
 	$(PYTEST) tests/integration --timeout=30; \
 	'

diff --git a/alphatrion/__init__.py b/alphatrion/__init__.py
@@ -1,10 +1,10 @@
-from alphatrion.log.log import log_artifact, log_metrics, log_params, log_result
+from alphatrion.log.log import log_artifact, log_dataset, log_metrics, log_params
 from alphatrion.runtime.runtime import init
 
 __all__ = [
     "init",
     "log_artifact",
     "log_params",
     "log_metrics",
-    "log_result",
+    "log_dataset",
 ]
diff --git a/alphatrion/artifact/artifact.py b/alphatrion/artifact/artifact.py
@@ -9,8 +9,7 @@
 
 
 class Artifact:
-    def __init__(self, team_id: str, insecure: bool = False):
-        self._team_id = team_id
+    def __init__(self, insecure: bool = False):
         self._url = get_registry_url()
         self._client = oras.client.OrasClient(
             hostname=self._url.strip("/"), auth_backend="token", insecure=insecure
@@ -50,7 +49,7 @@ def push(
         if version is None:
             version = utiltime.now_2_hash()
 
-        path = f"{self._team_id}/{repo_name}:{version}"
+        path = f"{repo_name}:{version}"
         target = f"{self._url}/{path}"
 
         try:
@@ -61,7 +60,7 @@ def push(
         return path
 
     def list_versions(self, repo_name: str) -> list[str]:
-        target = f"{self._url}/{self._team_id}/{repo_name}"
+        target = f"{self._url}/{repo_name}"
         try:
             tags = self._client.get_tags(target)
             return tags
@@ -91,7 +90,7 @@ def pull(
                            (defaults to ORAS temp directory)
         :return: list of absolute file paths that were downloaded
         """
-        path = f"{self._team_id}/{repo_name}:{version}"
+        path = f"{repo_name}:{version}"
         target = f"{self._url}/{path}"
 
         if output_dir:
@@ -115,7 +114,7 @@ def pull(
                 os.chdir(original_dir)
 
     def delete(self, repo_name: str, versions: str | list[str]):
-        target = f"{self._url}/{self._team_id}/{repo_name}"
+        target = f"{self._url}/{repo_name}"
 
         try:
             self._client.delete_tags(target, tags=versions)

diff --git a/alphatrion/envs.py b/alphatrion/envs.py
@@ -21,4 +21,3 @@
 
 # Runtime related envs
 ROOT_PATH = "ALPHATRION_ROOT_PATH"
-AUTO_CLEANUP = "ALPHATRION_AUTO_CLEANUP"
diff --git a/alphatrion/experiment/base.py b/alphatrion/experiment/base.py
@@ -217,6 +217,19 @@ def _start(
         # to avoid confusion.
         if exp_obj and exp_obj.status != Status.COMPLETED:
             self._id = exp_obj.uuid
+            usage = exp_obj.usage
+
+            # reset to running status, also need to reset the tokens.
+            if usage and "total_tokens" in usage:
+                # delete the tokens in the usage
+                usage.delete("total_tokens")
+                usage.delete("input_tokens")
+                usage.delete("output_tokens")
+            self._runtime._metadb.update_experiment(
+                experiment_id=self._id,
+                status=Status.RUNNING,
+                usage=usage,
+            )
         elif exp_obj and exp_obj.status == Status.COMPLETED:
             raise RuntimeError(
                 f"Experiment with name '{name}' already exists and is completed. \
@@ -366,6 +379,8 @@ def is_done(self) -> bool:
     # or it could lead to experiment not being marked as completed.
     # TODO: Should we distinguish done and cancel?
     def done(self):
+        if self.is_done():
+            return
         self._cancel()
         self._cleanup()
 

diff --git a/alphatrion/log/log.py b/alphatrion/log/log.py
@@ -1,19 +1,18 @@
 import asyncio
+import json
 import os
+import tempfile
 from collections.abc import Callable
 from typing import Any
 
 from alphatrion.runtime.contextvars import current_exp_id, current_run_id
 from alphatrion.runtime.runtime import global_runtime
 from alphatrion.snapshot.snapshot import (
-    ExecutionKind,
-    build_run_execution,
     checkpoint_path,
-    snapshot_path,
 )
+from alphatrion.storage import runtime as storage_runtime
 
 BEST_RESULT_PATH = "best_result_path"
-EXECUTION_RESULT = "execution_result"
 
 
 async def log_artifact(
@@ -45,7 +44,7 @@ async def log_artifact(
     if runtime is None:
         raise RuntimeError("Runtime is not initialized. Please call init() first.")
 
-    if not runtime.artifact_storage_enabled():
+    if not storage_runtime.artifact_storage_enabled():
         raise RuntimeError(
             "Artifact storage is not enabled in the runtime."
             "Set ENABLE_ARTIFACT_STORAGE=true in the environment variables."
@@ -59,7 +58,7 @@ async def log_artifact(
 
     loop = asyncio.get_running_loop()
     return await loop.run_in_executor(
-        None, runtime._artifact.push, repo_name, paths, version
+        None, runtime._artifact.push, f"{runtime.team_id}/{repo_name}", paths, version
     )
 
 
@@ -146,53 +145,61 @@ async def log_metrics(metrics: dict[str, float]) -> bool:
     return is_best_metric
 
 
-# log_result is used to log the result of a run/experiment,
-# including both input and output, e.g. you want to save the code snippet.
-# It will be stored in the object storage as a JSON file if object storage
-# is enabled or locally otherwise.
-async def log_result(
-    output: dict[str, Any],
-    input: dict[str, Any] | None = None,
-    phase: str = "success",
-    kind: ExecutionKind = ExecutionKind.RUN,
-):
-    result = None
-
-    if kind == ExecutionKind.RUN:
-        result = build_run_execution(output=output, input=input, phase=phase)
-    else:
-        raise NotImplementedError(
-            f"Logging record of kind {result.kind} is not implemented yet."
-        )
+# log_records is used to log a list of records, which is similar to log_metrics
+# but for tracing the execution of the code.
+# async def log_records():
 
-    # Can I get the file size to store in the database?
 
-    path = snapshot_path()
-    if os.path.exists(path) is False:
-        os.makedirs(path, exist_ok=True)
-
-    # Will eventually be cleanup on Experiment done() if AUTO_CLEANUP is enabled.
-    # Considering the record file is small, we just save it locally first.
-    # If this changes in the future, we should delete them after uploading.
-    with open(os.path.join(path, "result.json"), "w") as f:
-        f.write(result.model_dump_json())
+async def log_dataset(
+    name: str,
+    data_or_path: dict[str, Any] | str | list[str],
+):
+    """
+    Log dataset to the database and artifact registry.
 
-    file_size = os.path.getsize(os.path.join(path, "result.json"))
+    :param name: the name of the dataset.
+    :param data_or_path: the data to be logged, currently support dict only,
+                 will support more types in the future.
+    """
     runtime = global_runtime()
 
-    # If not enabled, only save to local disk.
-    if runtime.artifact_storage_enabled():
+    if isinstance(data_or_path, dict):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            os.chdir(tmpdir)
+            with open(name, "w") as f:
+                f.write(json.dumps(data_or_path))
+            file_size = os.path.getsize(name)
+
+            path = await log_artifact(
+                paths=name,
+                repo_name="dataset",
+            )
+
+            runtime.metadb.create_dataset(
+                name=name,
+                team_id=runtime.team_id,
+                user_id=runtime.user_id,
+                path=path,
+                experiment_id=current_exp_id.get(),
+                run_id=current_run_id.get(),
+                meta={"size": file_size},
+            )
+            return
+    elif isinstance(data_or_path, (str, list)):
         path = await log_artifact(
-            paths=os.path.join(path, "result.json"),
-            repo_name="execution",
+            paths=data_or_path,
+            repo_name="dataset",
         )
-        runtime.metadb.update_run(
+        runtime.metadb.create_dataset(
+            name=name,
+            team_id=runtime.team_id,
+            user_id=runtime.user_id,
+            path=path,
+            experiment_id=current_exp_id.get(),
             run_id=current_run_id.get(),
-            meta={
-                EXECUTION_RESULT: {
-                    "path": path,
-                    "size": file_size,
-                    "file_name": "result.json",
-                }
-            },
         )
+        return
+
+    raise NotImplementedError(
+        f"Logging dataset of type {type(data_or_path)} is not implemented yet."
+    )
diff --git a/alphatrion/run/run.py b/alphatrion/run/run.py
@@ -1,5 +1,6 @@
 import asyncio
 import uuid
+from datetime import UTC, datetime
 
 from alphatrion.runtime.contextvars import current_run_id
 from alphatrion.runtime.runtime import global_runtime
@@ -50,19 +51,28 @@ def done(self):
         if self.cancelled():
             return
 
+        run = self._runtime._metadb.get_run(run_id=self.id)
+        duration = (
+            datetime.now(UTC) - run.created_at.replace(tzinfo=UTC)
+        ).total_seconds()
+
         self._runtime.metadb.update_run(
-            run_id=self._id,
-            status=Status.COMPLETED,
+            run_id=self._id, status=Status.COMPLETED, duration=duration
         )
         self._result = self._task.result()
 
     def cancel(self):
         # TODO: we should wait for the task to be actually cancelled
         # and catch the CancelledError exception in the task function.
         self._task.cancel()
+
+        run = self._runtime._metadb.get_run(run_id=self.id)
+        duration = (
+            datetime.now(UTC) - run.created_at.replace(tzinfo=UTC)
+        ).total_seconds()
+
         self._runtime.metadb.update_run(
-            run_id=self._id,
-            status=Status.CANCELLED,
+            run_id=self._id, status=Status.CANCELLED, duration=duration
         )
 
     def cancelled(self) -> bool:

diff --git a/alphatrion/runtime/runtime.py b/alphatrion/runtime/runtime.py
@@ -3,7 +3,6 @@
 import uuid
 
 from alphatrion import envs
-from alphatrion.artifact.artifact import Artifact
 from alphatrion.storage import runtime as storage_runtime
 from alphatrion.storage.sqlstore import SQLStore
 
@@ -58,6 +57,7 @@ def __init__(
         storage_runtime.init()
         self._metadb = storage_runtime.storage_runtime().metadb
         self._tracestore = storage_runtime.storage_runtime().tracestore
+        self._artifact = storage_runtime.storage_runtime().artifact
 
         self._user_id = user_id
         self._team_id = team_id
@@ -74,18 +74,9 @@ def __init__(
             self._team_id = teams[0].uuid
 
         self._root_path = os.getenv(envs.ROOT_PATH, os.path.expanduser("~/.alphatrion"))
-
-        artifact_insecure = os.getenv(envs.ARTIFACT_INSECURE, "false").lower() == "true"
-
-        if self.artifact_storage_enabled():
-            self._artifact = Artifact(team_id=self._team_id, insecure=artifact_insecure)
-
         if not os.path.exists(self._root_path):
             os.makedirs(self._root_path, exist_ok=True)
 
-    def artifact_storage_enabled(self) -> bool:
-        return os.getenv(envs.ENABLE_ARTIFACT_STORAGE, "true").lower() == "true"
-
     @property
     def metadb(self) -> SQLStore:
         return self._metadb
@@ -94,6 +85,10 @@ def metadb(self) -> SQLStore:
     def tracestore(self):
         return self._tracestore
 
+    @property
+    def artifact(self):
+        return self._artifact
+
     @property
     def user_id(self) -> uuid.UUID:
         return self._user_id
Original file line number	Diff line number	Diff line change
Expand Up		@@ -21,4 +21,3 @@

		# Runtime related envs
		ROOT_PATH = "ALPHATRION_ROOT_PATH"
		AUTO_CLEANUP = "ALPHATRION_AUTO_CLEANUP"