Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
78 commits
Select commit Hold shift + click to select a range
aec0d6c
second stage
kerthcet Mar 1, 2026
7664772
change the icon of open in ide
kerthcet Mar 1, 2026
a41145a
add new tables
kerthcet Mar 1, 2026
bfc3cb8
add default users
kerthcet Mar 1, 2026
f11befb
Update the script
kerthcet Mar 1, 2026
5aebca1
update the default values of the helm chart
kerthcet Mar 1, 2026
d1f6c5d
fix deployment problem
kerthcet Mar 1, 2026
bd952e7
Temp: fix the userID error
kerthcet Mar 1, 2026
eb146fc
remove trial-id
kerthcet Mar 2, 2026
d9acdbe
fix error
kerthcet Mar 2, 2026
2a9c242
migrations
kerthcet Mar 2, 2026
e286ee2
remove project id
kerthcet Mar 2, 2026
b29f6ee
fix error
kerthcet Mar 2, 2026
54a0119
fix error
kerthcet Mar 2, 2026
d67815c
fix error
kerthcet Mar 2, 2026
3d7f84b
remove tokens from frontend
kerthcet Mar 2, 2026
21a96c6
update homepage
kerthcet Mar 2, 2026
418ef6d
change the frontend
kerthcet Mar 2, 2026
67f65d0
Add indexes
kerthcet Mar 2, 2026
36f83a5
add indexes
kerthcet Mar 2, 2026
0e8af62
remove non necessary components
kerthcet Mar 2, 2026
3139059
fix change
kerthcet Mar 2, 2026
ebb818c
Optimize the page
kerthcet Mar 2, 2026
0c5990a
Optimize the page
kerthcet Mar 2, 2026
481544e
Optimize the page
kerthcet Mar 2, 2026
aa2d354
optimize the performance
kerthcet Mar 2, 2026
f3bbd29
optimize the performance
kerthcet Mar 2, 2026
70b800a
optimize the performance
kerthcet Mar 2, 2026
122f6c3
optimize layout
kerthcet Mar 2, 2026
418c010
optimize performance
kerthcet Mar 2, 2026
e334fae
change the page router
kerthcet Mar 2, 2026
c9bf1fa
add run duration
kerthcet Mar 2, 2026
5c645f9
add avg durations
kerthcet Mar 2, 2026
9e00e52
add run duration
kerthcet Mar 2, 2026
b59c8ca
fix
kerthcet Mar 2, 2026
87d196e
support duration;
kerthcet Mar 2, 2026
5ffa7ad
add chart
kerthcet Mar 2, 2026
8a1548d
optimize chart
kerthcet Mar 2, 2026
149a5db
optimize chart
kerthcet Mar 2, 2026
1d8c5ab
optimize chart
kerthcet Mar 2, 2026
ace1619
optimize chart
kerthcet Mar 2, 2026
fd0522d
optimize chart
kerthcet Mar 2, 2026
a039610
optimize chart
kerthcet Mar 2, 2026
2c5cb71
optimize chart
kerthcet Mar 2, 2026
8724c50
optimize chart
kerthcet Mar 2, 2026
9d31c66
optimize chart
kerthcet Mar 2, 2026
529a663
optimize chart
kerthcet Mar 2, 2026
4098991
optimize chart
kerthcet Mar 2, 2026
484fe74
optimize performance
kerthcet Mar 2, 2026
1548cf2
optimize layout
kerthcet Mar 2, 2026
9e64932
add back the iterations
kerthcet Mar 3, 2026
fb71d12
add clickhouse support
kerthcet Mar 3, 2026
6f154c7
update password
kerthcet Mar 3, 2026
747d7ee
update chart
kerthcet Mar 3, 2026
f329364
update chart
kerthcet Mar 3, 2026
1d8a2ea
update
kerthcet Mar 3, 2026
a6bcfa5
update trace
kerthcet Mar 3, 2026
b8a1b1f
add duraion
kerthcet Mar 3, 2026
3eb0c69
refine the dashboard
kerthcet Mar 3, 2026
8cf2be8
refine the dashboard
kerthcet Mar 3, 2026
c529e4c
fix: resume experiment will reset to running
kerthcet Mar 3, 2026
63180e6
update logo
kerthcet Mar 3, 2026
0326ead
fix lint
kerthcet Mar 3, 2026
76c74d2
fix layout
kerthcet Mar 3, 2026
f1c142a
fix logo
kerthcet Mar 3, 2026
cb91afb
remove tokens from traces
kerthcet Mar 3, 2026
5179a2a
add delete experiment(s) apis
kerthcet Mar 5, 2026
3b22cff
add delete experiment(s)
kerthcet Mar 5, 2026
df8d694
add api to delete exps
kerthcet Mar 6, 2026
3c546ad
fix paginated page
kerthcet Mar 6, 2026
b964be6
Persist the tokens for the experiment
kerthcet Mar 7, 2026
37f24d3
Add log_dataset to APIs
kerthcet Mar 8, 2026
ddce747
fix merge error
kerthcet Mar 8, 2026
98d5773
Extend the api to support file paths
kerthcet Mar 8, 2026
66644c0
update the graphql api for datasets
kerthcet Mar 8, 2026
f024edc
fix merge conflict
kerthcet Mar 8, 2026
c69b617
fix lint
kerthcet Mar 8, 2026
6dc9604
feat: delete Dataset
kerthcet Mar 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,4 @@ ptest/
/node_modules/

.claude/
values-dev.yaml
values-prod.yaml
values-*.yaml
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ IMAGE_NAME ?= alphatrion
IMAGE_REPO := $(IMAGE_REGISTRY)/$(IMAGE_NAME)
GIT_TAG ?= $(shell git describe --tags --dirty --always)
IMG ?= $(IMAGE_REPO):$(GIT_TAG)
PLATFORMS ?= linux/arm64,linux/amd64
PLATFORMS ?= linux/amd64

POETRY := poetry
RUFF := .venv/bin/ruff
Expand Down Expand Up @@ -51,6 +51,7 @@ test-integration: lint
docker-compose -f ./docker-compose.yaml up -d; \
trap "docker-compose -f ./docker-compose.yaml down" EXIT; \
until docker exec postgres pg_isready -U alphatr1on; do sleep 1; done; \
until docker exec clickhouse clickhouse-client --query "SELECT 1"; do sleep 1; done; \
until curl -sf http://localhost:11434/api/tags | grep "smollm:135m" > /dev/null; do sleep 1; done; \
$(PYTEST) tests/integration --timeout=30; \
'
Expand Down
4 changes: 2 additions & 2 deletions alphatrion/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from alphatrion.log.log import log_artifact, log_metrics, log_params, log_result
from alphatrion.log.log import log_artifact, log_dataset, log_metrics, log_params
from alphatrion.runtime.runtime import init

__all__ = [
"init",
"log_artifact",
"log_params",
"log_metrics",
"log_result",
"log_dataset",
]
11 changes: 5 additions & 6 deletions alphatrion/artifact/artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@


class Artifact:
def __init__(self, team_id: str, insecure: bool = False):
self._team_id = team_id
def __init__(self, insecure: bool = False):
self._url = get_registry_url()
self._client = oras.client.OrasClient(
hostname=self._url.strip("/"), auth_backend="token", insecure=insecure
Expand Down Expand Up @@ -50,7 +49,7 @@ def push(
if version is None:
version = utiltime.now_2_hash()

path = f"{self._team_id}/{repo_name}:{version}"
path = f"{repo_name}:{version}"
target = f"{self._url}/{path}"

try:
Expand All @@ -61,7 +60,7 @@ def push(
return path

def list_versions(self, repo_name: str) -> list[str]:
target = f"{self._url}/{self._team_id}/{repo_name}"
target = f"{self._url}/{repo_name}"
try:
tags = self._client.get_tags(target)
return tags
Expand Down Expand Up @@ -91,7 +90,7 @@ def pull(
(defaults to ORAS temp directory)
:return: list of absolute file paths that were downloaded
"""
path = f"{self._team_id}/{repo_name}:{version}"
path = f"{repo_name}:{version}"
target = f"{self._url}/{path}"

if output_dir:
Expand All @@ -115,7 +114,7 @@ def pull(
os.chdir(original_dir)

def delete(self, repo_name: str, versions: str | list[str]):
target = f"{self._url}/{self._team_id}/{repo_name}"
target = f"{self._url}/{repo_name}"

try:
self._client.delete_tags(target, tags=versions)
Expand Down
1 change: 0 additions & 1 deletion alphatrion/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,3 @@

# Runtime related envs
ROOT_PATH = "ALPHATRION_ROOT_PATH"
AUTO_CLEANUP = "ALPHATRION_AUTO_CLEANUP"
15 changes: 15 additions & 0 deletions alphatrion/experiment/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,19 @@ def _start(
# to avoid confusion.
if exp_obj and exp_obj.status != Status.COMPLETED:
self._id = exp_obj.uuid
usage = exp_obj.usage

# reset to running status, also need to reset the tokens.
if usage and "total_tokens" in usage:
# delete the tokens in the usage
usage.delete("total_tokens")
usage.delete("input_tokens")
usage.delete("output_tokens")
self._runtime._metadb.update_experiment(
experiment_id=self._id,
status=Status.RUNNING,
usage=usage,
)
elif exp_obj and exp_obj.status == Status.COMPLETED:
raise RuntimeError(
f"Experiment with name '{name}' already exists and is completed. \
Expand Down Expand Up @@ -366,6 +379,8 @@ def is_done(self) -> bool:
# or it could lead to experiment not being marked as completed.
# TODO: Should we distinguish done and cancel?
def done(self):
if self.is_done():
return
self._cancel()
self._cleanup()

Expand Down
101 changes: 54 additions & 47 deletions alphatrion/log/log.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
import asyncio
import json
import os
import tempfile
from collections.abc import Callable
from typing import Any

from alphatrion.runtime.contextvars import current_exp_id, current_run_id
from alphatrion.runtime.runtime import global_runtime
from alphatrion.snapshot.snapshot import (
ExecutionKind,
build_run_execution,
checkpoint_path,
snapshot_path,
)
from alphatrion.storage import runtime as storage_runtime

BEST_RESULT_PATH = "best_result_path"
EXECUTION_RESULT = "execution_result"


async def log_artifact(
Expand Down Expand Up @@ -45,7 +44,7 @@ async def log_artifact(
if runtime is None:
raise RuntimeError("Runtime is not initialized. Please call init() first.")

if not runtime.artifact_storage_enabled():
if not storage_runtime.artifact_storage_enabled():
raise RuntimeError(
"Artifact storage is not enabled in the runtime."
"Set ENABLE_ARTIFACT_STORAGE=true in the environment variables."
Expand All @@ -59,7 +58,7 @@ async def log_artifact(

loop = asyncio.get_running_loop()
return await loop.run_in_executor(
None, runtime._artifact.push, repo_name, paths, version
None, runtime._artifact.push, f"{runtime.team_id}/{repo_name}", paths, version
)


Expand Down Expand Up @@ -146,53 +145,61 @@ async def log_metrics(metrics: dict[str, float]) -> bool:
return is_best_metric


# log_result is used to log the result of a run/experiment,
# including both input and output, e.g. you want to save the code snippet.
# It will be stored in the object storage as a JSON file if object storage
# is enabled or locally otherwise.
async def log_result(
output: dict[str, Any],
input: dict[str, Any] | None = None,
phase: str = "success",
kind: ExecutionKind = ExecutionKind.RUN,
):
result = None

if kind == ExecutionKind.RUN:
result = build_run_execution(output=output, input=input, phase=phase)
else:
raise NotImplementedError(
f"Logging record of kind {result.kind} is not implemented yet."
)
# log_records is used to log a list of records, which is similar to log_metrics
# but for tracing the execution of the code.
# async def log_records():

# Can I get the file size to store in the database?

path = snapshot_path()
if os.path.exists(path) is False:
os.makedirs(path, exist_ok=True)

# Will eventually be cleanup on Experiment done() if AUTO_CLEANUP is enabled.
# Considering the record file is small, we just save it locally first.
# If this changes in the future, we should delete them after uploading.
with open(os.path.join(path, "result.json"), "w") as f:
f.write(result.model_dump_json())
async def log_dataset(
name: str,
data_or_path: dict[str, Any] | str | list[str],
):
"""
Log dataset to the database and artifact registry.

file_size = os.path.getsize(os.path.join(path, "result.json"))
:param name: the name of the dataset.
:param data_or_path: the data to be logged, currently support dict only,
will support more types in the future.
"""
runtime = global_runtime()

# If not enabled, only save to local disk.
if runtime.artifact_storage_enabled():
if isinstance(data_or_path, dict):
with tempfile.TemporaryDirectory() as tmpdir:
os.chdir(tmpdir)
with open(name, "w") as f:
f.write(json.dumps(data_or_path))
file_size = os.path.getsize(name)

path = await log_artifact(
paths=name,
repo_name="dataset",
)

runtime.metadb.create_dataset(
name=name,
team_id=runtime.team_id,
user_id=runtime.user_id,
path=path,
experiment_id=current_exp_id.get(),
run_id=current_run_id.get(),
meta={"size": file_size},
)
return
elif isinstance(data_or_path, (str, list)):
path = await log_artifact(
paths=os.path.join(path, "result.json"),
repo_name="execution",
paths=data_or_path,
repo_name="dataset",
)
runtime.metadb.update_run(
runtime.metadb.create_dataset(
name=name,
team_id=runtime.team_id,
user_id=runtime.user_id,
path=path,
experiment_id=current_exp_id.get(),
run_id=current_run_id.get(),
meta={
EXECUTION_RESULT: {
"path": path,
"size": file_size,
"file_name": "result.json",
}
},
)
return

raise NotImplementedError(
f"Logging dataset of type {type(data_or_path)} is not implemented yet."
)
18 changes: 14 additions & 4 deletions alphatrion/run/run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import uuid
from datetime import UTC, datetime

from alphatrion.runtime.contextvars import current_run_id
from alphatrion.runtime.runtime import global_runtime
Expand Down Expand Up @@ -50,19 +51,28 @@ def done(self):
if self.cancelled():
return

run = self._runtime._metadb.get_run(run_id=self.id)
duration = (
datetime.now(UTC) - run.created_at.replace(tzinfo=UTC)
).total_seconds()

self._runtime.metadb.update_run(
run_id=self._id,
status=Status.COMPLETED,
run_id=self._id, status=Status.COMPLETED, duration=duration
)
self._result = self._task.result()

def cancel(self):
# TODO: we should wait for the task to be actually cancelled
# and catch the CancelledError exception in the task function.
self._task.cancel()

run = self._runtime._metadb.get_run(run_id=self.id)
duration = (
datetime.now(UTC) - run.created_at.replace(tzinfo=UTC)
).total_seconds()

self._runtime.metadb.update_run(
run_id=self._id,
status=Status.CANCELLED,
run_id=self._id, status=Status.CANCELLED, duration=duration
)

def cancelled(self) -> bool:
Expand Down
15 changes: 5 additions & 10 deletions alphatrion/runtime/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import uuid

from alphatrion import envs
from alphatrion.artifact.artifact import Artifact
from alphatrion.storage import runtime as storage_runtime
from alphatrion.storage.sqlstore import SQLStore

Expand Down Expand Up @@ -58,6 +57,7 @@ def __init__(
storage_runtime.init()
self._metadb = storage_runtime.storage_runtime().metadb
self._tracestore = storage_runtime.storage_runtime().tracestore
self._artifact = storage_runtime.storage_runtime().artifact

self._user_id = user_id
self._team_id = team_id
Expand All @@ -74,18 +74,9 @@ def __init__(
self._team_id = teams[0].uuid

self._root_path = os.getenv(envs.ROOT_PATH, os.path.expanduser("~/.alphatrion"))

artifact_insecure = os.getenv(envs.ARTIFACT_INSECURE, "false").lower() == "true"

if self.artifact_storage_enabled():
self._artifact = Artifact(team_id=self._team_id, insecure=artifact_insecure)

if not os.path.exists(self._root_path):
os.makedirs(self._root_path, exist_ok=True)

def artifact_storage_enabled(self) -> bool:
return os.getenv(envs.ENABLE_ARTIFACT_STORAGE, "true").lower() == "true"

@property
def metadb(self) -> SQLStore:
return self._metadb
Expand All @@ -94,6 +85,10 @@ def metadb(self) -> SQLStore:
def tracestore(self):
return self._tracestore

@property
def artifact(self):
return self._artifact

@property
def user_id(self) -> uuid.UUID:
return self._user_id
Expand Down
Loading