Persist the tokens for the experiment (#195)

kerthcet · web-flow · commit e31160c8892c · 2026-03-07T23:40:30.000Z
Signed-off-by: kerthcet &lt;kerthcet@gmail.com&gt;
diff --git a/Makefile b/Makefile
@@ -51,6 +51,7 @@ test-integration: lint
 	docker-compose -f ./docker-compose.yaml up -d; \
 	trap "docker-compose -f ./docker-compose.yaml down" EXIT; \
 	until docker exec postgres pg_isready -U alphatr1on; do sleep 1; done; \
+	until docker exec clickhouse clickhouse-client --query "SELECT 1"; do sleep 1; done; \
 	until curl -sf http://localhost:11434/api/tags | grep "smollm:135m" > /dev/null; do sleep 1; done; \
 	$(PYTEST) tests/integration --timeout=30; \
 	'
diff --git a/alphatrion/experiment/base.py b/alphatrion/experiment/base.py
@@ -219,10 +219,18 @@ def _start(
         # to avoid confusion.
         if exp_obj and exp_obj.status != Status.COMPLETED:
             self._id = exp_obj.uuid
-            # reset to running status.
+            usage = exp_obj.usage
+
+            # reset to running status, also need to reset the tokens.
+            if usage and "total_tokens" in usage:
+                # delete the tokens in the usage
+                usage.delete("total_tokens")
+                usage.delete("input_tokens")
+                usage.delete("output_tokens")
             self._runtime._metadb.update_experiment(
                 experiment_id=self._id,
                 status=Status.RUNNING,
+                usage=usage,
             )
         elif exp_obj and exp_obj.status == Status.COMPLETED:
             raise RuntimeError(
@@ -369,6 +377,8 @@ def is_done(self) -> bool:
         return self._context.cancelled()
 
     def done(self):
+        if self.is_done():
+            return
         self._cancel()
 
     def done_with_err(self):
diff --git a/alphatrion/server/graphql/resolvers.py b/alphatrion/server/graphql/resolvers.py
@@ -8,7 +8,10 @@
 from alphatrion import envs
 from alphatrion.artifact import artifact
 from alphatrion.storage import runtime
-from alphatrion.storage.sql_models import Status
+from alphatrion.storage.sql_models import (
+    FINISHED_STATUS,
+    Status,
+)
 
 from .types import (
     AddUserToTeamInput,
@@ -138,6 +141,7 @@ def list_experiments(
                 duration=e.duration,
                 status=GraphQLStatusEnum[Status(e.status).name],
                 kind=GraphQLExperimentTypeEnum[GraphQLExperimentType(e.kind).name],
+                cost=e.cost,
                 created_at=e.created_at,
                 updated_at=e.updated_at,
             )
@@ -160,6 +164,7 @@ def get_experiment(id: strawberry.ID) -> Experiment | None:
                 duration=exp.duration,
                 status=GraphQLStatusEnum[Status(exp.status).name],
                 kind=GraphQLExperimentTypeEnum[GraphQLExperimentType(exp.kind).name],
+                cost=exp.cost,
                 created_at=exp.created_at,
                 updated_at=exp.updated_at,
             )
@@ -175,7 +180,7 @@ def list_runs(
     ) -> list[Run]:
         metadb = runtime.storage_runtime().metadb
         runs = metadb.list_runs_by_exp_id(
-            exp_id=uuid.UUID(experiment_id),
+            experiment_id=uuid.UUID(experiment_id),
             page=page,
             page_size=page_size,
             order_by=order_by,
@@ -190,6 +195,7 @@ def list_runs(
                 meta=r.meta,
                 status=GraphQLStatusEnum[Status(r.status).name],
                 duration=r.duration,
+                cost=r.cost,
                 created_at=r.created_at,
             )
             for r in runs
@@ -208,6 +214,7 @@ def get_run(id: strawberry.ID) -> Run | None:
                 meta=run.meta,
                 status=GraphQLStatusEnum[Status(run.status).name],
                 duration=run.duration,
+                cost=run.cost,
                 created_at=run.created_at,
             )
         return None
@@ -311,6 +318,7 @@ def list_exps_by_timeframe(
                 duration=e.duration,
                 status=GraphQLStatusEnum[Status(e.status).name],
                 kind=GraphQLExperimentTypeEnum[GraphQLExperimentType(e.kind).name],
+                cost=e.cost,
                 created_at=e.created_at,
                 updated_at=e.updated_at,
             )
@@ -396,30 +404,22 @@ def aggregate_run_tokens(run_id: strawberry.ID) -> dict[str, int]:
             return {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
 
         try:
-            trace_store = runtime.storage_runtime().tracestore
-            spans = trace_store.get_llm_spans_by_run_id(run_id)
-            # Don't close - it's a shared singleton connection
-
-            total_tokens = 0
-            input_tokens = 0
-            output_tokens = 0
-
-            for span in spans:
-                span_attrs = span.get("SpanAttributes", {})
-
-                # Aggregate tokens from LLM spans
-                if "llm.usage.total_tokens" in span_attrs:
-                    total_tokens += int(span_attrs["llm.usage.total_tokens"])
-                if "gen_ai.usage.input_tokens" in span_attrs:
-                    input_tokens += int(span_attrs["gen_ai.usage.input_tokens"])
-                if "gen_ai.usage.output_tokens" in span_attrs:
-                    output_tokens += int(span_attrs["gen_ai.usage.output_tokens"])
-
-            return {
-                "total_tokens": total_tokens,
-                "input_tokens": input_tokens,
-                "output_tokens": output_tokens,
-            }
+            run = runtime.storage_runtime().metadb.get_run(run_id=run_id)
+            if run.status in FINISHED_STATUS:
+                if run.usage and "total_tokens" in run.usage:
+                    return {
+                        "total_tokens": run.usage.get("total_tokens", 0),
+                        "input_tokens": run.usage.get("input_tokens", 0),
+                        "output_tokens": run.usage.get("output_tokens", 0),
+                    }
+                else:
+                    usage = GraphQLResolvers.get_run_usage(run_id)
+                    runtime.storage_runtime().metadb.update_run(
+                        run_id=run_id, usage=usage
+                    )
+                    return usage
+            else:
+                return GraphQLResolvers.get_run_usage(run_id)
         except Exception as e:
             import logging
 
@@ -428,6 +428,33 @@ def aggregate_run_tokens(run_id: strawberry.ID) -> dict[str, int]:
             )
             return {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
 
+    @staticmethod
+    def get_run_usage(run_id: strawberry.ID) -> dict[str, int]:
+        trace_store = runtime.storage_runtime().tracestore
+        spans = trace_store.get_llm_spans_by_run_id(run_id)
+        # Don't close - it's a shared singleton connection
+
+        total_tokens = 0
+        input_tokens = 0
+        output_tokens = 0
+
+        for span in spans:
+            span_attrs = span.get("SpanAttributes", {})
+
+            # Aggregate tokens from LLM spans
+            if "llm.usage.total_tokens" in span_attrs:
+                total_tokens += int(span_attrs["llm.usage.total_tokens"])
+            if "gen_ai.usage.input_tokens" in span_attrs:
+                input_tokens += int(span_attrs["gen_ai.usage.input_tokens"])
+            if "gen_ai.usage.output_tokens" in span_attrs:
+                output_tokens += int(span_attrs["gen_ai.usage.output_tokens"])
+
+        return {
+            "total_tokens": total_tokens,
+            "input_tokens": input_tokens,
+            "output_tokens": output_tokens,
+        }
+
     @staticmethod
     def aggregate_experiment_tokens(experiment_id: strawberry.ID) -> dict[str, int]:
         """Aggregate token usage from all spans in an experiment."""
@@ -436,31 +463,24 @@ def aggregate_experiment_tokens(experiment_id: strawberry.ID) -> dict[str, int]:
             return {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
 
         try:
-            trace_store = runtime.storage_runtime().tracestore
-            # Get all LLM spans for this experiment in a single query
-            spans = trace_store.get_llm_spans_by_exp_id(experiment_id)
-            # Don't close - it's a shared singleton connection
-
-            total_tokens = 0
-            input_tokens = 0
-            output_tokens = 0
-
-            for span in spans:
-                span_attrs = span.get("SpanAttributes", {})
-
-                # Aggregate tokens from LLM spans
-                if "llm.usage.total_tokens" in span_attrs:
-                    total_tokens += int(span_attrs["llm.usage.total_tokens"])
-                if "gen_ai.usage.input_tokens" in span_attrs:
-                    input_tokens += int(span_attrs["gen_ai.usage.input_tokens"])
-                if "gen_ai.usage.output_tokens" in span_attrs:
-                    output_tokens += int(span_attrs["gen_ai.usage.output_tokens"])
-
-            return {
-                "total_tokens": total_tokens,
-                "input_tokens": input_tokens,
-                "output_tokens": output_tokens,
-            }
+            exp = runtime.storage_runtime().metadb.get_experiment(
+                experiment_id=experiment_id
+            )
+            if exp.status in FINISHED_STATUS:
+                if exp.usage and "total_tokens" in exp.usage:
+                    return {
+                        "total_tokens": exp.usage.get("total_tokens", 0),
+                        "input_tokens": exp.usage.get("input_tokens", 0),
+                        "output_tokens": exp.usage.get("output_tokens", 0),
+                    }
+                else:
+                    usage = GraphQLResolvers.get_experiment_usage(experiment_id)
+                    runtime.storage_runtime().metadb.update_experiment(
+                        experiment_id=experiment_id, usage=usage
+                    )
+                    return usage
+            else:
+                return GraphQLResolvers.get_experiment_usage(experiment_id)
         except Exception as e:
             import logging
 
@@ -469,6 +489,34 @@ def aggregate_experiment_tokens(experiment_id: strawberry.ID) -> dict[str, int]:
             )
             return {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
 
+    @staticmethod
+    def get_experiment_usage(experiment_id: strawberry.ID):
+        trace_store = runtime.storage_runtime().tracestore
+        # Get all LLM spans for this experiment in a single query
+        spans = trace_store.get_llm_spans_by_exp_id(experiment_id)
+        # Don't close - it's a shared singleton connection
+
+        total_tokens = 0
+        input_tokens = 0
+        output_tokens = 0
+
+        for span in spans:
+            span_attrs = span.get("SpanAttributes", {})
+
+            # Aggregate tokens from LLM spans
+            if "llm.usage.total_tokens" in span_attrs:
+                total_tokens += int(span_attrs["llm.usage.total_tokens"])
+            if "gen_ai.usage.input_tokens" in span_attrs:
+                input_tokens += int(span_attrs["gen_ai.usage.input_tokens"])
+            if "gen_ai.usage.output_tokens" in span_attrs:
+                output_tokens += int(span_attrs["gen_ai.usage.output_tokens"])
+
+        return {
+            "total_tokens": total_tokens,
+            "input_tokens": input_tokens,
+            "output_tokens": output_tokens,
+        }
+
     @staticmethod
     def list_spans(run_id: strawberry.ID) -> list[Span]:
         """List all spans for a specific run."""
diff --git a/alphatrion/server/graphql/types.py b/alphatrion/server/graphql/types.py
@@ -125,11 +125,10 @@ class Experiment:
     params: JSON | None
     duration: float
     status: GraphQLStatusEnum
+    cost: JSON | None
     created_at: datetime
     updated_at: datetime
 
-    _token_cache: strawberry.Private[dict[str, int] | None] = None
-
     @strawberry.field
     def labels(self) -> list[Label]:
         from .resolvers import GraphQLResolvers
@@ -163,10 +162,9 @@ class Run:
     meta: JSON | None
     duration: float
     status: GraphQLStatusEnum
+    cost: JSON | None
     created_at: datetime
 
-    _token_cache: strawberry.Private[dict[str, int] | None] = None
-
     @strawberry.field
     def metrics(self) -> list["Metric"]:
         """Get metrics for this run."""
diff --git a/alphatrion/storage/sql_models.py b/alphatrion/storage/sql_models.py
@@ -146,6 +146,17 @@ class Experiment(Base):
             0: UNKNOWN, 1: PENDING, 2: RUNNING, 9: COMPLETED, \
             10: CANCELLED, 11: FAILED",
     )
+    usage = Column(
+        MutableDict.as_mutable(JSON),
+        nullable=True,
+        comment="The usage information, e.g. for LLM calls: \
+            {total_tokens: int, input_tokens: int, output_tokens: int}",
+    )
+    cost = Column(
+        MutableDict.as_mutable(JSON),
+        nullable=True,
+        comment="Cost of the experiment in dollars",
+    )
 
     created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(UTC))
     updated_at = Column(
@@ -190,6 +201,17 @@ class Run(Base):
             0: UNKNOWN, 1: PENDING, 2: RUNNING, 9: COMPLETED, \
             10: CANCELLED, 11: FAILED",
     )
+    usage = Column(
+        MutableDict.as_mutable(JSON),
+        nullable=True,
+        comment="The usage information, e.g. for LLM calls: \
+            {total_tokens: int, input_tokens: int, output_tokens: int}",
+    )
+    cost = Column(
+        MutableDict.as_mutable(JSON),
+        nullable=True,
+        comment="Cost of the run in dollars",
+    )
 
     created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(UTC))
     updated_at = Column(
diff --git a/alphatrion/storage/sqlstore.py b/alphatrion/storage/sqlstore.py
@@ -658,7 +658,7 @@ def get_run(self, run_id: uuid.UUID) -> Run | None:
 
     def list_runs_by_exp_id(
         self,
-        exp_id: uuid.UUID,
+        experiment_id: uuid.UUID,
         page: int = 0,
         page_size: int = 10,
         order_by: str = "created_at",
@@ -667,7 +667,7 @@ def list_runs_by_exp_id(
         session = self._session()
         runs = (
             session.query(Run)
-            .filter(Run.experiment_id == exp_id, Run.is_del == 0)
+            .filter(Run.experiment_id == experiment_id, Run.is_del == 0)
             .order_by(
                 getattr(Run, order_by).desc() if order_desc else getattr(Run, order_by)
             )
diff --git a/migrations/versions/fd4984c761c2_add_usage_and_cost_fields.py b/migrations/versions/fd4984c761c2_add_usage_and_cost_fields.py
@@ -0,0 +1,38 @@
+"""add usage and cost fields
+
+Revision ID: fd4984c761c2
+Revises: 0f417c7cf4d3
+Create Date: 2026-03-07 09:24:35.803615
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'fd4984c761c2'
+down_revision: Union[str, Sequence[str], None] = '0f417c7cf4d3'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('experiments', sa.Column('usage', sa.JSON(), nullable=True, comment='The usage information, e.g. for LLM calls:             {total_tokens: int, input_tokens: int, output_tokens: int}'))
+    op.add_column('experiments', sa.Column('cost', sa.JSON(), nullable=True, comment='Cost of the run in dollars'))
+    op.add_column('runs', sa.Column('usage', sa.JSON(), nullable=True, comment='The usage information, e.g. for LLM calls:             {total_tokens: int, input_tokens: int, output_tokens: int}'))
+    op.add_column('runs', sa.Column('cost', sa.JSON(), nullable=True, comment='Cost of the run in dollars'))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('runs', 'cost')
+    op.drop_column('runs', 'usage')
+    op.drop_column('experiments', 'cost')
+    op.drop_column('experiments', 'usage')
+    # ### end Alembic commands ###
diff --git a/tests/integration/server/test_graphql_mutation.py b/tests/integration/server/test_graphql_mutation.py
@@ -328,8 +328,8 @@ def test_add_user_to_team_with_invalid_user():
     assert "not found" in str(response.errors[0])
 
 
-def test_complete_workflow():
-    """Test complete workflow: create team, create user, add user to teams"""
+def test_user_workflow():
+    """Test user workflow: create team, create user, add user to teams"""
     runtime.init()
 
     username = unique_username("alice")
diff --git a/tests/integration/server/test_graphql_query.py b/tests/integration/server/test_graphql_query.py
diff --git a/tests/integration/test_craft_experiment.py b/tests/integration/test_craft_experiment.py
diff --git a/tests/unit/experiment/test_experimant.py b/tests/unit/experiment/test_experimant.py

Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,7 @@ test-integration: lint`
`51`	`51`	`docker-compose -f ./docker-compose.yaml up -d; \`
`52`	`52`	`trap "docker-compose -f ./docker-compose.yaml down" EXIT; \`
`53`	`53`	`until docker exec postgres pg_isready -U alphatr1on; do sleep 1; done; \`
	`54`	`+ until docker exec clickhouse clickhouse-client --query "SELECT 1"; do sleep 1; done; \`
`54`	`55`	`until curl -sf http://localhost:11434/api/tags \| grep "smollm:135m" > /dev/null; do sleep 1; done; \`
`55`	`56`	`$(PYTEST) tests/integration --timeout=30; \`
`56`	`57`	`'`