Skip to content

Commit 0c07267

Browse files
authored
Optimize the trace detail page (#163)
* When unset, take it as success Signed-off-by: kerthcet <kerthcet@gmail.com> * update the workflow icon Signed-off-by: kerthcet <kerthcet@gmail.com> * Use graphql for biz logic Signed-off-by: kerthcet <kerthcet@gmail.com> * add tests Signed-off-by: kerthcet <kerthcet@gmail.com> * update tests Signed-off-by: kerthcet <kerthcet@gmail.com> * update the layout of the sidebar Signed-off-by: kerthcet <kerthcet@gmail.com> * update the layout Signed-off-by: kerthcet <kerthcet@gmail.com> * update test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix lint Signed-off-by: kerthcet <kerthcet@gmail.com> * fix spans Signed-off-by: kerthcet <kerthcet@gmail.com> * fix lint Signed-off-by: kerthcet <kerthcet@gmail.com> * fix query error Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * fix test Signed-off-by: kerthcet <kerthcet@gmail.com> * use lock to fix concurrent problem Signed-off-by: kerthcet <kerthcet@gmail.com> * debug Signed-off-by: kerthcet <kerthcet@gmail.com> * disable batch in integration tests Signed-off-by: kerthcet <kerthcet@gmail.com> * use env for batch Signed-off-by: kerthcet <kerthcet@gmail.com> * debug Signed-off-by: kerthcet <kerthcet@gmail.com> * debug Signed-off-by: kerthcet <kerthcet@gmail.com> * remove execution result from metadata Signed-off-by: kerthcet <kerthcet@gmail.com> * fix Signed-off-by: kerthcet <kerthcet@gmail.com> * fix sidebar Signed-off-by: kerthcet <kerthcet@gmail.com> * optimize the layout of traces Signed-off-by: kerthcet <kerthcet@gmail.com> * optimize the layout of traces Signed-off-by: kerthcet <kerthcet@gmail.com> * uptimize the layout Signed-off-by: kerthcet <kerthcet@gmail.com> * uptimize the layout Signed-off-by: kerthcet <kerthcet@gmail.com> --------- Signed-off-by: kerthcet <kerthcet@gmail.com>
1 parent 1846a83 commit 0c07267

32 files changed

Lines changed: 693 additions & 600 deletions

.env.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@ ALPHATRION_ARTIFACT_INSECURE=false
1515
# Tracing configurations
1616
ALPHATRION_ENABLE_TRACING=true
1717
ALPHATRION_CLICKHOUSE_INIT_TABLES=true
18+
ALPHATRION_CLICKHOUSE_ENABLE_BATCH=true

.env.integration-test

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ ALPHATRION_ARTIFACT_INSECURE=true
55
ALPHATRION_LOG_LEVEL=INFO
66
ALPHATRION_AUTO_CLEANUP=true
77
ALPHATRION_ENABLE_TRACING=true
8-
ALPHATRION_CLICKHOUSE_INIT_TABLES=true
8+
ALPHATRION_CLICKHOUSE_INIT_TABLES=true
9+
ALPHATRION_CLICKHOUSE_ENABLE_BATCH=true

alphatrion/envs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
CLICKHOUSE_DATABASE = "ALPHATRION_CLICKHOUSE_DATABASE"
1515
CLICKHOUSE_USERNAME = "ALPHATRION_CLICKHOUSE_USERNAME"
1616
CLICKHOUSE_PASSWORD = "ALPHATRION_CLICKHOUSE_PASSWORD"
17-
INIT_CLICKHOUSE_TABLES = "ALPHATRION_INIT_CLICKHOUSE_TABLES"
17+
CLICKHOUSE_ENABLE_BATCH = "ALPHATRION_CLICKHOUSE_ENABLE_BATCH"
1818

1919
# Dashboard only related envs
2020
DASHBOARD_USER_ID = "ALPHATRION_DASHBOARD_USER_ID"

alphatrion/server/graphql/resolvers.py

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -220,13 +220,23 @@ def get_run(id: strawberry.ID) -> Run | None:
220220
metadb = runtime.storage_runtime().metadb
221221
run = metadb.get_run(run_id=uuid.UUID(id))
222222
if run:
223+
meta = run.meta or {}
224+
225+
# Aggregate and cache tokens for completed runs.
226+
# It could be slow for the first time.
227+
if Status(run.status) == Status.COMPLETED and "total_tokens" not in meta:
228+
token_data = GraphQLResolvers.aggregate_run_tokens(run_id=id)
229+
if token_data["total_tokens"] > 0:
230+
meta.update(token_data)
231+
metadb.update_run(run_id=uuid.UUID(id), meta=meta)
232+
223233
return Run(
224234
id=run.uuid,
225235
team_id=run.team_id,
226236
user_id=run.user_id,
227237
project_id=run.project_id,
228238
experiment_id=run.experiment_id,
229-
meta=run.meta,
239+
meta=meta,
230240
status=GraphQLStatusEnum[Status(run.status).name],
231241
created_at=run.created_at,
232242
)
@@ -250,6 +260,24 @@ def list_exp_metrics(experiment_id: strawberry.ID) -> list[Metric]:
250260
for m in metrics
251261
]
252262

263+
@staticmethod
264+
def list_run_metrics(run_id: strawberry.ID) -> list[Metric]:
265+
metadb = runtime.storage_runtime().metadb
266+
metrics = metadb.list_metrics_by_run_id(run_id=run_id)
267+
return [
268+
Metric(
269+
id=m.uuid,
270+
key=m.key,
271+
value=m.value,
272+
team_id=m.team_id,
273+
project_id=m.project_id,
274+
experiment_id=m.experiment_id,
275+
run_id=m.run_id,
276+
created_at=m.created_at,
277+
)
278+
for m in metrics
279+
]
280+
253281
@staticmethod
254282
def total_projects(team_id: strawberry.ID) -> int:
255283
metadb = runtime.storage_runtime().metadb
@@ -373,8 +401,48 @@ async def get_artifact_content(
373401
raise RuntimeError(f"Failed to get artifact content: {e}") from e
374402

375403
@staticmethod
376-
def list_traces(run_id: strawberry.ID) -> list[Span]:
377-
"""List all traces/spans for a specific run."""
404+
def aggregate_run_tokens(run_id: strawberry.ID) -> dict[str, int]:
405+
"""Aggregate token usage from all traces for a run."""
406+
from alphatrion import envs
407+
408+
# Check if tracing is enabled
409+
if os.getenv(envs.ENABLE_TRACING, "false").lower() != "true":
410+
return {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
411+
412+
try:
413+
trace_store = runtime.storage_runtime().tracestore
414+
spans = trace_store.get_spans_by_run_id(uuid.UUID(run_id))
415+
trace_store.close()
416+
417+
total_tokens = 0
418+
input_tokens = 0
419+
output_tokens = 0
420+
421+
for span in spans:
422+
span_attrs = span.get("SpanAttributes", {})
423+
424+
# Aggregate tokens from LLM spans
425+
if "llm.usage.total_tokens" in span_attrs:
426+
total_tokens += int(span_attrs["llm.usage.total_tokens"])
427+
if "gen_ai.usage.input_tokens" in span_attrs:
428+
input_tokens += int(span_attrs["gen_ai.usage.input_tokens"])
429+
if "gen_ai.usage.output_tokens" in span_attrs:
430+
output_tokens += int(span_attrs["gen_ai.usage.output_tokens"])
431+
432+
return {
433+
"total_tokens": total_tokens,
434+
"input_tokens": input_tokens,
435+
"output_tokens": output_tokens,
436+
}
437+
except Exception as e:
438+
import logging
439+
440+
logging.error(f"Failed to aggregate tokens for run {run_id}: {e}")
441+
return {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
442+
443+
@staticmethod
444+
def list_spans(run_id: strawberry.ID) -> list[Span]:
445+
"""List all spans for a specific run."""
378446
from alphatrion import envs
379447

380448
# Check if tracing is enabled
@@ -385,12 +453,12 @@ def list_traces(run_id: strawberry.ID) -> list[Span]:
385453
trace_store = runtime.storage_runtime().tracestore
386454

387455
# Get traces from ClickHouse
388-
traces = trace_store.get_traces_by_run_id(uuid.UUID(run_id))
456+
raw_spans = trace_store.get_spans_by_run_id(uuid.UUID(run_id))
389457
trace_store.close()
390458

391459
# Convert to GraphQL Span objects
392460
spans = []
393-
for t in traces:
461+
for t in raw_spans:
394462
# Convert events
395463
events = []
396464
if t.get("Events"):

alphatrion/server/graphql/types.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,20 @@ class Run:
129129
status: GraphQLStatusEnum
130130
created_at: datetime
131131

132+
@strawberry.field
133+
def metrics(self) -> list["Metric"]:
134+
"""Get metrics for this run."""
135+
from alphatrion.server.graphql.resolvers import GraphQLResolvers
136+
137+
return GraphQLResolvers.list_run_metrics(run_id=self.id)
138+
139+
@strawberry.field
140+
def spans(self) -> list["Span"]:
141+
"""Get spans for this run."""
142+
from alphatrion.server.graphql.resolvers import GraphQLResolvers
143+
144+
return GraphQLResolvers.list_spans(run_id=str(self.id))
145+
132146

133147
@strawberry.type
134148
class Metric:

alphatrion/storage/runtime.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33

44
from opentelemetry import trace
5+
from opentelemetry.sdk.trace import TracerProvider
56
from traceloop.sdk import Traceloop
67

78
from alphatrion import envs
@@ -38,10 +39,13 @@ def __init__(self):
3839
== "true",
3940
)
4041

42+
enable_batch = (
43+
os.getenv(envs.CLICKHOUSE_ENABLE_BATCH, "true").lower() == "true"
44+
)
4145
Traceloop.init(
4246
app_name="alphatrion",
4347
exporter=ClickHouseSpanExporter(self.tracestore),
44-
disable_batch=False, # Enable batching
48+
disable_batch=not enable_batch,
4549
telemetry_enabled=False,
4650
)
4751

@@ -60,6 +64,12 @@ def metadb(self):
6064
def tracestore(self):
6165
return self._tracestore
6266

67+
def flush(self):
68+
if self._tracestore:
69+
tracer_provider = trace.get_tracer_provider()
70+
if isinstance(tracer_provider, TracerProvider):
71+
tracer_provider.force_flush(timeout_millis=5000)
72+
6373

6474
def init():
6575
"""

alphatrion/storage/sqlstore.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -695,3 +695,14 @@ def list_metrics_by_experiment_id(self, experiment_id: uuid.UUID) -> list[Metric
695695
)
696696
session.close()
697697
return metrics
698+
699+
def list_metrics_by_run_id(self, run_id: uuid.UUID) -> list[Metric]:
700+
session = self._session()
701+
metrics = (
702+
session.query(Metric)
703+
.filter(Metric.run_id == run_id)
704+
.order_by(Metric.created_at.asc())
705+
.all()
706+
)
707+
session.close()
708+
return metrics

0 commit comments

Comments
 (0)