feat(trace): per-model cost estimation + fix token extraction

kunalkushwaha · claude · kunalkushwaha · commit c24ebdd12a28 · 2026-06-22T00:06:43.000+09:00
The trace fallback path read token attributes under keys the framework never
emits (llm.usage.completion_tokens / llm.completion_tokens), so TotalTokens —
and therefore the cost estimate — was always 0 for runs without a manifest.

- Add internal/pricing: a published-list-price table (per 1M tokens) with
  exact + longest-prefix model matching (handles dated/variant ids and provider
  prefixes); local/unknown models return (0, false). Unit tested.
- Read the real AgenticGoKit observability keys (agk.llm.tokens.input/output/total,
  agk.llm.model, agk.llm.cost.usd) plus legacy aliases for back-compat.
- Replace the hardcoded `tokens * 0.00001` estimate: prefer a directly-reported
  cost, else a price-table estimate, else 0 for unknown/local models.

Improves the accuracy of `agk trace list/view` and the `agk run` trace summary.

Co-Authored-By: Claude Opus 4.8 &lt;noreply@anthropic.com&gt;
diff --git a/cmd/trace.go b/cmd/trace.go
@@ -13,6 +13,7 @@ import (
 	"time"
 
 	"github.com/agenticgokit/agk/internal/audit"
+	"github.com/agenticgokit/agk/internal/pricing"
 	"github.com/agenticgokit/agk/internal/tui"
 	tea "github.com/charmbracelet/bubbletea"
 	"github.com/spf13/cobra"
@@ -637,7 +638,6 @@ func parseTraceFile(runPath string) (TraceRun, error) {
 	}
 
 	durationSeconds := stats.LastSpan.Sub(stats.FirstSpan).Seconds()
-	estimatedCost := float64(stats.TotalTokens) * 0.00001 // Rough estimate
 
 	return TraceRun{
 		RunID:         runID,
@@ -648,17 +648,22 @@ func parseTraceFile(runPath string) (TraceRun, error) {
 		Duration:      durationSeconds,
 		SpanCount:     stats.SpanCount,
 		LLMCalls:      stats.LLMCalls,
-		TotalTokens:   stats.TotalTokens,
-		EstimatedCost: estimatedCost,
+		TotalTokens:   stats.Tokens(),
+		EstimatedCost: stats.Cost(),
 	}, nil
 }
 
 type RunStats struct {
-	SpanCount   int
-	LLMCalls    int
-	TotalTokens int
-	FirstSpan   time.Time
-	LastSpan    time.Time
+	SpanCount     int
+	LLMCalls      int
+	InputTokens   int
+	OutputTokens  int
+	TotalTokens   int
+	Model         string
+	DirectCost    float64 // summed from agk.llm.cost.usd attributes, if emitted
+	HasDirectCost bool
+	FirstSpan     time.Time
+	LastSpan      time.Time
 }
 
 func (s *RunStats) Update(span map[string]interface{}) {
@@ -671,34 +676,83 @@ func (s *RunStats) Update(span map[string]interface{}) {
 		}
 	}
 
-	// Extract token count from attributes
+	// Extract token/model/cost data from attributes
 	if attrs, ok := span["Attributes"].([]interface{}); ok {
-		s.extractTokens(attrs)
+		s.extractAttrs(attrs)
 	}
 
 	// Extract start and end times
 	s.updateTimes(span)
 }
 
-func (s *RunStats) extractTokens(attrs []interface{}) {
+// extractAttrs pulls token, model, and cost data from a span's attributes.
+// It recognizes the AgenticGoKit observability keys (agk.llm.tokens.*, agk.llm.model,
+// agk.llm.cost.usd) plus a few legacy aliases for backward compatibility.
+func (s *RunStats) extractAttrs(attrs []interface{}) {
 	for _, attr := range attrs {
-		if attrMap, ok := attr.(map[string]interface{}); ok {
-			if key, ok := attrMap["Key"].(string); ok {
-				// Look for token-related attributes
-				if key == "llm.usage.completion_tokens" || key == "llm.completion_tokens" {
-					if val, ok := attrMap["Value"].(map[string]interface{}); ok {
-						if tokenVal, ok := val["Value"]; ok {
-							if tokenInt, err := toInt64(tokenVal); err == nil {
-								s.TotalTokens += int(tokenInt)
-							}
-						}
-					}
-				}
+		attrMap, ok := attr.(map[string]interface{})
+		if !ok {
+			continue
+		}
+		key, ok := attrMap["Key"].(string)
+		if !ok {
+			continue
+		}
+		val, ok := attrMap["Value"].(map[string]interface{})
+		if !ok {
+			continue
+		}
+		raw := val["Value"]
+
+		switch key {
+		case "agk.llm.tokens.input", "agk.llm.tokens.prompt",
+			"llm.usage.prompt_tokens", "llm.prompt_tokens":
+			if n, err := toInt64(raw); err == nil {
+				s.InputTokens += int(n)
+			}
+		case "agk.llm.tokens.output", "agk.llm.tokens.completion",
+			"llm.usage.completion_tokens", "llm.completion_tokens":
+			if n, err := toInt64(raw); err == nil {
+				s.OutputTokens += int(n)
+			}
+		case "agk.llm.tokens.total", "llm.usage.total_tokens":
+			if n, err := toInt64(raw); err == nil {
+				s.TotalTokens += int(n)
+			}
+		case "agk.llm.cost.usd":
+			if f, err := toFloat64(raw); err == nil {
+				s.DirectCost += f
+				s.HasDirectCost = true
+			}
+		case "agk.llm.model":
+			if str, ok := raw.(string); ok && str != "" {
+				s.Model = str
 			}
 		}
 	}
 }
 
+// Tokens returns the best available total token count, preferring an explicit
+// total and falling back to input+output.
+func (s *RunStats) Tokens() int {
+	if s.TotalTokens > 0 {
+		return s.TotalTokens
+	}
+	return s.InputTokens + s.OutputTokens
+}
+
+// Cost returns the estimated USD cost: a directly-reported cost if present,
+// otherwise a price-table estimate, otherwise 0 for unknown/local models.
+func (s *RunStats) Cost() float64 {
+	if s.HasDirectCost {
+		return s.DirectCost
+	}
+	if cost, ok := pricing.Estimate(s.Model, s.InputTokens, s.OutputTokens); ok {
+		return cost
+	}
+	return 0
+}
+
 func (s *RunStats) updateTimes(span map[string]interface{}) {
 	// Extract start and end times from span
 	// Format: "2026-01-19T18:36:38.897+09:00"
@@ -741,6 +795,23 @@ func toInt64(v interface{}) (int64, error) {
 	}
 }
 
+// toFloat64 safely converts a value to float64
+func toFloat64(v interface{}) (float64, error) {
+	switch val := v.(type) {
+	case float64:
+		return val, nil
+	case int:
+		return float64(val), nil
+	case int64:
+		return float64(val), nil
+	case string:
+		f, err := strconv.ParseFloat(val, 64)
+		return f, err
+	default:
+		return 0, fmt.Errorf("cannot convert %T to float64", v)
+	}
+}
+
 func getLatestRunID() string {
 	entries, err := os.ReadDir(runsDirName)
 	if err != nil {
diff --git a/internal/pricing/pricing.go b/internal/pricing/pricing.go
@@ -0,0 +1,83 @@
+// Package pricing provides approximate USD cost estimates for LLM token usage.
+//
+// Prices are published list prices per 1,000,000 tokens and are intended for
+// rough cost reporting in traces, not billing. They drift over time; update the
+// table as providers change pricing. Local models (Ollama, etc.) are treated as
+// free and intentionally absent from the table.
+package pricing
+
+import "strings"
+
+// ModelPrice is the USD price per 1,000,000 tokens, split by input and output.
+type ModelPrice struct {
+	InputPer1M  float64
+	OutputPer1M float64
+}
+
+// table maps a normalized model key to its price. Lookups use exact match first,
+// then the longest matching prefix, so dated/variant model ids (e.g.
+// "gpt-4o-2024-08-06", "claude-sonnet-4-20250514") resolve to their base model.
+var table = map[string]ModelPrice{
+	// OpenAI
+	"gpt-4o":        {InputPer1M: 2.50, OutputPer1M: 10.00},
+	"gpt-4o-mini":   {InputPer1M: 0.15, OutputPer1M: 0.60},
+	"gpt-4-turbo":   {InputPer1M: 10.00, OutputPer1M: 30.00},
+	"gpt-4":         {InputPer1M: 30.00, OutputPer1M: 60.00},
+	"gpt-3.5-turbo": {InputPer1M: 0.50, OutputPer1M: 1.50},
+	"o1":            {InputPer1M: 15.00, OutputPer1M: 60.00},
+	"o1-mini":       {InputPer1M: 1.10, OutputPer1M: 4.40},
+	"o3-mini":       {InputPer1M: 1.10, OutputPer1M: 4.40},
+
+	// Anthropic
+	"claude-3-5-sonnet": {InputPer1M: 3.00, OutputPer1M: 15.00},
+	"claude-3-5-haiku":  {InputPer1M: 0.80, OutputPer1M: 4.00},
+	"claude-3-opus":     {InputPer1M: 15.00, OutputPer1M: 75.00},
+	"claude-3-sonnet":   {InputPer1M: 3.00, OutputPer1M: 15.00},
+	"claude-3-haiku":    {InputPer1M: 0.25, OutputPer1M: 1.25},
+	"claude-sonnet-4":   {InputPer1M: 3.00, OutputPer1M: 15.00},
+	"claude-opus-4":     {InputPer1M: 15.00, OutputPer1M: 75.00},
+	"claude-haiku-4":    {InputPer1M: 1.00, OutputPer1M: 5.00},
+}
+
+// Estimate returns the estimated USD cost for the given token usage and whether
+// the model was found in the price table. Unknown or local models return (0, false),
+// letting callers decide how to report an unpriced run.
+func Estimate(model string, inputTokens, outputTokens int) (float64, bool) {
+	p, ok := lookup(model)
+	if !ok {
+		return 0, false
+	}
+	cost := float64(inputTokens)/1e6*p.InputPer1M + float64(outputTokens)/1e6*p.OutputPer1M
+	return cost, true
+}
+
+func lookup(model string) (ModelPrice, bool) {
+	key := normalize(model)
+	if key == "" {
+		return ModelPrice{}, false
+	}
+	if p, ok := table[key]; ok {
+		return p, true
+	}
+	// Longest-prefix match handles dated/variant ids.
+	var best string
+	for k := range table {
+		if strings.HasPrefix(key, k) && len(k) > len(best) {
+			best = k
+		}
+	}
+	if best != "" {
+		return table[best], true
+	}
+	return ModelPrice{}, false
+}
+
+// normalize lowercases the model id and strips any provider prefix
+// (e.g. "openai/gpt-4o" -> "gpt-4o").
+func normalize(model string) string {
+	m := strings.ToLower(strings.TrimSpace(model))
+	if i := strings.LastIndex(m, "/"); i >= 0 {
+		m = m[i+1:]
+	}
+	return m
+}
diff --git a/internal/pricing/pricing_test.go b/internal/pricing/pricing_test.go
@@ -0,0 +1,51 @@
+package pricing
+
+import "testing"
+
+func TestEstimate(t *testing.T) {
+	tests := []struct {
+		name     string
+		model    string
+		in       int
+		out      int
+		want     float64
+		matched  bool
+		approxOK bool // compare with tolerance instead of exact
+	}{
+		{name: "gpt-4o exact", model: "gpt-4o", in: 1_000_000, out: 1_000_000, want: 12.50, matched: true},
+		{name: "gpt-4o-mini", model: "gpt-4o-mini", in: 1_000_000, out: 0, want: 0.15, matched: true},
+		{name: "gpt-4o dated suffix", model: "gpt-4o-2024-08-06", in: 1_000_000, out: 0, want: 2.50, matched: true},
+		{name: "longest prefix prefers mini", model: "gpt-4o-mini-2024-07-18", in: 1_000_000, out: 0, want: 0.15, matched: true},
+		{name: "provider prefix stripped", model: "openai/gpt-4o", in: 0, out: 1_000_000, want: 10.00, matched: true},
+		{name: "claude sonnet 4 dated", model: "claude-sonnet-4-20250514", in: 1_000_000, out: 0, want: 3.00, matched: true},
+		{name: "case insensitive", model: "GPT-4O", in: 1_000_000, out: 0, want: 2.50, matched: true},
+		{name: "zero tokens still matched", model: "gpt-4o", in: 0, out: 0, want: 0, matched: true},
+		{name: "local model unknown", model: "llama3.2", in: 1_000_000, out: 1_000_000, want: 0, matched: false},
+		{name: "empty model", model: "", in: 100, out: 100, want: 0, matched: false},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got, matched := Estimate(tc.model, tc.in, tc.out)
+			if matched != tc.matched {
+				t.Fatalf("matched = %v, want %v", matched, tc.matched)
+			}
+			if diff := got - tc.want; diff > 1e-9 || diff < -1e-9 {
+				t.Fatalf("cost = %v, want %v", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestEstimatePartialTokens(t *testing.T) {
+	// 1,500 input + 500 output on gpt-4o:
+	// 1500/1e6*2.50 + 500/1e6*10.00 = 0.00375 + 0.005 = 0.00875
+	got, matched := Estimate("gpt-4o", 1500, 500)
+	if !matched {
+		t.Fatal("expected gpt-4o to match")
+	}
+	want := 0.00875
+	if diff := got - want; diff > 1e-9 || diff < -1e-9 {
+		t.Fatalf("cost = %v, want %v", got, want)
+	}
+}