fix(069-A2): count blocked attempts + wire observability hot-reload

Dumbris · Paperclip-Paperclip · Dumbris · commit 17d85375e5e3 · 2026-06-01T16:05:01.000+03:00
Addresses CodexReviewer findings #2 and #3 on PR #560 (the O(1) fix cleared #1 once CI went green). #2 — blocked tool attempts were missing from the usage aggregate. They are persisted as blocked `policy_decision` records, but `Apply` dropped all non-tool_call records and `handlePolicyDecision` never fed the aggregate, so the contract's per-tool `blocked` field was permanently 0. `Apply` now also folds blocked policy_decisions: a blocked attempt never executed, so it increments only `Blocked` + `LastUsed` — not `Calls`, latency, bytes, or the executed-call timeline. `handlePolicyDecision` calls `usage.Apply` on save success so the live path matches a cold-start rebuild-from-scan. Extracted a `tool()` get-or-create helper. #3 — `observability.usage_persist_interval` claimed hot-reload but was only read at construction. `DetectConfigChanges` now flags an `observability` change and `ApplyConfig` pushes the new cadence into the running ActivityService via `SetUsagePersistInterval` (the flush loop already re-reads the interval each cycle). Test-first (ENG-1): aggregate counts blocked-only (not Calls/latency/ timeline); live `handlePolicyDecision` folds blocked into the snapshot; `DetectConfigChanges` detects observability as hot-reloadable; end-to-end `ApplyConfig` applies the new interval to a running runtime. Repointed the "ignores non-tool_calls" test to a non-blocked decision. Contract documents `blocked` semantics. Full internal/runtime+config+storage -race green; lint 0; personal+server builds. Related #560 Related MCP-835 Co-Authored-By: Paperclip <noreply@paperclip.ing>
diff --git a/internal/runtime/activity_service.go b/internal/runtime/activity_service.go
@@ -362,6 +362,15 @@ func (s *ActivityService) handlePolicyDecision(evt Event) {
 			zap.Error(err),
 			zap.String("server_name", serverName),
 			zap.String("decision", decision))
+		return
+	}
+
+	// Fold blocked attempts into the usage aggregate (Spec 069 A2). Apply
+	// ignores non-blocked decisions, so passing every policy decision is safe.
+	// Done only on save success so the in-memory rollup stays consistent with a
+	// cold-start rebuild that re-scans persisted records.
+	if s.usage != nil {
+		s.usage.Apply(record)
 	}
 }
 
diff --git a/internal/runtime/apply_config_restart_test.go b/internal/runtime/apply_config_restart_test.go
@@ -5,6 +5,7 @@ import (
 	"path/filepath"
 	"runtime"
 	"testing"
+	"time"
 
 	"go.uber.org/zap"
 
@@ -107,6 +108,41 @@ func TestApplyConfig_HotReloadableChange(t *testing.T) {
 	assert.Equal(t, 20, savedCfg.ToolsLimit, "Config file should be updated with new ToolsLimit value")
 }
 
+// TestApplyConfig_ObservabilityHotReload (MCP-835 / Codex finding #3): changing
+// the observability usage persist interval must hot-reload into the running
+// ActivityService — previously ApplyConfig only handled logging/truncator, so
+// SetUsagePersistInterval's "hot-reloadable" promise was unfulfilled.
+func TestApplyConfig_ObservabilityHotReload(t *testing.T) {
+	tmpDir := t.TempDir()
+	cfgPath := filepath.Join(tmpDir, "config.json")
+
+	initialCfg := config.DefaultConfig()
+	initialCfg.Listen = "127.0.0.1:8080"
+	initialCfg.DataDir = tmpDir
+	require.NoError(t, config.SaveConfig(initialCfg, cfgPath))
+
+	rt, err := New(initialCfg, cfgPath, zap.NewNop())
+	require.NoError(t, err)
+	defer func() { _ = rt.Close() }()
+
+	// Default cadence is 30s before the reload.
+	require.Equal(t, DefaultUsagePersistInterval, rt.ActivityService().usagePersistInterval())
+
+	newCfg := config.DefaultConfig()
+	newCfg.Listen = "127.0.0.1:8080"
+	newCfg.DataDir = tmpDir
+	newCfg.Observability.UsagePersistInterval = config.Duration(10 * time.Second)
+
+	result, err := rt.ApplyConfig(newCfg, cfgPath)
+	require.NoError(t, err)
+	require.NotNil(t, result)
+
+	assert.False(t, result.RequiresRestart, "observability cadence change is hot-reloadable")
+	assert.Contains(t, result.ChangedFields, "observability")
+	assert.Equal(t, 10*time.Second, rt.ActivityService().usagePersistInterval(),
+		"new persist interval must be applied to the running ActivityService")
+}
+
 // TestApplyConfig_SaveFailure tests handling of save errors
 func TestApplyConfig_SaveFailure(t *testing.T) {
 	// Skip on Windows: chmod on directories doesn't reliably prevent file creation
diff --git a/internal/runtime/config_hotreload.go b/internal/runtime/config_hotreload.go
@@ -133,6 +133,12 @@ func DetectConfigChanges(oldCfg, newCfg *config.Config) *ConfigApplyResult {
 		result.ChangedFields = append(result.ChangedFields, "environment")
 	}
 
+	// Observability cadence (Spec 069 A2 — can be hot-reloaded; the usage flush
+	// loop re-reads the interval each cycle, so applying it is just a setter).
+	if !reflect.DeepEqual(oldCfg.Observability, newCfg.Observability) {
+		result.ChangedFields = append(result.ChangedFields, "observability")
+	}
+
 	// If no changes detected
 	if len(result.ChangedFields) == 0 {
 		result.AppliedImmediately = false
diff --git a/internal/runtime/config_hotreload_test.go b/internal/runtime/config_hotreload_test.go
@@ -9,6 +9,32 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
+// TestDetectConfigChanges_Observability (MCP-835 / Codex finding #3): changing
+// the observability usage cadence must be detected as a hot-reloadable change so
+// ApplyConfig can push the new persist interval to the running ActivityService.
+// SetUsagePersistInterval advertises hot-reload; the detector must back it.
+func TestDetectConfigChanges_Observability(t *testing.T) {
+	base := &config.Config{
+		Listen: "127.0.0.1:8080", DataDir: "/d", TLS: &config.TLSConfig{},
+		Observability: &config.ObservabilityConfig{
+			UsageCacheTTL:        config.Duration(5 * time.Second),
+			UsagePersistInterval: config.Duration(30 * time.Second),
+		},
+	}
+	changed := &config.Config{
+		Listen: "127.0.0.1:8080", DataDir: "/d", TLS: &config.TLSConfig{},
+		Observability: &config.ObservabilityConfig{
+			UsageCacheTTL:        config.Duration(5 * time.Second),
+			UsagePersistInterval: config.Duration(10 * time.Second),
+		},
+	}
+
+	result := DetectConfigChanges(base, changed)
+	require.True(t, result.Success)
+	assert.Contains(t, result.ChangedFields, "observability")
+	assert.False(t, result.RequiresRestart, "cadence change is hot-reloadable")
+}
+
 func TestDetectConfigChanges(t *testing.T) {
 	baseConfig := &config.Config{
 		Listen:            "127.0.0.1:8080",
@@ -49,7 +75,7 @@ func TestDetectConfigChanges(t *testing.T) {
 				Listen:            ":9090", // Changed
 				DataDir:           "/test/data",
 				APIKey:            "test-key",
-		ToolsLimit:        15,
+				ToolsLimit:        15,
 				ToolResponseLimit: 1000,
 				CallToolTimeout:   config.Duration(60 * time.Second),
 				Servers:           []*config.ServerConfig{},
@@ -67,7 +93,7 @@ func TestDetectConfigChanges(t *testing.T) {
 				Listen:            "127.0.0.1:8080",
 				DataDir:           "/different/data", // Changed
 				APIKey:            "test-key",
-		ToolsLimit:        15,
+				ToolsLimit:        15,
 				ToolResponseLimit: 1000,
 				CallToolTimeout:   config.Duration(60 * time.Second),
 				Servers:           []*config.ServerConfig{},
@@ -85,7 +111,7 @@ func TestDetectConfigChanges(t *testing.T) {
 				Listen:            "127.0.0.1:8080",
 				DataDir:           "/test/data",
 				APIKey:            "new-key", // Changed
-		ToolsLimit:        15,
+				ToolsLimit:        15,
 				ToolResponseLimit: 1000,
 				CallToolTimeout:   config.Duration(60 * time.Second),
 				Servers:           []*config.ServerConfig{},
@@ -103,7 +129,7 @@ func TestDetectConfigChanges(t *testing.T) {
 				Listen:            "127.0.0.1:8080",
 				DataDir:           "/test/data",
 				APIKey:            "test-key",
-		ToolsLimit:        15,
+				ToolsLimit:        15,
 				ToolResponseLimit: 1000,
 				CallToolTimeout:   config.Duration(60 * time.Second),
 				Servers:           []*config.ServerConfig{},
@@ -124,7 +150,7 @@ func TestDetectConfigChanges(t *testing.T) {
 				Listen:            "127.0.0.1:8080",
 				DataDir:           "/test/data",
 				APIKey:            "test-key",
-		ToolsLimit:        20, // Changed
+				ToolsLimit:        20, // Changed
 				ToolResponseLimit: 1000,
 				CallToolTimeout:   config.Duration(60 * time.Second),
 				Servers:           []*config.ServerConfig{},
@@ -144,7 +170,7 @@ func TestDetectConfigChanges(t *testing.T) {
 				Listen:            "127.0.0.1:8080",
 				DataDir:           "/test/data",
 				APIKey:            "test-key",
-		ToolsLimit:        15,
+				ToolsLimit:        15,
 				ToolResponseLimit: 1000,
 				CallToolTimeout:   config.Duration(60 * time.Second),
 				Servers: []*config.ServerConfig{ // Changed
diff --git a/internal/runtime/runtime.go b/internal/runtime/runtime.go
@@ -1267,6 +1267,15 @@ func (r *Runtime) ApplyConfig(newCfg *config.Config, cfgPath string) (*ConfigApp
 		r.truncator = truncate.NewTruncator(newCfg.ToolResponseLimit)
 	}
 
+	// Apply observability usage cadence (Spec 069 A2 — hot-reloadable). The
+	// usage flush loop re-reads the interval each cycle, so the setter suffices.
+	if contains(result.ChangedFields, "observability") && r.activityService != nil &&
+		newCfg.Observability != nil && newCfg.Observability.UsagePersistInterval.Duration() > 0 {
+		r.logger.Info("Observability usage persist interval changed",
+			zap.Duration("new_interval", newCfg.Observability.UsagePersistInterval.Duration()))
+		r.activityService.SetUsagePersistInterval(newCfg.Observability.UsagePersistInterval.Duration())
+	}
+
 	// Capture app context, config path, and config copy while we still hold the lock
 	appCtx := r.appCtx
 	cfgPathCopy := r.cfgPath
diff --git a/internal/runtime/usage_aggregate.go b/internal/runtime/usage_aggregate.go
@@ -147,30 +147,47 @@ func newUsageAggregate() *UsageAggregate {
 	}
 }
 
-// Apply folds a single activity record into the aggregate. Non tool_call
-// records and records without a tool name are ignored. Apply is not safe for
-// concurrent use; it is called only by the owning goroutine (see UsageStore).
-func (a *UsageAggregate) Apply(rec *storage.ActivityRecord) {
-	if rec == nil || rec.Type != storage.ActivityTypeToolCall || rec.ToolName == "" {
-		return
-	}
-
-	key := toolKey(rec.ServerName, rec.ToolName)
+// tool returns the per-(server,tool) rollup, creating it on first use. It also
+// defensively resizes a persisted snapshot from an older latency-bucket layout
+// rather than panicking on index.
+func (a *UsageAggregate) tool(server, toolName string) *ToolUsage {
+	key := toolKey(server, toolName)
 	tu := a.Tools[key]
 	if tu == nil {
 		tu = &ToolUsage{
-			Server:         rec.ServerName,
-			Tool:           rec.ToolName,
+			Server:         server,
+			Tool:           toolName,
 			LatencyBuckets: make([]int64, numLatencyBuckets()),
 		}
 		a.Tools[key] = tu
 	} else if len(tu.LatencyBuckets) != numLatencyBuckets() {
-		// Defensive: a persisted snapshot from an older bucket layout is
-		// resized rather than panicking on index.
 		resized := make([]int64, numLatencyBuckets())
 		copy(resized, tu.LatencyBuckets)
 		tu.LatencyBuckets = resized
 	}
+	return tu
+}
+
+// Apply folds a single activity record into the aggregate. It accepts executed
+// tool_calls and blocked policy_decisions (the form a policy-prevented tool
+// attempt is persisted as — MCP-835); all other records, and records without a
+// tool name, are ignored. Apply is not safe for concurrent use; it is called
+// only by the owning goroutine (see UsageStore).
+func (a *UsageAggregate) Apply(rec *storage.ActivityRecord) {
+	if rec == nil || rec.ToolName == "" {
+		return
+	}
+	switch {
+	case rec.Type == storage.ActivityTypeToolCall:
+		// folded below
+	case rec.Type == storage.ActivityTypePolicyDecision && rec.Status == "blocked":
+		a.applyBlocked(rec)
+		return
+	default:
+		return
+	}
+
+	tu := a.tool(rec.ServerName, rec.ToolName)
 
 	tu.Calls++
 	switch rec.Status {
@@ -195,6 +212,20 @@ func (a *UsageAggregate) Apply(rec *storage.ActivityRecord) {
 	a.applyTimeBucket(rec)
 }
 
+// applyBlocked folds a policy-blocked attempt into the per-tool Blocked counter.
+// A blocked attempt never executed the tool, so it contributes no Calls,
+// latency, or bytes, and does not enter the executed-call timeline — it only
+// bumps Blocked and LastUsed. This keeps the contract's per-tool `blocked`
+// count non-zero (the field was previously dead) without polluting latency
+// percentiles or byte averages with non-executed attempts.
+func (a *UsageAggregate) applyBlocked(rec *storage.ActivityRecord) {
+	tu := a.tool(rec.ServerName, rec.ToolName)
+	tu.Blocked++
+	if rec.Timestamp.After(tu.LastUsed) {
+		tu.LastUsed = rec.Timestamp
+	}
+}
+
 func (a *UsageAggregate) applyTimeBucket(rec *storage.ActivityRecord) {
 	start := rec.Timestamp.UTC().Truncate(usageBucketWidth)
 	k := start.Unix()
diff --git a/internal/runtime/usage_aggregate_test.go b/internal/runtime/usage_aggregate_test.go
@@ -56,13 +56,42 @@ func TestUsageAggregate_Apply_IgnoresNonToolCalls(t *testing.T) {
 	agg := newUsageAggregate()
 	ts := time.Date(2026, 6, 1, 10, 0, 0, 0, time.UTC)
 
-	// Non tool_call records and empty tool names are ignored.
-	agg.Apply(&storage.ActivityRecord{Type: storage.ActivityTypePolicyDecision, ServerName: "x", ToolName: "y", Status: "blocked", Timestamp: ts})
+	// Non-blocked policy decisions and tool_calls with empty tool names are
+	// ignored. (Blocked policy decisions ARE counted — see the test below.)
+	agg.Apply(&storage.ActivityRecord{Type: storage.ActivityTypePolicyDecision, ServerName: "x", ToolName: "y", Status: "approved", Timestamp: ts})
 	agg.Apply(&storage.ActivityRecord{Type: storage.ActivityTypeToolCall, ServerName: "x", ToolName: "", Status: "success", Timestamp: ts})
 
 	assert.Empty(t, agg.Tools)
 }
 
+// TestUsageAggregate_Apply_CountsBlockedPolicyDecisions (MCP-835 / Codex
+// finding #2): blocked tool attempts are persisted as policy_decision records,
+// not tool_calls. The aggregate must still count them so the contract's
+// per-tool `blocked` field is non-zero. A blocked attempt never executed, so it
+// contributes ONLY to Blocked (and LastUsed) — not Calls, latency, bytes, or
+// the timeline (which tracks executed calls).
+func TestUsageAggregate_Apply_CountsBlockedPolicyDecisions(t *testing.T) {
+	agg := newUsageAggregate()
+	ts := time.Date(2026, 6, 1, 10, 0, 0, 0, time.UTC)
+
+	agg.Apply(&storage.ActivityRecord{Type: storage.ActivityTypePolicyDecision, ServerName: "github", ToolName: "search", Status: "blocked", Timestamp: ts})
+	agg.Apply(&storage.ActivityRecord{Type: storage.ActivityTypePolicyDecision, ServerName: "github", ToolName: "search", Status: "blocked", Timestamp: ts.Add(time.Minute)})
+
+	tu := agg.Tools[toolKey("github", "search")]
+	require.NotNil(t, tu, "blocked attempts must create a per-tool entry")
+	assert.Equal(t, int64(2), tu.Blocked, "both blocked attempts counted")
+	assert.Equal(t, int64(0), tu.Calls, "blocked attempts are not executed calls")
+	assert.Equal(t, int64(0), tu.Errors)
+	assert.Equal(t, ts.Add(time.Minute), tu.LastUsed, "LastUsed tracks the latest attempt")
+
+	var latencyTotal int64
+	for _, c := range tu.LatencyBuckets {
+		latencyTotal += c
+	}
+	assert.Equal(t, int64(0), latencyTotal, "blocked attempts have no latency sample")
+	assert.Empty(t, agg.Buckets, "blocked attempts do not enter the executed-call timeline")
+}
+
 func TestToolUsage_Averages_ExcludeZeroByteCalls(t *testing.T) {
 	agg := newUsageAggregate()
 	ts := time.Date(2026, 6, 1, 10, 0, 0, 0, time.UTC)
diff --git a/internal/runtime/usage_service_test.go b/internal/runtime/usage_service_test.go
@@ -132,6 +132,34 @@ func TestActivityService_HandleEvent_AppliesToolCallToUsage(t *testing.T) {
 	assert.Equal(t, int64(128), tu.ReqBytesSum)
 }
 
+// TestActivityService_HandleEvent_CountsBlockedPolicyDecision (MCP-835 / Codex
+// finding #2): a blocked tool attempt is emitted as a policy_decision event,
+// not a tool_call. The live path must fold it into the usage aggregate so the
+// per-tool `blocked` count is non-zero — matching what a cold-start scan would
+// rebuild from the persisted record.
+func TestActivityService_HandleEvent_CountsBlockedPolicyDecision(t *testing.T) {
+	svc, _ := newUsageTestService(t)
+	evt := Event{
+		Type:      EventTypeActivityPolicyDecision,
+		Timestamp: time.Date(2026, 6, 1, 10, 0, 0, 0, time.UTC),
+		Payload: map[string]any{
+			"server_name": "github",
+			"tool_name":   "search",
+			"decision":    "blocked",
+			"reason":      "Server is quarantined for security review",
+		},
+	}
+
+	svc.handleEvent(evt)
+
+	snap := svc.UsageSnapshot()
+	require.NotNil(t, snap)
+	tu := snap.Tools[toolKey("github", "search")]
+	require.NotNil(t, tu, "blocked policy decision must reach the usage aggregate")
+	assert.Equal(t, int64(1), tu.Blocked)
+	assert.Equal(t, int64(0), tu.Calls, "blocked attempt is not an executed call")
+}
+
 func TestActivityService_SetUsagePersistInterval_HotReload(t *testing.T) {
 	svc, _ := newUsageTestService(t)
 	assert.Equal(t, DefaultUsagePersistInterval, svc.usagePersistInterval())
diff --git a/specs/069-observability-usage-graphs/contracts/usage-endpoint.md b/specs/069-observability-usage-graphs/contracts/usage-endpoint.md
@@ -55,6 +55,7 @@ Auth: `X-API-Key` (REST default).
 - `token_source: "bytes"` labels the size-based proxy (FR-006); FR-010 will switch this to `"estimated_tokens"`.
 - `tokens_saved*` echoed from existing `ServerTokenMetrics` (FR-007 / SC-008).
 - `avg_*` computed over `sized_calls` only (records with `0` bytes excluded); `null`/omitted when `sized_calls == 0`.
+- `blocked` counts policy-prevented attempts (persisted as blocked `policy_decision` records, not executed tool_calls). A blocked attempt never ran, so it is **not** included in `calls` and contributes no latency/bytes — it only increments `blocked` and `last_used`. The timeline tracks executed calls and excludes blocked attempts.
 - `other` present only when the tool list was truncated to `top`.
 - Empty log → `tools: []`, `timeline: []`, `tokens_saved` from metrics (or 0) — never an error (FR-009 / SC-007).