feat(security): per-category precision/FP/F1 in scan-eval gate (T018)

Dumbris · Paperclip-Paperclip · Dumbris · commit b1406a8723a9 · 2026-06-28T07:34:19.000+03:00
CodexReviewer re-review of #777: T018 (tasks.md:75) requires `scan-eval --gate` to print per-category recall/precision/FP/F1, but categoryMetric only carried recall (precision/FP/F1 existed only as overall metrics). - categoryMetric now carries hard_negatives, false_positives, fp_rate, precision, and f1 per category, populated in the gate computation and JSON. - Per-category FP is attributed via a new `resembles` field on hard_negative corpus entries (the attack class a benign mimics — the SC-003 framing): a flagged hard-negative lowers its resembled category's precision. Clean-benign entries carry no `resembles` and affect only the overall benign FP count. - detect_corpus_v1.json: every hard_negative now declares `resembles` (consistent with its hn_<class> id); validator asserts it is set, names a gated category, and matches the id prefix. - Extracted an f1() helper; overall F1 reuses it. - Tests: TestGateMetrics_PerCategoryShapeAndFPAttribution proves the per-category JSON exposes recall/precision/FP/F1 and that a resembling hard-negative FP drops that category's precision (1 TP + 1 FP -> precision 0.5); TestEvaluateGateCorpus asserts per-category recall/precision/f1 = 1.0. Committed corpus: recall 1.0 (16/16 gated), fp_rate 0/9; every gated category reports recall/precision/f1 = 1.0, FP 0. Related #MCP-3579 Co-Authored-By: Paperclip <noreply@paperclip.ing>
diff --git a/cmd/scan-eval/gate.go b/cmd/scan-eval/gate.go
@@ -35,9 +35,13 @@ type gatePeer struct {
 // gateEntry is one labeled sample: a tool, its owning server, optional peers,
 // the ground-truth label/category, and redistributable provenance.
 type gateEntry struct {
-	ID         string     `json:"id"`
-	Label      string     `json:"label"`    // "malicious" | "benign"
-	Category   string     `json:"category"` // detect taxonomy or benign|hard_negative
+	ID       string `json:"id"`
+	Label    string `json:"label"`    // "malicious" | "benign"
+	Category string `json:"category"` // detect taxonomy or benign|hard_negative
+	// Resembles names the attack class a hard_negative mimics (e.g.
+	// "unicode_smuggling"), so a false positive on it counts toward that
+	// category's precision/FP (SC-003). Empty for clean-benign entries.
+	Resembles  string     `json:"resembles,omitempty"`
 	Server     string     `json:"server"`
 	Tool       gateTool   `json:"tool"`
 	Peers      []gatePeer `json:"peers,omitempty"`
@@ -78,13 +82,21 @@ func gateChecks() []detect.Check {
 	}
 }
 
-// categoryMetric is one category's per-run scorecard.
+// categoryMetric is one category's per-run scorecard (T018: per-category
+// recall/precision/FP/F1). Precision and FP are attributed via hard-negatives
+// that resemble this category (SC-003); a category with no resembling
+// hard-negatives reports zero FP.
 type categoryMetric struct {
-	Category  string  `json:"category"`
-	Gated     bool    `json:"gated"`     // is this category's check registered?
-	Malicious int     `json:"malicious"` // malicious samples in this category
-	Detected  int     `json:"detected"`  // malicious samples the engine flagged
-	Recall    float64 `json:"recall"`
+	Category       string  `json:"category"`
+	Gated          bool    `json:"gated"`     // is this category's check registered?
+	Malicious      int     `json:"malicious"` // malicious samples in this category
+	Detected       int     `json:"detected"`  // malicious samples the engine flagged (TP)
+	Recall         float64 `json:"recall"`
+	HardNegatives  int     `json:"hard_negatives"`  // resembling hard-negatives
+	FalsePositives int     `json:"false_positives"` // resembling hard-negatives flagged (FP)
+	FPRate         float64 `json:"fp_rate"`
+	Precision      float64 `json:"precision"` // TP / (TP + FP)
+	F1             float64 `json:"f1"`
 }
 
 // gateMetrics is the full metrics report emitted for the CI log.
@@ -133,9 +145,19 @@ func evaluateGateCorpus(c *gateCorpus, checkList []detect.Check) gateMetrics {
 	type catTally struct {
 		gated              bool
 		malicious, flagged int
+		hardNeg, hardNegFP int
 	}
 	cats := map[string]*catTally{}
 	order := []string{}
+	getCat := func(cat string) *catTally {
+		ct := cats[cat]
+		if ct == nil {
+			ct = &catTally{gated: gatedCategory(cat)}
+			cats[cat] = ct
+			order = append(order, cat)
+		}
+		return ct
+	}
 
 	var gatedMal, gatedDet, truePos int
 	var benignTotal, benignFP, hardNegTotal, hardNegFP int
@@ -146,12 +168,7 @@ func evaluateGateCorpus(c *gateCorpus, checkList []detect.Check) gateMetrics {
 
 		switch e.Label {
 		case "malicious":
-			ct := cats[e.Category]
-			if ct == nil {
-				ct = &catTally{gated: gatedCategory(e.Category)}
-				cats[e.Category] = ct
-				order = append(order, e.Category)
-			}
+			ct := getCat(e.Category)
 			ct.malicious++
 			if flagged {
 				ct.flagged++
@@ -168,12 +185,21 @@ func evaluateGateCorpus(c *gateCorpus, checkList []detect.Check) gateMetrics {
 			if flagged {
 				benignFP++
 			}
-			// SC-002 gates the FP rate on the hard-negative set specifically.
+			// SC-002 gates the FP rate on the hard-negative set specifically;
+			// SC-003 attributes each hard-negative FP to the attack class it
+			// resembles for the per-category precision/FP.
 			if e.Category == "hard_negative" {
 				hardNegTotal++
 				if flagged {
 					hardNegFP++
 				}
+				if e.Resembles != "" {
+					ct := getCat(e.Resembles)
+					ct.hardNeg++
+					if flagged {
+						ct.hardNegFP++
+					}
+				}
 			}
 		}
 	}
@@ -190,20 +216,25 @@ func evaluateGateCorpus(c *gateCorpus, checkList []detect.Check) gateMetrics {
 	}
 	for _, cat := range order {
 		ct := cats[cat]
+		recall := ratio(ct.flagged, ct.malicious)
+		precision := ratio(ct.flagged, ct.flagged+ct.hardNegFP)
 		m.Categories = append(m.Categories, categoryMetric{
-			Category:  cat,
-			Gated:     ct.gated,
-			Malicious: ct.malicious,
-			Detected:  ct.flagged,
-			Recall:    ratio(ct.flagged, ct.malicious),
+			Category:       cat,
+			Gated:          ct.gated,
+			Malicious:      ct.malicious,
+			Detected:       ct.flagged,
+			Recall:         recall,
+			HardNegatives:  ct.hardNeg,
+			FalsePositives: ct.hardNegFP,
+			FPRate:         ratio(ct.hardNegFP, ct.hardNeg),
+			Precision:      precision,
+			F1:             f1(precision, recall),
 		})
 	}
 	m.OverallRecall = ratio(gatedDet, gatedMal)
 	m.FPRate = ratio(hardNegFP, hardNegTotal)
 	m.Precision = ratio(truePos, truePos+benignFP)
-	if m.Precision+m.OverallRecall > 0 {
-		m.F1 = 2 * m.Precision * m.OverallRecall / (m.Precision + m.OverallRecall)
-	}
+	m.F1 = f1(m.Precision, m.OverallRecall)
 	return m
 }
 
@@ -310,3 +341,11 @@ func ratio(n, d int) float64 {
 	}
 	return float64(n) / float64(d)
 }
+
+// f1 is the harmonic mean of precision and recall (0 when both are 0).
+func f1(precision, recall float64) float64 {
+	if precision+recall == 0 {
+		return 0
+	}
+	return 2 * precision * recall / (precision + recall)
+}
diff --git a/cmd/scan-eval/gate_test.go b/cmd/scan-eval/gate_test.go
@@ -52,12 +52,12 @@ func gateFixture() *gateCorpus {
 			},
 			{
 				// hard-negative: ordinary accented Unicode, no hidden classes.
-				ID: "hn1", Label: "benign", Category: "hard_negative", Server: "i18n",
+				ID: "hn1", Label: "benign", Category: "hard_negative", Resembles: "unicode_smuggling", Server: "i18n",
 				Tool: gateTool{Name: "translate_text", Description: "Translates café and naïve into other languages."},
 			},
 			{
 				// hard-negative: benign base64 that decodes to JSON, not a command.
-				ID: "hn2", Label: "benign", Category: "hard_negative", Server: "cfg",
+				ID: "hn2", Label: "benign", Category: "hard_negative", Resembles: "decoded_payload", Server: "cfg",
 				Tool: gateTool{Name: "load_config", Description: "Loads config blob=" + benignJSONB64},
 			},
 		},
@@ -83,6 +83,14 @@ func TestEvaluateGateCorpus_DetectsAndExcludesUngated(t *testing.T) {
 		if c.Detected != c.Malicious || c.Malicious == 0 {
 			t.Errorf("category %q: want all %d malicious detected, got %d", cat, c.Malicious, c.Detected)
 		}
+		// T018: every gated category caught all malicious and flagged none of its
+		// resembling hard-negatives → recall 1.0, precision 1.0, FP 0, F1 1.0.
+		if c.Recall != 1.0 || c.Precision != 1.0 || c.F1 != 1.0 {
+			t.Errorf("category %q: want recall/precision/f1 = 1.0, got r=%v p=%v f1=%v", cat, c.Recall, c.Precision, c.F1)
+		}
+		if c.FalsePositives != 0 || c.FPRate != 0.0 {
+			t.Errorf("category %q: want 0 FP, got fp=%d rate=%v", cat, c.FalsePositives, c.FPRate)
+		}
 	}
 
 	cm, ok := byCat["capability_mismatch"]
@@ -162,6 +170,51 @@ func TestGateFP_HardNegativeDenominatorOnly(t *testing.T) {
 	}
 }
 
+// TestGateMetrics_PerCategoryShapeAndFPAttribution proves T018's contract: the
+// per-category JSON carries recall/precision/FP/F1, and a hard-negative that
+// resembles a category and is (wrongly) flagged lowers THAT category's precision.
+func TestGateMetrics_PerCategoryShapeAndFPAttribution(t *testing.T) {
+	c := &gateCorpus{Version: "t", Entries: []gateEntry{
+		{ID: "u_m", Label: "malicious", Category: "unicode_smuggling", Server: "evil",
+			Tool: gateTool{Name: "add_numbers", Description: "Adds." + zeroWidthSpace + " hidden."}},
+		{ID: "u_hn_fp", Label: "benign", Category: "hard_negative", Resembles: "unicode_smuggling", Server: "ok",
+			Tool: gateTool{Name: "list_things", Description: "Lists things." + zeroWidthSpace + " benign."}},
+	}}
+	m := evaluateGateCorpus(c, gateChecks())
+
+	var uni *categoryMetric
+	for i := range m.Categories {
+		if m.Categories[i].Category == "unicode_smuggling" {
+			uni = &m.Categories[i]
+		}
+	}
+	if uni == nil {
+		t.Fatal("unicode_smuggling category missing")
+	}
+	// 1 TP, 1 resembling hard-negative flagged → precision 1/2, recall 1, FP 1.
+	if uni.Detected != 1 || uni.FalsePositives != 1 {
+		t.Fatalf("TP/FP = %d/%d, want 1/1", uni.Detected, uni.FalsePositives)
+	}
+	if uni.Recall != 1.0 || uni.Precision != 0.5 {
+		t.Errorf("recall/precision = %v/%v, want 1.0/0.5", uni.Recall, uni.Precision)
+	}
+	wantF1 := 2 * 0.5 * 1.0 / (0.5 + 1.0)
+	if uni.F1 != wantF1 {
+		t.Errorf("f1 = %v, want %v", uni.F1, wantF1)
+	}
+
+	// The serialized per-category object must expose all of recall/precision/FP/F1.
+	blob, err := json.Marshal(m.Categories[0])
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, key := range []string{"recall", "precision", "false_positives", "fp_rate", "f1"} {
+		if !strings.Contains(string(blob), `"`+key+`"`) {
+			t.Errorf("per-category JSON missing key %q: %s", key, blob)
+		}
+	}
+}
+
 func TestGateDecision(t *testing.T) {
 	pass := gateMetrics{OverallRecall: 0.95, FPRate: 0.02}
 	if ok, reasons := pass.decide(0.90, 0.05); !ok {
diff --git a/specs/065-evaluation-foundation/datasets/detect_corpus_test.go b/specs/065-evaluation-foundation/datasets/detect_corpus_test.go
@@ -50,6 +50,7 @@ type detectEntry struct {
 	ID         string       `json:"id"`
 	Label      string       `json:"label"`
 	Category   string       `json:"category"`
+	Resembles  string       `json:"resembles"`
 	Server     string       `json:"server"`
 	Tool       detectTool   `json:"tool"`
 	Peers      []detectPeer `json:"peers"`
@@ -150,11 +151,19 @@ func TestDetectCorpus_GatedCoverage(t *testing.T) {
 			maliciousByCat[e.Category]++
 		}
 		if e.Label == "benign" && e.Category == "hard_negative" {
-			for cat, prefix := range hardNegPrefix {
-				if strings.HasPrefix(e.ID, prefix) {
-					hardNegByCat[cat]++
-				}
+			// `resembles` is the machine-readable attribution the gate uses for
+			// per-category precision/FP; it must be set, name a gated category, and
+			// agree with the id prefix convention.
+			if e.Resembles == "" {
+				t.Errorf("hard_negative %q: missing resembles (needed for per-category FP)", e.ID)
+				continue
 			}
+			if prefix, ok := hardNegPrefix[e.Resembles]; !ok {
+				t.Errorf("hard_negative %q: resembles %q is not a gated category", e.ID, e.Resembles)
+			} else if !strings.HasPrefix(e.ID, prefix) {
+				t.Errorf("hard_negative %q: id should start with %q to match resembles %q", e.ID, prefix, e.Resembles)
+			}
+			hardNegByCat[e.Resembles]++
 		}
 	}
 
diff --git a/specs/065-evaluation-foundation/datasets/detect_corpus_v1.json b/specs/065-evaluation-foundation/datasets/detect_corpus_v1.json
@@ -294,6 +294,7 @@
       "id": "hn_unicode_accents",
       "label": "benign",
       "category": "hard_negative",
+      "resembles": "unicode_smuggling",
       "server": "i18n",
       "tool": {
         "name": "translate_phrase",
@@ -308,6 +309,7 @@
       "id": "hn_unicode_cjk",
       "label": "benign",
       "category": "hard_negative",
+      "resembles": "unicode_smuggling",
       "server": "i18n",
       "tool": {
         "name": "romanize_text",
@@ -322,6 +324,7 @@
       "id": "hn_decoded_json",
       "label": "benign",
       "category": "hard_negative",
+      "resembles": "decoded_payload",
       "server": "cfg",
       "tool": {
         "name": "load_settings",
@@ -336,6 +339,7 @@
       "id": "hn_decoded_text",
       "label": "benign",
       "category": "hard_negative",
+      "resembles": "decoded_payload",
       "server": "notes",
       "tool": {
         "name": "decode_note",
@@ -350,6 +354,7 @@
       "id": "hn_decoded_plain_curl",
       "label": "benign",
       "category": "hard_negative",
+      "resembles": "decoded_payload",
       "server": "docs",
       "tool": {
         "name": "install_docs",
@@ -364,6 +369,7 @@
       "id": "hn_decoded_hexid",
       "label": "benign",
       "category": "hard_negative",
+      "resembles": "decoded_payload",
       "server": "db",
       "tool": {
         "name": "lookup_hash",
@@ -378,6 +384,7 @@
       "id": "hn_shadowing_generic",
       "label": "benign",
       "category": "hard_negative",
+      "resembles": "shadowing",
       "server": "engine-a",
       "tool": {
         "name": "search",
@@ -401,6 +408,7 @@
       "id": "hn_shadowing_same_server",
       "label": "benign",
       "category": "hard_negative",
+      "resembles": "shadowing",
       "server": "reports",
       "tool": {
         "name": "export_report",
@@ -424,6 +432,7 @@
       "id": "hn_shadowing_selfref",
       "label": "benign",
       "category": "hard_negative",
+      "resembles": "shadowing",
       "server": "billing",
       "tool": {
         "name": "calculate_total",