fix(security): score consensus group at max severity; preserve stronger fields on merge

Dumbris · Dumbris · commit 5c14f7905bb9 · 2026-07-02T07:55:38.000+03:00
CalculateRiskScore consensus branch now scores each (location, threat_type)
group at the most-severe threat category and the max per-finding weight among
all agreeing findings, instead of whichever finding was encountered first. For
severity-derived threat_types (supply_chain, uncategorized, malicious_code
fallback) agreeing findings can carry different threat_levels, so the previous
first-wins behavior made the score order-dependent and could drop a genuine
high/warning finding. Extracted a threatCategory helper and precompute the
group max in the existing consensus pass for determinism.

MergeFindings phase-1 dedup now absorbs the duplicate's stronger fields: it
takes max(Confidence), the more-severe Tier (hard &gt; soft via tierRank), and the
union of Signals, so merging a hard/high-confidence finding with a same-
(rule_id, location) soft/low-confidence duplicate no longer discards the hard
tier and higher confidence.

Adds tests locking order-independent max-severity consensus scoring and
max-confidence + hard-tier-wins on merge.

Related: Spec 077
diff --git a/internal/security/scanner/sarif.go b/internal/security/scanner/sarif.go
@@ -290,6 +290,13 @@ func CalculateRiskScore(findings []ScanFinding) int {
 	// dedup (so existing single-scanner scoring is unchanged).
 	type consensusKey struct{ location, threatType string }
 	groupSources := make(map[consensusKey]map[string]struct{})
+	// A consensus group is scored ONCE, at the MOST-SEVERE threat category among
+	// all agreeing findings (not whichever is encountered first) and at the
+	// MAX per-finding weight — so the score is order-independent even when
+	// severity-derived threat_types (supply_chain, uncategorized, the
+	// malicious_code fallback) put different threat_levels on agreeing findings.
+	groupMaxCat := make(map[consensusKey]int)
+	groupMaxWeight := make(map[consensusKey]int)
 	for i := range findings {
 		f := &findings[i]
 		if f.ThreatType == "" {
@@ -304,6 +311,12 @@ func CalculateRiskScore(findings []ScanFinding) int {
 		for _, s := range findingSources(*f) {
 			set[s] = struct{}{}
 		}
+		if cat := threatCategory(f); cat > groupMaxCat[ck] {
+			groupMaxCat[ck] = cat
+		}
+		if w := consensusWeight(*f); w > groupMaxWeight[ck] {
+			groupMaxWeight[ck] = w
+		}
 	}
 
 	// Deduplicate for scoring:
@@ -317,46 +330,36 @@ func CalculateRiskScore(findings []ScanFinding) int {
 	seenConsensus := make(map[consensusKey]bool)
 	var dangerousCount, warningCount, infoCount int
 
-	addWeight := func(f *ScanFinding, weight int) {
-		switch f.ThreatLevel {
-		case ThreatLevelDangerous:
+	addWeight := func(category, weight int) {
+		switch category {
+		case threatCatDangerous:
 			dangerousCount += weight
-		case ThreatLevelWarning:
+		case threatCatWarning:
 			warningCount += weight
-		case ThreatLevelInfo:
+		case threatCatInfo:
 			infoCount += weight
-		default:
-			// Unclassified: use severity as fallback
-			switch f.Severity {
-			case SeverityCritical:
-				dangerousCount += weight
-			case SeverityHigh:
-				warningCount += weight
-			case SeverityMedium:
-				warningCount += weight
-			case SeverityLow:
-				infoCount += weight
-			}
 		}
 	}
 
 	for i := range findings {
 		f := &findings[i]
 
 		// Cross-source consensus path: only when ≥2 distinct sources agree on a
-		// classified (location, threat_type). Counted once, weighted by agreement.
+		// classified (location, threat_type). Counted once, at the most-severe
+		// category and max weight of the whole group so the result is
+		// order-independent (not the first finding encountered).
 		if f.ThreatType != "" {
 			ck := consensusKey{f.Location, f.ThreatType}
 			if n := len(groupSources[ck]); n >= 2 {
 				if seenConsensus[ck] {
 					continue
 				}
 				seenConsensus[ck] = true
-				weight := consensusWeight(*f)
+				weight := groupMaxWeight[ck]
 				if n > weight {
 					weight = n
 				}
-				addWeight(f, weight)
+				addWeight(groupMaxCat[ck], weight)
 				continue
 			}
 		}
@@ -372,7 +375,7 @@ func CalculateRiskScore(findings []ScanFinding) int {
 
 		// Consensus weight: independent signals on one tool ADD to the score
 		// (Spec 076 FR-006). A single-signal or signal-less finding weighs 1.
-		addWeight(f, consensusWeight(*f))
+		addWeight(threatCategory(f), consensusWeight(*f))
 	}
 
 	// Logarithmic diminishing returns: score = weight * log2(1 + count)
@@ -398,6 +401,41 @@ func CalculateRiskScore(findings []ScanFinding) int {
 	return score
 }
 
+// Threat categories rank a finding's contribution to the risk score. Higher is
+// more severe; threatCatNone (the zero value) is not counted, so a group whose
+// members are all unclassified keeps the map default without spuriously scoring.
+const (
+	threatCatNone = iota
+	threatCatInfo
+	threatCatWarning
+	threatCatDangerous
+)
+
+// threatCategory maps a finding to its risk bucket, preferring the explicit
+// user-facing ThreatLevel and falling back to CVSS severity for unclassified
+// findings. It mirrors the previous inline switch in addWeight so legacy scoring
+// is unchanged; extracting it lets a consensus group be scored at the
+// most-severe member instead of whichever finding is encountered first.
+func threatCategory(f *ScanFinding) int {
+	switch f.ThreatLevel {
+	case ThreatLevelDangerous:
+		return threatCatDangerous
+	case ThreatLevelWarning:
+		return threatCatWarning
+	case ThreatLevelInfo:
+		return threatCatInfo
+	}
+	switch f.Severity {
+	case SeverityCritical:
+		return threatCatDangerous
+	case SeverityHigh, SeverityMedium:
+		return threatCatWarning
+	case SeverityLow:
+		return threatCatInfo
+	}
+	return threatCatNone
+}
+
 // consensusWeight returns how much a single (deduplicated) finding contributes
 // to its risk category. The deterministic scanner (Spec 076) aggregates every
 // independent check that fired on a tool into one finding's Signals list, so a
@@ -411,6 +449,19 @@ func consensusWeight(f ScanFinding) int {
 	return 1
 }
 
+// tierRank orders finding tiers so the more-severe tier wins on merge:
+// hard > soft > empty/unknown.
+func tierRank(tier string) int {
+	switch tier {
+	case TierHard:
+		return 2
+	case TierSoft:
+		return 1
+	default:
+		return 0
+	}
+}
+
 // findingSources returns the contributing scanner ids for a finding, preferring
 // the explicit Sources list (Spec 077) and falling back to the single Scanner
 // id for legacy findings that predate multi-source attribution.
@@ -473,6 +524,18 @@ func MergeFindings(findings []ScanFinding) []ScanFinding {
 		k := key{f.RuleID, f.Location}
 		if pos, ok := index[k]; ok {
 			result[pos].Sources = sortedUnion(result[pos].Sources, srcs)
+			// Absorb the duplicate's stronger fields (Spec 077): keep the
+			// higher confidence, the more-severe tier (hard > soft), and the
+			// union of signals — otherwise merging a hard/high-confidence
+			// finding with a same-(rule_id,location) soft/low-confidence
+			// duplicate would silently drop the hard tier and confidence.
+			if f.Confidence > result[pos].Confidence {
+				result[pos].Confidence = f.Confidence
+			}
+			if tierRank(f.Tier) > tierRank(result[pos].Tier) {
+				result[pos].Tier = f.Tier
+			}
+			result[pos].Signals = sortedUnion(result[pos].Signals, f.Signals)
 			continue
 		}
 		f.Sources = sortedUnion(f.Sources, srcs)
diff --git a/internal/security/scanner/sarif_test.go b/internal/security/scanner/sarif_test.go
@@ -424,6 +424,86 @@ func TestCalculateRiskScoreCrossSourceConsensusAdds(t *testing.T) {
 	}
 }
 
+// TestCalculateRiskScoreConsensusUsesMaxSeverity proves the consensus group is
+// scored at its MOST-SEVERE member's threat level, not whichever finding is
+// encountered first. For severity-derived threat_types (supply_chain here)
+// agreeing findings can carry different threat_levels; the previous code counted
+// the group at the first finding's level, making the score order-dependent and
+// able to drop a genuine warning. The score must be identical in both orders and
+// reflect the warning (high) member, not the info (low) one.
+func TestCalculateRiskScoreConsensusUsesMaxSeverity(t *testing.T) {
+	warningFinding := ScanFinding{
+		RuleID:      "trivy.CVE-1",
+		Location:    "srv:tool",
+		ThreatType:  ThreatSupplyChain,
+		ThreatLevel: ThreatLevelWarning,
+		Severity:    SeverityHigh,
+		Scanner:     "trivy",
+	}
+	infoFinding := ScanFinding{
+		RuleID:      "grype.CVE-2",
+		Location:    "srv:tool",
+		ThreatType:  ThreatSupplyChain,
+		ThreatLevel: ThreatLevelInfo,
+		Severity:    SeverityLow,
+		Scanner:     "grype",
+	}
+
+	ab := CalculateRiskScore([]ScanFinding{warningFinding, infoFinding})
+	ba := CalculateRiskScore([]ScanFinding{infoFinding, warningFinding})
+
+	if ab != ba {
+		t.Fatalf("consensus score is order-dependent: [warning,info]=%d [info,warning]=%d", ab, ba)
+	}
+	// Two distinct sources agree → weight 2 at the warning level: 6*log2(3)=9.
+	// Scoring at the info level (the bug) would give 2*log2(3)=3.
+	if ab != 9 {
+		t.Errorf("consensus score = %d, want 9 (warning-level, 2 sources)", ab)
+	}
+}
+
+// TestMergeFindingsAbsorbsStrongerFields proves that phase-1 dedup keeps the
+// absorbed duplicate's stronger fields: on merge the result takes max(Confidence),
+// the more-severe Tier (hard > soft), and the union of Signals — regardless of
+// which finding is encountered first.
+func TestMergeFindingsAbsorbsStrongerFields(t *testing.T) {
+	hard := ScanFinding{
+		RuleID: "detect.tpa", Location: "srv:tool",
+		Tier: TierHard, Confidence: 0.9, Signals: []string{"unicode.hidden"},
+		Scanner: "tpa-descriptions",
+	}
+	soft := ScanFinding{
+		RuleID: "detect.tpa", Location: "srv:tool",
+		Tier: TierSoft, Confidence: 0.2, Signals: []string{"directive.imperative"},
+		Scanner: "cisco-mcp-scanner",
+	}
+
+	for _, tc := range []struct {
+		name string
+		in   []ScanFinding
+	}{
+		{"soft-first", []ScanFinding{soft, hard}},
+		{"hard-first", []ScanFinding{hard, soft}},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			merged := MergeFindings(tc.in)
+			if len(merged) != 1 {
+				t.Fatalf("expected 1 merged finding, got %d: %+v", len(merged), merged)
+			}
+			m := merged[0]
+			if m.Tier != TierHard {
+				t.Errorf("merged tier = %q, want hard (more-severe tier must win)", m.Tier)
+			}
+			if m.Confidence != 0.9 {
+				t.Errorf("merged confidence = %v, want 0.9 (max)", m.Confidence)
+			}
+			if len(m.Signals) != 2 {
+				t.Errorf("merged signals = %v, want union of both (2)", m.Signals)
+			}
+		})
+	}
+}
+
 // TestClassifyThreatBackfillsSeverity proves Spec 077 (T022): a finding that
 // arrives with no severity (as some external/legacy SARIF findings do) is given
 // a user-readable severity derived from its classified threat level.