shakacode · justin808 · Jun 9, 2026 · Jun 9, 2026 · claude · Jun 9, 2026
diff --git a/benchmarks/lib/bencher_report.rb b/benchmarks/lib/bencher_report.rb
@@ -15,7 +15,7 @@
 # loud rather than mis-report), while purely informational alert fields are read
 # leniently (see #parse_alerts). The job fails loudly rather than silently rendering
 # garbage or missing a regression.
-class BencherReport
+class BencherReport # rubocop:disable Metrics/ClassLength
   class FormatError < StandardError; end
 
   # One benchmark+measure's value and its prediction interval. baseline/limits are
@@ -74,15 +74,22 @@ def mirror(limit)
 
   Alert = Struct.new(:benchmark, :measure, :limit, keyword_init: true)
 
-  def self.parse(json_string)
-    new(JSON.parse(json_string))
+  def self.parse(json_string, tracked_measures: nil)
+    new(JSON.parse(json_string), tracked_measures:)
   rescue JSON::ParserError => e
     raise FormatError, "Bencher report is not valid JSON: #{e.message}"
   end
 
-  def initialize(raw)
+  # tracked_measures: the measures the caller actually tracks (e.g. track_benchmarks.rb's
+  # THRESHOLDS names). When given, an active alert on any *other* measure is treated as
+  # filtered rather than a regression — this drops alerts from orphaned server-side Bencher
+  # thresholds (e.g. a p90_latency threshold the code stopped passing but never deleted),
+  # which otherwise file an issue that the summary table can't even flag. nil = track every
+  # measure (backward-compatible default for callers that only need parsing/significance).
+  def initialize(raw, tracked_measures: nil)
     raise FormatError, "report is not a JSON object (got #{raw.class})" unless raw.is_a?(Hash)
 
+    @tracked_measures = tracked_measures&.map { |measure| normalize(measure) }
     @boundaries = index_boundaries(raw)
     @alerts, @filtered_alerts = parse_alerts(raw).partition { |alert| current_regression_alert?(alert) }
     # Per-benchmark perf links are informational (they only decide whether a name links
@@ -190,6 +197,7 @@ def parse_alerts(raw)
   # rubocop:disable Metrics/CyclomaticComplexity
   def current_regression_alert?(alert)
     return true unless alert.benchmark
+    return false if untracked_measure_alert?(alert)
 
     direction = { "lower" => :lower, "upper" => :upper }[normalize(alert.limit)]
     return true unless direction
@@ -212,6 +220,16 @@ def current_regression_alert?(alert)
 
   def threshold_side?(boundary, direction) = direction == :lower ? boundary.lower_limit : boundary.upper_limit
 
+  # An active alert on a measure the caller does not track (an orphaned server-side
+  # threshold). Only applies when tracked_measures was given; a measure-less alert can't be
+  # classified here, so it falls through to the existing benchmark-level fail-safe.
+  def untracked_measure_alert?(alert)
-  def untracked_measure_alert?(alert)
+  def untracked_measure_alert?(alert)
+    return false if @tracked_measures.nil? || @tracked_measures.empty?
+    return false unless alert.measure
+
+    !@tracked_measures.include?(normalize(alert.measure))
+  end
-  def untracked_measure_alert?(alert)
+  def untracked_measure_alert?(alert)
+    return false if @tracked_measures.nil? || @tracked_measures.empty?
+    return false unless alert.measure
+
+    !@tracked_measures.include?(normalize(alert.measure))
+  end
+    return false unless @tracked_measures
+    return false unless alert.measure
+
+    !@tracked_measures.include?(normalize(alert.measure))
+  end
+
   def normalize(key)
     key.to_s.downcase.gsub(/[-\s]+/, "_")
   end

diff --git a/benchmarks/spec/bencher_report_spec.rb b/benchmarks/spec/bencher_report_spec.rb
@@ -207,6 +207,74 @@ def rps_regression_result(name: "/foo: Core", value: 80.0)
     end
   end
 
+  # An orphaned server-side threshold (e.g. p90_latency, which the code dropped from
+  # THRESHOLDS but never deleted in Bencher) keeps firing alerts on a measure the code
+  # no longer tracks. Those alerts are invisible in the summary table (the p90 column has
+  # no :direction) yet would still file a regression issue. `tracked_measures` lets the
+  # caller pass the measures it actually tracks so an alert on any other measure is treated
+  # as filtered, not a regression. See the orphaned-p90-threshold investigation.
+  describe "tracked-measure filtering" do
+    def p90_report(tracked_measures: :unset)
+      json = report_json(
+        results: [[benchmark_result(
+          name: "/foo: Core",
+          measures: [measure_entry(slug: "p90-latency", name: "p90_latency", value: 24.2,
+                                   baseline: 17.9, lower_limit: nil, upper_limit: 23.6)]
+        )]],
+        alerts: [alert(benchmark: "/foo: Core", measure_slug: "p90-latency", limit: "upper")]
+      )
+      tracked_measures == :unset ? described_class.parse(json) : described_class.parse(json, tracked_measures:)
+    end
+
+    it "filters an active alert on an untracked measure (orphaned p90 threshold)" do
+      report = p90_report(tracked_measures: %w[rps p50_latency failed_pct])
+      expect(report.regression?).to be(false)
+      expect(report.alerts).to be_empty
+      expect(report.filtered_alert?).to be(true)
+    end
+
+    it "still reports a regression on a tracked measure when tracked_measures is given" do
+      report = described_class.parse(
+        report_json(
+          results: [[rps_regression_result]],
+          alerts: [alert(benchmark: "/foo: Core", measure_slug: "rps", limit: "lower")]
+        ),
+        tracked_measures: %w[rps p50_latency failed_pct]
+      )
+      expect(report.regression?).to be(true)
+      expect(report.alerts.first.measure).to eq("rps")
+    end
+
+    it "matches tracked measures regardless of - / _ / case (slug vs THRESHOLDS name)" do
+      report = described_class.parse(
+        report_json(
+          results: [[benchmark_result(
+            name: "/foo: Core",
+            measures: [measure_entry(slug: "p50-latency", name: "p50_latency", value: 7.0,
+                                     baseline: 5.0, lower_limit: nil, upper_limit: 6.0)]
+          )]],
+          alerts: [alert(benchmark: "/foo: Core", measure_slug: "p50-latency", limit: "upper")]
+        ),
+        tracked_measures: %w[rps p50_latency failed_pct]
+      )
+      expect(report.regression?).to be(true)
+    end
+
+    it "counts every measure when tracked_measures is not given (backward compatible)" do
+      expect(p90_report.regression?).to be(true)
+    end
+
+    it "keeps a measure-less alert fail-safe even with tracked_measures set" do
+      measureless_alert = alert(benchmark: "/foo: Core", measure_slug: "rps", limit: "lower")
+      measureless_alert["threshold"] = {}
+      report = described_class.parse(
+        report_json(results: [[rps_regression_result]], alerts: [measureless_alert]),
+        tracked_measures: %w[rps p50_latency failed_pct]
+      )
+      expect(report.regression?).to be(true)
+    end
+  end
+
   describe "#boundary" do
     let(:report) do
       described_class.parse(
@@ -422,7 +490,7 @@ def result_for(name)
     end
 
     it "is false when the report lists no benchmarks (nothing to link)" do
-      expect(described_class.new("results" => [], "alerts" => []).perf_links_unavailable?).to be(false)
+      expect(described_class.new({ "results" => [], "alerts" => [] }).perf_links_unavailable?).to be(false)
     end
   end
 

diff --git a/benchmarks/track_benchmarks.rb b/benchmarks/track_benchmarks.rb
@@ -178,7 +178,11 @@ def run_bencher(branch, start_point_args)
     # backtrace so the failure is triageable in CI logs. This covers both the initial
     # run and the start-point-hash retry, since both go through run_bencher.
     begin
-      report = BencherReport.parse(stdout)
+      # Pass the measures we actually track so an alert from an orphaned server-side Bencher
+      # threshold (a measure dropped from THRESHOLDS but never deleted in Bencher, e.g.
+      # p90_latency) is treated as filtered rather than a regression — it would otherwise file
+      # an issue the summary table can't even flag (p90 has no :direction column).
+      report = BencherReport.parse(stdout, tracked_measures: THRESHOLDS.map(&:first))
     rescue BencherReport::FormatError => e
       warn "::error::Bencher JSON report has an unexpected shape — re-verify against " \
            "benchmarks/spec/bencher_report_spec.rb before bumping the CLI pin. #{e.message}"