Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions benchmarks/lib/bencher_report.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# loud rather than mis-report), while purely informational alert fields are read
# leniently (see #parse_alerts). The job fails loudly rather than silently rendering
# garbage or missing a regression.
class BencherReport
class BencherReport # rubocop:disable Metrics/ClassLength
class FormatError < StandardError; end

# One benchmark+measure's value and its prediction interval. baseline/limits are
Expand Down Expand Up @@ -74,15 +74,22 @@ def mirror(limit)

Alert = Struct.new(:benchmark, :measure, :limit, keyword_init: true)

def self.parse(json_string)
new(JSON.parse(json_string))
def self.parse(json_string, tracked_measures: nil)
new(JSON.parse(json_string), tracked_measures:)
rescue JSON::ParserError => e
raise FormatError, "Bencher report is not valid JSON: #{e.message}"
end

def initialize(raw)
# tracked_measures: the measures the caller actually tracks (e.g. track_benchmarks.rb's
# THRESHOLDS names). When given, an active alert on any *other* measure is treated as
# filtered rather than a regression — this drops alerts from orphaned server-side Bencher
# thresholds (e.g. a p90_latency threshold the code stopped passing but never deleted),
# which otherwise file an issue that the summary table can't even flag. nil = track every
# measure (backward-compatible default for callers that only need parsing/significance).
def initialize(raw, tracked_measures: nil)
raise FormatError, "report is not a JSON object (got #{raw.class})" unless raw.is_a?(Hash)

@tracked_measures = tracked_measures&.map { |measure| normalize(measure) }
@boundaries = index_boundaries(raw)
@alerts, @filtered_alerts = parse_alerts(raw).partition { |alert| current_regression_alert?(alert) }
# Per-benchmark perf links are informational (they only decide whether a name links
Expand Down Expand Up @@ -190,6 +197,7 @@ def parse_alerts(raw)
# rubocop:disable Metrics/CyclomaticComplexity
def current_regression_alert?(alert)
return true unless alert.benchmark
return false if untracked_measure_alert?(alert)

direction = { "lower" => :lower, "upper" => :upper }[normalize(alert.limit)]
return true unless direction
Expand All @@ -212,6 +220,16 @@ def current_regression_alert?(alert)

def threshold_side?(boundary, direction) = direction == :lower ? boundary.lower_limit : boundary.upper_limit

# An active alert on a measure the caller does not track (an orphaned server-side
# threshold). Only applies when tracked_measures was given; a measure-less alert can't be
# classified here, so it falls through to the existing benchmark-level fail-safe.
def untracked_measure_alert?(alert)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor footgun: if tracked_measures: is ever passed as [], @tracked_measures is [] (truthy), so !@tracked_measures.include?(...) is true for every measure — all alerts get silently filtered and regression? is always false. The current caller can't hit this (THRESHOLDS has 3 entries), but a future caller could pass an empty list accidentally.

A small guard in initialize would make the invariant explicit:

Suggested change
def untracked_measure_alert?(alert)
def untracked_measure_alert?(alert)
return false if @tracked_measures.nil? || @tracked_measures.empty?
return false unless alert.measure
!@tracked_measures.include?(normalize(alert.measure))
end

Or equivalently, coerce an empty list to nil at construction time:

@tracked_measures = tracked_measures&.then { |m| m.empty? ? nil : m.map { |k| normalize(k) } }

Either makes it impossible to silently disable regression detection by passing [].

return false unless @tracked_measures
return false unless alert.measure

!@tracked_measures.include?(normalize(alert.measure))
end

def normalize(key)
key.to_s.downcase.gsub(/[-\s]+/, "_")
end
Expand Down
70 changes: 69 additions & 1 deletion benchmarks/spec/bencher_report_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,74 @@ def rps_regression_result(name: "/foo: Core", value: 80.0)
end
end

# An orphaned server-side threshold (e.g. p90_latency, which the code dropped from
# THRESHOLDS but never deleted in Bencher) keeps firing alerts on a measure the code
# no longer tracks. Those alerts are invisible in the summary table (the p90 column has
# no :direction) yet would still file a regression issue. `tracked_measures` lets the
# caller pass the measures it actually tracks so an alert on any other measure is treated
# as filtered, not a regression. See the orphaned-p90-threshold investigation.
describe "tracked-measure filtering" do
def p90_report(tracked_measures: :unset)
json = report_json(
results: [[benchmark_result(
name: "/foo: Core",
measures: [measure_entry(slug: "p90-latency", name: "p90_latency", value: 24.2,
baseline: 17.9, lower_limit: nil, upper_limit: 23.6)]
)]],
alerts: [alert(benchmark: "/foo: Core", measure_slug: "p90-latency", limit: "upper")]
)
tracked_measures == :unset ? described_class.parse(json) : described_class.parse(json, tracked_measures:)
end

it "filters an active alert on an untracked measure (orphaned p90 threshold)" do
report = p90_report(tracked_measures: %w[rps p50_latency failed_pct])
expect(report.regression?).to be(false)
expect(report.alerts).to be_empty
expect(report.filtered_alert?).to be(true)
end

it "still reports a regression on a tracked measure when tracked_measures is given" do
report = described_class.parse(
report_json(
results: [[rps_regression_result]],
alerts: [alert(benchmark: "/foo: Core", measure_slug: "rps", limit: "lower")]
),
tracked_measures: %w[rps p50_latency failed_pct]
)
expect(report.regression?).to be(true)
expect(report.alerts.first.measure).to eq("rps")
end

it "matches tracked measures regardless of - / _ / case (slug vs THRESHOLDS name)" do
report = described_class.parse(
report_json(
results: [[benchmark_result(
name: "/foo: Core",
measures: [measure_entry(slug: "p50-latency", name: "p50_latency", value: 7.0,
baseline: 5.0, lower_limit: nil, upper_limit: 6.0)]
)]],
alerts: [alert(benchmark: "/foo: Core", measure_slug: "p50-latency", limit: "upper")]
),
tracked_measures: %w[rps p50_latency failed_pct]
)
expect(report.regression?).to be(true)
end

it "counts every measure when tracked_measures is not given (backward compatible)" do
expect(p90_report.regression?).to be(true)
end

it "keeps a measure-less alert fail-safe even with tracked_measures set" do
measureless_alert = alert(benchmark: "/foo: Core", measure_slug: "rps", limit: "lower")
measureless_alert["threshold"] = {}
report = described_class.parse(
report_json(results: [[rps_regression_result]], alerts: [measureless_alert]),
tracked_measures: %w[rps p50_latency failed_pct]
)
expect(report.regression?).to be(true)
end
end

describe "#boundary" do
let(:report) do
described_class.parse(
Expand Down Expand Up @@ -422,7 +490,7 @@ def result_for(name)
end

it "is false when the report lists no benchmarks (nothing to link)" do
expect(described_class.new("results" => [], "alerts" => []).perf_links_unavailable?).to be(false)
expect(described_class.new({ "results" => [], "alerts" => [] }).perf_links_unavailable?).to be(false)
end
end

Expand Down
6 changes: 5 additions & 1 deletion benchmarks/track_benchmarks.rb
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,11 @@ def run_bencher(branch, start_point_args)
# backtrace so the failure is triageable in CI logs. This covers both the initial
# run and the start-point-hash retry, since both go through run_bencher.
begin
report = BencherReport.parse(stdout)
# Pass the measures we actually track so an alert from an orphaned server-side Bencher
# threshold (a measure dropped from THRESHOLDS but never deleted in Bencher, e.g.
# p90_latency) is treated as filtered rather than a regression — it would otherwise file
# an issue the summary table can't even flag (p90 has no :direction column).
report = BencherReport.parse(stdout, tracked_measures: THRESHOLDS.map(&:first))
rescue BencherReport::FormatError => e
warn "::error::Bencher JSON report has an unexpected shape — re-verify against " \
"benchmarks/spec/bencher_report_spec.rb before bumping the CLI pin. #{e.message}"
Expand Down
Loading