Skip to content

Commit 9f71401

Browse files
committed
Add histogram unit guesser
1 parent a996a89 commit 9f71401

4 files changed

Lines changed: 286 additions & 5 deletions

File tree

internal/evaluator/evaluator_test.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,4 +554,80 @@ func TestEvaluateHistogramInfOverflow(t *testing.T) {
554554
t.Fatalf("expected 0 results, got %d", len(results))
555555
}
556556
})
557+
558+
t.Run("guesses unit from metric name for +Inf output", func(t *testing.T) {
559+
metrics := parser.MetricsData{
560+
"test_duration_seconds_bucket": &parser.Metric{
561+
Name: "test_duration_seconds_bucket",
562+
Type: "histogram",
563+
Values: []parser.MetricValue{
564+
{Value: 10, Labels: map[string]string{"le": "512"}},
565+
{Value: 100, Labels: map[string]string{"le": "+Inf"}},
566+
},
567+
},
568+
}
569+
570+
results := EvaluateHistogramInfOverflow(metrics)
571+
if len(results) != 1 {
572+
t.Fatalf("expected 1 result, got %d", len(results))
573+
}
574+
if !strings.Contains(results[0].Message, "Highest non-infinity bucket: 512 s") {
575+
t.Fatalf("expected guessed seconds unit in message, got: %s", results[0].Message)
576+
}
577+
details := strings.Join(results[0].Details, "\n")
578+
if !strings.Contains(details, "Highest non-infinity bucket: 512 s") {
579+
t.Fatalf("expected guessed seconds unit in details, got: %s", details)
580+
}
581+
})
582+
583+
t.Run("uses help text only when unit is unambiguous", func(t *testing.T) {
584+
metrics := parser.MetricsData{
585+
"mystery_metric": &parser.Metric{
586+
Name: "mystery_metric",
587+
Help: "Time taken in milliseconds for processing",
588+
Type: "histogram",
589+
},
590+
"mystery_metric_bucket": &parser.Metric{
591+
Name: "mystery_metric_bucket",
592+
Type: "histogram",
593+
Values: []parser.MetricValue{
594+
{Value: 10, Labels: map[string]string{"le": "512"}},
595+
{Value: 100, Labels: map[string]string{"le": "+Inf"}},
596+
},
597+
},
598+
}
599+
results := EvaluateHistogramInfOverflow(metrics)
600+
if len(results) != 1 {
601+
t.Fatalf("expected 1 result, got %d", len(results))
602+
}
603+
if !strings.Contains(results[0].Message, "Highest non-infinity bucket: 512 ms") {
604+
t.Fatalf("expected guessed milliseconds unit in message, got: %s", results[0].Message)
605+
}
606+
})
607+
608+
t.Run("does not use help text when multiple units are present", func(t *testing.T) {
609+
metrics := parser.MetricsData{
610+
"mystery_metric": &parser.Metric{
611+
Name: "mystery_metric",
612+
Help: "Latency shown in milliseconds and seconds for compatibility",
613+
Type: "histogram",
614+
},
615+
"mystery_metric_bucket": &parser.Metric{
616+
Name: "mystery_metric_bucket",
617+
Type: "histogram",
618+
Values: []parser.MetricValue{
619+
{Value: 10, Labels: map[string]string{"le": "512"}},
620+
{Value: 100, Labels: map[string]string{"le": "+Inf"}},
621+
},
622+
},
623+
}
624+
results := EvaluateHistogramInfOverflow(metrics)
625+
if len(results) != 1 {
626+
t.Fatalf("expected 1 result, got %d", len(results))
627+
}
628+
if strings.Contains(results[0].Message, "Highest non-infinity bucket: 512 ms") ||
629+
strings.Contains(results[0].Message, "Highest non-infinity bucket: 512 s") {
630+
t.Fatalf("expected no guessed unit in ambiguous help text, got: %s", results[0].Message)
631+
}
632+
})
557633
}

internal/evaluator/histogram.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsD
143143
return nil
144144
}
145145
metricHelp := resolveMetricHelp(baseName, metrics)
146+
guessedUnit := guessMetricUnit(baseName, metricHelp)
146147

147148
// Group buckets by label combination (excluding "le" label)
148149
// Each label combination represents a separate time series
@@ -252,14 +253,14 @@ func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsD
252253
Timestamp: time.Now(),
253254
ReviewStatus: "Automatically generated rule; reviewed by the code author at the time of implementation.",
254255
PotentialActionUser: fmt.Sprintf("Further investigation is required to understand why values exceed %s. "+
255-
"Check if there are other alerts for this specific metric with more precise context.", formatHumanNumber(eval.highestFiniteLe)),
256+
"Check if there are other alerts for this specific metric with more precise context.", formatHistogramValue(eval.highestFiniteLe, guessedUnit)),
256257
PotentialActionDeveloper: "Review code paths and metric instrumentation to confirm whether observed latencies are expected.",
257258
}
258259
result.Details = append(result.Details,
259260
"Total Number of Observations: "+formatHumanNumber(eval.totalCount),
260261
"Observations in +Inf bucket: "+formatHumanNumber(eval.infObservations),
261262
"Percentage of observations in +Inf bucket: "+formatHumanNumber(eval.infPercentage)+" %",
262-
"Highest non-infinity bucket: "+formatHumanNumber(eval.highestFiniteLe)+" unit",
263+
"Highest non-infinity bucket: "+formatHistogramValue(eval.highestFiniteLe, guessedUnit),
263264
)
264265
result.Message = fmt.Sprintf("%s%% of observations are in +Inf bucket (%s out of %s). "+
265266
"This indicates the metric designer likely didn't expect the values to be so high. "+
@@ -268,7 +269,7 @@ func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsD
268269
formatHumanNumber(eval.infPercentage),
269270
formatHumanNumber(eval.infObservations),
270271
formatHumanNumber(eval.totalCount),
271-
formatHumanNumber(eval.highestFiniteLe))
272+
formatHistogramValue(eval.highestFiniteLe, guessedUnit))
272273
results = append(results, result)
273274
}
274275

@@ -294,10 +295,10 @@ func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsD
294295
"Total Number of Observations: "+formatHumanNumber(worstOverall.totalCount),
295296
"Observations in +Inf bucket: "+formatHumanNumber(worstOverall.infObservations),
296297
"Percentage of observations in +Inf bucket: "+formatHumanNumber(worstOverall.infPercentage)+" %",
297-
"Highest non-infinity bucket: "+formatHumanNumber(worstOverall.highestFiniteLe)+" unit",
298+
"Highest non-infinity bucket: "+formatHistogramValue(worstOverall.highestFiniteLe, guessedUnit),
298299
)
299300
greenResult.Message = fmt.Sprintf("%s%% of observations in +Inf bucket (acceptable). Highest non-infinity bucket: %s",
300-
formatHumanNumber(worstOverall.infPercentage), formatHumanNumber(worstOverall.highestFiniteLe))
301+
formatHumanNumber(worstOverall.infPercentage), formatHistogramValue(worstOverall.highestFiniteLe, guessedUnit))
301302
return []rules.EvaluationResult{greenResult}
302303
}
303304

internal/evaluator/unit_guess.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
package evaluator
2+
3+
import (
4+
"regexp"
5+
"strings"
6+
)
7+
8+
var (
9+
unitTokenToCanonical = map[string]string{
10+
"seconds": "seconds",
11+
"second": "seconds",
12+
"s": "seconds",
13+
"sec": "seconds",
14+
"secs": "seconds",
15+
"milliseconds": "milliseconds",
16+
"millisecond": "milliseconds",
17+
"ms": "milliseconds",
18+
"msec": "milliseconds",
19+
"msecs": "milliseconds",
20+
"millis": "milliseconds",
21+
"bytes": "bytes",
22+
"byte": "bytes",
23+
}
24+
25+
helpUnitMatchers = map[string]*regexp.Regexp{
26+
"milliseconds": regexp.MustCompile(`(?i)\bmilliseconds?\b|\bms\b|\bmsecs?\b|\bmillis\b`),
27+
"seconds": regexp.MustCompile(`(?i)\bseconds?\b|\bsecs?\b|\bsec\b`),
28+
"bytes": regexp.MustCompile(`(?i)\bbytes?\b`),
29+
}
30+
)
31+
32+
// guessMetricUnit infers unit from metric name first, then HELP text.
33+
// HELP text is only used when exactly one unit candidate is detected.
34+
func guessMetricUnit(metricName, helpText string) string {
35+
if unit := guessUnitFromMetricName(metricName); unit != "" {
36+
return unit
37+
}
38+
return guessUnitFromHelpText(helpText)
39+
}
40+
41+
func guessUnitFromMetricName(metricName string) string {
42+
metricName = strings.TrimSpace(strings.ToLower(metricName))
43+
if metricName == "" {
44+
return ""
45+
}
46+
47+
parts := strings.Split(metricName, "_")
48+
if len(parts) == 0 {
49+
return ""
50+
}
51+
52+
// Prometheus best-practice suffixes like *_seconds_total.
53+
if len(parts) >= 2 && parts[len(parts)-1] == "total" {
54+
if unit, ok := unitTokenToCanonical[parts[len(parts)-2]]; ok {
55+
return unit
56+
}
57+
}
58+
59+
// Common suffixes like *_seconds and *_bytes.
60+
if unit, ok := unitTokenToCanonical[parts[len(parts)-1]]; ok {
61+
return unit
62+
}
63+
64+
// Handle *_timestamp_seconds.
65+
if len(parts) >= 2 && parts[len(parts)-2] == "timestamp" {
66+
if unit, ok := unitTokenToCanonical[parts[len(parts)-1]]; ok {
67+
return unit
68+
}
69+
}
70+
71+
return ""
72+
}
73+
74+
func guessUnitFromHelpText(helpText string) string {
75+
helpText = strings.TrimSpace(helpText)
76+
if helpText == "" {
77+
return ""
78+
}
79+
80+
var matched []string
81+
for unit, matcher := range helpUnitMatchers {
82+
if matcher.MatchString(helpText) {
83+
matched = append(matched, unit)
84+
}
85+
}
86+
87+
// Use HELP only if it points to exactly one unit.
88+
if len(matched) != 1 {
89+
return ""
90+
}
91+
return matched[0]
92+
}
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
package evaluator
2+
3+
import "testing"
4+
5+
func TestGuessUnitFromMetricName(t *testing.T) {
6+
tests := []struct {
7+
name string
8+
metricName string
9+
want string
10+
}{
11+
{name: "seconds suffix", metricName: "http_request_duration_seconds", want: "seconds"},
12+
{name: "bytes suffix", metricName: "container_memory_usage_bytes", want: "bytes"},
13+
{name: "seconds total suffix", metricName: "process_cpu_seconds_total", want: "seconds"},
14+
{name: "timestamp seconds suffix", metricName: "last_seen_timestamp_seconds", want: "seconds"},
15+
{name: "real metric histogram bytes", metricName: "http_incoming_request_size_histogram_bytes", want: "bytes"},
16+
{name: "real metric bytes total", metricName: "go_memstats_alloc_bytes_total", want: "bytes"},
17+
{name: "real metric duration milliseconds", metricName: "rox_sensor_scan_call_duration_milliseconds", want: "milliseconds"},
18+
{name: "real metric purger duration seconds", metricName: "rox_sensor_network_flow_manager_purger_duration_seconds", want: "seconds"},
19+
{name: "real metric vm report duration milliseconds", metricName: "rox_sensor_virtual_machine_index_report_processing_duration_milliseconds", want: "milliseconds"},
20+
{name: "sec alias suffix", metricName: "request_duration_sec", want: "seconds"},
21+
{name: "msec alias suffix", metricName: "request_duration_msec", want: "milliseconds"},
22+
{name: "no known suffix", metricName: "rox_sensor_events", want: ""},
23+
{name: "empty metric name", metricName: "", want: ""},
24+
}
25+
26+
for _, tt := range tests {
27+
t.Run(tt.name, func(t *testing.T) {
28+
got := guessUnitFromMetricName(tt.metricName)
29+
if got != tt.want {
30+
t.Fatalf("guessUnitFromMetricName(%q) = %q, want %q", tt.metricName, got, tt.want)
31+
}
32+
})
33+
}
34+
}
35+
36+
func TestGuessUnitFromHelpText(t *testing.T) {
37+
tests := []struct {
38+
name string
39+
helpText string
40+
want string
41+
}{
42+
{name: "milliseconds mention", helpText: "Time taken in milliseconds to process events", want: "milliseconds"},
43+
{name: "seconds mention", helpText: "Duration in seconds", want: "seconds"},
44+
{name: "seconds short sec mention", helpText: "Duration in sec", want: "seconds"},
45+
{name: "bytes mention", helpText: "Payload size in bytes", want: "bytes"},
46+
{name: "milliseconds alias msec", helpText: "Call took 32 msec", want: "milliseconds"},
47+
{name: "milliseconds alias millis", helpText: "Observed latency in millis", want: "milliseconds"},
48+
{name: "byte singular mention", helpText: "Equals to /memory/classes/total:byte.", want: "bytes"},
49+
{name: "ambiguous mentions", helpText: "Latency in milliseconds and seconds", want: ""},
50+
{name: "ambiguous time and bytes mentions", helpText: "Duration in seconds with payload bytes", want: ""},
51+
{name: "no known units", helpText: "Number of retries", want: ""},
52+
{name: "empty help text", helpText: "", want: ""},
53+
}
54+
55+
for _, tt := range tests {
56+
t.Run(tt.name, func(t *testing.T) {
57+
got := guessUnitFromHelpText(tt.helpText)
58+
if got != tt.want {
59+
t.Fatalf("guessUnitFromHelpText(%q) = %q, want %q", tt.helpText, got, tt.want)
60+
}
61+
})
62+
}
63+
}
64+
65+
func TestGuessMetricUnit(t *testing.T) {
66+
tests := []struct {
67+
name string
68+
metricName string
69+
helpText string
70+
want string
71+
}{
72+
{
73+
name: "metric name wins over help text",
74+
metricName: "process_cpu_seconds_total",
75+
helpText: "CPU usage in milliseconds",
76+
want: "seconds",
77+
},
78+
{
79+
name: "falls back to help text when name has no unit",
80+
metricName: "rox_central_event_processing",
81+
helpText: "Time taken in milliseconds",
82+
want: "milliseconds",
83+
},
84+
{
85+
name: "real cluster duration without unit in name uses help",
86+
metricName: "rox_sensor_k8s_event_processing_duration",
87+
helpText: "Time spent fully processing an event from Kubernetes in milliseconds",
88+
want: "milliseconds",
89+
},
90+
{
91+
name: "returns empty when both sources fail",
92+
metricName: "rox_sensor_events",
93+
helpText: "Count of events",
94+
want: "",
95+
},
96+
{
97+
name: "real metric without unit in name or help",
98+
metricName: "rox_sensor_k8s_event_processing_duration",
99+
helpText: "Time taken to fully process an event from Kubernetes",
100+
want: "",
101+
},
102+
}
103+
104+
for _, tt := range tests {
105+
t.Run(tt.name, func(t *testing.T) {
106+
got := guessMetricUnit(tt.metricName, tt.helpText)
107+
if got != tt.want {
108+
t.Fatalf("guessMetricUnit(%q, %q) = %q, want %q", tt.metricName, tt.helpText, got, tt.want)
109+
}
110+
})
111+
}
112+
}

0 commit comments

Comments
 (0)