Add histogram unit guesser

vikin91 · vikin91 · commit 9f71401fb3d7 · 2026-03-13T13:50:17.000+01:00
diff --git a/internal/evaluator/evaluator_test.go b/internal/evaluator/evaluator_test.go
@@ -554,4 +554,80 @@ func TestEvaluateHistogramInfOverflow(t *testing.T) {
 			t.Fatalf("expected 0 results, got %d", len(results))
 		}
 	})
+
+	t.Run("guesses unit from metric name for +Inf output", func(t *testing.T) {
+		metrics := parser.MetricsData{
+			"test_duration_seconds_bucket": &parser.Metric{
+				Name: "test_duration_seconds_bucket",
+				Type: "histogram",
+				Values: []parser.MetricValue{
+					{Value: 10, Labels: map[string]string{"le": "512"}},
+					{Value: 100, Labels: map[string]string{"le": "+Inf"}},
+				},
+			},
+		}
+
+		results := EvaluateHistogramInfOverflow(metrics)
+		if len(results) != 1 {
+			t.Fatalf("expected 1 result, got %d", len(results))
+		}
+		if !strings.Contains(results[0].Message, "Highest non-infinity bucket: 512 s") {
+			t.Fatalf("expected guessed seconds unit in message, got: %s", results[0].Message)
+		}
+		details := strings.Join(results[0].Details, "\n")
+		if !strings.Contains(details, "Highest non-infinity bucket: 512 s") {
+			t.Fatalf("expected guessed seconds unit in details, got: %s", details)
+		}
+	})
+
+	t.Run("uses help text only when unit is unambiguous", func(t *testing.T) {
+		metrics := parser.MetricsData{
+			"mystery_metric": &parser.Metric{
+				Name: "mystery_metric",
+				Help: "Time taken in milliseconds for processing",
+				Type: "histogram",
+			},
+			"mystery_metric_bucket": &parser.Metric{
+				Name: "mystery_metric_bucket",
+				Type: "histogram",
+				Values: []parser.MetricValue{
+					{Value: 10, Labels: map[string]string{"le": "512"}},
+					{Value: 100, Labels: map[string]string{"le": "+Inf"}},
+				},
+			},
+		}
+		results := EvaluateHistogramInfOverflow(metrics)
+		if len(results) != 1 {
+			t.Fatalf("expected 1 result, got %d", len(results))
+		}
+		if !strings.Contains(results[0].Message, "Highest non-infinity bucket: 512 ms") {
+			t.Fatalf("expected guessed milliseconds unit in message, got: %s", results[0].Message)
+		}
+	})
+
+	t.Run("does not use help text when multiple units are present", func(t *testing.T) {
+		metrics := parser.MetricsData{
+			"mystery_metric": &parser.Metric{
+				Name: "mystery_metric",
+				Help: "Latency shown in milliseconds and seconds for compatibility",
+				Type: "histogram",
+			},
+			"mystery_metric_bucket": &parser.Metric{
+				Name: "mystery_metric_bucket",
+				Type: "histogram",
+				Values: []parser.MetricValue{
+					{Value: 10, Labels: map[string]string{"le": "512"}},
+					{Value: 100, Labels: map[string]string{"le": "+Inf"}},
+				},
+			},
+		}
+		results := EvaluateHistogramInfOverflow(metrics)
+		if len(results) != 1 {
+			t.Fatalf("expected 1 result, got %d", len(results))
+		}
+		if strings.Contains(results[0].Message, "Highest non-infinity bucket: 512 ms") ||
+			strings.Contains(results[0].Message, "Highest non-infinity bucket: 512 s") {
+			t.Fatalf("expected no guessed unit in ambiguous help text, got: %s", results[0].Message)
+		}
+	})
 }
diff --git a/internal/evaluator/histogram.go b/internal/evaluator/histogram.go
@@ -143,6 +143,7 @@ func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsD
 		return nil
 	}
 	metricHelp := resolveMetricHelp(baseName, metrics)
+	guessedUnit := guessMetricUnit(baseName, metricHelp)
 
 	// Group buckets by label combination (excluding "le" label)
 	// Each label combination represents a separate time series
@@ -252,14 +253,14 @@ func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsD
 			Timestamp:    time.Now(),
 			ReviewStatus: "Automatically generated rule; reviewed by the code author at the time of implementation.",
 			PotentialActionUser: fmt.Sprintf("Further investigation is required to understand why values exceed %s. "+
-				"Check if there are other alerts for this specific metric with more precise context.", formatHumanNumber(eval.highestFiniteLe)),
+				"Check if there are other alerts for this specific metric with more precise context.", formatHistogramValue(eval.highestFiniteLe, guessedUnit)),
 			PotentialActionDeveloper: "Review code paths and metric instrumentation to confirm whether observed latencies are expected.",
 		}
 		result.Details = append(result.Details,
 			"Total Number of Observations: "+formatHumanNumber(eval.totalCount),
 			"Observations in +Inf bucket: "+formatHumanNumber(eval.infObservations),
 			"Percentage of observations in +Inf bucket: "+formatHumanNumber(eval.infPercentage)+" %",
-			"Highest non-infinity bucket: "+formatHumanNumber(eval.highestFiniteLe)+" unit",
+			"Highest non-infinity bucket: "+formatHistogramValue(eval.highestFiniteLe, guessedUnit),
 		)
 		result.Message = fmt.Sprintf("%s%% of observations are in +Inf bucket (%s out of %s). "+
 			"This indicates the metric designer likely didn't expect the values to be so high. "+
@@ -268,7 +269,7 @@ func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsD
 			formatHumanNumber(eval.infPercentage),
 			formatHumanNumber(eval.infObservations),
 			formatHumanNumber(eval.totalCount),
-			formatHumanNumber(eval.highestFiniteLe))
+			formatHistogramValue(eval.highestFiniteLe, guessedUnit))
 		results = append(results, result)
 	}
 
@@ -294,10 +295,10 @@ func evaluateSingleHistogramInfOverflow(baseName string, metrics parser.MetricsD
 		"Total Number of Observations: "+formatHumanNumber(worstOverall.totalCount),
 		"Observations in +Inf bucket: "+formatHumanNumber(worstOverall.infObservations),
 		"Percentage of observations in +Inf bucket: "+formatHumanNumber(worstOverall.infPercentage)+" %",
-		"Highest non-infinity bucket: "+formatHumanNumber(worstOverall.highestFiniteLe)+" unit",
+		"Highest non-infinity bucket: "+formatHistogramValue(worstOverall.highestFiniteLe, guessedUnit),
 	)
 	greenResult.Message = fmt.Sprintf("%s%% of observations in +Inf bucket (acceptable). Highest non-infinity bucket: %s",
-		formatHumanNumber(worstOverall.infPercentage), formatHumanNumber(worstOverall.highestFiniteLe))
+		formatHumanNumber(worstOverall.infPercentage), formatHistogramValue(worstOverall.highestFiniteLe, guessedUnit))
 	return []rules.EvaluationResult{greenResult}
 }
 
diff --git a/internal/evaluator/unit_guess.go b/internal/evaluator/unit_guess.go
@@ -0,0 +1,92 @@
+package evaluator
+
+import (
+	"regexp"
+	"strings"
+)
+
+var (
+	unitTokenToCanonical = map[string]string{
+		"seconds":      "seconds",
+		"second":       "seconds",
+		"s":            "seconds",
+		"sec":          "seconds",
+		"secs":         "seconds",
+		"milliseconds": "milliseconds",
+		"millisecond":  "milliseconds",
+		"ms":           "milliseconds",
+		"msec":         "milliseconds",
+		"msecs":        "milliseconds",
+		"millis":       "milliseconds",
+		"bytes":        "bytes",
+		"byte":         "bytes",
+	}
+
+	helpUnitMatchers = map[string]*regexp.Regexp{
+		"milliseconds": regexp.MustCompile(`(?i)\bmilliseconds?\b|\bms\b|\bmsecs?\b|\bmillis\b`),
+		"seconds":      regexp.MustCompile(`(?i)\bseconds?\b|\bsecs?\b|\bsec\b`),
+		"bytes":        regexp.MustCompile(`(?i)\bbytes?\b`),
+	}
+)
+
+// guessMetricUnit infers unit from metric name first, then HELP text.
+// HELP text is only used when exactly one unit candidate is detected.
+func guessMetricUnit(metricName, helpText string) string {
+	if unit := guessUnitFromMetricName(metricName); unit != "" {
+		return unit
+	}
+	return guessUnitFromHelpText(helpText)
+}
+
+func guessUnitFromMetricName(metricName string) string {
+	metricName = strings.TrimSpace(strings.ToLower(metricName))
+	if metricName == "" {
+		return ""
+	}
+
+	parts := strings.Split(metricName, "_")
+	if len(parts) == 0 {
+		return ""
+	}
+
+	// Prometheus best-practice suffixes like *_seconds_total.
+	if len(parts) >= 2 && parts[len(parts)-1] == "total" {
+		if unit, ok := unitTokenToCanonical[parts[len(parts)-2]]; ok {
+			return unit
+		}
+	}
+
+	// Common suffixes like *_seconds and *_bytes.
+	if unit, ok := unitTokenToCanonical[parts[len(parts)-1]]; ok {
+		return unit
+	}
+
+	// Handle *_timestamp_seconds.
+	if len(parts) >= 2 && parts[len(parts)-2] == "timestamp" {
+		if unit, ok := unitTokenToCanonical[parts[len(parts)-1]]; ok {
+			return unit
+		}
+	}
+
+	return ""
+}
+
+func guessUnitFromHelpText(helpText string) string {
+	helpText = strings.TrimSpace(helpText)
+	if helpText == "" {
+		return ""
+	}
+
+	var matched []string
+	for unit, matcher := range helpUnitMatchers {
+		if matcher.MatchString(helpText) {
+			matched = append(matched, unit)
+		}
+	}
+
+	// Use HELP only if it points to exactly one unit.
+	if len(matched) != 1 {
+		return ""
+	}
+	return matched[0]
+}
diff --git a/internal/evaluator/unit_guess_test.go b/internal/evaluator/unit_guess_test.go
@@ -0,0 +1,112 @@
+package evaluator
+
+import "testing"
+
+func TestGuessUnitFromMetricName(t *testing.T) {
+	tests := []struct {
+		name       string
+		metricName string
+		want       string
+	}{
+		{name: "seconds suffix", metricName: "http_request_duration_seconds", want: "seconds"},
+		{name: "bytes suffix", metricName: "container_memory_usage_bytes", want: "bytes"},
+		{name: "seconds total suffix", metricName: "process_cpu_seconds_total", want: "seconds"},
+		{name: "timestamp seconds suffix", metricName: "last_seen_timestamp_seconds", want: "seconds"},
+		{name: "real metric histogram bytes", metricName: "http_incoming_request_size_histogram_bytes", want: "bytes"},
+		{name: "real metric bytes total", metricName: "go_memstats_alloc_bytes_total", want: "bytes"},
+		{name: "real metric duration milliseconds", metricName: "rox_sensor_scan_call_duration_milliseconds", want: "milliseconds"},
+		{name: "real metric purger duration seconds", metricName: "rox_sensor_network_flow_manager_purger_duration_seconds", want: "seconds"},
+		{name: "real metric vm report duration milliseconds", metricName: "rox_sensor_virtual_machine_index_report_processing_duration_milliseconds", want: "milliseconds"},
+		{name: "sec alias suffix", metricName: "request_duration_sec", want: "seconds"},
+		{name: "msec alias suffix", metricName: "request_duration_msec", want: "milliseconds"},
+		{name: "no known suffix", metricName: "rox_sensor_events", want: ""},
+		{name: "empty metric name", metricName: "", want: ""},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := guessUnitFromMetricName(tt.metricName)
+			if got != tt.want {
+				t.Fatalf("guessUnitFromMetricName(%q) = %q, want %q", tt.metricName, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestGuessUnitFromHelpText(t *testing.T) {
+	tests := []struct {
+		name     string
+		helpText string
+		want     string
+	}{
+		{name: "milliseconds mention", helpText: "Time taken in milliseconds to process events", want: "milliseconds"},
+		{name: "seconds mention", helpText: "Duration in seconds", want: "seconds"},
+		{name: "seconds short sec mention", helpText: "Duration in sec", want: "seconds"},
+		{name: "bytes mention", helpText: "Payload size in bytes", want: "bytes"},
+		{name: "milliseconds alias msec", helpText: "Call took 32 msec", want: "milliseconds"},
+		{name: "milliseconds alias millis", helpText: "Observed latency in millis", want: "milliseconds"},
+		{name: "byte singular mention", helpText: "Equals to /memory/classes/total:byte.", want: "bytes"},
+		{name: "ambiguous mentions", helpText: "Latency in milliseconds and seconds", want: ""},
+		{name: "ambiguous time and bytes mentions", helpText: "Duration in seconds with payload bytes", want: ""},
+		{name: "no known units", helpText: "Number of retries", want: ""},
+		{name: "empty help text", helpText: "", want: ""},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := guessUnitFromHelpText(tt.helpText)
+			if got != tt.want {
+				t.Fatalf("guessUnitFromHelpText(%q) = %q, want %q", tt.helpText, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestGuessMetricUnit(t *testing.T) {
+	tests := []struct {
+		name       string
+		metricName string
+		helpText   string
+		want       string
+	}{
+		{
+			name:       "metric name wins over help text",
+			metricName: "process_cpu_seconds_total",
+			helpText:   "CPU usage in milliseconds",
+			want:       "seconds",
+		},
+		{
+			name:       "falls back to help text when name has no unit",
+			metricName: "rox_central_event_processing",
+			helpText:   "Time taken in milliseconds",
+			want:       "milliseconds",
+		},
+		{
+			name:       "real cluster duration without unit in name uses help",
+			metricName: "rox_sensor_k8s_event_processing_duration",
+			helpText:   "Time spent fully processing an event from Kubernetes in milliseconds",
+			want:       "milliseconds",
+		},
+		{
+			name:       "returns empty when both sources fail",
+			metricName: "rox_sensor_events",
+			helpText:   "Count of events",
+			want:       "",
+		},
+		{
+			name:       "real metric without unit in name or help",
+			metricName: "rox_sensor_k8s_event_processing_duration",
+			helpText:   "Time taken to fully process an event from Kubernetes",
+			want:       "",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := guessMetricUnit(tt.metricName, tt.helpText)
+			if got != tt.want {
+				t.Fatalf("guessMetricUnit(%q, %q) = %q, want %q", tt.metricName, tt.helpText, got, tt.want)
+			}
+		})
+	}
+}