From e4627b9576037cead3b4192cc17529892c03cea0 Mon Sep 17 00:00:00 2001 From: Jan Radon Date: Mon, 27 Apr 2026 08:53:23 +0200 Subject: [PATCH 1/3] feat(opslog): add bucket SLI metrics for Prometheus SLOs Introduce dedicated low-cardinality bucket SLI tracking for GET/LIST-style operations in the opslog producer. This adds a new TrackBucketSLO metrics option, wires it through CLI flags and environment configuration, and enables it automatically when track-everything is selected. When enabled, metrics updates now classify supported bucket operations, derive HTTP status classes, and emit dedicated Prometheus SLI request and latency metrics. Also document the new flag, environment variable, and exported metrics in the opslog README, and add tests covering SLI operation classification and metric emission. Promote github.com/prometheus/client_model to a direct dependency for the new metric assertions. --- go.mod | 2 +- pkg/commands/producer_ops_log.go | 37 ++++++++++------- pkg/producers/opslog/README.md | 16 ++++++++ pkg/producers/opslog/config.go | 4 +- pkg/producers/opslog/metrics.go | 9 ++++- pkg/producers/opslog/metrics_test.go | 56 ++++++++++++++++++++++++++ pkg/producers/opslog/prometheus.go | 5 +++ pkg/producers/opslog/prometheus_sli.go | 31 ++++++++++++++ pkg/producers/opslog/sli.go | 54 +++++++++++++++++++++++++ 9 files changed, 196 insertions(+), 18 deletions(-) create mode 100644 pkg/producers/opslog/prometheus_sli.go create mode 100644 pkg/producers/opslog/sli.go diff --git a/go.mod b/go.mod index 3a07b90..adc5e0d 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/nats-io/nats-server/v2 v2.12.4 github.com/nats-io/nats.go v1.49.0 github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/client_model v0.6.2 github.com/rs/zerolog v1.34.0 github.com/sapcc/go-api-declarations v1.20.0 github.com/sapcc/go-bits v0.0.0-20260227115921-50512683ebc4 @@ -45,7 +46,6 @@ require ( github.com/nats-io/nuid v1.0.1 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.20.1 // indirect github.com/rabbitmq/amqp091-go v1.10.0 // indirect diff --git a/pkg/commands/producer_ops_log.go b/pkg/commands/producer_ops_log.go index fb9af7e..02e8906 100644 --- a/pkg/commands/producer_ops_log.go +++ b/pkg/commands/producer_ops_log.go @@ -39,6 +39,7 @@ var ( // Shortcut config opsTrackEverything bool + opsTrackBucketSLO bool // Request metrics flags opsTrackRequestsDetailed bool @@ -78,13 +79,13 @@ var ( opsTrackBytesReceivedPerTenant bool // Error metrics flags - opsTrackErrorsDetailed bool - opsTrackErrorsPerUser bool - opsTrackErrorsPerBucket bool - opsTrackErrorsPerTenant bool - opsTrackErrorsPerStatus bool - opsTrackErrorsByIP bool - opsTrackTimeoutErrors bool + opsTrackErrorsDetailed bool + opsTrackErrorsPerUser bool + opsTrackErrorsPerBucket bool + opsTrackErrorsPerTenant bool + opsTrackErrorsPerStatus bool + opsTrackErrorsByIP bool + opsTrackTimeoutErrors bool opsTrackErrorsByCategory bool // IP-based metrics flags @@ -148,6 +149,7 @@ Following this configuration change, the RadosGW will log operations to the file MetricsConfig: opslog.MetricsConfig{ // Shortcut config TrackEverything: opsTrackEverything, + TrackBucketSLO: opsTrackBucketSLO, // Request metrics TrackRequestsDetailed: opsTrackRequestsDetailed, @@ -187,13 +189,13 @@ Following this configuration change, the RadosGW will log operations to the file TrackBytesReceivedPerTenant: opsTrackBytesReceivedPerTenant, // Error metrics - TrackErrorsDetailed: opsTrackErrorsDetailed, - TrackErrorsPerUser: opsTrackErrorsPerUser, - TrackErrorsPerBucket: opsTrackErrorsPerBucket, - TrackErrorsPerTenant: opsTrackErrorsPerTenant, - TrackErrorsPerStatus: opsTrackErrorsPerStatus, - TrackTimeoutErrors: opsTrackTimeoutErrors, - TrackErrorsByCategory: opsTrackErrorsByCategory, + TrackErrorsDetailed: opsTrackErrorsDetailed, + TrackErrorsPerUser: opsTrackErrorsPerUser, + TrackErrorsPerBucket: opsTrackErrorsPerBucket, + TrackErrorsPerTenant: opsTrackErrorsPerTenant, + TrackErrorsPerStatus: opsTrackErrorsPerStatus, + TrackTimeoutErrors: opsTrackTimeoutErrors, + TrackErrorsByCategory: opsTrackErrorsByCategory, // IP-based metrics TrackRequestsByIPDetailed: opsTrackRequestsByIPDetailed, @@ -293,6 +295,11 @@ func debugTrackingConfig(event *zerolog.Event, config opslog.MetricsConfig) { return // Don't add individual flags if everything is enabled } + if config.TrackBucketSLO { + event.Bool("track_bucket_slo", true) + totalEnabled++ + } + // Request tracking requestMetrics := []string{} if config.TrackRequestsDetailed { @@ -564,6 +571,7 @@ func mergeOpsLogConfigWithEnv(cfg opslog.OpsLogConfig) opslog.OpsLogConfig { // Shortcut config cfg.MetricsConfig.TrackEverything = getEnvBool("TRACK_EVERYTHING", cfg.MetricsConfig.TrackEverything) + cfg.MetricsConfig.TrackBucketSLO = getEnvBool("TRACK_BUCKET_SLO", cfg.MetricsConfig.TrackBucketSLO) // Request metrics environment variables cfg.MetricsConfig.TrackRequestsDetailed = getEnvBool("TRACK_REQUESTS_DETAILED", cfg.MetricsConfig.TrackRequestsDetailed) @@ -662,6 +670,7 @@ func init() { // Shortcut flag opsLogCmd.Flags().BoolVar(&opsTrackEverything, "track-everything", false, "Enable detailed tracking for all metric types (efficient mode)") + opsLogCmd.Flags().BoolVar(&opsTrackBucketSLO, "track-bucket-slo", false, "Track low-cardinality bucket GET/LIST SLI metrics for Prometheus SLOs") // Essential request metrics (most commonly used) opsLogCmd.Flags().BoolVar(&opsTrackRequestsDetailed, "track-requests-detailed", false, "Track detailed requests with full labels") diff --git a/pkg/producers/opslog/README.md b/pkg/producers/opslog/README.md index 483bd54..a8f714f 100644 --- a/pkg/producers/opslog/README.md +++ b/pkg/producers/opslog/README.md @@ -65,6 +65,8 @@ prysm local-producer ops-log [flags] existing data. - `--track-everything` - Enable detailed tracking for all metric types (efficient mode). +- `--track-bucket-slo` - Enable low-cardinality bucket GET/LIST SLI metrics for + Prometheus SLOs. - `--track-timeout-errors` - Enable tracking of timeout errors (408, 504, 598, 499) for OSD issue detection. - `--track-errors-by-category` - Enable error categorization (timeout, @@ -141,6 +143,7 @@ prysm local-producer ops-log \ | `IGNORE_ANONYMOUS_REQUESTS` | Ignore anonymous requests in metrics. | | `TRUNCATE_LOG_ON_START` | Whether to rotate the log file on startup. | | `TRACK_EVERYTHING` | Enable detailed tracking for all metric types. | +| `TRACK_BUCKET_SLO` | Enable low-cardinality bucket GET/LIST SLI metrics. | | `AUDIT_ENABLED` | Enable RabbitMQ audit trail publishing. | | `AUDIT_RABBITMQ_URL` | RabbitMQ connection URL. | | `AUDIT_QUEUE_NAME` | RabbitMQ queue name for audit events. | @@ -237,6 +240,12 @@ prysm local-producer ops-log \ | `TRACK_LATENCY_PER_METHOD` | Track latency aggregated per HTTP method. | | `TRACK_LATENCY_PER_BUCKET_AND_METHOD` | Track latency by bucket and method combination. | +#### SLI Tracking Environment Variables: + +| Variable | Description | +|-----------------------------------------------|----------------------------------------------------------------| +| `TRACK_BUCKET_SLO` | Track low-cardinality bucket GET/LIST request SLI metrics for Prometheus SLOs. | + ## Metrics Collected ### Request Counters @@ -339,6 +348,13 @@ prysm local-producer ops-log \ | `radosgw_requests_duration_per_method` | Histogram | `method` | Histogram for request latencies aggregated per method (global). | | `radosgw_requests_duration_per_bucket_and_method` | Histogram | `tenant`, `bucket`, `method` | Histogram for request latencies aggregated per bucket and method (all users combined). | +### Bucket SLI Metrics + +| Metric Name | Type | Labels | Description | +|-----------------------------------------------|-----------|---------------------------------------------|--------------------------------------------------------------------| +| `radosgw_bucket_sli_requests_total` | Counter | `tenant`, `bucket`, `operation`, `status_class` | Low-cardinality bucket SLI request counter for GET/LIST-style operations, labeled by response class such as `2xx` or `5xx`. | +| `radosgw_bucket_sli_request_duration_seconds` | Histogram | `tenant`, `bucket`, `operation` | Latency histogram in seconds for bucket GET/LIST SLI operations, intended for Prometheus SLO evaluation. | + > **Note**: Histogram metrics do **not** include the `pod` label to reduce > cardinality. Each histogram automatically provides `_bucket`, `_count`, and > `_sum` metrics for comprehensive latency analysis. diff --git a/pkg/producers/opslog/config.go b/pkg/producers/opslog/config.go index 61674a9..0255317 100644 --- a/pkg/producers/opslog/config.go +++ b/pkg/producers/opslog/config.go @@ -10,7 +10,7 @@ type AuditSinkConfig struct { RabbitMQURL string `mapstructure:"rabbitmq_url"` QueueName string `mapstructure:"queue_name"` InternalQueueSize int `mapstructure:"internal_queue_size"` // Optional, defaults to 20 - Debug bool `mapstructure:"debug"` // Log published events + Debug bool `mapstructure:"debug"` // Log published events } type OpsLogConfig struct { @@ -38,6 +38,7 @@ type OpsLogConfig struct { type MetricsConfig struct { // === SHORTCUT CONFIGS === TrackEverything bool `yaml:"track_everything"` // Enables all metrics at all levels + TrackBucketSLO bool `yaml:"track_bucket_slo"` // Dedicated low-cardinality GET/LIST SLI metrics for Prometheus SLOs // === REQUEST METRICS === // Total requests @@ -120,6 +121,7 @@ func (c *MetricsConfig) ApplyShortcuts() { if c.TrackEverything { // Enable only detailed metrics - aggregations can be done in Prometheus queries // This is the most efficient approach with lowest cardinality + c.TrackBucketSLO = true c.TrackRequestsDetailed = true c.TrackRequestsByMethodDetailed = true c.TrackRequestsByOperationDetailed = true diff --git a/pkg/producers/opslog/metrics.go b/pkg/producers/opslog/metrics.go index 86069e4..17a3148 100644 --- a/pkg/producers/opslog/metrics.go +++ b/pkg/producers/opslog/metrics.go @@ -262,11 +262,11 @@ func (m *Metrics) ToJSON(metricsConfig *MetricsConfig) ([]byte, error) { if metricsConfig.TrackErrorsByIP { data["errors_per_ip"] = loadSyncMap(&m.ErrorsPerIP) } - + if metricsConfig.TrackTimeoutErrors { data["timeout_errors"] = loadSyncMap(&m.TimeoutErrors) } - + if metricsConfig.TrackErrorsByCategory { data["errors_by_category"] = loadSyncMap(&m.ErrorsByCategory) } @@ -282,6 +282,11 @@ func (m *Metrics) Update(logEntry S3OperationLog, metricsConfig *MetricsConfig) method := ExtractHTTPMethod(logEntry.URI) userStr, tenantStr := extractUserAndTenant(logEntry.User) + + if metricsConfig.TrackBucketSLO { + observeBucketSLI(logEntry, tenantStr) + } + if metricsConfig.TrackRequestsDetailed { key := logEntry.User + "|" + logEntry.Bucket + "|" + method + "|" + logEntry.HTTPStatus incrementSyncMap(&m.RequestsDetailed, key) diff --git a/pkg/producers/opslog/metrics_test.go b/pkg/producers/opslog/metrics_test.go index a2141e5..045ae06 100644 --- a/pkg/producers/opslog/metrics_test.go +++ b/pkg/producers/opslog/metrics_test.go @@ -4,6 +4,8 @@ import ( "sync/atomic" "testing" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" ) @@ -261,6 +263,60 @@ func TestMetricsUpdate_BasicFunctionality(t *testing.T) { assert.Equal(t, 1, latencyCallCount, "LatencyObs should be called once") } +func TestClassifyBucketSLOOperation(t *testing.T) { + testCases := []struct { + name string + operation string + expected SLOperation + ok bool + }{ + {name: "get object", operation: "get_obj", expected: SLOperationGet, ok: true}, + {name: "head object", operation: "head_obj", expected: SLOperationGet, ok: true}, + {name: "list bucket", operation: "list_bucket", expected: SLOperationList, ok: true}, + {name: "list buckets", operation: "list_buckets", expected: SLOperationList, ok: true}, + {name: "bucket info", operation: "get_bucket_info", expected: SLOperationList, ok: true}, + {name: "unsupported", operation: "put_obj", expected: "", ok: false}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + actual, ok := classifyBucketSLOOperation(tc.operation) + assert.Equal(t, tc.ok, ok) + assert.Equal(t, tc.expected, actual) + }) + } +} + +func TestMetricsUpdate_TrackBucketSLO(t *testing.T) { + config := &MetricsConfig{TrackBucketSLO: true} + logEntry := S3OperationLog{ + Bucket: "bucket-a", + User: "alice$tenant-a", + Operation: "get_obj", + HTTPStatus: "200", + TotalTime: 150, + } + + before := readCounterValue(t, sliRequestsTotal, "tenant-a", "bucket-a", "get", "2xx") + assert.NotPanics(t, func() { + NewMetrics().Update(logEntry, config) + }) + after := readCounterValue(t, sliRequestsTotal, "tenant-a", "bucket-a", "get", "2xx") + + assert.Equal(t, before+1, after) +} + +func readCounterValue(t *testing.T, counter *prometheus.CounterVec, labelValues ...string) float64 { + t.Helper() + + metric, err := counter.GetMetricWithLabelValues(labelValues...) + assert.NoError(t, err) + + dtoMetric := &dto.Metric{} + assert.NoError(t, metric.Write(dtoMetric)) + return dtoMetric.GetCounter().GetValue() +} + func TestMetricsUpdate_ErrorTracking(t *testing.T) { config := &MetricsConfig{ TrackErrorsDetailed: true, diff --git a/pkg/producers/opslog/prometheus.go b/pkg/producers/opslog/prometheus.go index 022968c..4cdffc0 100644 --- a/pkg/producers/opslog/prometheus.go +++ b/pkg/producers/opslog/prometheus.go @@ -51,6 +51,11 @@ func initPrometheusSettings(cfg *OpsLogConfig) { // Register latency metrics and set up LatencyObs function registerLatencyMetrics(metricsConfig) + // Register dedicated low-cardinality SLI metrics for bucket GET/LIST SLOs + if metricsConfig.TrackBucketSLO { + registerSLIMetrics() + } + // Set up the global LatencyObs function LatencyObs = latencyObs } diff --git a/pkg/producers/opslog/prometheus_sli.go b/pkg/producers/opslog/prometheus_sli.go new file mode 100644 index 0000000..7d887bc --- /dev/null +++ b/pkg/producers/opslog/prometheus_sli.go @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and prysm contributors +// +// SPDX-License-Identifier: Apache-2.0 + +package opslog + +import "github.com/prometheus/client_golang/prometheus" + +var ( + sliRequestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "radosgw_bucket_sli_requests_total", + Help: "Bucket GET/LIST requests for SLI and SLO evaluation", + }, + []string{"tenant", "bucket", "operation", "status_class"}, + ) + + sliRequestDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "radosgw_bucket_sli_request_duration_seconds", + Help: "Latency histogram for bucket GET/LIST requests used for SLI and SLO evaluation", + Buckets: []float64{0.05, 0.1, 0.2, 0.3, 0.5, 1, 2, 5, 10}, + }, + []string{"tenant", "bucket", "operation"}, + ) +) + +func registerSLIMetrics() { + prometheus.MustRegister(sliRequestsTotal) + prometheus.MustRegister(sliRequestDuration) +} diff --git a/pkg/producers/opslog/sli.go b/pkg/producers/opslog/sli.go new file mode 100644 index 0000000..9fc2673 --- /dev/null +++ b/pkg/producers/opslog/sli.go @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and prysm contributors +// +// SPDX-License-Identifier: Apache-2.0 + +package opslog + +import "strings" + +type SLOperation string + +const ( + SLOperationGet SLOperation = "get" + SLOperationList SLOperation = "list" +) + +func classifyBucketSLOOperation(operation string) (SLOperation, bool) { + switch strings.ToLower(operation) { + case "get_obj", "head_obj": + return SLOperationGet, true + case "list_bucket", "list_buckets", "get_bucket_info": + return SLOperationList, true + default: + return "", false + } +} + +func statusClass(status string) string { + if len(status) == 0 { + return "unknown" + } + return status[:1] + "xx" +} + +func observeBucketSLI(logEntry S3OperationLog, tenant string) { + sloOperation, ok := classifyBucketSLOOperation(logEntry.Operation) + if !ok || logEntry.Bucket == "" { + return + } + + sliRequestsTotal.WithLabelValues( + tenant, + logEntry.Bucket, + string(sloOperation), + statusClass(logEntry.HTTPStatus), + ).Inc() + + if logEntry.TotalTime > 0 { + sliRequestDuration.WithLabelValues( + tenant, + logEntry.Bucket, + string(sloOperation), + ).Observe(float64(logEntry.TotalTime) / 1000.0) + } +} From 351315b389813a64099edd20846f2c7aab0c3e2a Mon Sep 17 00:00:00 2001 From: Jan Radon Date: Mon, 27 Apr 2026 09:10:05 +0200 Subject: [PATCH 2/3] fix(opslog): bound SLI status labels to valid HTTP status classes Prevent malformed opslog HTTP status values from creating unexpected Prometheus label values in bucket SLI metrics. Instead of blindly deriving the label from the first byte, status classification now returns when the status is empty or does not start with a digit. This keeps SLI metric cardinality bounded for invalid upstream values such as strings or whitespace-prefixed statuses. Add unit tests covering valid statuses and malformed inputs. --- pkg/producers/opslog/metrics_test.go | 20 ++++++++++++++++++++ pkg/producers/opslog/sli.go | 3 +++ 2 files changed, 23 insertions(+) diff --git a/pkg/producers/opslog/metrics_test.go b/pkg/producers/opslog/metrics_test.go index 045ae06..8f39488 100644 --- a/pkg/producers/opslog/metrics_test.go +++ b/pkg/producers/opslog/metrics_test.go @@ -287,6 +287,26 @@ func TestClassifyBucketSLOOperation(t *testing.T) { } } +func TestStatusClass(t *testing.T) { + testCases := []struct { + name string + status string + expected string + }{ + {name: "success", status: "200", expected: "2xx"}, + {name: "server error", status: "503", expected: "5xx"}, + {name: "empty", status: "", expected: "unknown"}, + {name: "alpha", status: "ok", expected: "unknown"}, + {name: "leading whitespace", status: " 200", expected: "unknown"}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.expected, statusClass(tc.status)) + }) + } +} + func TestMetricsUpdate_TrackBucketSLO(t *testing.T) { config := &MetricsConfig{TrackBucketSLO: true} logEntry := S3OperationLog{ diff --git a/pkg/producers/opslog/sli.go b/pkg/producers/opslog/sli.go index 9fc2673..552e621 100644 --- a/pkg/producers/opslog/sli.go +++ b/pkg/producers/opslog/sli.go @@ -28,6 +28,9 @@ func statusClass(status string) string { if len(status) == 0 { return "unknown" } + if status[0] < '0' || status[0] > '9' { + return "unknown" + } return status[:1] + "xx" } From 6ee9f15b5851c0510f73f66016cb852eae9fc66f Mon Sep 17 00:00:00 2001 From: "senol.colak" Date: Fri, 8 May 2026 20:51:00 +0200 Subject: [PATCH 3/3] docs(opslog): align --track-bucket-slo descriptions with CLI help text The CLI flag uses "Track" but the README used "Enable" in two places. Align to "Track" for consistency with other TRACK_* flag descriptions. --- pkg/producers/opslog/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/producers/opslog/README.md b/pkg/producers/opslog/README.md index a8f714f..a143b56 100644 --- a/pkg/producers/opslog/README.md +++ b/pkg/producers/opslog/README.md @@ -65,7 +65,7 @@ prysm local-producer ops-log [flags] existing data. - `--track-everything` - Enable detailed tracking for all metric types (efficient mode). -- `--track-bucket-slo` - Enable low-cardinality bucket GET/LIST SLI metrics for +- `--track-bucket-slo` - Track low-cardinality bucket GET/LIST SLI metrics for Prometheus SLOs. - `--track-timeout-errors` - Enable tracking of timeout errors (408, 504, 598, 499) for OSD issue detection. @@ -143,7 +143,7 @@ prysm local-producer ops-log \ | `IGNORE_ANONYMOUS_REQUESTS` | Ignore anonymous requests in metrics. | | `TRUNCATE_LOG_ON_START` | Whether to rotate the log file on startup. | | `TRACK_EVERYTHING` | Enable detailed tracking for all metric types. | -| `TRACK_BUCKET_SLO` | Enable low-cardinality bucket GET/LIST SLI metrics. | +| `TRACK_BUCKET_SLO` | Track low-cardinality bucket GET/LIST SLI metrics. | | `AUDIT_ENABLED` | Enable RabbitMQ audit trail publishing. | | `AUDIT_RABBITMQ_URL` | RabbitMQ connection URL. | | `AUDIT_QUEUE_NAME` | RabbitMQ queue name for audit events. |