diff --git a/go.mod b/go.mod index 3a07b90..adc5e0d 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/nats-io/nats-server/v2 v2.12.4 github.com/nats-io/nats.go v1.49.0 github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/client_model v0.6.2 github.com/rs/zerolog v1.34.0 github.com/sapcc/go-api-declarations v1.20.0 github.com/sapcc/go-bits v0.0.0-20260227115921-50512683ebc4 @@ -45,7 +46,6 @@ require ( github.com/nats-io/nuid v1.0.1 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.20.1 // indirect github.com/rabbitmq/amqp091-go v1.10.0 // indirect diff --git a/pkg/commands/producer_ops_log.go b/pkg/commands/producer_ops_log.go index fb9af7e..02e8906 100644 --- a/pkg/commands/producer_ops_log.go +++ b/pkg/commands/producer_ops_log.go @@ -39,6 +39,7 @@ var ( // Shortcut config opsTrackEverything bool + opsTrackBucketSLO bool // Request metrics flags opsTrackRequestsDetailed bool @@ -78,13 +79,13 @@ var ( opsTrackBytesReceivedPerTenant bool // Error metrics flags - opsTrackErrorsDetailed bool - opsTrackErrorsPerUser bool - opsTrackErrorsPerBucket bool - opsTrackErrorsPerTenant bool - opsTrackErrorsPerStatus bool - opsTrackErrorsByIP bool - opsTrackTimeoutErrors bool + opsTrackErrorsDetailed bool + opsTrackErrorsPerUser bool + opsTrackErrorsPerBucket bool + opsTrackErrorsPerTenant bool + opsTrackErrorsPerStatus bool + opsTrackErrorsByIP bool + opsTrackTimeoutErrors bool opsTrackErrorsByCategory bool // IP-based metrics flags @@ -148,6 +149,7 @@ Following this configuration change, the RadosGW will log operations to the file MetricsConfig: opslog.MetricsConfig{ // Shortcut config TrackEverything: opsTrackEverything, + TrackBucketSLO: opsTrackBucketSLO, // Request metrics TrackRequestsDetailed: opsTrackRequestsDetailed, @@ -187,13 +189,13 @@ Following this configuration change, the RadosGW will log operations to the file TrackBytesReceivedPerTenant: opsTrackBytesReceivedPerTenant, // Error metrics - TrackErrorsDetailed: opsTrackErrorsDetailed, - TrackErrorsPerUser: opsTrackErrorsPerUser, - TrackErrorsPerBucket: opsTrackErrorsPerBucket, - TrackErrorsPerTenant: opsTrackErrorsPerTenant, - TrackErrorsPerStatus: opsTrackErrorsPerStatus, - TrackTimeoutErrors: opsTrackTimeoutErrors, - TrackErrorsByCategory: opsTrackErrorsByCategory, + TrackErrorsDetailed: opsTrackErrorsDetailed, + TrackErrorsPerUser: opsTrackErrorsPerUser, + TrackErrorsPerBucket: opsTrackErrorsPerBucket, + TrackErrorsPerTenant: opsTrackErrorsPerTenant, + TrackErrorsPerStatus: opsTrackErrorsPerStatus, + TrackTimeoutErrors: opsTrackTimeoutErrors, + TrackErrorsByCategory: opsTrackErrorsByCategory, // IP-based metrics TrackRequestsByIPDetailed: opsTrackRequestsByIPDetailed, @@ -293,6 +295,11 @@ func debugTrackingConfig(event *zerolog.Event, config opslog.MetricsConfig) { return // Don't add individual flags if everything is enabled } + if config.TrackBucketSLO { + event.Bool("track_bucket_slo", true) + totalEnabled++ + } + // Request tracking requestMetrics := []string{} if config.TrackRequestsDetailed { @@ -564,6 +571,7 @@ func mergeOpsLogConfigWithEnv(cfg opslog.OpsLogConfig) opslog.OpsLogConfig { // Shortcut config cfg.MetricsConfig.TrackEverything = getEnvBool("TRACK_EVERYTHING", cfg.MetricsConfig.TrackEverything) + cfg.MetricsConfig.TrackBucketSLO = getEnvBool("TRACK_BUCKET_SLO", cfg.MetricsConfig.TrackBucketSLO) // Request metrics environment variables cfg.MetricsConfig.TrackRequestsDetailed = getEnvBool("TRACK_REQUESTS_DETAILED", cfg.MetricsConfig.TrackRequestsDetailed) @@ -662,6 +670,7 @@ func init() { // Shortcut flag opsLogCmd.Flags().BoolVar(&opsTrackEverything, "track-everything", false, "Enable detailed tracking for all metric types (efficient mode)") + opsLogCmd.Flags().BoolVar(&opsTrackBucketSLO, "track-bucket-slo", false, "Track low-cardinality bucket GET/LIST SLI metrics for Prometheus SLOs") // Essential request metrics (most commonly used) opsLogCmd.Flags().BoolVar(&opsTrackRequestsDetailed, "track-requests-detailed", false, "Track detailed requests with full labels") diff --git a/pkg/producers/opslog/README.md b/pkg/producers/opslog/README.md index 483bd54..a143b56 100644 --- a/pkg/producers/opslog/README.md +++ b/pkg/producers/opslog/README.md @@ -65,6 +65,8 @@ prysm local-producer ops-log [flags] existing data. - `--track-everything` - Enable detailed tracking for all metric types (efficient mode). +- `--track-bucket-slo` - Track low-cardinality bucket GET/LIST SLI metrics for + Prometheus SLOs. - `--track-timeout-errors` - Enable tracking of timeout errors (408, 504, 598, 499) for OSD issue detection. - `--track-errors-by-category` - Enable error categorization (timeout, @@ -141,6 +143,7 @@ prysm local-producer ops-log \ | `IGNORE_ANONYMOUS_REQUESTS` | Ignore anonymous requests in metrics. | | `TRUNCATE_LOG_ON_START` | Whether to rotate the log file on startup. | | `TRACK_EVERYTHING` | Enable detailed tracking for all metric types. | +| `TRACK_BUCKET_SLO` | Track low-cardinality bucket GET/LIST SLI metrics. | | `AUDIT_ENABLED` | Enable RabbitMQ audit trail publishing. | | `AUDIT_RABBITMQ_URL` | RabbitMQ connection URL. | | `AUDIT_QUEUE_NAME` | RabbitMQ queue name for audit events. | @@ -237,6 +240,12 @@ prysm local-producer ops-log \ | `TRACK_LATENCY_PER_METHOD` | Track latency aggregated per HTTP method. | | `TRACK_LATENCY_PER_BUCKET_AND_METHOD` | Track latency by bucket and method combination. | +#### SLI Tracking Environment Variables: + +| Variable | Description | +|-----------------------------------------------|----------------------------------------------------------------| +| `TRACK_BUCKET_SLO` | Track low-cardinality bucket GET/LIST request SLI metrics for Prometheus SLOs. | + ## Metrics Collected ### Request Counters @@ -339,6 +348,13 @@ prysm local-producer ops-log \ | `radosgw_requests_duration_per_method` | Histogram | `method` | Histogram for request latencies aggregated per method (global). | | `radosgw_requests_duration_per_bucket_and_method` | Histogram | `tenant`, `bucket`, `method` | Histogram for request latencies aggregated per bucket and method (all users combined). | +### Bucket SLI Metrics + +| Metric Name | Type | Labels | Description | +|-----------------------------------------------|-----------|---------------------------------------------|--------------------------------------------------------------------| +| `radosgw_bucket_sli_requests_total` | Counter | `tenant`, `bucket`, `operation`, `status_class` | Low-cardinality bucket SLI request counter for GET/LIST-style operations, labeled by response class such as `2xx` or `5xx`. | +| `radosgw_bucket_sli_request_duration_seconds` | Histogram | `tenant`, `bucket`, `operation` | Latency histogram in seconds for bucket GET/LIST SLI operations, intended for Prometheus SLO evaluation. | + > **Note**: Histogram metrics do **not** include the `pod` label to reduce > cardinality. Each histogram automatically provides `_bucket`, `_count`, and > `_sum` metrics for comprehensive latency analysis. diff --git a/pkg/producers/opslog/config.go b/pkg/producers/opslog/config.go index 61674a9..0255317 100644 --- a/pkg/producers/opslog/config.go +++ b/pkg/producers/opslog/config.go @@ -10,7 +10,7 @@ type AuditSinkConfig struct { RabbitMQURL string `mapstructure:"rabbitmq_url"` QueueName string `mapstructure:"queue_name"` InternalQueueSize int `mapstructure:"internal_queue_size"` // Optional, defaults to 20 - Debug bool `mapstructure:"debug"` // Log published events + Debug bool `mapstructure:"debug"` // Log published events } type OpsLogConfig struct { @@ -38,6 +38,7 @@ type OpsLogConfig struct { type MetricsConfig struct { // === SHORTCUT CONFIGS === TrackEverything bool `yaml:"track_everything"` // Enables all metrics at all levels + TrackBucketSLO bool `yaml:"track_bucket_slo"` // Dedicated low-cardinality GET/LIST SLI metrics for Prometheus SLOs // === REQUEST METRICS === // Total requests @@ -120,6 +121,7 @@ func (c *MetricsConfig) ApplyShortcuts() { if c.TrackEverything { // Enable only detailed metrics - aggregations can be done in Prometheus queries // This is the most efficient approach with lowest cardinality + c.TrackBucketSLO = true c.TrackRequestsDetailed = true c.TrackRequestsByMethodDetailed = true c.TrackRequestsByOperationDetailed = true diff --git a/pkg/producers/opslog/metrics.go b/pkg/producers/opslog/metrics.go index 86069e4..17a3148 100644 --- a/pkg/producers/opslog/metrics.go +++ b/pkg/producers/opslog/metrics.go @@ -262,11 +262,11 @@ func (m *Metrics) ToJSON(metricsConfig *MetricsConfig) ([]byte, error) { if metricsConfig.TrackErrorsByIP { data["errors_per_ip"] = loadSyncMap(&m.ErrorsPerIP) } - + if metricsConfig.TrackTimeoutErrors { data["timeout_errors"] = loadSyncMap(&m.TimeoutErrors) } - + if metricsConfig.TrackErrorsByCategory { data["errors_by_category"] = loadSyncMap(&m.ErrorsByCategory) } @@ -282,6 +282,11 @@ func (m *Metrics) Update(logEntry S3OperationLog, metricsConfig *MetricsConfig) method := ExtractHTTPMethod(logEntry.URI) userStr, tenantStr := extractUserAndTenant(logEntry.User) + + if metricsConfig.TrackBucketSLO { + observeBucketSLI(logEntry, tenantStr) + } + if metricsConfig.TrackRequestsDetailed { key := logEntry.User + "|" + logEntry.Bucket + "|" + method + "|" + logEntry.HTTPStatus incrementSyncMap(&m.RequestsDetailed, key) diff --git a/pkg/producers/opslog/metrics_test.go b/pkg/producers/opslog/metrics_test.go index a2141e5..8f39488 100644 --- a/pkg/producers/opslog/metrics_test.go +++ b/pkg/producers/opslog/metrics_test.go @@ -4,6 +4,8 @@ import ( "sync/atomic" "testing" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" ) @@ -261,6 +263,80 @@ func TestMetricsUpdate_BasicFunctionality(t *testing.T) { assert.Equal(t, 1, latencyCallCount, "LatencyObs should be called once") } +func TestClassifyBucketSLOOperation(t *testing.T) { + testCases := []struct { + name string + operation string + expected SLOperation + ok bool + }{ + {name: "get object", operation: "get_obj", expected: SLOperationGet, ok: true}, + {name: "head object", operation: "head_obj", expected: SLOperationGet, ok: true}, + {name: "list bucket", operation: "list_bucket", expected: SLOperationList, ok: true}, + {name: "list buckets", operation: "list_buckets", expected: SLOperationList, ok: true}, + {name: "bucket info", operation: "get_bucket_info", expected: SLOperationList, ok: true}, + {name: "unsupported", operation: "put_obj", expected: "", ok: false}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + actual, ok := classifyBucketSLOOperation(tc.operation) + assert.Equal(t, tc.ok, ok) + assert.Equal(t, tc.expected, actual) + }) + } +} + +func TestStatusClass(t *testing.T) { + testCases := []struct { + name string + status string + expected string + }{ + {name: "success", status: "200", expected: "2xx"}, + {name: "server error", status: "503", expected: "5xx"}, + {name: "empty", status: "", expected: "unknown"}, + {name: "alpha", status: "ok", expected: "unknown"}, + {name: "leading whitespace", status: " 200", expected: "unknown"}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.expected, statusClass(tc.status)) + }) + } +} + +func TestMetricsUpdate_TrackBucketSLO(t *testing.T) { + config := &MetricsConfig{TrackBucketSLO: true} + logEntry := S3OperationLog{ + Bucket: "bucket-a", + User: "alice$tenant-a", + Operation: "get_obj", + HTTPStatus: "200", + TotalTime: 150, + } + + before := readCounterValue(t, sliRequestsTotal, "tenant-a", "bucket-a", "get", "2xx") + assert.NotPanics(t, func() { + NewMetrics().Update(logEntry, config) + }) + after := readCounterValue(t, sliRequestsTotal, "tenant-a", "bucket-a", "get", "2xx") + + assert.Equal(t, before+1, after) +} + +func readCounterValue(t *testing.T, counter *prometheus.CounterVec, labelValues ...string) float64 { + t.Helper() + + metric, err := counter.GetMetricWithLabelValues(labelValues...) + assert.NoError(t, err) + + dtoMetric := &dto.Metric{} + assert.NoError(t, metric.Write(dtoMetric)) + return dtoMetric.GetCounter().GetValue() +} + func TestMetricsUpdate_ErrorTracking(t *testing.T) { config := &MetricsConfig{ TrackErrorsDetailed: true, diff --git a/pkg/producers/opslog/prometheus.go b/pkg/producers/opslog/prometheus.go index 022968c..4cdffc0 100644 --- a/pkg/producers/opslog/prometheus.go +++ b/pkg/producers/opslog/prometheus.go @@ -51,6 +51,11 @@ func initPrometheusSettings(cfg *OpsLogConfig) { // Register latency metrics and set up LatencyObs function registerLatencyMetrics(metricsConfig) + // Register dedicated low-cardinality SLI metrics for bucket GET/LIST SLOs + if metricsConfig.TrackBucketSLO { + registerSLIMetrics() + } + // Set up the global LatencyObs function LatencyObs = latencyObs } diff --git a/pkg/producers/opslog/prometheus_sli.go b/pkg/producers/opslog/prometheus_sli.go new file mode 100644 index 0000000..7d887bc --- /dev/null +++ b/pkg/producers/opslog/prometheus_sli.go @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and prysm contributors +// +// SPDX-License-Identifier: Apache-2.0 + +package opslog + +import "github.com/prometheus/client_golang/prometheus" + +var ( + sliRequestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "radosgw_bucket_sli_requests_total", + Help: "Bucket GET/LIST requests for SLI and SLO evaluation", + }, + []string{"tenant", "bucket", "operation", "status_class"}, + ) + + sliRequestDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "radosgw_bucket_sli_request_duration_seconds", + Help: "Latency histogram for bucket GET/LIST requests used for SLI and SLO evaluation", + Buckets: []float64{0.05, 0.1, 0.2, 0.3, 0.5, 1, 2, 5, 10}, + }, + []string{"tenant", "bucket", "operation"}, + ) +) + +func registerSLIMetrics() { + prometheus.MustRegister(sliRequestsTotal) + prometheus.MustRegister(sliRequestDuration) +} diff --git a/pkg/producers/opslog/sli.go b/pkg/producers/opslog/sli.go new file mode 100644 index 0000000..552e621 --- /dev/null +++ b/pkg/producers/opslog/sli.go @@ -0,0 +1,57 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and prysm contributors +// +// SPDX-License-Identifier: Apache-2.0 + +package opslog + +import "strings" + +type SLOperation string + +const ( + SLOperationGet SLOperation = "get" + SLOperationList SLOperation = "list" +) + +func classifyBucketSLOOperation(operation string) (SLOperation, bool) { + switch strings.ToLower(operation) { + case "get_obj", "head_obj": + return SLOperationGet, true + case "list_bucket", "list_buckets", "get_bucket_info": + return SLOperationList, true + default: + return "", false + } +} + +func statusClass(status string) string { + if len(status) == 0 { + return "unknown" + } + if status[0] < '0' || status[0] > '9' { + return "unknown" + } + return status[:1] + "xx" +} + +func observeBucketSLI(logEntry S3OperationLog, tenant string) { + sloOperation, ok := classifyBucketSLOOperation(logEntry.Operation) + if !ok || logEntry.Bucket == "" { + return + } + + sliRequestsTotal.WithLabelValues( + tenant, + logEntry.Bucket, + string(sloOperation), + statusClass(logEntry.HTTPStatus), + ).Inc() + + if logEntry.TotalTime > 0 { + sliRequestDuration.WithLabelValues( + tenant, + logEntry.Bucket, + string(sloOperation), + ).Observe(float64(logEntry.TotalTime) / 1000.0) + } +}