Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
github.com/nats-io/nats-server/v2 v2.12.4
github.com/nats-io/nats.go v1.49.0
github.com/prometheus/client_golang v1.23.2
github.com/prometheus/client_model v0.6.2
github.com/rs/zerolog v1.34.0
github.com/sapcc/go-api-declarations v1.20.0
github.com/sapcc/go-bits v0.0.0-20260227115921-50512683ebc4
Expand Down Expand Up @@ -45,7 +46,6 @@ require (
github.com/nats-io/nuid v1.0.1 // indirect
github.com/pelletier/go-toml/v2 v2.2.4 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.5 // indirect
github.com/prometheus/procfs v0.20.1 // indirect
github.com/rabbitmq/amqp091-go v1.10.0 // indirect
Expand Down
37 changes: 23 additions & 14 deletions pkg/commands/producer_ops_log.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ var (

// Shortcut config
opsTrackEverything bool
opsTrackBucketSLO bool

// Request metrics flags
opsTrackRequestsDetailed bool
Expand Down Expand Up @@ -78,13 +79,13 @@ var (
opsTrackBytesReceivedPerTenant bool

// Error metrics flags
opsTrackErrorsDetailed bool
opsTrackErrorsPerUser bool
opsTrackErrorsPerBucket bool
opsTrackErrorsPerTenant bool
opsTrackErrorsPerStatus bool
opsTrackErrorsByIP bool
opsTrackTimeoutErrors bool
opsTrackErrorsDetailed bool
opsTrackErrorsPerUser bool
opsTrackErrorsPerBucket bool
opsTrackErrorsPerTenant bool
opsTrackErrorsPerStatus bool
opsTrackErrorsByIP bool
opsTrackTimeoutErrors bool
opsTrackErrorsByCategory bool

// IP-based metrics flags
Expand Down Expand Up @@ -148,6 +149,7 @@ Following this configuration change, the RadosGW will log operations to the file
MetricsConfig: opslog.MetricsConfig{
// Shortcut config
TrackEverything: opsTrackEverything,
TrackBucketSLO: opsTrackBucketSLO,

// Request metrics
TrackRequestsDetailed: opsTrackRequestsDetailed,
Expand Down Expand Up @@ -187,13 +189,13 @@ Following this configuration change, the RadosGW will log operations to the file
TrackBytesReceivedPerTenant: opsTrackBytesReceivedPerTenant,

// Error metrics
TrackErrorsDetailed: opsTrackErrorsDetailed,
TrackErrorsPerUser: opsTrackErrorsPerUser,
TrackErrorsPerBucket: opsTrackErrorsPerBucket,
TrackErrorsPerTenant: opsTrackErrorsPerTenant,
TrackErrorsPerStatus: opsTrackErrorsPerStatus,
TrackTimeoutErrors: opsTrackTimeoutErrors,
TrackErrorsByCategory: opsTrackErrorsByCategory,
TrackErrorsDetailed: opsTrackErrorsDetailed,
TrackErrorsPerUser: opsTrackErrorsPerUser,
TrackErrorsPerBucket: opsTrackErrorsPerBucket,
TrackErrorsPerTenant: opsTrackErrorsPerTenant,
TrackErrorsPerStatus: opsTrackErrorsPerStatus,
TrackTimeoutErrors: opsTrackTimeoutErrors,
TrackErrorsByCategory: opsTrackErrorsByCategory,

// IP-based metrics
TrackRequestsByIPDetailed: opsTrackRequestsByIPDetailed,
Expand Down Expand Up @@ -293,6 +295,11 @@ func debugTrackingConfig(event *zerolog.Event, config opslog.MetricsConfig) {
return // Don't add individual flags if everything is enabled
}

if config.TrackBucketSLO {
event.Bool("track_bucket_slo", true)
totalEnabled++
}

// Request tracking
requestMetrics := []string{}
if config.TrackRequestsDetailed {
Expand Down Expand Up @@ -564,6 +571,7 @@ func mergeOpsLogConfigWithEnv(cfg opslog.OpsLogConfig) opslog.OpsLogConfig {

// Shortcut config
cfg.MetricsConfig.TrackEverything = getEnvBool("TRACK_EVERYTHING", cfg.MetricsConfig.TrackEverything)
cfg.MetricsConfig.TrackBucketSLO = getEnvBool("TRACK_BUCKET_SLO", cfg.MetricsConfig.TrackBucketSLO)

// Request metrics environment variables
cfg.MetricsConfig.TrackRequestsDetailed = getEnvBool("TRACK_REQUESTS_DETAILED", cfg.MetricsConfig.TrackRequestsDetailed)
Expand Down Expand Up @@ -662,6 +670,7 @@ func init() {

// Shortcut flag
opsLogCmd.Flags().BoolVar(&opsTrackEverything, "track-everything", false, "Enable detailed tracking for all metric types (efficient mode)")
opsLogCmd.Flags().BoolVar(&opsTrackBucketSLO, "track-bucket-slo", false, "Track low-cardinality bucket GET/LIST SLI metrics for Prometheus SLOs")

// Essential request metrics (most commonly used)
opsLogCmd.Flags().BoolVar(&opsTrackRequestsDetailed, "track-requests-detailed", false, "Track detailed requests with full labels")
Expand Down
16 changes: 16 additions & 0 deletions pkg/producers/opslog/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ prysm local-producer ops-log [flags]
existing data.
- `--track-everything` - Enable detailed tracking for all metric types
(efficient mode).
- `--track-bucket-slo` - Track low-cardinality bucket GET/LIST SLI metrics for
Prometheus SLOs.
- `--track-timeout-errors` - Enable tracking of timeout errors (408, 504, 598,
499) for OSD issue detection.
- `--track-errors-by-category` - Enable error categorization (timeout,
Expand Down Expand Up @@ -141,6 +143,7 @@ prysm local-producer ops-log \
| `IGNORE_ANONYMOUS_REQUESTS` | Ignore anonymous requests in metrics. |
| `TRUNCATE_LOG_ON_START` | Whether to rotate the log file on startup. |
| `TRACK_EVERYTHING` | Enable detailed tracking for all metric types. |
| `TRACK_BUCKET_SLO` | Track low-cardinality bucket GET/LIST SLI metrics. |
| `AUDIT_ENABLED` | Enable RabbitMQ audit trail publishing. |
| `AUDIT_RABBITMQ_URL` | RabbitMQ connection URL. |
| `AUDIT_QUEUE_NAME` | RabbitMQ queue name for audit events. |
Expand Down Expand Up @@ -237,6 +240,12 @@ prysm local-producer ops-log \
| `TRACK_LATENCY_PER_METHOD` | Track latency aggregated per HTTP method. |
| `TRACK_LATENCY_PER_BUCKET_AND_METHOD` | Track latency by bucket and method combination. |

#### SLI Tracking Environment Variables:

| Variable | Description |
|-----------------------------------------------|----------------------------------------------------------------|
| `TRACK_BUCKET_SLO` | Track low-cardinality bucket GET/LIST request SLI metrics for Prometheus SLOs. |

## Metrics Collected

### Request Counters
Expand Down Expand Up @@ -339,6 +348,13 @@ prysm local-producer ops-log \
| `radosgw_requests_duration_per_method` | Histogram | `method` | Histogram for request latencies aggregated per method (global). |
| `radosgw_requests_duration_per_bucket_and_method` | Histogram | `tenant`, `bucket`, `method` | Histogram for request latencies aggregated per bucket and method (all users combined). |

### Bucket SLI Metrics

| Metric Name | Type | Labels | Description |
|-----------------------------------------------|-----------|---------------------------------------------|--------------------------------------------------------------------|
| `radosgw_bucket_sli_requests_total` | Counter | `tenant`, `bucket`, `operation`, `status_class` | Low-cardinality bucket SLI request counter for GET/LIST-style operations, labeled by response class such as `2xx` or `5xx`. |
| `radosgw_bucket_sli_request_duration_seconds` | Histogram | `tenant`, `bucket`, `operation` | Latency histogram in seconds for bucket GET/LIST SLI operations, intended for Prometheus SLO evaluation. |

> **Note**: Histogram metrics do **not** include the `pod` label to reduce
> cardinality. Each histogram automatically provides `_bucket`, `_count`, and
> `_sum` metrics for comprehensive latency analysis.
Expand Down
4 changes: 3 additions & 1 deletion pkg/producers/opslog/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ type AuditSinkConfig struct {
RabbitMQURL string `mapstructure:"rabbitmq_url"`
QueueName string `mapstructure:"queue_name"`
InternalQueueSize int `mapstructure:"internal_queue_size"` // Optional, defaults to 20
Debug bool `mapstructure:"debug"` // Log published events
Debug bool `mapstructure:"debug"` // Log published events
}

type OpsLogConfig struct {
Expand Down Expand Up @@ -38,6 +38,7 @@ type OpsLogConfig struct {
type MetricsConfig struct {
// === SHORTCUT CONFIGS ===
TrackEverything bool `yaml:"track_everything"` // Enables all metrics at all levels
TrackBucketSLO bool `yaml:"track_bucket_slo"` // Dedicated low-cardinality GET/LIST SLI metrics for Prometheus SLOs

// === REQUEST METRICS ===
// Total requests
Expand Down Expand Up @@ -120,6 +121,7 @@ func (c *MetricsConfig) ApplyShortcuts() {
if c.TrackEverything {
// Enable only detailed metrics - aggregations can be done in Prometheus queries
// This is the most efficient approach with lowest cardinality
c.TrackBucketSLO = true
c.TrackRequestsDetailed = true
c.TrackRequestsByMethodDetailed = true
c.TrackRequestsByOperationDetailed = true
Expand Down
9 changes: 7 additions & 2 deletions pkg/producers/opslog/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,11 +262,11 @@ func (m *Metrics) ToJSON(metricsConfig *MetricsConfig) ([]byte, error) {
if metricsConfig.TrackErrorsByIP {
data["errors_per_ip"] = loadSyncMap(&m.ErrorsPerIP)
}

if metricsConfig.TrackTimeoutErrors {
data["timeout_errors"] = loadSyncMap(&m.TimeoutErrors)
}

if metricsConfig.TrackErrorsByCategory {
data["errors_by_category"] = loadSyncMap(&m.ErrorsByCategory)
}
Expand All @@ -282,6 +282,11 @@ func (m *Metrics) Update(logEntry S3OperationLog, metricsConfig *MetricsConfig)

method := ExtractHTTPMethod(logEntry.URI)
userStr, tenantStr := extractUserAndTenant(logEntry.User)

if metricsConfig.TrackBucketSLO {
observeBucketSLI(logEntry, tenantStr)
}

if metricsConfig.TrackRequestsDetailed {
key := logEntry.User + "|" + logEntry.Bucket + "|" + method + "|" + logEntry.HTTPStatus
incrementSyncMap(&m.RequestsDetailed, key)
Expand Down
76 changes: 76 additions & 0 deletions pkg/producers/opslog/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import (
"sync/atomic"
"testing"

"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
)

Expand Down Expand Up @@ -261,6 +263,80 @@ func TestMetricsUpdate_BasicFunctionality(t *testing.T) {
assert.Equal(t, 1, latencyCallCount, "LatencyObs should be called once")
}

func TestClassifyBucketSLOOperation(t *testing.T) {
testCases := []struct {
name string
operation string
expected SLOperation
ok bool
}{
{name: "get object", operation: "get_obj", expected: SLOperationGet, ok: true},
{name: "head object", operation: "head_obj", expected: SLOperationGet, ok: true},
{name: "list bucket", operation: "list_bucket", expected: SLOperationList, ok: true},
{name: "list buckets", operation: "list_buckets", expected: SLOperationList, ok: true},
{name: "bucket info", operation: "get_bucket_info", expected: SLOperationList, ok: true},
{name: "unsupported", operation: "put_obj", expected: "", ok: false},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
actual, ok := classifyBucketSLOOperation(tc.operation)
assert.Equal(t, tc.ok, ok)
assert.Equal(t, tc.expected, actual)
})
}
}

func TestStatusClass(t *testing.T) {
testCases := []struct {
name string
status string
expected string
}{
{name: "success", status: "200", expected: "2xx"},
{name: "server error", status: "503", expected: "5xx"},
{name: "empty", status: "", expected: "unknown"},
{name: "alpha", status: "ok", expected: "unknown"},
{name: "leading whitespace", status: " 200", expected: "unknown"},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
assert.Equal(t, tc.expected, statusClass(tc.status))
})
}
}

func TestMetricsUpdate_TrackBucketSLO(t *testing.T) {
config := &MetricsConfig{TrackBucketSLO: true}
logEntry := S3OperationLog{
Bucket: "bucket-a",
User: "alice$tenant-a",
Operation: "get_obj",
HTTPStatus: "200",
TotalTime: 150,
}

before := readCounterValue(t, sliRequestsTotal, "tenant-a", "bucket-a", "get", "2xx")
assert.NotPanics(t, func() {
NewMetrics().Update(logEntry, config)
})
after := readCounterValue(t, sliRequestsTotal, "tenant-a", "bucket-a", "get", "2xx")

assert.Equal(t, before+1, after)
}

func readCounterValue(t *testing.T, counter *prometheus.CounterVec, labelValues ...string) float64 {
t.Helper()

metric, err := counter.GetMetricWithLabelValues(labelValues...)
assert.NoError(t, err)

dtoMetric := &dto.Metric{}
assert.NoError(t, metric.Write(dtoMetric))
return dtoMetric.GetCounter().GetValue()
}

func TestMetricsUpdate_ErrorTracking(t *testing.T) {
config := &MetricsConfig{
TrackErrorsDetailed: true,
Expand Down
5 changes: 5 additions & 0 deletions pkg/producers/opslog/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ func initPrometheusSettings(cfg *OpsLogConfig) {
// Register latency metrics and set up LatencyObs function
registerLatencyMetrics(metricsConfig)

// Register dedicated low-cardinality SLI metrics for bucket GET/LIST SLOs
if metricsConfig.TrackBucketSLO {
registerSLIMetrics()
}

// Set up the global LatencyObs function
LatencyObs = latencyObs
}
Expand Down
31 changes: 31 additions & 0 deletions pkg/producers/opslog/prometheus_sli.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and prysm contributors
//
// SPDX-License-Identifier: Apache-2.0

package opslog

import "github.com/prometheus/client_golang/prometheus"

var (
sliRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "radosgw_bucket_sli_requests_total",
Help: "Bucket GET/LIST requests for SLI and SLO evaluation",
},
[]string{"tenant", "bucket", "operation", "status_class"},
)

sliRequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "radosgw_bucket_sli_request_duration_seconds",
Help: "Latency histogram for bucket GET/LIST requests used for SLI and SLO evaluation",
Buckets: []float64{0.05, 0.1, 0.2, 0.3, 0.5, 1, 2, 5, 10},
},
[]string{"tenant", "bucket", "operation"},
)
)

func registerSLIMetrics() {
prometheus.MustRegister(sliRequestsTotal)
prometheus.MustRegister(sliRequestDuration)
}
57 changes: 57 additions & 0 deletions pkg/producers/opslog/sli.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and prysm contributors
//
// SPDX-License-Identifier: Apache-2.0

package opslog

import "strings"

type SLOperation string

const (
SLOperationGet SLOperation = "get"
SLOperationList SLOperation = "list"
)

func classifyBucketSLOOperation(operation string) (SLOperation, bool) {
switch strings.ToLower(operation) {
case "get_obj", "head_obj":
return SLOperationGet, true
case "list_bucket", "list_buckets", "get_bucket_info":
return SLOperationList, true
default:
return "", false
}
}

func statusClass(status string) string {
if len(status) == 0 {
return "unknown"
}
if status[0] < '0' || status[0] > '9' {
return "unknown"
}
return status[:1] + "xx"
}
Comment on lines +27 to +35
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Restrict status classes to valid HTTP ranges (1xx5xx)

statusClass currently turns any leading digit into a class, so malformed statuses can produce 0xx/6xx9xx. That degrades metric quality and can skew SLO queries expecting valid HTTP classes.

💡 Proposed fix
 func statusClass(status string) string {
 	if len(status) == 0 {
 		return "unknown"
 	}
-	if status[0] < '0' || status[0] > '9' {
+	if status[0] < '1' || status[0] > '5' {
 		return "unknown"
 	}
 	return status[:1] + "xx"
 }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
func statusClass(status string) string {
if len(status) == 0 {
return "unknown"
}
if status[0] < '0' || status[0] > '9' {
return "unknown"
}
return status[:1] + "xx"
}
func statusClass(status string) string {
if len(status) == 0 {
return "unknown"
}
if status[0] < '1' || status[0] > '5' {
return "unknown"
}
return status[:1] + "xx"
}
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@pkg/producers/opslog/sli.go` around lines 27 - 35, statusClass currently maps
any leading digit to a class, allowing invalid HTTP classes like 0xx or 6xx–9xx;
update statusClass to only accept leading digits '1' through '5' and return
"unknown" otherwise. Locate the statusClass function and change its digit check
from a generic numeric range to explicitly allow '1' <= status[0] <= '5',
keeping the existing empty and non-digit guards, and return status[:1] + "xx"
only when the leading digit is within 1–5.


func observeBucketSLI(logEntry S3OperationLog, tenant string) {
sloOperation, ok := classifyBucketSLOOperation(logEntry.Operation)
if !ok || logEntry.Bucket == "" {
return
}

sliRequestsTotal.WithLabelValues(
tenant,
logEntry.Bucket,
string(sloOperation),
statusClass(logEntry.HTTPStatus),
).Inc()

if logEntry.TotalTime > 0 {
sliRequestDuration.WithLabelValues(
tenant,
logEntry.Bucket,
string(sloOperation),
).Observe(float64(logEntry.TotalTime) / 1000.0)
}
}