Skip to content
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 30 additions & 9 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,18 @@ import (
"crypto/tls"
"flag"
"fmt"
"log/slog"
"os"
"path/filepath"
"time"

"github.com/spf13/pflag"
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"

intController "github.com/splunk/splunk-operator/internal/controller"
"github.com/splunk/splunk-operator/internal/controller/debug"
"github.com/splunk/splunk-operator/pkg/config"
"github.com/splunk/splunk-operator/pkg/logging"
"github.com/splunk/splunk-operator/pkg/splunk/enterprise/validation"
"sigs.k8s.io/controller-runtime/pkg/certwatcher"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
Expand All @@ -46,14 +46,15 @@ import (
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
_ "k8s.io/client-go/plugin/pkg/client/auth"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/certwatcher"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"

enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3"
enterpriseApi "github.com/splunk/splunk-operator/api/v4"
"github.com/splunk/splunk-operator/internal/controller"
//+kubebuilder:scaffold:imports
//extapi "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
)
Expand All @@ -77,8 +78,11 @@ func main() {
var enableLeaderElection bool
var probeAddr string
var pprofActive bool
var logEncoder string
var logLevel int

// Structured logging flags
var logLevel string
var logFormat string
var logAddSource bool

var leaseDuration time.Duration
var renewDeadline time.Duration
Expand All @@ -87,16 +91,18 @@ func main() {

var tlsOpts []func(*tls.Config)

// TLS certificate configuration for metrics
var metricsCertPath, metricsCertName, metricsCertKey string

pflag.StringVar(&logEncoder, "log-encoder", "json", "log encoding ('json' or 'console')")
pflag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
pflag.BoolVar(&enableLeaderElection, "leader-elect", false,
"Enable leader election for controller manager. "+
"Enabling this will ensure there is only one active controller manager.")
pflag.BoolVar(&pprofActive, "pprof", true, "Enable pprof endpoint")
pflag.IntVar(&logLevel, "log-level", int(zapcore.InfoLevel), "set log level")

// Structured logging flags (can also be set via LOG_LEVEL, LOG_FORMAT, LOG_ADD_SOURCE env vars)
pflag.StringVar(&logLevel, "log-level", "", "log level: debug, info, warn, error (overrides LOG_LEVEL env var)")
pflag.StringVar(&logFormat, "log-format", "", "log output format: json, text (overrides LOG_FORMAT env var)")
pflag.BoolVar(&logAddSource, "log-add-source", false, "add source file:line to log output (overrides LOG_ADD_SOURCE env var)")
pflag.IntVar(&leaseDurationSecond, "lease-duration", leaseDurationSecond, "manager lease duration in seconds")
pflag.IntVar(&renewDeadlineSecond, "renew-duration", renewDeadlineSecond, "manager renew duration in seconds")
pflag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metrics endpoint binds to. "+
Expand Down Expand Up @@ -168,6 +174,21 @@ func main() {
renewDeadline = time.Duration(renewDeadlineSecond) * time.Second
}

// Initialize structured logging infrastructure
// Flags take precedence over environment variables
var addSourcePtr *bool
if logAddSource {
addSourcePtr = &logAddSource
}
logCfg := logging.LoadConfigWithFlags(logLevel, logFormat, addSourcePtr)
_ = logging.SetupLogger(logCfg)

// Log startup information using slog
slog.Info("Splunk Operator starting",
slog.String("log_level", logging.LevelToString(logCfg.Level)),
slog.String("log_format", logCfg.Format),
slog.Bool("log_add_source", logCfg.AddSource))

// Configure metrics certificate watcher if metrics certs are provided
var metricsCertWatcher *certwatcher.CertWatcher
if len(metricsCertPath) > 0 {
Expand Down Expand Up @@ -275,7 +296,7 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "Standalone")
os.Exit(1)
}
if err := (&controller.IngestorClusterReconciler{
if err := (&intController.IngestorClusterReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("ingestorcluster-controller"),
Expand Down
199 changes: 199 additions & 0 deletions docs/LoggingAndEvents.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
# Logging & Events
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add this to the "Reference" section of the github docs similar to the examples file: https://github.com/splunk/splunk-operator/blob/main/docs/Examples.md?plain=1#L4


## Logging

The operator uses Go's `log/slog` package for structured logging. The global logger is configured once at startup in `cmd/main.go` via `logging.SetupLogger()` and set as the default with `slog.SetDefault()`.

### Getting a Logger

| Where | How |
|---|---|
| **Controller `Reconcile`** | `logger := slog.Default().With("controller", "Name", "name", req.Name, "namespace", req.Namespace)` |
| **Business logic (pkg)** | `logger := logging.FromContext(ctx).With("func", "FunctionName")` |

Controllers inject the logger into context so downstream code can retrieve it:

```go
logger := slog.Default().With("controller", "Standalone", "name", req.Name, "namespace", req.Namespace)
ctx = logging.WithLogger(ctx, logger)
```

### Log Levels

| Level | Use for |
|---|---|
| `Debug` | Verbose diagnostics (state dumps, intermediate values) |
| `Info` | Normal operations (reconcile start, requeue, status changes) |
| `Warn` | Recoverable issues (deprecated config, fallback behavior) |
| `Error` | Failures that need attention (API errors, broken invariants) |

Configure at runtime via `LOG_LEVEL` env var or `--log-level` flag. Values: `debug`, `info`, `warn`, `error`.

### Writing Log Messages

**Message format:** short, lowercase, past tense or present participle. Describe *what happened*, not the function name.

```go
// Good
logger.InfoContext(ctx, "statefulset updated", "replicas", replicas)
logger.ErrorContext(ctx, "failed to fetch secret", "error", err, "secret", name)

// Bad - don't restate the function name or use generic messages
logger.InfoContext(ctx, "ApplyStandalone")
logger.InfoContext(ctx, "error occurred")
```

**Rules:**

1. Always use `*Context(ctx, ...)` variants (`InfoContext`, `ErrorContext`, etc.).
2. Always pass `"error", err` as a key-value pair — never as the message string.
3. Use consistent key names across the codebase:

| Key | Meaning |
|---|---|
| `"error"` | The `error` value |
| `"name"` | CR or resource name |
| `"namespace"` | Kubernetes namespace |
| `"controller"` | Controller name |
| `"func"` | Function name (in pkg layer) |
| `"replicas"` | Replica count |
| `"phase"` | CR phase |

4. **Don't log and return the same error.** controller-runtime automatically logs every non-nil error returned from `Reconcile()` via `log.Error(err, "Reconciler error")`. If you also log it in the business logic, the same error appears twice. Instead, wrap the error with context using `fmt.Errorf("description: %w", err)` so the single controller-runtime log line is descriptive:
```go
// Good — wrap and return, no explicit log needed
return result, fmt.Errorf("validate standalone spec: %w", err)

// Bad — double-logged: once here, once by controller-runtime
scopedLog.Error(err, "Failed to validate standalone spec")
return result, err
```
5. Sensitive data (passwords, tokens, secrets) is automatically redacted by the handler.

### Configuration

| Env Var | Flag | Default | Description |
|---|---|---|---|
| `LOG_LEVEL` | `--log-level` | `info` | Minimum log level |
| `LOG_FORMAT` | `--log-format` | `json` | `json` or `text` |
| `LOG_ADD_SOURCE` | `--log-add-source` | `false` | Include source file/line (auto-enabled at debug) |

Flags take precedence over env vars.

---

## Kubernetes Events

Events are user-facing signals visible via `kubectl describe`. Use them for significant state changes that an operator user should see — not for internal debugging.

### How It Works

1. Controllers pass the event recorder into context:
```go
ctx = context.WithValue(ctx, splcommon.EventRecorderKey, r.Recorder)
```
2. Business logic retrieves a publisher:
```go
eventPublisher := GetEventPublisher(ctx, cr)
```
3. Emit events using constants from `pkg/splunk/enterprise/event_reasons.go`:
```go
eventPublisher.Normal(ctx, EventReasonScaledUp,
fmt.Sprintf("Successfully scaled %s from %d to %d replicas", cr.GetName(), old, new))
eventPublisher.Warning(ctx, EventReasonValidateSpecFailed,
fmt.Sprintf("Spec validation failed for %s — check operator logs", cr.GetName()))
```

### Event Reason Constants

All event reasons are defined as constants in `pkg/splunk/enterprise/event_reasons.go`. **Never use string literals for event reasons** — always use the `EventReason*` constants. This ensures consistency across controllers and makes reasons searchable.

**Normal reasons:**

| Constant | Value | Use for |
|---|---|---|
| `EventReasonScaledUp` | `ScaledUp` | Successful scale up |
| `EventReasonScaledDown` | `ScaledDown` | Successful scale down |
| `EventReasonClusterInitialized` | `ClusterInitialized` | Cluster first becomes ready |
| `EventReasonClusterQuorumRestored` | `ClusterQuorumRestored` | Quorum recovered |
| `EventReasonPasswordSyncCompleted` | `PasswordSyncCompleted` | Secret sync finished |

**Warning reasons (common):**

| Constant | Value | Use for |
|---|---|---|
| `EventReasonValidateSpecFailed` | `ValidateSpecFailed` | CR spec validation errors |
| `EventReasonApplySplunkConfigFailed` | `ApplySplunkConfigFailed` | General config apply failures |
| `EventReasonAppFrameworkInitFailed` | `AppFrameworkInitFailed` | App framework init errors |
| `EventReasonApplyServiceFailed` | `ApplyServiceFailed` | Service create/update failures |
| `EventReasonStatefulSetFailed` | `StatefulSetFailed` | StatefulSet get failures |
| `EventReasonStatefulSetUpdateFailed` | `StatefulSetUpdateFailed` | StatefulSet update failures |
| `EventReasonDeleteFailed` | `DeleteFailed` | CR deletion failures |
| `EventReasonSecretMissing` | `SecretMissing` | Required secret not found |
| `EventReasonUpgradeCheckFailed` | `UpgradeCheckFailed` | Upgrade path validation errors |

See `event_reasons.go` for the full list.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a link to this file from the doc?


To add a new event reason: add a constant to `event_reasons.go`, then use it in the code.

### When to Use Events vs Logs

| Scenario | Log | Event |
|---|---|---|
| Reconcile start/requeue | Yes | No |
| API call failed (transient) | Yes | No |
| Spec validation failed | Yes | Yes (Warning) |
| Successful scale up/down | Yes | Yes (Normal) |
| Phase transition | Yes | Yes (Normal) |
| Security-related failure | Yes | Yes (Warning) |

### Writing Event Messages

**Reason:** always use an `EventReason*` constant — e.g. `EventReasonScaledUp`, `EventReasonValidateSpecFailed`.

**Message:** sentence case, user-friendly, include the resource name. **Never include raw `err.Error()` in events** — error details belong in logs only. Events are visible to any user with `kubectl describe` access and may leak internal paths, secret names, or stack traces. Instead, write a user-actionable summary and point to logs for details.

```go
// Good — uses constant, event gives a user-actionable summary, log has full error
logger.ErrorContext(ctx, "smartstore volume key validation failed", "error", err)
eventPublisher.Warning(ctx, EventReasonRemoteVolumeKeyCheckFailed,
fmt.Sprintf("Remote volume key change check failed for %s — check operator logs", cr.GetName()))

eventPublisher.Normal(ctx, EventReasonScaledUp,
fmt.Sprintf("Successfully scaled %s from %d to %d replicas", cr.GetName(), oldCount, newCount))

// Bad — string literal instead of constant
eventPublisher.Warning(ctx, "ValidationFailed", "...")

// Bad — leaking err.Error() into events
eventPublisher.Warning(ctx, EventReasonValidateSpecFailed,
fmt.Sprintf("Invalid smartstore config: %s", err.Error()))

// Bad — too vague, no context
eventPublisher.Warning(ctx, "Error", "something went wrong")
```

**Log + Event combo for errors:**

```go
err = validateStandaloneSpec(ctx, client, cr)
if err != nil {
// Log: full error for operator developers / debugging
logger.ErrorContext(ctx, "spec validation failed", "error", err)

// Event: user-actionable summary visible in kubectl describe
eventPublisher.Warning(ctx, EventReasonValidateSpecFailed,
fmt.Sprintf("Spec validation failed for %s — check operator logs", cr.GetName()))

return result, err
}
```

**Rules:**

1. Always use `EventReason*` constants — never string literals for event reasons.
2. Use `Normal` for successful operations the user should know about.
3. Use `Warning` for failures or degraded states that may need user intervention.
4. Always include the resource name in the message.
5. Never pass `err.Error()` into event messages — log the full error, keep events clean.
6. Don't emit events for routine internal operations — keep the event stream actionable.
11 changes: 6 additions & 5 deletions internal/controller/clustermanager_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ package controller

import (
"context"
"log/slog"
"time"

enterpriseApi "github.com/splunk/splunk-operator/api/v4"
"github.com/splunk/splunk-operator/internal/controller/common"
"github.com/splunk/splunk-operator/pkg/logging"
splcommon "github.com/splunk/splunk-operator/pkg/splunk/common"

"github.com/pkg/errors"
Expand All @@ -37,7 +39,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)
Expand Down Expand Up @@ -79,8 +80,8 @@ func (r *ClusterManagerReconciler) Reconcile(ctx context.Context, req ctrl.Reque
metrics.ReconcileCounters.With(metrics.GetPrometheusLabels(req, "ClusterManager")).Inc()
defer recordInstrumentionData(time.Now(), req, "controller", "ClusterManager")

reqLogger := log.FromContext(ctx)
reqLogger = reqLogger.WithValues("clustermanager", req.NamespacedName)
logger := slog.Default().With("controller", "ClusterManager", "name", req.Name, "namespace", req.Namespace, "reconcileID", controller.ReconcileIDFromContext(ctx))
ctx = logging.WithLogger(ctx, logger)

// Fetch the ClusterManager
instance := &enterpriseApi.ClusterManager{}
Expand All @@ -105,14 +106,14 @@ func (r *ClusterManagerReconciler) Reconcile(ctx context.Context, req ctrl.Reque
}
}

reqLogger.Info("start", "CR version", instance.GetResourceVersion())
logger.InfoContext(ctx, "start", "CR version", instance.GetResourceVersion())
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: to keep it consistent across all logs, we could enforce a pattern for key naming and it could be crVersion instead of CR version


// Pass event recorder through context
ctx = context.WithValue(ctx, splcommon.EventRecorderKey, r.Recorder)

result, err := ApplyClusterManager(ctx, r.Client, instance, nil)
if result.Requeue && result.RequeueAfter != 0 {
reqLogger.Info("Requeued", "period(seconds)", int(result.RequeueAfter/time.Second))
logger.InfoContext(ctx, "Requeued", "period(seconds)", int(result.RequeueAfter/time.Second))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: to keep it consistent across all logs, we could enforce a pattern for key naming and it could be periodSeconds instead of period(seconds)

}

return result, err
Expand Down
Loading
Loading