feat: add enterprise sizing knobs and sizing guide

kylehounslow · kylehounslow · commit a8b819a73425 · 2026-03-19T13:13:35.000-07:00
- OpenSearch: persistence.size, OPENSEARCH_JAVA_OPTS, storageClass
- Data Prepper: number_of_shards/number_of_replicas on opensearch sinks
- Prometheus: server.retention (15d default), persistentVolume options
- README: sizing guide with storage formula, shard rules, quick-reference
  profiles (dev/small team/enterprise)
diff --git a/charts/observability-stack/README.md b/charts/observability-stack/README.md
@@ -102,6 +102,61 @@ Configured via `scrapeConfigs` in `values.yaml`. Default K8s scrape jobs are dis
 
 > **Note:** Targets use the helm release name as prefix. The values in `values.yaml` are hardcoded to `obs-stack-*` — update them if you change the release name.
 
+## Sizing Guide
+
+The default values are tuned for development/demo (single-node OpenSearch, minimal resources). For production or enterprise-scale deployments, adjust the following knobs.
+
+### OpenSearch Cluster
+
+| Knob | Default | Production Guidance |
+|------|---------|---------------------|
+| `opensearch.replicas` | `1` | 3+ data nodes minimum for HA |
+| `opensearch.singleNode` | `true` | Set `false` for multi-node |
+| `opensearch.resources.requests.memory` | `2Gi` | 8–64Gi per node (JVM gets 50%) |
+| `opensearch.persistence.size` | `8Gi` | Size per formula below |
+| `opensearch.extraEnvs[OPENSEARCH_JAVA_OPTS]` | `-Xms1g -Xmx1g` | 50% of node RAM, max 31g |
+
+**Storage formula:**
+```
+storage_per_node = (daily_ingest_GB × 1.45 × (replicas + 1) × retention_days) / node_count
+```
+The 1.45x multiplier accounts for indexing overhead (10%), OS reserved space for merges (20%), filesystem overhead (5%), and node failure buffer (10%).
+
+**Shard sizing:**
+- Logs/traces (write-heavy): 30–50 GB per primary shard
+- Search (latency-sensitive): 10–30 GB per primary shard
+- Total shards should be a multiple of data node count
+- Max 25 shards per GB of JVM heap
+
+Shard count is configurable per Data Prepper pipeline sink via `number_of_shards` and `number_of_replicas` (commented out in `values.yaml`).
+
+### Data Prepper Pipeline Tuning
+
+| Knob | Default | Description |
+|------|---------|-------------|
+| `data-prepper.pipelineConfig.config.otel-logs-pipeline.workers` | `5` | Parallel log processing threads |
+| `...opensearch.number_of_shards` | (OS default: 1) | Primary shards per index |
+| `...opensearch.number_of_replicas` | (OS default: 1) | Replica shards per primary |
+| `...opensearch.bulk_size` | `5` (MiB) | Bulk request size to OpenSearch |
+
+### Prometheus
+
+| Knob | Default | Description |
+|------|---------|-------------|
+| `prometheus.server.retention` | `15d` | How long metrics are kept |
+| `prometheus.server.persistentVolume.enabled` | `false` | Enable for production |
+| `prometheus.server.persistentVolume.size` | `8Gi` | Disk for metrics TSDB |
+
+### Quick Reference: Sizing Profiles
+
+| Profile | OS Nodes | OS Memory | OS Disk | Prometheus Retention |
+|---------|----------|-----------|---------|---------------------|
+| **Dev/Demo** (default) | 1 | 2Gi | 8Gi | 15d |
+| **Small team** (~10 GB/day) | 3 | 8Gi | 100Gi | 30d |
+| **Enterprise** (~100 GB/day) | 6+ | 32Gi | 500Gi+ | 90d |
+
+Sources: [OpenSearch shard sizing](https://opensearch.org/blog/optimize-opensearch-index-shard-size/), [AWS sizing guide](https://docs.aws.amazon.com/prescriptive-guidance/latest/opensearch-service-migration/sizing.html), [AWS shard best practices](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/bp-sharding.html)
+
 ## Key Values
 
 See `values.yaml` for all options. Notable settings:
diff --git a/charts/observability-stack/values.yaml b/charts/observability-stack/values.yaml
@@ -2,6 +2,12 @@
 # Mirrors the docker-compose setup for Kubernetes deployment
 
 # -- OpenSearch
+# Sizing guide:
+#   Storage: daily_ingest_GB × 1.45 × (replicas + 1) × retention_days
+#   Shards:  30–50 GB per shard for logs/traces, 10–30 GB for search
+#   JVM:     50% of node RAM, max ~31 GB (set via OPENSEARCH_JAVA_OPTS)
+#   Nodes:   minimum 3 for production, 1 for dev/demo
+#   Heap-to-shard ratio: max 25 shards per GB of JVM heap
 opensearch:
   enabled: true
   singleNode: true
@@ -13,9 +19,16 @@ opensearch:
     requests:
       memory: "2Gi"
       cpu: "500m"
+  persistence:
+    enabled: true
+    size: 8Gi                    # Increase for production (e.g. 100Gi, 500Gi)
+    # storageClass: "gp3"       # Uncomment for AWS gp3 (better IOPS/$ than gp2)
   extraEnvs:
     - name: OPENSEARCH_INITIAL_ADMIN_PASSWORD
       value: "My_password_123!@#"
+    # JVM heap — set to 50% of resources.requests.memory, max 31g
+    - name: OPENSEARCH_JAVA_OPTS
+      value: "-Xms1g -Xmx1g"
   config:
     opensearch.yml: |
       plugins.query.datasources.encryption.masterkey: "BTqK4Ytdz67La1kShIKV3Pu9"
@@ -135,6 +148,10 @@ data-prepper:
               password: "My_password_123!@#"
               insecure: true
               index_type: log-analytics-plain
+              # Shard tuning — adjust for ingest volume:
+              #   1 shard handles ~30-50 GB for logs. Scale shards with data node count.
+              # number_of_shards: 1
+              # number_of_replicas: 1
 
       otel-traces-pipeline:
         delay: 100
@@ -160,6 +177,8 @@ data-prepper:
               password: "My_password_123!@#"
               insecure: true
               index_type: trace-analytics-plain-raw
+              # number_of_shards: 1
+              # number_of_replicas: 1
 
       service-map-pipeline:
         delay: 100
@@ -277,8 +296,11 @@ opentelemetry-collector:
 prometheus:
   enabled: true
   server:
+    # Retention — how long Prometheus keeps metrics. Increase for longer history.
+    retention: "15d"
     persistentVolume:
-      enabled: false
+      enabled: false               # Enable for production (survives pod restarts)
+      # size: 50Gi
     extraFlags:
       - "web.enable-remote-write-receiver"
       - "web.enable-otlp-receiver"