Skip to content

Commit 35e1c3f

Browse files
authored
feat: add self-monitoring to docker-compose (opensearch-exporter, pipeline + OpenSearch health dashboards) (opensearch-project#107)
1 parent 3233e6f commit 35e1c3f

5 files changed

Lines changed: 368 additions & 0 deletions

File tree

docker-compose.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,34 @@ services:
137137
memory: ${PROMETHEUS_MEMORY_LIMIT}
138138
logging: *logging
139139

140+
# OpenSearch Prometheus Exporter - Exposes OpenSearch metrics for Prometheus scraping
141+
opensearch-exporter:
142+
image: prometheuscommunity/elasticsearch-exporter:v1.10.0
143+
container_name: opensearch-exporter
144+
command:
145+
- --es.uri=${OPENSEARCH_PROTOCOL}://${OPENSEARCH_HOST}:${OPENSEARCH_PORT}
146+
- --es.ssl-skip-verify
147+
- --es.all
148+
- --es.indices
149+
- --es.shards
150+
environment:
151+
- ES_USERNAME=${OPENSEARCH_USER}
152+
- ES_PASSWORD=${OPENSEARCH_PASSWORD}
153+
ports:
154+
- "9114:9114"
155+
networks:
156+
- observability-stack-network
157+
depends_on:
158+
opensearch:
159+
condition: service_healthy
160+
required: false
161+
restart: unless-stopped
162+
deploy:
163+
resources:
164+
limits:
165+
memory: 128M
166+
logging: *logging
167+
140168
# OpenSearch Dashboards Initialization - Creates workspace, index patterns, and saved queries
141169
opensearch-dashboards-init:
142170
image: python:3.11-alpine
@@ -157,6 +185,8 @@ services:
157185
- ./docker-compose/opensearch-dashboards/init/init-opensearch-dashboards.py:/init.py
158186
- ./docker-compose/opensearch-dashboards/saved-queries-traces.yaml:/config/saved-queries-traces.yaml
159187
- ./docker-compose/opensearch-dashboards/saved-queries-metrics.yaml:/config/saved-queries-metrics.yaml
188+
- ./docker-compose/opensearch-dashboards/dashboard-opensearch-health.yaml:/config/dashboard-opensearch-health.yaml
189+
- ./docker-compose/opensearch-dashboards/dashboard-pipeline-health.yaml:/config/dashboard-pipeline-health.yaml
160190
- ./docker-compose/opensearch-dashboards/init/architecture.png:/config/architecture.png
161191
networks:
162192
- observability-stack-network
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# OpenSearch Cluster Health Dashboard — Cluster, index, JVM, and search metrics
2+
3+
dashboard:
4+
id: opensearch-cluster-health-dashboard
5+
title: OpenSearch Cluster Health
6+
description: Cluster status, index stats, JVM health, and indexing performance
7+
8+
panels:
9+
# --- Row 1: Cluster Status ---
10+
- id: os-cluster-status
11+
title: "Cluster Status (0=green, 1=yellow/red)"
12+
query: "elasticsearch_cluster_health_status{color=\"yellow\"} or elasticsearch_cluster_health_status{color=\"red\"}"
13+
chartType: line
14+
15+
- id: os-active-shards
16+
title: "Active Shards"
17+
query: "elasticsearch_cluster_health_active_shards"
18+
chartType: line
19+
20+
# --- Row 2: Index Stats ---
21+
- id: os-unassigned-shards
22+
title: "Unassigned Shards"
23+
query: "elasticsearch_cluster_health_unassigned_shards"
24+
chartType: line
25+
26+
- id: os-docs-count
27+
title: "Total Documents"
28+
query: "sum(elasticsearch_indices_docs)"
29+
chartType: line
30+
31+
# --- Row 3: Indexing & Storage ---
32+
- id: os-indexing-rate
33+
title: "Indexing Rate (docs/sec)"
34+
query: "rate(elasticsearch_indices_indexing_index_total[5m])"
35+
chartType: line
36+
37+
- id: os-store-size
38+
title: "Store Size (bytes)"
39+
query: "sum(elasticsearch_indices_store_size_bytes_total)"
40+
chartType: line
41+
42+
# --- Row 4: JVM Health ---
43+
- id: os-jvm-heap-used-pct
44+
title: "JVM Heap Used %"
45+
query: "100 * elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}"
46+
chartType: line
47+
48+
- id: os-jvm-gc-rate
49+
title: "JVM GC Collection Rate (sec/sec)"
50+
query: "rate(elasticsearch_jvm_gc_collection_seconds_sum[5m])"
51+
chartType: line
52+
53+
# --- Row 5: Search & CPU ---
54+
- id: os-search-rate
55+
title: "Search Rate (queries/sec)"
56+
query: "rate(elasticsearch_indices_search_query_total[5m])"
57+
chartType: line
58+
59+
- id: os-cpu-percent
60+
title: "OpenSearch CPU %"
61+
query: "elasticsearch_os_cpu_percent"
62+
chartType: line
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
# Observability Pipeline Health Dashboard — OTel Collector and Prometheus self-monitoring
2+
3+
dashboard:
4+
id: observability-pipeline-health-dashboard
5+
title: Observability Pipeline Health
6+
description: OTel Collector throughput, Prometheus ingestion, and pipeline health
7+
8+
panels:
9+
# --- Row 1: OTel Collector Throughput ---
10+
- id: pipeline-otel-spans-received
11+
title: "OTel Spans Received/sec"
12+
query: "rate(otelcol_receiver_accepted_spans_total[5m])"
13+
chartType: line
14+
15+
- id: pipeline-otel-spans-exported
16+
title: "OTel Spans Exported/sec"
17+
query: "rate(otelcol_exporter_sent_spans_total[5m])"
18+
chartType: line
19+
20+
# --- Row 2: OTel Metrics & Failures ---
21+
- id: pipeline-otel-metrics-received
22+
title: "OTel Metrics Received/sec"
23+
query: "rate(otelcol_receiver_accepted_metric_points_total[5m])"
24+
chartType: line
25+
26+
- id: pipeline-otel-spans-dropped
27+
title: "OTel Spans Dropped/sec"
28+
query: "rate(otelcol_exporter_send_failed_spans_total[5m])"
29+
chartType: line
30+
31+
# --- Row 3: OTel Collector Resources ---
32+
- id: pipeline-otel-queue-size
33+
title: "OTel Exporter Queue Size"
34+
query: "otelcol_exporter_queue_size"
35+
chartType: line
36+
37+
- id: pipeline-otel-collector-memory
38+
title: "OTel Collector Memory (bytes)"
39+
query: "otelcol_process_memory_rss_bytes"
40+
chartType: line
41+
42+
# --- Row 4: OTel Collector CPU & Uptime ---
43+
- id: pipeline-otel-collector-cpu
44+
title: "OTel Collector CPU Usage"
45+
query: "rate(otelcol_process_cpu_seconds_total[5m])"
46+
chartType: line
47+
48+
- id: pipeline-otel-batch-cardinality
49+
title: "OTel Batch Metadata Cardinality"
50+
query: "otelcol_processor_batch_metadata_cardinality"
51+
chartType: line
52+
53+
# --- Row 5: Prometheus Health ---
54+
- id: pipeline-prometheus-ingestion
55+
title: "Prometheus Ingestion Rate (chunks/sec)"
56+
query: "rate(prometheus_tsdb_head_chunks_created_total[5m])"
57+
chartType: line
58+
59+
- id: pipeline-prometheus-active-series
60+
title: "Prometheus Active Time Series"
61+
query: "prometheus_tsdb_head_series"
62+
chartType: line
63+
64+
# --- Row 6: Prometheus Storage ---
65+
- id: pipeline-prometheus-wal-size
66+
title: "Prometheus WAL Size (bytes)"
67+
query: "prometheus_tsdb_wal_storage_size_bytes"
68+
chartType: line
69+
70+
- id: pipeline-prometheus-head-chunks
71+
title: "Prometheus Head Chunks Size (bytes)"
72+
query: "prometheus_tsdb_head_chunks_storage_size_bytes"
73+
chartType: line
74+
75+
- id: pipeline-prometheus-query-latency
76+
title: "Prometheus Query Latency P99 (sec)"
77+
query: "histogram_quantile(0.99, rate(prometheus_http_request_duration_seconds_bucket{handler=\"/api/v1/query\"}[5m]))"
78+
chartType: line
79+
80+
# --- Row 7: Data Prepper — Logs Pipeline ---
81+
- id: pipeline-dp-logs-processed
82+
title: "DP Logs Processed/sec"
83+
query: "rate(otel_logs_pipeline_recordsProcessed_total[5m])"
84+
chartType: line
85+
86+
- id: pipeline-dp-logs-latency
87+
title: "DP Logs Pipeline Latency (avg sec)"
88+
query: "rate(otel_logs_pipeline_opensearch_PipelineLatency_seconds_sum[5m]) / rate(otel_logs_pipeline_opensearch_PipelineLatency_seconds_count[5m])"
89+
chartType: line
90+
91+
# --- Row 8: Data Prepper — Traces Pipeline ---
92+
- id: pipeline-dp-traces-processed
93+
title: "DP Traces Processed/sec"
94+
query: "rate(otel_traces_pipeline_recordsProcessed_total[5m])"
95+
chartType: line
96+
97+
- id: pipeline-dp-traces-latency
98+
title: "DP Traces Pipeline Latency (avg sec)"
99+
query: "rate(traces_raw_pipeline_opensearch_PipelineLatency_seconds_sum[5m]) / rate(traces_raw_pipeline_opensearch_PipelineLatency_seconds_count[5m])"
100+
chartType: line
101+
102+
# --- Row 9: Data Prepper — Metrics Pipeline ---
103+
- id: pipeline-dp-metrics-received
104+
title: "DP Metrics Received/sec"
105+
query: "rate(otlp_metrics_requestsReceived_total[5m])"
106+
chartType: line
107+
108+
- id: pipeline-dp-otlp-requests
109+
title: "DP OTLP Requests Received/sec (all)"
110+
query: "rate(otlp_traces_requestsReceived_total[5m]) + rate(otlp_logs_requestsReceived_total[5m]) + rate(otlp_metrics_requestsReceived_total[5m])"
111+
chartType: line
112+
113+
# --- Row 10: Data Prepper — Writes & Errors ---
114+
- id: pipeline-dp-logs-docs-written
115+
title: "DP Logs Docs Written/sec"
116+
query: "rate(otel_logs_pipeline_opensearch_documentsSuccess_total[5m])"
117+
chartType: line
118+
119+
- id: pipeline-dp-traces-docs-written
120+
title: "DP Traces Docs Written/sec"
121+
query: "rate(traces_raw_pipeline_opensearch_documentsSuccess_total[5m])"
122+
chartType: line
123+
124+
# --- Row 11: Data Prepper — Errors & Buffer ---
125+
- id: pipeline-dp-bulk-errors
126+
title: "DP Bulk Request Errors"
127+
query: "sum(rate(otel_logs_pipeline_opensearch_bulkRequestErrors_total[5m])) + sum(rate(traces_raw_pipeline_opensearch_bulkRequestErrors_total[5m]))"
128+
chartType: line
129+
130+
- id: pipeline-dp-buffer-usage
131+
title: "DP Buffer Writes/sec"
132+
query: "rate(otel_logs_pipeline_BlockingBuffer_recordsWritten_total[5m]) + rate(otel_traces_pipeline_BlockingBuffer_recordsWritten_total[5m])"
133+
chartType: line
134+
135+
- id: pipeline-dp-buffer-capacity
136+
title: "DP Buffer Capacity Used %"
137+
query: "otlp_pipeline_BlockingBuffer_capacityUsed + otel_logs_pipeline_BlockingBuffer_capacityUsed + otel_traces_pipeline_BlockingBuffer_capacityUsed"
138+
chartType: line

docker-compose/opensearch-dashboards/init/init-opensearch-dashboards.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -950,6 +950,119 @@ def create_chart_visualization(workspace_id, vis_id, title, vis_type, field, ind
950950
return None
951951

952952

953+
def create_promql_dashboard_from_yaml(workspace_id, config_path, prometheus_datasource_title="ObservabilityStack_Prometheus"):
954+
"""Create a dashboard with PromQL explore panels from a YAML config file"""
955+
import json
956+
957+
try:
958+
with open(config_path, "r") as f:
959+
config = yaml.safe_load(f)
960+
except (FileNotFoundError, yaml.YAMLError) as e:
961+
print(f"⚠️ Skipping dashboard from {config_path}: {e}")
962+
return None
963+
964+
dashboard_config = config.get("dashboard", {})
965+
panel_defs = config.get("panels", [])
966+
dashboard_id = dashboard_config.get("id", "promql-dashboard")
967+
968+
print(f"📊 Creating {dashboard_config.get('title', 'PromQL Dashboard')} dashboard ({len(panel_defs)} panels)...")
969+
970+
viz_template = json.dumps({
971+
"title": "", "chartType": "line",
972+
"params": {
973+
"addLegend": True, "addTimeMarker": False, "legendPosition": "bottom",
974+
"legendTitle": "", "lineMode": "straight", "lineStyle": "line", "lineWidth": 2,
975+
"showFullTimeRange": False, "standardAxes": [],
976+
"thresholdOptions": {"baseColor": "#00BD6B", "thresholds": [], "thresholdStyle": "off"},
977+
"titleOptions": {"show": False, "titleName": ""},
978+
"tooltipOptions": {"mode": "all"}
979+
},
980+
"axesMapping": {"color": "Series", "x": "Time", "y": "Value"}
981+
})
982+
983+
dataset = {
984+
"id": prometheus_datasource_title, "title": prometheus_datasource_title,
985+
"type": "PROMETHEUS", "language": "PROMQL", "timeFieldName": "Time",
986+
"dataSource": {}, "signalType": "metrics"
987+
}
988+
989+
created_ids = []
990+
for panel_def in panel_defs:
991+
panel_id = panel_def["id"]
992+
search_source = json.dumps({
993+
"query": {"query": panel_def["query"], "language": "PROMQL", "dataset": dataset},
994+
"filter": [], "indexRefName": "kibanaSavedObjectMeta.searchSourceJSON.index"
995+
})
996+
payload = {
997+
"attributes": {
998+
"title": panel_def["title"], "description": "", "hits": 0,
999+
"columns": ["_source"], "sort": [], "version": 1, "type": "metrics",
1000+
"visualization": viz_template,
1001+
"uiState": json.dumps({"activeTab": "explore_visualization_tab"}),
1002+
"kibanaSavedObjectMeta": {"searchSourceJSON": search_source}
1003+
},
1004+
"references": [{"name": "kibanaSavedObjectMeta.searchSourceJSON.index", "type": "index-pattern", "id": prometheus_datasource_title}]
1005+
}
1006+
if workspace_id and workspace_id != "default":
1007+
payload["workspaces"] = [workspace_id]
1008+
url = f"{BASE_URL}/w/{workspace_id}/api/saved_objects/explore/{panel_id}"
1009+
else:
1010+
url = f"{BASE_URL}/api/saved_objects/explore/{panel_id}"
1011+
try:
1012+
response = requests.post(url, auth=(USERNAME, PASSWORD), headers={"Content-Type": "application/json", "osd-xsrf": "true"}, json=payload, verify=False, timeout=10)
1013+
if response.status_code == 200:
1014+
created_ids.append(panel_id)
1015+
print(f" ✅ {panel_def['title']}")
1016+
elif response.status_code == 409:
1017+
requests.put(url, auth=(USERNAME, PASSWORD), headers={"Content-Type": "application/json", "osd-xsrf": "true"}, json={"attributes": payload["attributes"], "references": payload["references"]}, verify=False, timeout=10)
1018+
created_ids.append(panel_id)
1019+
print(f" 🔄 {panel_def['title']} (updated)")
1020+
else:
1021+
print(f" ⚠️ {panel_def['title']}: {response.status_code} {response.text[:100]}")
1022+
except requests.exceptions.RequestException as e:
1023+
print(f" ⚠️ {panel_def['title']}: {e}")
1024+
1025+
if not created_ids:
1026+
print("⚠️ No panels created, skipping dashboard")
1027+
return None
1028+
1029+
panels = []
1030+
references = []
1031+
for i, pid in enumerate(created_ids):
1032+
panels.append({"version": "3.6.0", "panelIndex": pid, "gridData": {"i": pid, "x": (i % 2) * 24, "y": (i // 2) * 15, "w": 24, "h": 15}, "panelRefName": f"panel_{i}"})
1033+
references.append({"name": f"panel_{i}", "type": "explore", "id": pid})
1034+
1035+
dashboard_payload = {
1036+
"attributes": {
1037+
"title": dashboard_config.get("title", "PromQL Dashboard"),
1038+
"description": dashboard_config.get("description", ""),
1039+
"panelsJSON": json.dumps(panels),
1040+
"optionsJSON": json.dumps({"useMargins": True, "hidePanelTitles": False}),
1041+
"timeRestore": False,
1042+
"kibanaSavedObjectMeta": {"searchSourceJSON": json.dumps({})}
1043+
},
1044+
"references": references
1045+
}
1046+
if workspace_id and workspace_id != "default":
1047+
dashboard_payload["workspaces"] = [workspace_id]
1048+
url = f"{BASE_URL}/w/{workspace_id}/api/saved_objects/dashboard/{dashboard_id}"
1049+
else:
1050+
url = f"{BASE_URL}/api/saved_objects/dashboard/{dashboard_id}"
1051+
try:
1052+
# Always delete and recreate the dashboard so panel order matches YAML
1053+
requests.delete(url, auth=(USERNAME, PASSWORD), headers={"osd-xsrf": "true"}, verify=False, timeout=10)
1054+
response = requests.post(url, auth=(USERNAME, PASSWORD), headers={"Content-Type": "application/json", "osd-xsrf": "true"}, json=dashboard_payload, verify=False, timeout=10)
1055+
if response.status_code == 200:
1056+
print(f"✅ Created {dashboard_config['title']} dashboard ({len(created_ids)} panels)")
1057+
return dashboard_id
1058+
else:
1059+
print(f"⚠️ Dashboard creation failed: {response.text[:200]}")
1060+
return None
1061+
except requests.exceptions.RequestException as e:
1062+
print(f"⚠️ Error creating dashboard: {e}")
1063+
return None
1064+
1065+
9531066
def create_overview_dashboard(workspace_id):
9541067
"""Create an overview landing dashboard with markdown links to all observability features"""
9551068
import json
@@ -1172,6 +1285,10 @@ def main():
11721285
# Create overview landing dashboard (becomes the new default)
11731286
create_overview_dashboard(workspace_id)
11741287

1288+
# Create self-monitoring dashboards (PromQL explore panels)
1289+
create_promql_dashboard_from_yaml(workspace_id, "/config/dashboard-pipeline-health.yaml")
1290+
create_promql_dashboard_from_yaml(workspace_id, "/config/dashboard-opensearch-health.yaml")
1291+
11751292
# Create saved queries for common agent observability patterns
11761293
create_default_saved_queries(workspace_id)
11771294

0 commit comments

Comments
 (0)