11#! /usr/bin/env bash
22# Shared health checks and trace verification for E2E tests.
33# Expects these variables to be set before sourcing:
4- # OPENSEARCH_USER, OPENSEARCH_PASSWORD, OPENSEARCH_PORT,
4+ # OPENSEARCH_USER, OPENSEARCH_PASSWORD,
55# OPENSEARCH_DASHBOARDS_PORT, OTEL_COLLECTOR_PORT_HTTP, PROMETHEUS_PORT
66set -euo pipefail
77
8- OPENSEARCH_URL=" https://localhost:${OPENSEARCH_PORT} "
9- CURL_OPTS=(-s -k -u " ${OPENSEARCH_USER} :${OPENSEARCH_PASSWORD} " )
8+ # Query OpenSearch through the OSD console proxy, routed via the local_cluster
9+ # data source (dataSourceId), instead of hitting localhost:9200 directly. This
10+ # exercises the configured OSD_DATASOURCE_ENDPOINT, so a misconfigured endpoint
11+ # fails the test.
12+ OSD_URL=" http://localhost:${OPENSEARCH_DASHBOARDS_PORT} "
13+ OSD_CURL_OPTS=(-s -u " ${OPENSEARCH_USER} :${OPENSEARCH_PASSWORD} " -H " osd-xsrf: true" )
1014HEALTH_CHECK_RETRIES=" ${HEALTH_CHECK_RETRIES:- 30} "
15+ # Separate, larger budget than HEALTH_CHECK_RETRIES: the local_cluster data
16+ # source is seeded by the one-shot opensearch-dashboards-init container, which
17+ # `--wait` does not block on, so it can appear well after OSD reports healthy.
18+ DATASOURCE_RETRIES=" ${DATASOURCE_RETRIES:- 180} "
19+
20+ # Resolved by resolve_datasource_id() before any osd_proxy call.
21+ DATASOURCE_ID=" "
22+
23+ # Look up the id of the seeded `local_cluster` data source, retrying until the
24+ # init container has created it. Sets the global DATASOURCE_ID.
25+ resolve_datasource_id () {
26+ local max=" $1 "
27+ for i in $( seq 1 " $max " ) ; do
28+ DATASOURCE_ID=$( curl " ${OSD_CURL_OPTS[@]} " \
29+ " $OSD_URL /api/saved_objects/_find?type=data-source&fields=title&search=local_cluster&search_fields=title" \
30+ | sed -n ' s/.*"id":"\([^"]*\)".*/\1/p' | head -1)
31+ [[ -n " $DATASOURCE_ID " ]] && return 0
32+ [[ " $i " -eq " $max " ]] && { echo " FAIL: local_cluster data source not found after ${max} s" ; exit 1; }
33+ sleep 1
34+ done
35+ }
36+
37+ # Proxy a request to OpenSearch through the OSD Dev Tools console proxy, routed
38+ # through the `local_cluster` data source.
39+ # Usage: osd_proxy <method> <opensearch-path> [extra curl args...]
40+ # The console proxy route is always POSTed to; the <method> arg is the verb
41+ # applied against OpenSearch itself.
42+ osd_proxy () {
43+ local method=" $1 " path=" $2 "
44+ shift 2
45+ curl " ${OSD_CURL_OPTS[@]} " -X POST \
46+ " $OSD_URL /api/console/proxy?method=${method} &path=${path} &dataSourceId=${DATASOURCE_ID} " " $@ "
47+ }
48+
49+ # Check cluster health through the selected data source (DATASOURCE_ID).
50+ # Requires an explicit green/yellow status, not merely "not red": a bad endpoint
51+ # yields a non-JSON error body and thus an empty status, which must fail here.
52+ check_cluster_health () {
53+ local health_body health
54+ health_body=$( osd_proxy GET " /_cluster/health" )
55+ health=$( echo " $health_body " | sed -n ' s/.*"status":"\([^"]*\)".*/\1/p' )
56+ if [[ " $health " != " green" && " $health " != " yellow" ]]; then
57+ echo " cluster health not green/yellow (via OSD data source)"
58+ echo " Response: $health_body "
59+ return 1
60+ fi
61+ echo " OpenSearch cluster health: $health "
62+ return 0
63+ }
1164
1265# Retry a curl check until it returns the expected HTTP status code.
1366# Usage: retry_check <label> <max_retries> <expected_status> <curl_args...>
1467retry_check () {
1568 local label=" $1 " max=" $2 " expected=" $3 "
1669 shift 3
70+ local status
1771 for i in $( seq 1 " $max " ) ; do
1872 status=$( curl -s -o /dev/null -w " %{http_code}" " $@ " ) && true
1973 [[ " $status " == " $expected " ]] && return 0
@@ -23,13 +77,23 @@ retry_check() {
2377}
2478
2579run_checks () {
26- echo " ==> Checking OpenSearch cluster health..."
27- health=$( curl " ${CURL_OPTS[@]} " " $OPENSEARCH_URL /_cluster/health" | sed -n ' s/.*"status":"\([^"]*\)".*/\1/p' )
28- if [[ " $health " == " red" ]]; then
29- echo " FAIL: OpenSearch cluster health is red"
80+ echo " ==> Checking OpenSearch Dashboards is up..."
81+ # OSD must be ready first: the OpenSearch checks below query through its
82+ # console proxy rather than hitting OpenSearch directly.
83+ retry_check " OpenSearch Dashboards" " $HEALTH_CHECK_RETRIES " " 200" \
84+ -u " ${OPENSEARCH_USER} :${OPENSEARCH_PASSWORD} " \
85+ " http://localhost:${OPENSEARCH_DASHBOARDS_PORT} /api/status"
86+ echo " OpenSearch Dashboards: OK"
87+
88+ echo " ==> Resolving local_cluster data source..."
89+ resolve_datasource_id " $DATASOURCE_RETRIES "
90+ echo " local_cluster data source: $DATASOURCE_ID "
91+
92+ echo " ==> Checking OpenSearch cluster health (via OSD data source)..."
93+ if ! check_cluster_health; then
94+ echo " FAIL: OpenSearch cluster health check failed"
3095 exit 1
3196 fi
32- echo " OpenSearch cluster health: $health "
3397
3498 echo " ==> Checking OTel Collector is accepting OTLP..."
3599 retry_check " OTel Collector" " $HEALTH_CHECK_RETRIES " " 200" \
@@ -45,12 +109,6 @@ run_checks() {
45109 " http://localhost:${PROMETHEUS_PORT} /ready"
46110 echo " Prometheus: OK"
47111
48- echo " ==> Checking OpenSearch Dashboards is up..."
49- retry_check " OpenSearch Dashboards" " $HEALTH_CHECK_RETRIES " " 200" \
50- -u " ${OPENSEARCH_USER} :${OPENSEARCH_PASSWORD} " \
51- " http://localhost:${OPENSEARCH_DASHBOARDS_PORT} /api/status"
52- echo " OpenSearch Dashboards: OK"
53-
54112 echo " ==> Sending test trace through OTel Collector..."
55113 trace_response=$( curl -s -w " \n%{http_code}" " http://localhost:${OTEL_COLLECTOR_PORT_HTTP} /v1/traces" \
56114 -H " Content-Type: application/json" \
@@ -77,14 +135,20 @@ run_checks() {
77135 fi
78136 echo " Test trace sent: OK"
79137
80- echo " ==> Verifying trace landed in OpenSearch..."
138+ echo " ==> Verifying trace landed in OpenSearch (via OSD data source) ..."
81139 TRACE_ID=" 5b8efff798038103d269b633813fc60c"
140+ # The index pattern is URL-encoded for the console proxy's path query param:
141+ # "*span*,*trace*" -> "%2Aspan%2A%2C%2Atrace%2A".
142+ SEARCH_PATH=" %2Aspan%2A%2C%2Atrace%2A/_search"
82143 MAX_RETRIES=90
83144 for i in $( seq 1 " $MAX_RETRIES " ) ; do
84- hits=$( curl " ${CURL_OPTS[@]} " " $OPENSEARCH_URL /*span*,*trace*/_search" \
145+ # `|| true` so a transient transport error (e.g. curl exit 7 if OSD blips
146+ # mid-loop) lets the retry loop continue rather than aborting under
147+ # `set -euo pipefail`. Mirrors retry_check's `&& true` defusing.
148+ hits=$( osd_proxy GET " $SEARCH_PATH " \
85149 -H " Content-Type: application/json" \
86150 -d " {\" query\" :{\" bool\" :{\" should\" :[{\" term\" :{\" traceId\" :\" $TRACE_ID \" }},{\" term\" :{\" traceID\" :\" $TRACE_ID \" }}]}}}" \
87- | sed -n ' s/.*"total":{"value":\([0-9]*\).*/\1/p' )
151+ | sed -n ' s/.*"total":{"value":\([0-9]*\).*/\1/p' ) || true
88152 if [[ " $hits " -gt 0 ]]; then
89153 echo " Trace found in OpenSearch after ${i} s"
90154 break
@@ -99,3 +163,74 @@ run_checks() {
99163 echo " "
100164 echo " ==> All E2E checks passed!"
101165}
166+
167+ # Delete every data source whose title matches $1. Used to clear orphaned
168+ # throwaway data sources left by a crashed prior run before recreating one, so
169+ # reruns against a persistent stack are self-healing (the saved-objects API does
170+ # not dedupe by title, so a stale object would otherwise accumulate silently).
171+ delete_datasources_by_title () {
172+ local title=" $1 " id
173+ # `grep -o` extracts EVERY id in the response. A greedy `sed` would capture
174+ # only one match on the single-line JSON body, so multiple orphans would not
175+ # all be deleted.
176+ for id in $( curl " ${OSD_CURL_OPTS[@]} " \
177+ " $OSD_URL /api/saved_objects/_find?type=data-source&fields=title&search=${title} &search_fields=title" \
178+ | grep -o ' "id":"[^"]*"' | sed ' s/"id":"\([^"]*\)"/\1/' ) ; do
179+ curl " ${OSD_CURL_OPTS[@]} " -o /dev/null -X DELETE \
180+ " $OSD_URL /api/saved_objects/data-source/${id} " || true
181+ done
182+ }
183+
184+ # Negative-path regression test for OSD_DATASOURCE_ENDPOINT.
185+ #
186+ # Seeds a throwaway data source whose endpoint points at https://localhost:9200
187+ # — the exact misconfiguration that breaks MDS-scoped OSD features when OSD runs
188+ # inside the compose network (the container can't resolve `localhost` to
189+ # OpenSearch). It then asserts that querying through that data source FAILS the
190+ # health check. This guards against a regression where the health check stops
191+ # detecting a bad endpoint (e.g. reverting to a "not red" check that lets an
192+ # empty/error response slip through).
193+ #
194+ # Assumes run_checks has already passed, so OSD is up and DATASOURCE_ID is set.
195+ run_negative_checks () {
196+ echo " ==> [negative] Verifying a bad data-source endpoint is caught..."
197+ local bad_id saved_id=" $DATASOURCE_ID " bad_title=" e2e_bad_endpoint" healthy
198+
199+ # Self-heal: clear any orphan left by a crashed prior run before recreating,
200+ # so reruns against a persistent stack stay clean.
201+ delete_datasources_by_title " $bad_title "
202+
203+ # Create a data source pointing at the host-only, in-container-unreachable URL.
204+ bad_id=$( curl " ${OSD_CURL_OPTS[@]} " -X POST \
205+ " $OSD_URL /api/saved_objects/data-source" \
206+ -H " Content-Type: application/json" \
207+ -d " {\" attributes\" :{\" title\" :\" ${bad_title} \" ,\" endpoint\" :\" https://localhost:9200\" ,\" auth\" :{\" type\" :\" username_password\" ,\" credentials\" :{\" username\" :\" ${OPENSEARCH_USER} \" ,\" password\" :\" ${OPENSEARCH_PASSWORD} \" }},\" dataSourceVersion\" :\" 3.5.0\" ,\" dataSourceEngineType\" :\" OpenSearch\" }}" \
208+ | sed -n ' s/.*"id":"\([^"]*\)".*/\1/p' | head -1)
209+ if [[ -z " $bad_id " ]]; then
210+ echo " FAIL: [negative] could not create throwaway data source"
211+ exit 1
212+ fi
213+
214+ # Point the proxy at the bad data source; the health check must fail. The
215+ # `if` guard captures the result without `set -e` aborting on the expected
216+ # non-zero return.
217+ DATASOURCE_ID=" $bad_id "
218+ if check_cluster_health > /dev/null 2>&1 ; then healthy=0; else healthy=1; fi
219+
220+ # Restore the real data source and delete the throwaway one. Delete by the
221+ # captured id (not _find-by-title): the just-created object may not yet be
222+ # visible to _find due to saved-objects index refresh lag. No command between
223+ # create and here can abort under `set -e`, so inline cleanup needs no trap.
224+ DATASOURCE_ID=" $saved_id "
225+ curl " ${OSD_CURL_OPTS[@]} " -o /dev/null -X DELETE \
226+ " $OSD_URL /api/saved_objects/data-source/${bad_id} " || true
227+
228+ if [[ " $healthy " -eq 0 ]]; then
229+ echo " FAIL: [negative] health check PASSED through a bad endpoint (should fail)"
230+ exit 1
231+ fi
232+ echo " Bad data-source endpoint correctly rejected: OK"
233+
234+ echo " "
235+ echo " ==> Negative-path checks passed!"
236+ }
0 commit comments