Skip to content

Commit 51345e6

Browse files
authored
fix: default OSD_DATASOURCE_ENDPOINT to in-network host and verify via E2E (#299)
fix: default OSD_DATASOURCE_ENDPOINT to in-network host and verify via E2E (#299) Signed-off-by: Ashish Agrawal <ashisagr@amazon.com>
1 parent 753f868 commit 51345e6

4 files changed

Lines changed: 164 additions & 27 deletions

File tree

.env

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@ OPENSEARCH_PORT=9200
2424
OPENSEARCH_PROTOCOL=https
2525
OPENSEARCH_JAVA_OPTS=-Xms1g -Xmx1g
2626
# Endpoint written into the `local_cluster` data-source saved object that the
27-
# init container seeds into OpenSearch. Point it at the host-reachable port
28-
# (`https://localhost:9200`, published by the compose file) when running
29-
# OpenSearch Dashboards on the host — the host-side OSD process cannot
30-
# resolve the docker-compose service name `opensearch`, so any MDS-scoped
31-
# OSD feature that dials this SO's endpoint would fail with
32-
# `getaddrinfo ENOTFOUND opensearch`. Leave blank/commented when OSD itself
33-
# runs inside the compose network.
34-
OSD_DATASOURCE_ENDPOINT=https://localhost:9200
27+
# init container seeds into OpenSearch. Defaults to the in-network endpoint
28+
# (`https://opensearch:9200`) so OSD running inside the compose network can
29+
# reach it. When running OpenSearch Dashboards on the host (local OSD dev
30+
# server), override this to `https://localhost:9200` — the host-side OSD
31+
# process cannot resolve the docker-compose service name `opensearch`, so any
32+
# MDS-scoped OSD feature that dials this SO's endpoint would otherwise fail
33+
# with `getaddrinfo ENOTFOUND opensearch`.
34+
OSD_DATASOURCE_ENDPOINT=${OPENSEARCH_PROTOCOL}://${OPENSEARCH_HOST}:${OPENSEARCH_PORT}
3535

3636
# OpenSearch Dashboards Configuration
3737
OPENSEARCH_DASHBOARDS_VERSION=3.7.0

test/checks.sh

Lines changed: 152 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,73 @@
11
#!/usr/bin/env bash
22
# Shared health checks and trace verification for E2E tests.
33
# Expects these variables to be set before sourcing:
4-
# OPENSEARCH_USER, OPENSEARCH_PASSWORD, OPENSEARCH_PORT,
4+
# OPENSEARCH_USER, OPENSEARCH_PASSWORD,
55
# OPENSEARCH_DASHBOARDS_PORT, OTEL_COLLECTOR_PORT_HTTP, PROMETHEUS_PORT
66
set -euo pipefail
77

8-
OPENSEARCH_URL="https://localhost:${OPENSEARCH_PORT}"
9-
CURL_OPTS=(-s -k -u "${OPENSEARCH_USER}:${OPENSEARCH_PASSWORD}")
8+
# Query OpenSearch through the OSD console proxy, routed via the local_cluster
9+
# data source (dataSourceId), instead of hitting localhost:9200 directly. This
10+
# exercises the configured OSD_DATASOURCE_ENDPOINT, so a misconfigured endpoint
11+
# fails the test.
12+
OSD_URL="http://localhost:${OPENSEARCH_DASHBOARDS_PORT}"
13+
OSD_CURL_OPTS=(-s -u "${OPENSEARCH_USER}:${OPENSEARCH_PASSWORD}" -H "osd-xsrf: true")
1014
HEALTH_CHECK_RETRIES="${HEALTH_CHECK_RETRIES:-30}"
15+
# Separate, larger budget than HEALTH_CHECK_RETRIES: the local_cluster data
16+
# source is seeded by the one-shot opensearch-dashboards-init container, which
17+
# `--wait` does not block on, so it can appear well after OSD reports healthy.
18+
DATASOURCE_RETRIES="${DATASOURCE_RETRIES:-180}"
19+
20+
# Resolved by resolve_datasource_id() before any osd_proxy call.
21+
DATASOURCE_ID=""
22+
23+
# Look up the id of the seeded `local_cluster` data source, retrying until the
24+
# init container has created it. Sets the global DATASOURCE_ID.
25+
resolve_datasource_id() {
26+
local max="$1"
27+
for i in $(seq 1 "$max"); do
28+
DATASOURCE_ID=$(curl "${OSD_CURL_OPTS[@]}" \
29+
"$OSD_URL/api/saved_objects/_find?type=data-source&fields=title&search=local_cluster&search_fields=title" \
30+
| sed -n 's/.*"id":"\([^"]*\)".*/\1/p' | head -1)
31+
[[ -n "$DATASOURCE_ID" ]] && return 0
32+
[[ "$i" -eq "$max" ]] && { echo "FAIL: local_cluster data source not found after ${max}s"; exit 1; }
33+
sleep 1
34+
done
35+
}
36+
37+
# Proxy a request to OpenSearch through the OSD Dev Tools console proxy, routed
38+
# through the `local_cluster` data source.
39+
# Usage: osd_proxy <method> <opensearch-path> [extra curl args...]
40+
# The console proxy route is always POSTed to; the <method> arg is the verb
41+
# applied against OpenSearch itself.
42+
osd_proxy() {
43+
local method="$1" path="$2"
44+
shift 2
45+
curl "${OSD_CURL_OPTS[@]}" -X POST \
46+
"$OSD_URL/api/console/proxy?method=${method}&path=${path}&dataSourceId=${DATASOURCE_ID}" "$@"
47+
}
48+
49+
# Check cluster health through the selected data source (DATASOURCE_ID).
50+
# Requires an explicit green/yellow status, not merely "not red": a bad endpoint
51+
# yields a non-JSON error body and thus an empty status, which must fail here.
52+
check_cluster_health() {
53+
local health_body health
54+
health_body=$(osd_proxy GET "/_cluster/health")
55+
health=$(echo "$health_body" | sed -n 's/.*"status":"\([^"]*\)".*/\1/p')
56+
if [[ "$health" != "green" && "$health" != "yellow" ]]; then
57+
echo " cluster health not green/yellow (via OSD data source)"
58+
echo " Response: $health_body"
59+
return 1
60+
fi
61+
echo " OpenSearch cluster health: $health"
62+
return 0
63+
}
1164

1265
# Retry a curl check until it returns the expected HTTP status code.
1366
# Usage: retry_check <label> <max_retries> <expected_status> <curl_args...>
1467
retry_check() {
1568
local label="$1" max="$2" expected="$3"
1669
shift 3
70+
local status
1771
for i in $(seq 1 "$max"); do
1872
status=$(curl -s -o /dev/null -w "%{http_code}" "$@") && true
1973
[[ "$status" == "$expected" ]] && return 0
@@ -23,13 +77,23 @@ retry_check() {
2377
}
2478

2579
run_checks() {
26-
echo "==> Checking OpenSearch cluster health..."
27-
health=$(curl "${CURL_OPTS[@]}" "$OPENSEARCH_URL/_cluster/health" | sed -n 's/.*"status":"\([^"]*\)".*/\1/p')
28-
if [[ "$health" == "red" ]]; then
29-
echo "FAIL: OpenSearch cluster health is red"
80+
echo "==> Checking OpenSearch Dashboards is up..."
81+
# OSD must be ready first: the OpenSearch checks below query through its
82+
# console proxy rather than hitting OpenSearch directly.
83+
retry_check "OpenSearch Dashboards" "$HEALTH_CHECK_RETRIES" "200" \
84+
-u "${OPENSEARCH_USER}:${OPENSEARCH_PASSWORD}" \
85+
"http://localhost:${OPENSEARCH_DASHBOARDS_PORT}/api/status"
86+
echo " OpenSearch Dashboards: OK"
87+
88+
echo "==> Resolving local_cluster data source..."
89+
resolve_datasource_id "$DATASOURCE_RETRIES"
90+
echo " local_cluster data source: $DATASOURCE_ID"
91+
92+
echo "==> Checking OpenSearch cluster health (via OSD data source)..."
93+
if ! check_cluster_health; then
94+
echo "FAIL: OpenSearch cluster health check failed"
3095
exit 1
3196
fi
32-
echo " OpenSearch cluster health: $health"
3397

3498
echo "==> Checking OTel Collector is accepting OTLP..."
3599
retry_check "OTel Collector" "$HEALTH_CHECK_RETRIES" "200" \
@@ -45,12 +109,6 @@ run_checks() {
45109
"http://localhost:${PROMETHEUS_PORT}/ready"
46110
echo " Prometheus: OK"
47111

48-
echo "==> Checking OpenSearch Dashboards is up..."
49-
retry_check "OpenSearch Dashboards" "$HEALTH_CHECK_RETRIES" "200" \
50-
-u "${OPENSEARCH_USER}:${OPENSEARCH_PASSWORD}" \
51-
"http://localhost:${OPENSEARCH_DASHBOARDS_PORT}/api/status"
52-
echo " OpenSearch Dashboards: OK"
53-
54112
echo "==> Sending test trace through OTel Collector..."
55113
trace_response=$(curl -s -w "\n%{http_code}" "http://localhost:${OTEL_COLLECTOR_PORT_HTTP}/v1/traces" \
56114
-H "Content-Type: application/json" \
@@ -77,14 +135,20 @@ run_checks() {
77135
fi
78136
echo " Test trace sent: OK"
79137

80-
echo "==> Verifying trace landed in OpenSearch..."
138+
echo "==> Verifying trace landed in OpenSearch (via OSD data source)..."
81139
TRACE_ID="5b8efff798038103d269b633813fc60c"
140+
# The index pattern is URL-encoded for the console proxy's path query param:
141+
# "*span*,*trace*" -> "%2Aspan%2A%2C%2Atrace%2A".
142+
SEARCH_PATH="%2Aspan%2A%2C%2Atrace%2A/_search"
82143
MAX_RETRIES=90
83144
for i in $(seq 1 "$MAX_RETRIES"); do
84-
hits=$(curl "${CURL_OPTS[@]}" "$OPENSEARCH_URL/*span*,*trace*/_search" \
145+
# `|| true` so a transient transport error (e.g. curl exit 7 if OSD blips
146+
# mid-loop) lets the retry loop continue rather than aborting under
147+
# `set -euo pipefail`. Mirrors retry_check's `&& true` defusing.
148+
hits=$(osd_proxy GET "$SEARCH_PATH" \
85149
-H "Content-Type: application/json" \
86150
-d "{\"query\":{\"bool\":{\"should\":[{\"term\":{\"traceId\":\"$TRACE_ID\"}},{\"term\":{\"traceID\":\"$TRACE_ID\"}}]}}}" \
87-
| sed -n 's/.*"total":{"value":\([0-9]*\).*/\1/p')
151+
| sed -n 's/.*"total":{"value":\([0-9]*\).*/\1/p') || true
88152
if [[ "$hits" -gt 0 ]]; then
89153
echo " Trace found in OpenSearch after ${i}s"
90154
break
@@ -99,3 +163,74 @@ run_checks() {
99163
echo ""
100164
echo "==> All E2E checks passed!"
101165
}
166+
167+
# Delete every data source whose title matches $1. Used to clear orphaned
168+
# throwaway data sources left by a crashed prior run before recreating one, so
169+
# reruns against a persistent stack are self-healing (the saved-objects API does
170+
# not dedupe by title, so a stale object would otherwise accumulate silently).
171+
delete_datasources_by_title() {
172+
local title="$1" id
173+
# `grep -o` extracts EVERY id in the response. A greedy `sed` would capture
174+
# only one match on the single-line JSON body, so multiple orphans would not
175+
# all be deleted.
176+
for id in $(curl "${OSD_CURL_OPTS[@]}" \
177+
"$OSD_URL/api/saved_objects/_find?type=data-source&fields=title&search=${title}&search_fields=title" \
178+
| grep -o '"id":"[^"]*"' | sed 's/"id":"\([^"]*\)"/\1/'); do
179+
curl "${OSD_CURL_OPTS[@]}" -o /dev/null -X DELETE \
180+
"$OSD_URL/api/saved_objects/data-source/${id}" || true
181+
done
182+
}
183+
184+
# Negative-path regression test for OSD_DATASOURCE_ENDPOINT.
185+
#
186+
# Seeds a throwaway data source whose endpoint points at https://localhost:9200
187+
# — the exact misconfiguration that breaks MDS-scoped OSD features when OSD runs
188+
# inside the compose network (the container can't resolve `localhost` to
189+
# OpenSearch). It then asserts that querying through that data source FAILS the
190+
# health check. This guards against a regression where the health check stops
191+
# detecting a bad endpoint (e.g. reverting to a "not red" check that lets an
192+
# empty/error response slip through).
193+
#
194+
# Assumes run_checks has already passed, so OSD is up and DATASOURCE_ID is set.
195+
run_negative_checks() {
196+
echo "==> [negative] Verifying a bad data-source endpoint is caught..."
197+
local bad_id saved_id="$DATASOURCE_ID" bad_title="e2e_bad_endpoint" healthy
198+
199+
# Self-heal: clear any orphan left by a crashed prior run before recreating,
200+
# so reruns against a persistent stack stay clean.
201+
delete_datasources_by_title "$bad_title"
202+
203+
# Create a data source pointing at the host-only, in-container-unreachable URL.
204+
bad_id=$(curl "${OSD_CURL_OPTS[@]}" -X POST \
205+
"$OSD_URL/api/saved_objects/data-source" \
206+
-H "Content-Type: application/json" \
207+
-d "{\"attributes\":{\"title\":\"${bad_title}\",\"endpoint\":\"https://localhost:9200\",\"auth\":{\"type\":\"username_password\",\"credentials\":{\"username\":\"${OPENSEARCH_USER}\",\"password\":\"${OPENSEARCH_PASSWORD}\"}},\"dataSourceVersion\":\"3.5.0\",\"dataSourceEngineType\":\"OpenSearch\"}}" \
208+
| sed -n 's/.*"id":"\([^"]*\)".*/\1/p' | head -1)
209+
if [[ -z "$bad_id" ]]; then
210+
echo "FAIL: [negative] could not create throwaway data source"
211+
exit 1
212+
fi
213+
214+
# Point the proxy at the bad data source; the health check must fail. The
215+
# `if` guard captures the result without `set -e` aborting on the expected
216+
# non-zero return.
217+
DATASOURCE_ID="$bad_id"
218+
if check_cluster_health >/dev/null 2>&1; then healthy=0; else healthy=1; fi
219+
220+
# Restore the real data source and delete the throwaway one. Delete by the
221+
# captured id (not _find-by-title): the just-created object may not yet be
222+
# visible to _find due to saved-objects index refresh lag. No command between
223+
# create and here can abort under `set -e`, so inline cleanup needs no trap.
224+
DATASOURCE_ID="$saved_id"
225+
curl "${OSD_CURL_OPTS[@]}" -o /dev/null -X DELETE \
226+
"$OSD_URL/api/saved_objects/data-source/${bad_id}" || true
227+
228+
if [[ "$healthy" -eq 0 ]]; then
229+
echo "FAIL: [negative] health check PASSED through a bad endpoint (should fail)"
230+
exit 1
231+
fi
232+
echo " Bad data-source endpoint correctly rejected: OK"
233+
234+
echo ""
235+
echo "==> Negative-path checks passed!"
236+
}

test/e2e-install.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ printf '%s\n' "$INSTALL_DIR" "n" "n" "n" | bash "$PATCHED_SCRIPT" --skip-pull
3030
rm -f "$PATCHED_SCRIPT"
3131

3232
# Parse .env from the installed directory
33-
eval "$(grep -E '^(OPENSEARCH_USER|OPENSEARCH_PASSWORD|OPENSEARCH_PORT|OPENSEARCH_DASHBOARDS_PORT|OTEL_COLLECTOR_PORT_HTTP|PROMETHEUS_PORT)=' "$INSTALL_DIR/.env")"
33+
eval "$(grep -E '^(OPENSEARCH_USER|OPENSEARCH_PASSWORD|OPENSEARCH_DASHBOARDS_PORT|OTEL_COLLECTOR_PORT_HTTP|PROMETHEUS_PORT)=' "$INSTALL_DIR/.env")"
3434

3535
source "$SCRIPT_DIR/checks.sh"
3636
run_checks
37+
run_negative_checks

test/e2e.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ echo "==> Starting observability stack..."
1515
docker compose --project-directory "$PROJECT_DIR" up -d --wait --wait-timeout "$WAIT_TIMEOUT"
1616

1717
# Parse .env safely (don't source — some values aren't shell-safe)
18-
eval "$(grep -E '^(OPENSEARCH_USER|OPENSEARCH_PASSWORD|OPENSEARCH_PORT|OPENSEARCH_DASHBOARDS_PORT|OTEL_COLLECTOR_PORT_HTTP|PROMETHEUS_PORT)=' "$PROJECT_DIR/.env")"
18+
eval "$(grep -E '^(OPENSEARCH_USER|OPENSEARCH_PASSWORD|OPENSEARCH_DASHBOARDS_PORT|OTEL_COLLECTOR_PORT_HTTP|PROMETHEUS_PORT)=' "$PROJECT_DIR/.env")"
1919

2020
source "$SCRIPT_DIR/checks.sh"
2121
run_checks
22+
run_negative_checks

0 commit comments

Comments
 (0)