Skip to content

Commit 554f997

Browse files
committed
add check for restorebackup kvdump
1 parent ab20fb4 commit 554f997

1 file changed

Lines changed: 166 additions & 7 deletions

File tree

.github/workflows/splunkconf-backup-test.yml

Lines changed: 166 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -927,7 +927,6 @@ jobs:
927927
# -------------------------------------------------------
928928
# Step 15: Collect Splunk logs from backup operations (always, for debugging any failure)
929929
# -------------------------------------------------------
930-
931930
- name: Collect Splunk logs
932931
if: always()
933932
run: |
@@ -1329,10 +1328,10 @@ jobs:
13291328
13301329
if [ "$HEALTH" = "healthy" ]; then
13311330
echo "Splunk is healthy after $((RETRIES * RETRY_INTERVAL)) seconds."
1332-
WAITSEC=300
1333-
echo "sleeping ${WAITSEC}s to wait for splunk restore to finish"
1334-
# temp - to be replaced by check
1335-
sleep $WAITSEC
1331+
#WAITSEC=300
1332+
#echo "sleeping ${WAITSEC}s to wait for splunk restore to finish"
1333+
## temp - to be replaced by check
1334+
#sleep $WAITSEC
13361335
break
13371336
fi
13381337
@@ -1369,10 +1368,170 @@ jobs:
13691368
fi
13701369
13711370
# -------------------------------------------------------
1372-
# Step 21: Collect Splunk logs from backup operations (always, for debugging any failure)
1371+
# Step 21: Wait for Splunk to restore kvdump
13731372
# -------------------------------------------------------
1373+
- name: Wait for Splunk to restore kvdump
1374+
run: |
1375+
# should run only is splunk is up from previous Step
1376+
# (task to repeat every 30s until max seconds (about 300s)
1377+
# search in splunk via rest API source=*splunkconf-backup.log sourcetype=splunkd for results
1378+
# 06-05-2026 11:22:33.169 +0000 splunkconf-restore INFO id=1778066430 action=restorebackup type=local COUNTER=97 currentversion=10.2, minimalversionover=7 MGMTURL=0.0.0.0:8089 KVARCHIVE=backupconfsplunk-kvdump-toberestored.tar.gz object=kvdump dest=/opt/splunk/var/lib/splunk/kvstorebackup/backupconfsplunk-kvdump-toberestored.tar.gz result=success kvstore online (kvdump) restore complete
1379+
# result=failure mean it fail and the test should error (it can fail because we didnt get a ok status from kvstroe api or for other reason like no disk space to restore initally , it is produced by fail_log "action=restorebackup type=${TYPE} object=${OBJECT} result=failure dest=$FIC reason=${ERROR_MESS} ${MESS1}"
1380+
# do not use result=running lines, they will be followed by a success or failure at end of the loop (as the restore script call splunk REST API to restore kvdump then loop to fetch kvstore restore status every x seconds
1381+
# if uf or kvstore disabled then the log will be different (either we reuse the info from previous step and we know there was no kvdump or we fetch the correct status from logs
1382+
# log will be starting with echo_log "action=restorebackup type=$TYPE object=${OBJECT} result=disabled
1383+
echo "=== Checking kvdump restore status ==="
1384+
echo "Splunk type: ${{ matrix.splunk_type }}"
1385+
echo "Context type: ${{ matrix.context_type }}"
1386+
echo "IS_UF: ${IS_UF}"
13741387
1375-
- name: Collect Splunk logs
1388+
# ------------------------------------------------------------
1389+
# Skip conditions: UF or kvstore_disabled => no kvdump expected
1390+
# ------------------------------------------------------------
1391+
if [ "${IS_UF}" = "1" ]; then
1392+
echo "✅ Universal Forwarder: no kvdump restore expected. Skipping."
1393+
exit 0
1394+
fi
1395+
1396+
if [ "${{ matrix.context_type }}" = "kvstore_disabled" ]; then
1397+
echo "✅ ¸ kvstore_disabled context: no kvdump restore expected."
1398+
echo "Verifying that splunkconf-backup logged result=disabled (informational)..."
1399+
docker exec --user ${SPLUNK_USER} splunk bash -c \
1400+
"grep -E 'action=restorebackup.*object=kvdump.*result=disabled' \
1401+
${SPLUNK_HOME}/var/log/splunk/splunkconf-backup.log 2>/dev/null | tail -5" \
1402+
|| echo "(no result=disabled line found yet, but not fatal)"
1403+
exit 0
1404+
fi
1405+
1406+
# ------------------------------------------------------------
1407+
# Polling parameters
1408+
# ------------------------------------------------------------
1409+
MAX_WAIT=300 # total max wait in seconds
1410+
INTERVAL=30 # seconds between polls
1411+
ELAPSED=0
1412+
KV_RESULT="" # final captured result: success | failure | disabled
1413+
KV_LINE="" # the matching log line (for reporting)
1414+
1415+
LOG_PATH="${SPLUNK_HOME}/var/log/splunk/splunkconf-backup.log"
1416+
1417+
echo "Polling ${LOG_PATH} every ${INTERVAL}s (max ${MAX_WAIT}s)..."
1418+
echo "Looking for: action=restorebackup ... object=kvdump ... result=(success|failure|disabled)"
1419+
echo ""
1420+
1421+
# ------------------------------------------------------------
1422+
# Try Splunk REST API first (search), fall back to grep on the
1423+
# log file inside the container if the search returns nothing.
1424+
# We accept either path: whichever surfaces a terminal result.
1425+
# ------------------------------------------------------------
1426+
while [ $ELAPSED -lt $MAX_WAIT ]; do
1427+
1428+
# ---- (a) Try via Splunk REST search ----
1429+
SEARCH_QUERY='search index=_internal source=*splunkconf-backup.log action=restorebackup object=kvdump (result=success OR result=failure OR result=disabled) | head 5'
1430+
1431+
SEARCH_RESULTS=$(docker exec --user ${SPLUNK_USER} splunk curl -s -k \
1432+
-u admin:${{ env.SPLUNK_PASSWORD }} \
1433+
https://localhost:8089/services/search/jobs/export \
1434+
-d "search=${SEARCH_QUERY}" \
1435+
-d "output_mode=json" \
1436+
-d "earliest_time=-1h" \
1437+
-d "latest_time=now" \
1438+
2>/dev/null || echo "")
1439+
1440+
# Extract the most recent matching raw line, if any
1441+
REST_LINE=$(echo "$SEARCH_RESULTS" \
1442+
| grep -oE '"_raw"[[:space:]]*:[[:space:]]*"[^"]*action=restorebackup[^"]*object=kvdump[^"]*result=(success|failure|disabled)[^"]*"' \
1443+
| head -n 1 || true)
1444+
1445+
# ---- (b) Fall back to direct log grep inside container ----
1446+
FILE_LINE=$(docker exec --user ${SPLUNK_USER} splunk bash -c \
1447+
"grep -E 'action=restorebackup.*object=kvdump.*result=(success|failure|disabled)' \
1448+
'${LOG_PATH}' 2>/dev/null | tail -n 1" || true)
1449+
1450+
# Prefer the file line (most authoritative & easiest to parse);
1451+
# fall back to REST line if file is empty.
1452+
CANDIDATE_LINE=""
1453+
if [ -n "$FILE_LINE" ]; then
1454+
CANDIDATE_LINE="$FILE_LINE"
1455+
elif [ -n "$REST_LINE" ]; then
1456+
CANDIDATE_LINE="$REST_LINE"
1457+
fi
1458+
1459+
if [ -n "$CANDIDATE_LINE" ]; then
1460+
# Extract the result= token (success|failure|disabled)
1461+
MATCHED=$(echo "$CANDIDATE_LINE" \
1462+
| grep -oE 'result=(success|failure|disabled)' \
1463+
| head -n 1 \
1464+
| cut -d= -f2)
1465+
1466+
if [ -n "$MATCHED" ]; then
1467+
KV_RESULT="$MATCHED"
1468+
KV_LINE="$CANDIDATE_LINE"
1469+
echo " [${ELAPSED}s] Found terminal kvdump restore status: result=${KV_RESULT}"
1470+
break
1471+
fi
1472+
fi
1473+
1474+
echo " [${ELAPSED}s/${MAX_WAIT}s] No terminal kvdump restore status yet (still running or not started). Retrying in ${INTERVAL}s..."
1475+
sleep $INTERVAL
1476+
ELAPSED=$((ELAPSED + INTERVAL))
1477+
done
1478+
1479+
# ------------------------------------------------------------
1480+
# Evaluate result
1481+
# ------------------------------------------------------------
1482+
echo ""
1483+
echo "============================================"
1484+
echo "kvdump restore final evaluation"
1485+
echo "============================================"
1486+
1487+
if [ -z "$KV_RESULT" ]; then
1488+
echo "❌ ERROR: Did not observe a terminal kvdump restore status (success/failure/disabled) within ${MAX_WAIT}s."
1489+
echo ""
1490+
echo "--- Last 50 lines of splunkconf-backup.log ---"
1491+
docker exec --user ${SPLUNK_USER} splunk bash -c \
1492+
"tail -50 '${LOG_PATH}' 2>/dev/null" || true
1493+
echo ""
1494+
echo "--- Any kvdump-related lines (incl. running) ---"
1495+
docker exec --user ${SPLUNK_USER} splunk bash -c \
1496+
"grep -E 'object=kvdump' '${LOG_PATH}' 2>/dev/null | tail -20" || true
1497+
exit 1
1498+
fi
1499+
1500+
echo "Matching log line:"
1501+
echo " ${KV_LINE}"
1502+
echo ""
1503+
1504+
case "$KV_RESULT" in
1505+
success)
1506+
echo "✅ kvdump restore SUCCESS."
1507+
exit 0
1508+
;;
1509+
disabled)
1510+
# We don't expect 'disabled' here because UF / kvstore_disabled
1511+
# were short-circuited above. If we do see it, it's unexpected.
1512+
echo "❌¸ kvdump restore reported result=disabled, but this context expected an actual restore."
1513+
echo " Treating as failure for this matrix combination."
1514+
exit 1
1515+
;;
1516+
failure)
1517+
echo "❌ kvdump restore FAILED."
1518+
echo ""
1519+
echo "--- Recent kvdump-related log lines ---"
1520+
docker exec --user ${SPLUNK_USER} splunk bash -c \
1521+
"grep -E 'object=kvdump' '${LOG_PATH}' 2>/dev/null | tail -20" || true
1522+
exit 1
1523+
;;
1524+
*)
1525+
echo "❌ Unexpected result token: ${KV_RESULT}"
1526+
exit 1
1527+
;;
1528+
esac
1529+
1530+
1531+
# -------------------------------------------------------
1532+
# Step 22: Collect Splunk logs from backup operations (always, for debugging any failure)
1533+
# -------------------------------------------------------
1534+
- name: Collect Splunk logs after restore
13761535
if: always()
13771536
run: |
13781537
echo "=== Removing logs from backup steps (so we only collect restore) ==="

0 commit comments

Comments
 (0)