Skip to content

Commit 9a08de3

Browse files
committed
ci: add gated inference smoke test with nightly workflow
- bin/ci/smoke-agent-inference.sh: sends a real LLM prompt via session-control RPC, waits for turn_end, validates response. - Gated by BAUDBOT_CI_INFERENCE_SMOKE=1 (default off for PR CI). - BAUDBOT_CI_INFERENCE_SMOKE_OPTIONAL=1 for fail-open mode. - CI_ANTHROPIC_API_KEY injected into agent .env when available. - bin/ci/droplet.sh run: accepts optional KEY=VALUE env vars to forward to the remote droplet script. - .github/workflows/nightly.yml: daily schedule + manual dispatch, runs full integration + inference smoke on both Ubuntu and Arch.
1 parent 132fc8d commit 9a08de3

5 files changed

Lines changed: 337 additions & 4 deletions

File tree

.github/workflows/nightly.yml

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
name: Nightly
2+
3+
on:
4+
schedule:
5+
# 06:00 UTC daily (1 AM EST / 10 PM PST)
6+
- cron: "0 6 * * *"
7+
workflow_dispatch:
8+
inputs:
9+
inference_smoke:
10+
description: "Run inference smoke test"
11+
type: boolean
12+
default: true
13+
14+
concurrency:
15+
group: nightly
16+
cancel-in-progress: true
17+
18+
jobs:
19+
inference-smoke:
20+
runs-on: ubuntu-latest
21+
strategy:
22+
fail-fast: false
23+
matrix:
24+
include:
25+
- distro: ubuntu
26+
image: ubuntu-24-04-x64
27+
setup_script: bin/ci/setup-ubuntu.sh
28+
- distro: arch
29+
image: "217410218"
30+
setup_script: bin/ci/setup-arch.sh
31+
32+
name: ${{ matrix.distro }} (inference)
33+
timeout-minutes: 15
34+
35+
steps:
36+
- uses: actions/checkout@v4
37+
38+
- name: Generate ephemeral SSH key
39+
run: |
40+
mkdir -p ~/.ssh
41+
ssh-keygen -t ed25519 -f ~/.ssh/ci_key -N "" -q
42+
43+
- name: Create droplet
44+
id: droplet
45+
env:
46+
DO_API_TOKEN: ${{ secrets.DO_API_TOKEN }}
47+
run: |
48+
output=$(bash bin/ci/droplet.sh create \
49+
"nightly-${{ matrix.distro }}-${{ github.run_id }}" \
50+
"${{ matrix.image }}" \
51+
~/.ssh/ci_key.pub)
52+
echo "$output" >> "$GITHUB_OUTPUT"
53+
echo "$output"
54+
55+
- name: Wait for SSH
56+
env:
57+
DO_API_TOKEN: ${{ secrets.DO_API_TOKEN }}
58+
run: |
59+
bash bin/ci/droplet.sh wait-ssh \
60+
"${{ steps.droplet.outputs.DROPLET_IP }}" \
61+
~/.ssh/ci_key
62+
63+
- name: Upload source
64+
run: |
65+
tar czf /tmp/baudbot-src.tar.gz \
66+
--exclude=node_modules --exclude=.git .
67+
scp -o StrictHostKeyChecking=no -o BatchMode=yes \
68+
-i ~/.ssh/ci_key \
69+
/tmp/baudbot-src.tar.gz \
70+
"root@${{ steps.droplet.outputs.DROPLET_IP }}:/tmp/baudbot-src.tar.gz"
71+
72+
- name: Setup and test (with inference smoke)
73+
run: |
74+
bash bin/ci/droplet.sh run \
75+
"${{ steps.droplet.outputs.DROPLET_IP }}" \
76+
~/.ssh/ci_key \
77+
"${{ matrix.setup_script }}" \
78+
"BAUDBOT_CI_INFERENCE_SMOKE=1" \
79+
"BAUDBOT_CI_INFERENCE_SMOKE_OPTIONAL=1" \
80+
"CI_ANTHROPIC_API_KEY=${{ secrets.CI_ANTHROPIC_API_KEY }}"
81+
82+
- name: Cleanup
83+
if: always()
84+
env:
85+
DO_API_TOKEN: ${{ secrets.DO_API_TOKEN }}
86+
run: |
87+
bash bin/ci/droplet.sh destroy \
88+
"${{ steps.droplet.outputs.DROPLET_ID }}" \
89+
"${{ steps.droplet.outputs.SSH_KEY_ID }}" \
90+
"nightly-${{ matrix.distro }}-${{ github.run_id }}"

bin/ci/droplet.sh

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -184,12 +184,20 @@ cmd_wait_ssh() {
184184

185185
# ── run <ip> <ssh_private_key_file> <script> ──────────────────────────────────
186186
cmd_run() {
187-
local ip="${1:?Usage: droplet.sh run <ip> <ssh_private_key_file> <script>}"
187+
local ip="${1:?Usage: droplet.sh run <ip> <ssh_private_key_file> <script> [env_vars...]}"
188188
local key_file="${2:?}"
189189
local script="${3:?}"
190-
191-
ssh -o StrictHostKeyChecking=no -o BatchMode=yes \
192-
-i "$key_file" "root@$ip" bash -s < "$script"
190+
shift 3
191+
192+
# Remaining args are KEY=VALUE env vars forwarded to the remote script.
193+
# Prepend export statements so the remote bash -s session inherits them.
194+
{
195+
for var in "$@"; do
196+
printf 'export %s\n' "$var"
197+
done
198+
cat "$script"
199+
} | ssh -o StrictHostKeyChecking=no -o BatchMode=yes \
200+
-i "$key_file" "root@$ip" bash -s
193201
}
194202

195203
# ── list ──────────────────────────────────────────────────────────────────────

bin/ci/setup-arch.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,20 @@ bash /home/baudbot_admin/baudbot/bin/ci/smoke-cli.sh
7070
echo "=== Running runtime smoke checks ==="
7171
bash /home/baudbot_admin/baudbot/bin/ci/smoke-agent-runtime.sh
7272

73+
if [[ "${BAUDBOT_CI_INFERENCE_SMOKE:-}" == "1" ]]; then
74+
echo "=== Running inference smoke check ==="
75+
if bash /home/baudbot_admin/baudbot/bin/ci/smoke-agent-inference.sh; then
76+
echo " ✓ inference smoke passed"
77+
elif [[ "${BAUDBOT_CI_INFERENCE_SMOKE_OPTIONAL:-}" == "1" ]]; then
78+
echo " ⚠ inference smoke failed (optional — continuing)"
79+
else
80+
echo " ✗ inference smoke failed"
81+
exit 1
82+
fi
83+
else
84+
echo "=== Inference smoke check skipped (set BAUDBOT_CI_INFERENCE_SMOKE=1 to enable) ==="
85+
fi
86+
7387
echo "=== Installing test dependencies ==="
7488
export PATH="/home/baudbot_agent/opt/node/bin:$PATH"
7589
cd /home/baudbot_admin/baudbot

bin/ci/setup-ubuntu.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,20 @@ bash /home/baudbot_admin/baudbot/bin/ci/smoke-cli.sh
108108
echo "=== Running runtime smoke checks ==="
109109
bash /home/baudbot_admin/baudbot/bin/ci/smoke-agent-runtime.sh
110110

111+
if [[ "${BAUDBOT_CI_INFERENCE_SMOKE:-}" == "1" ]]; then
112+
echo "=== Running inference smoke check ==="
113+
if bash /home/baudbot_admin/baudbot/bin/ci/smoke-agent-inference.sh; then
114+
echo " ✓ inference smoke passed"
115+
elif [[ "${BAUDBOT_CI_INFERENCE_SMOKE_OPTIONAL:-}" == "1" ]]; then
116+
echo " ⚠ inference smoke failed (optional — continuing)"
117+
else
118+
echo " ✗ inference smoke failed"
119+
exit 1
120+
fi
121+
else
122+
echo "=== Inference smoke check skipped (set BAUDBOT_CI_INFERENCE_SMOKE=1 to enable) ==="
123+
fi
124+
111125
echo "=== Installing test dependencies ==="
112126
export PATH="/home/baudbot_agent/opt/node/bin:$PATH"
113127
cd /home/baudbot_admin/baudbot

bin/ci/smoke-agent-inference.sh

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
#!/usr/bin/env bash
2+
# Gated inference smoke-test for baudbot.
3+
#
4+
# Verifies that the control-agent can complete at least one real LLM turn
5+
# end-to-end via session-control RPC. Default OFF for PR CI; enabled by
6+
# setting BAUDBOT_CI_INFERENCE_SMOKE=1.
7+
#
8+
# Optional fail-open mode: BAUDBOT_CI_INFERENCE_SMOKE_OPTIONAL=1 logs failure
9+
# as a warning instead of failing the build (useful for non-nightly runs where
10+
# flaky provider errors shouldn't block merges).
11+
#
12+
# Expects baudbot to be already installed and stoppable via `sudo baudbot`.
13+
14+
set -Eeuo pipefail
15+
16+
readonly AGENT_USER="baudbot_agent"
17+
readonly AGENT_HOME="/home/${AGENT_USER}"
18+
readonly AGENT_ENV="${AGENT_HOME}/.config/.env"
19+
readonly CONTROL_DIR="${AGENT_HOME}/.pi/session-control"
20+
readonly CONTROL_ALIAS="${CONTROL_DIR}/control-agent.alias"
21+
readonly START_TIMEOUT_SECONDS=60
22+
readonly INFERENCE_TIMEOUT_SECONDS=120
23+
readonly EXPECTED_TOKEN="CI_INFERENCE_OK"
24+
25+
started=0
26+
27+
log() {
28+
printf '[inference-smoke] %s\n' "$*"
29+
}
30+
31+
cleanup() {
32+
local exit_code=$?
33+
if [[ $started -eq 1 ]]; then
34+
log "cleanup: stopping baudbot"
35+
sudo baudbot stop >/dev/null 2>&1 || true
36+
fi
37+
exit "$exit_code"
38+
}
39+
trap cleanup EXIT
40+
41+
wait_for_control_socket() {
42+
local deadline=$((SECONDS + START_TIMEOUT_SECONDS))
43+
local target=""
44+
45+
while (( SECONDS < deadline )); do
46+
if [[ -L "$CONTROL_ALIAS" ]]; then
47+
target="$(readlink -- "$CONTROL_ALIAS" 2>/dev/null || true)"
48+
if [[ -n "$target" ]]; then
49+
if [[ "$target" != /* ]]; then
50+
target="${CONTROL_DIR}/${target}"
51+
fi
52+
if [[ -S "$target" ]]; then
53+
printf '%s\n' "$target"
54+
return 0
55+
fi
56+
fi
57+
fi
58+
sleep 1
59+
done
60+
61+
return 1
62+
}
63+
64+
dump_diagnostics() {
65+
log "--- diagnostics ---"
66+
sudo baudbot status 2>&1 || true
67+
log "--- end diagnostics ---"
68+
}
69+
70+
# Send a message via session-control RPC and wait for turn_end.
71+
# Prints the assistant response content on success, exits non-zero on failure.
72+
rpc_send_wait_turn_end() {
73+
local socket_path="$1"
74+
local message="$2"
75+
local timeout_seconds="$3"
76+
77+
sudo -u "$AGENT_USER" python3 - "$socket_path" "$message" "$timeout_seconds" <<'PY'
78+
import json
79+
import socket
80+
import sys
81+
82+
sock_path = sys.argv[1]
83+
message = sys.argv[2]
84+
timeout_seconds = int(sys.argv[3])
85+
86+
send_cmd = {"type": "send", "message": message, "mode": "steer"}
87+
subscribe_cmd = {"type": "subscribe", "event": "turn_end"}
88+
89+
client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
90+
try:
91+
client.settimeout(timeout_seconds)
92+
client.connect(sock_path)
93+
94+
# Send both commands
95+
client.sendall((json.dumps(send_cmd) + "\n").encode("utf-8"))
96+
client.sendall((json.dumps(subscribe_cmd) + "\n").encode("utf-8"))
97+
98+
buf = b""
99+
send_response = None
100+
101+
while True:
102+
chunk = client.recv(8192)
103+
if not chunk:
104+
print("connection closed before turn_end", file=sys.stderr)
105+
sys.exit(1)
106+
buf += chunk
107+
108+
while b"\n" in buf:
109+
line, buf = buf.split(b"\n", 1)
110+
line = line.strip()
111+
if not line:
112+
continue
113+
114+
try:
115+
msg = json.loads(line.decode("utf-8", errors="replace"))
116+
except json.JSONDecodeError:
117+
continue
118+
119+
if msg.get("type") == "response":
120+
cmd = msg.get("command", "")
121+
if cmd == "send":
122+
if not msg.get("success", False):
123+
print(f"send failed: {msg.get('error', 'unknown')}", file=sys.stderr)
124+
sys.exit(1)
125+
send_response = msg
126+
# Ignore subscribe response
127+
continue
128+
129+
if msg.get("type") == "event" and msg.get("event") == "turn_end":
130+
if send_response is None:
131+
print("received turn_end before send response", file=sys.stderr)
132+
sys.exit(1)
133+
data = msg.get("data", {})
134+
assistant_msg = data.get("message", {})
135+
content = assistant_msg.get("content", "")
136+
if not content:
137+
print("turn completed but no assistant content", file=sys.stderr)
138+
sys.exit(1)
139+
print(content)
140+
sys.exit(0)
141+
142+
print("stream ended without turn_end event", file=sys.stderr)
143+
sys.exit(1)
144+
except socket.timeout:
145+
print("timeout waiting for inference response", file=sys.stderr)
146+
sys.exit(1)
147+
finally:
148+
client.close()
149+
PY
150+
}
151+
152+
inject_api_key() {
153+
# If a real API key is available in the environment, inject it into the
154+
# agent's .env so the runtime can authenticate with the provider.
155+
if [[ -n "${CI_ANTHROPIC_API_KEY:-}" ]]; then
156+
log "injecting CI API key into agent .env"
157+
sed -i "s|^ANTHROPIC_API_KEY=.*|ANTHROPIC_API_KEY=${CI_ANTHROPIC_API_KEY}|" "$AGENT_ENV"
158+
elif grep -q "ANTHROPIC_API_KEY=sk-ant-testkey" "$AGENT_ENV" 2>/dev/null; then
159+
log "WARNING: agent .env has dummy API key; inference will likely fail"
160+
log " set CI_ANTHROPIC_API_KEY to provide a real key"
161+
fi
162+
}
163+
164+
main() {
165+
inject_api_key
166+
167+
log "starting baudbot"
168+
sudo baudbot start
169+
started=1
170+
171+
log "waiting for control-agent socket"
172+
local socket_path=""
173+
if ! socket_path="$(wait_for_control_socket)"; then
174+
log "control-agent socket did not become ready within ${START_TIMEOUT_SECONDS}s"
175+
dump_diagnostics
176+
return 1
177+
fi
178+
log "control socket ready: ${socket_path}"
179+
180+
log "sending inference prompt (timeout ${INFERENCE_TIMEOUT_SECONDS}s)"
181+
local response=""
182+
if ! response="$(rpc_send_wait_turn_end "$socket_path" \
183+
"Reply with exactly: ${EXPECTED_TOKEN}" \
184+
"$INFERENCE_TIMEOUT_SECONDS")"; then
185+
log "inference failed"
186+
dump_diagnostics
187+
return 1
188+
fi
189+
190+
# Validate response contains expected token
191+
if [[ "$response" == *"$EXPECTED_TOKEN"* ]]; then
192+
log "inference response contains expected token"
193+
else
194+
log "unexpected response (missing '${EXPECTED_TOKEN}'):"
195+
log " ${response:0:500}"
196+
dump_diagnostics
197+
return 1
198+
fi
199+
200+
log "stopping baudbot"
201+
sudo baudbot stop
202+
started=0
203+
204+
log "inference smoke passed"
205+
}
206+
207+
main "$@"

0 commit comments

Comments
 (0)