-
Notifications
You must be signed in to change notification settings - Fork 365
Add job cancellation support to the debugger command relay #1262
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| --- | ||
| name: debug | ||
| description: Run commands inside a remote Docker container via the file-based command relay (tools/debugger). Use when the user says "run in Docker", "run on GPU", "debug remotely", "run test in container", "check nvidia-smi", "run pytest in Docker", or needs to execute any command inside a Docker container that shares the repo filesystem. Requires the user to have started server.sh inside the container first. | ||
| --- | ||
|
|
||
| # Remote Docker Debugger | ||
|
|
||
| Execute commands inside a Docker container from the host using the file-based command relay. | ||
|
|
||
| **Read `tools/debugger/CLAUDE.md` for full usage details** — it has the protocol, examples, and troubleshooting. | ||
|
|
||
| ## Quick Reference | ||
|
|
||
| ```bash | ||
| # Check connection | ||
| bash tools/debugger/client.sh status | ||
|
|
||
| # Connect to server (user must start server.sh in Docker first) | ||
| bash tools/debugger/client.sh handshake | ||
|
|
||
| # Run a command | ||
| bash tools/debugger/client.sh run "<command>" | ||
|
|
||
| # Long-running command (default timeout is 600s) | ||
| bash tools/debugger/client.sh --timeout 1800 run "<command>" | ||
|
|
||
| # Cancel the currently running command | ||
| bash tools/debugger/client.sh cancel | ||
|
|
||
| # Reconnect after server restart | ||
| bash tools/debugger/client.sh flush | ||
| bash tools/debugger/client.sh handshake | ||
| ``` | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -63,10 +63,19 @@ RESULT_DIR="$RELAY_DIR/result" | |
|
|
||
| cleanup() { | ||
| echo "[server] Shutting down..." | ||
| # Kill any running command (guard all reads with || true to prevent set -e | ||
| # from aborting the trap and leaving stale marker files) | ||
| running_pid=$(cut -d: -f2 "$RELAY_DIR/running" 2>/dev/null) || true | ||
| if [[ -n "$running_pid" ]]; then | ||
| pkill -P "$running_pid" 2>/dev/null || true | ||
| kill "$running_pid" 2>/dev/null || true | ||
| fi | ||
|
Comment on lines
+66
to
+71
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shutdown cleanup can still orphan the detached command tree. Lines 173-178 start the 🔧 Suggested fix+CURRENT_CMD_PID=""
+
+terminate_cmd_group() {
+ local pid="$1"
+ [[ -n "$pid" ]] || return 0
+
+ kill -- -"$pid" 2>/dev/null || kill "$pid" 2>/dev/null || true
+ for _ in $(seq 1 5); do
+ kill -0 "$pid" 2>/dev/null || return 0
+ sleep 1
+ done
+ kill -9 -- -"$pid" 2>/dev/null || kill -9 "$pid" 2>/dev/null || true
+}
+
cleanup() {
echo "[server] Shutting down..."
- running_pid=$(cut -d: -f2 "$RELAY_DIR/running" 2>/dev/null) || true
+ running_pid="${CURRENT_CMD_PID:-}"
+ [[ -n "$running_pid" ]] || running_pid=$(cut -d: -f2 "$RELAY_DIR/running" 2>/dev/null) || true
if [[ -n "$running_pid" ]]; then
- kill -- -"$running_pid" 2>/dev/null || kill "$running_pid" 2>/dev/null || true
+ terminate_cmd_group "$running_pid"
fi
# Kill any child processes in our process group
pkill -P $$ 2>/dev/null || true
@@
(cd "$WORKDIR" && exec setsid bash -c "$cmd_content") >> "$RESULT_DIR/$cmd_id.log" 2>&1 &
cmd_pid=$!
+ CURRENT_CMD_PID="$cmd_pid"
@@
rm -f "$RELAY_DIR/running"
rm -f "$RELAY_DIR/cancel"
+ CURRENT_CMD_PID=""Also applies to: 173-178 🤖 Prompt for AI Agents |
||
| # Kill any child processes in our process group | ||
| pkill -P $$ 2>/dev/null || true | ||
| rm -f "$RELAY_DIR/server.ready" | ||
| rm -f "$RELAY_DIR/handshake.done" | ||
| rm -f "$RELAY_DIR/running" | ||
| rm -f "$RELAY_DIR/cancel" | ||
| exit 0 | ||
| } | ||
| trap cleanup SIGINT SIGTERM | ||
|
|
@@ -87,17 +96,28 @@ fi | |
| rm -rf "$RELAY_DIR" | ||
| mkdir -p "$CMD_DIR" "$RESULT_DIR" | ||
|
|
||
| # Install modelopt in editable mode (skip if already editable-installed from WORKDIR) | ||
| if python -c " | ||
| import modelopt, os | ||
| assert os.path.realpath(modelopt.__path__[0]).startswith(os.path.realpath('$WORKDIR')) | ||
| " 2>/dev/null; then | ||
| # Ensure modelopt is editable-installed from WORKDIR | ||
| check_modelopt_local() { | ||
| python -c " | ||
| import modelopt, os, sys | ||
| actual = os.path.realpath(modelopt.__path__[0]) | ||
| expected = os.path.realpath('$WORKDIR') | ||
| if not actual.startswith(expected): | ||
| print(f'modelopt loaded from {actual}, expected under {expected}', file=sys.stderr) | ||
| sys.exit(1) | ||
| " 2>&1 | ||
|
coderabbitai[bot] marked this conversation as resolved.
Outdated
|
||
| } | ||
|
|
||
| if check_modelopt_local >/dev/null 2>&1; then | ||
| echo "[server] modelopt already editable-installed from $WORKDIR, skipping pip install." | ||
| else | ||
| echo "[server] Installing modelopt (pip install -e .[dev]) ..." | ||
| (cd "$WORKDIR" && pip install -e ".[dev]") || { | ||
| echo "[server] WARNING: pip install failed (exit=$?), continuing anyway." | ||
| } | ||
| (cd "$WORKDIR" && pip install -e ".[dev]") | ||
| if ! check_modelopt_local; then | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| echo "[server] ERROR: modelopt is not running from the local folder ($WORKDIR)." | ||
| echo "[server] Try: pip install -e '.[dev]' inside the container, then restart the server." | ||
| exit 1 | ||
| fi | ||
| echo "[server] Install done." | ||
| fi | ||
|
|
||
|
|
@@ -129,19 +149,90 @@ while true; do | |
| fi | ||
|
|
||
| for cmd_file in "$CMD_DIR"/*.sh; do | ||
| cmd_id="$(basename "$cmd_file" .sh)" | ||
| echo "[server] Executing command $cmd_id..." | ||
|
|
||
| # Execute the command, tee stdout+stderr to console and result file | ||
| (cd "$WORKDIR" && bash "$cmd_file" 2>&1) | tee "$RESULT_DIR/$cmd_id.log" || true | ||
| exit_code=${PIPESTATUS[0]} | ||
| # Guard against command files deleted by the client between glob expansion | ||
| # and processing (e.g., client timeout on a queued command) | ||
| [[ -f "$cmd_file" ]] || continue | ||
|
|
||
| # Atomic write of exit code (signal to client that result is ready) | ||
| cmd_id="$(basename "$cmd_file" .sh)" | ||
| # Tolerate file disappearing between guard and read (TOCTOU with client timeout) | ||
| cmd_content=$(cat "$cmd_file" 2>/dev/null) || continue | ||
| # Remove command file immediately after reading to prevent re-execution | ||
| # and to avoid TOCTOU with client timeout deleting it during execution | ||
| rm -f "$cmd_file" | ||
| echo "[server] Executing command $cmd_id: $cmd_content" | ||
|
|
||
| # Clear any stale cancel file from a previous timed-out client | ||
| rm -f "$RELAY_DIR/cancel" | ||
|
|
||
| # Create log file and stream output to server console via tail | ||
| : > "$RESULT_DIR/$cmd_id.log" | ||
| tail -f "$RESULT_DIR/$cmd_id.log" & | ||
| tail_pid=$! | ||
|
|
||
| # Run from cmd_content (not the file) since we already removed it | ||
| (cd "$WORKDIR" && bash -c "$cmd_content") >> "$RESULT_DIR/$cmd_id.log" 2>&1 & | ||
| cmd_pid=$! | ||
|
|
||
| # Track the running command (ID and PID) — atomic write to prevent partial reads | ||
| echo "$cmd_id:$cmd_pid" > "$RELAY_DIR/running.tmp" | ||
| mv "$RELAY_DIR/running.tmp" "$RELAY_DIR/running" | ||
|
|
||
| # Wait for completion or cancellation | ||
| cancelled="" | ||
| while kill -0 "$cmd_pid" 2>/dev/null; do | ||
| if [[ -f "$RELAY_DIR/cancel" ]]; then | ||
| # Verify cancel targets this command (reject empty or mismatched signals) | ||
| cancel_target=$(cat "$RELAY_DIR/cancel" 2>/dev/null) || true | ||
| if [[ "$cancel_target" != "$cmd_id" ]]; then | ||
| rm -f "$RELAY_DIR/cancel" | ||
| sleep "$POLL_INTERVAL" | ||
| continue | ||
| fi | ||
| echo "[server] Cancelling command $cmd_id (PID $cmd_pid)..." | ||
| # Send SIGTERM to children first, then parent | ||
| pkill -P "$cmd_pid" 2>/dev/null || true | ||
|
cjluo-nv marked this conversation as resolved.
Outdated
|
||
| kill "$cmd_pid" 2>/dev/null || true | ||
| # Wait up to 5s for graceful exit, then escalate to SIGKILL | ||
| for _ in $(seq 1 5); do | ||
| kill -0 "$cmd_pid" 2>/dev/null || break | ||
| sleep 1 | ||
| done | ||
| if kill -0 "$cmd_pid" 2>/dev/null; then | ||
| echo "[server] Process $cmd_pid did not exit, sending SIGKILL..." | ||
| pkill -9 -P "$cmd_pid" 2>/dev/null || true | ||
| kill -9 "$cmd_pid" 2>/dev/null || true | ||
| fi | ||
| wait "$cmd_pid" 2>/dev/null || true | ||
| cancelled="true" | ||
| rm -f "$RELAY_DIR/cancel" | ||
| echo "[cancelled]" >> "$RESULT_DIR/$cmd_id.log" | ||
| echo "[server] Command $cmd_id cancelled." | ||
| break | ||
| fi | ||
| sleep "$POLL_INTERVAL" | ||
| done | ||
|
|
||
| # Determine exit code (|| exit_code=$? prevents set -e from killing the | ||
| # server when the command exits non-zero) | ||
| if [[ -n "$cancelled" ]]; then | ||
| exit_code=130 | ||
| else | ||
| exit_code=0 | ||
| wait "$cmd_pid" 2>/dev/null || exit_code=$? | ||
| fi | ||
|
|
||
| # Stop console streaming | ||
| kill "$tail_pid" 2>/dev/null || true | ||
| wait "$tail_pid" 2>/dev/null || true | ||
|
|
||
| # Write exit code BEFORE removing the running marker, so any observer | ||
| # that sees running disappear can immediately find the result | ||
| echo "$exit_code" > "$RESULT_DIR/$cmd_id.exit.tmp" | ||
| mv "$RESULT_DIR/$cmd_id.exit.tmp" "$RESULT_DIR/$cmd_id.exit" | ||
|
|
||
| # Remove the command file to mark it as processed | ||
| rm -f "$cmd_file" | ||
| # Now safe to remove markers | ||
| rm -f "$RELAY_DIR/running" | ||
| rm -f "$RELAY_DIR/cancel" | ||
|
|
||
| echo "[server] Command $cmd_id finished (exit=$exit_code)" | ||
| done | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.