NVIDIA · cjluo-nv · Apr 15, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 15, 2026
diff --git a/.claude/skills/debug/SKILL.md b/.claude/skills/debug/SKILL.md
@@ -0,0 +1,33 @@
+---
+name: debug
+description: Run commands inside a remote Docker container via the file-based command relay (tools/debugger). Use when the user says "run in Docker", "run on GPU", "debug remotely", "run test in container", "check nvidia-smi", "run pytest in Docker", or needs to execute any command inside a Docker container that shares the repo filesystem. Requires the user to have started server.sh inside the container first.
+---
+
+# Remote Docker Debugger
+
+Execute commands inside a Docker container from the host using the file-based command relay.
+
+**Read `tools/debugger/CLAUDE.md` for full usage details** — it has the protocol, examples, and troubleshooting.
+
+## Quick Reference
+
+```bash
+# Check connection
+bash tools/debugger/client.sh status
+
+# Connect to server (user must start server.sh in Docker first)
+bash tools/debugger/client.sh handshake
+
+# Run a command
+bash tools/debugger/client.sh run "<command>"
+
+# Long-running command (default timeout is 600s)
+bash tools/debugger/client.sh --timeout 1800 run "<command>"
+
+# Cancel the currently running command
+bash tools/debugger/client.sh cancel
+
+# Reconnect after server restart
+bash tools/debugger/client.sh flush
+bash tools/debugger/client.sh handshake
+```
diff --git a/tools/debugger/CLAUDE.md b/tools/debugger/CLAUDE.md
@@ -53,10 +53,20 @@ bash tools/debugger/client.sh run "nvidia-smi"
 bash tools/debugger/client.sh run "python script.py --model /hf-local/Qwen/Qwen3-8B"
 ```
 
+### Cancelling Commands
+
+```bash
+# Cancel the currently running command
+bash tools/debugger/client.sh cancel
+
+# Client-side timeout also auto-cancels the running command
+```
+
 ### Important Notes
 
 - The server must be started by the user manually inside Docker before the handshake.
 - Default command timeout is 600 seconds (10 minutes). Use `--timeout` for longer tasks.
 - Commands execute sequentially — one at a time.
+- A running command can be cancelled; cancelled commands exit with code 130.
 - All commands run with the auto-detected repo root as the working directory.
 - The `.relay/` directory is ephemeral and git-ignored.
diff --git a/tools/debugger/README.md b/tools/debugger/README.md
@@ -52,6 +52,9 @@ bash tools/debugger/client.sh run "bash llm_ptq/scripts/huggingface_example.sh"
 # Run with a long timeout (default is 600s)
 bash tools/debugger/client.sh --timeout 1800 run "python my_long_test.py"
 
+# Cancel a running command
+bash tools/debugger/client.sh cancel
+
 # Check status
 bash tools/debugger/client.sh status
 ```
@@ -65,6 +68,8 @@ The relay uses a directory at `tools/debugger/.relay/` with this structure:
 ├── server.ready      # Written by server on startup
 ├── client.ready      # Written by client during handshake
 ├── handshake.done    # Written by server to confirm handshake
+├── running           # Written by server while a command is executing (cmd_id:pid)
+├── cancel            # Written by client to request cancellation of the running command
 ├── cmd/              # Client writes command .sh files here
 │   └── <id>.sh       # Command to execute
 └── result/           # Server writes results here
@@ -82,9 +87,16 @@ The relay uses a directory at `tools/debugger/.relay/` with this structure:
 ### Command Execution
 
 1. Client writes a command to `.relay/cmd/<timestamp>.sh`
-2. Server detects the file, runs `bash <file>` in the workdir, captures output
+2. Server detects the file, runs `bash <file>` in the workdir in background, writes `.relay/running`
 3. Server writes `.relay/result/<timestamp>.log` and `.relay/result/<timestamp>.exit`
-4. Server removes the `.sh` file; client reads results and cleans up
+4. Server removes the `.sh` file and `.relay/running`; client reads results and cleans up
+
+### Cancellation
+
+1. Client writes `.relay/cancel`
+2. Server detects the cancel signal, kills the running command process tree
+3. Server writes exit code 130 and removes `.relay/running` and `.relay/cancel`
+4. Client-side timeout also triggers cancellation automatically
 
 ## Options
 
@@ -107,3 +119,5 @@ The relay uses a directory at `tools/debugger/.relay/` with this structure:
 - The `.relay/` directory is in `.gitignore` — it is not checked in.
 - Only one server should run at a time (startup clears the relay directory).
 - Commands run sequentially in the order the server discovers them.
+- A running command can be cancelled via `client.sh cancel`. Cancelled commands exit with code 130.
+- Client-side timeouts automatically cancel the running command on the server.
diff --git a/tools/debugger/client.sh b/tools/debugger/client.sh
@@ -21,6 +21,7 @@
 # Usage:
 #   bash client.sh handshake              - Connect to server
 #   bash client.sh run <command...>        - Run a command and print output
+#   bash client.sh cancel                 - Cancel the running command
 #   bash client.sh status                  - Check server status
 #
 # Options:
@@ -91,6 +92,8 @@ case "$SUBCOMMAND" in
         # Generate a unique command ID (timestamp + PID to avoid collisions)
         cmd_id="$(date +%s%N)_$$"
 
+        echo "[client] Running: $*"
+
         # Write the command file atomically (tmp + mv)
         echo "$*" > "$CMD_DIR/$cmd_id.sh.tmp"
         mv "$CMD_DIR/$cmd_id.sh.tmp" "$CMD_DIR/$cmd_id.sh"
@@ -108,14 +111,32 @@ case "$SUBCOMMAND" in
             elapsed=$((elapsed + POLL_INTERVAL))
             if [[ $elapsed -ge $TIMEOUT ]]; then
                 echo "ERROR: Command timed out after ${TIMEOUT}s."
-                # Clean up the pending command
+                # Cancel the running command only if it is OUR command
+                if [[ -f "$RELAY_DIR/running" ]]; then
+                    running_info=$(cat "$RELAY_DIR/running" 2>/dev/null) || true
+                    running_id="${running_info%%:*}"
+                    if [[ "$running_id" == "$cmd_id" ]]; then
+                        echo "Sending cancel signal..."
+                        echo "$cmd_id" > "$RELAY_DIR/cancel"
+                        for _ in $(seq 1 10); do
+                            [[ -f "$RELAY_DIR/running" ]] || break
+                            sleep 1
+                        done
+                    fi
+                fi
+                # Clean up command and any orphaned result files
                 rm -f "$CMD_DIR/$cmd_id.sh"
+                rm -f "$RESULT_DIR/$cmd_id.exit" "$RESULT_DIR/$cmd_id.log"
                 exit 1
             fi
         done
 
         # Read and display results
         exit_code=$(cat "$RESULT_DIR/$cmd_id.exit")
+        if ! [[ "$exit_code" =~ ^[0-9]+$ ]]; then
+            echo "WARNING: Invalid exit code '$exit_code', defaulting to 1."
+            exit_code=1
+        fi
         if [[ -f "$RESULT_DIR/$cmd_id.log" ]]; then
             cat "$RESULT_DIR/$cmd_id.log"
         fi
@@ -139,6 +160,11 @@ case "$SUBCOMMAND" in
         else
             echo "Handshake: not started"
         fi
+        if [[ -f "$RELAY_DIR/running" ]]; then
+            echo "Running: $(cat "$RELAY_DIR/running")"
+        else
+            echo "Running: (idle)"
+        fi
         if [[ -d "$CMD_DIR" ]]; then
             pending=$(find "$CMD_DIR" -maxdepth 1 -type f -name '*.sh' 2>/dev/null | wc -l)
         else
@@ -148,6 +174,10 @@ case "$SUBCOMMAND" in
         ;;
 
     flush)
+        if [[ -f "$RELAY_DIR/running" ]]; then
+            echo "ERROR: A command is currently running. Cancel it first or wait for it to finish."
+            exit 1
+        fi
         if [[ -d "$RELAY_DIR" ]]; then
             # Clear handshake and command/result files, but keep server.ready
             rm -f "$RELAY_DIR/client.ready" "$RELAY_DIR/handshake.done"
@@ -159,12 +189,47 @@ case "$SUBCOMMAND" in
         fi
         ;;
 
+    cancel)
+        # Check if there's a running command
+        if [[ -f "$RELAY_DIR/running" ]]; then
+            running_info=$(cat "$RELAY_DIR/running" 2>/dev/null) || true
+            running_id="${running_info%%:*}"
+            echo "Cancelling running command: $running_id"
+
+            # Write cancel signal with cmd_id so server can verify the target
+            echo "$running_id" > "$RELAY_DIR/cancel"
+
+            # Wait for the server to process the cancellation
+            elapsed=0
+            while [[ -f "$RELAY_DIR/running" ]]; do
+                sleep "$POLL_INTERVAL"
+                elapsed=$((elapsed + POLL_INTERVAL))
+                if [[ $elapsed -ge 30 ]]; then
+                    echo "WARNING: Cancel signal sent but command still running after 30s."
+                    exit 1
+                fi
+            done
+            echo "Command cancelled."
+        else
+            echo "No command is currently running."
+        fi
+
+        # Report pending commands
+        if [[ -d "$CMD_DIR" ]]; then
+            pending=$(find "$CMD_DIR" -maxdepth 1 -type f -name '*.sh' 2>/dev/null | wc -l)
+            if [[ "$pending" -gt 0 ]]; then
+                echo "$pending pending command(s) in queue. Use 'flush' to clear them."
+            fi
+        fi
+        ;;
+
     *)
         echo "Usage: $0 [--relay-dir <path>] [--timeout <secs>] <subcommand>"
         echo ""
         echo "Subcommands:"
         echo "  handshake   Connect to the server"
         echo "  run <cmd>   Execute a command on the server"
+        echo "  cancel      Cancel the currently running command"
         echo "  status      Check connection status"
         echo "  flush       Clear the relay directory"
         exit 1

diff --git a/tools/debugger/server.sh b/tools/debugger/server.sh
@@ -63,10 +63,19 @@ RESULT_DIR="$RELAY_DIR/result"
 
 cleanup() {
     echo "[server] Shutting down..."
+    # Kill any running command (guard all reads with || true to prevent set -e
+    # from aborting the trap and leaving stale marker files)
+    running_pid=$(cut -d: -f2 "$RELAY_DIR/running" 2>/dev/null) || true
+    if [[ -n "$running_pid" ]]; then
+        pkill -P "$running_pid" 2>/dev/null || true
+        kill "$running_pid" 2>/dev/null || true
+    fi
     # Kill any child processes in our process group
     pkill -P $$ 2>/dev/null || true
     rm -f "$RELAY_DIR/server.ready"
     rm -f "$RELAY_DIR/handshake.done"
+    rm -f "$RELAY_DIR/running"
+    rm -f "$RELAY_DIR/cancel"
     exit 0
 }
 trap cleanup SIGINT SIGTERM
@@ -87,17 +96,28 @@ fi
 rm -rf "$RELAY_DIR"
 mkdir -p "$CMD_DIR" "$RESULT_DIR"
 
-# Install modelopt in editable mode (skip if already editable-installed from WORKDIR)
-if python -c "
-import modelopt, os
-assert os.path.realpath(modelopt.__path__[0]).startswith(os.path.realpath('$WORKDIR'))
-" 2>/dev/null; then
+# Ensure modelopt is editable-installed from WORKDIR
+check_modelopt_local() {
+    python -c "
+import modelopt, os, sys
+actual = os.path.realpath(modelopt.__path__[0])
+expected = os.path.realpath('$WORKDIR')
+if not actual.startswith(expected):
+    print(f'modelopt loaded from {actual}, expected under {expected}', file=sys.stderr)
+    sys.exit(1)
+" 2>&1
+}
+
+if check_modelopt_local >/dev/null 2>&1; then
     echo "[server] modelopt already editable-installed from $WORKDIR, skipping pip install."
 else
     echo "[server] Installing modelopt (pip install -e .[dev]) ..."
-    (cd "$WORKDIR" && pip install -e ".[dev]") || {
-        echo "[server] WARNING: pip install failed (exit=$?), continuing anyway."
-    }
+    (cd "$WORKDIR" && pip install -e ".[dev]")
+    if ! check_modelopt_local; then
+        echo "[server] ERROR: modelopt is not running from the local folder ($WORKDIR)."
+        echo "[server] Try: pip install -e '.[dev]' inside the container, then restart the server."
+        exit 1
+    fi
     echo "[server] Install done."
 fi
 
@@ -129,19 +149,90 @@ while true; do
     fi
 
     for cmd_file in "$CMD_DIR"/*.sh; do
-        cmd_id="$(basename "$cmd_file" .sh)"
-        echo "[server] Executing command $cmd_id..."
-
-        # Execute the command, tee stdout+stderr to console and result file
-        (cd "$WORKDIR" && bash "$cmd_file" 2>&1) | tee "$RESULT_DIR/$cmd_id.log" || true
-        exit_code=${PIPESTATUS[0]}
+        # Guard against command files deleted by the client between glob expansion
+        # and processing (e.g., client timeout on a queued command)
+        [[ -f "$cmd_file" ]] || continue
 
-        # Atomic write of exit code (signal to client that result is ready)
+        cmd_id="$(basename "$cmd_file" .sh)"
+        # Tolerate file disappearing between guard and read (TOCTOU with client timeout)
+        cmd_content=$(cat "$cmd_file" 2>/dev/null) || continue
+        # Remove command file immediately after reading to prevent re-execution
+        # and to avoid TOCTOU with client timeout deleting it during execution
+        rm -f "$cmd_file"
+        echo "[server] Executing command $cmd_id: $cmd_content"
+
+        # Clear any stale cancel file from a previous timed-out client
+        rm -f "$RELAY_DIR/cancel"
+
+        # Create log file and stream output to server console via tail
+        : > "$RESULT_DIR/$cmd_id.log"
+        tail -f "$RESULT_DIR/$cmd_id.log" &
+        tail_pid=$!
+
+        # Run from cmd_content (not the file) since we already removed it
+        (cd "$WORKDIR" && bash -c "$cmd_content") >> "$RESULT_DIR/$cmd_id.log" 2>&1 &
+        cmd_pid=$!
+
+        # Track the running command (ID and PID) — atomic write to prevent partial reads
+        echo "$cmd_id:$cmd_pid" > "$RELAY_DIR/running.tmp"
+        mv "$RELAY_DIR/running.tmp" "$RELAY_DIR/running"
+
+        # Wait for completion or cancellation
+        cancelled=""
+        while kill -0 "$cmd_pid" 2>/dev/null; do
+            if [[ -f "$RELAY_DIR/cancel" ]]; then
+                # Verify cancel targets this command (reject empty or mismatched signals)
+                cancel_target=$(cat "$RELAY_DIR/cancel" 2>/dev/null) || true
+                if [[ "$cancel_target" != "$cmd_id" ]]; then
+                    rm -f "$RELAY_DIR/cancel"
+                    sleep "$POLL_INTERVAL"
+                    continue
+                fi
+                echo "[server] Cancelling command $cmd_id (PID $cmd_pid)..."
+                # Send SIGTERM to children first, then parent
+                pkill -P "$cmd_pid" 2>/dev/null || true
+                kill "$cmd_pid" 2>/dev/null || true
+                # Wait up to 5s for graceful exit, then escalate to SIGKILL
+                for _ in $(seq 1 5); do
+                    kill -0 "$cmd_pid" 2>/dev/null || break
+                    sleep 1
+                done
+                if kill -0 "$cmd_pid" 2>/dev/null; then
+                    echo "[server] Process $cmd_pid did not exit, sending SIGKILL..."
+                    pkill -9 -P "$cmd_pid" 2>/dev/null || true
+                    kill -9 "$cmd_pid" 2>/dev/null || true
+                fi
+                wait "$cmd_pid" 2>/dev/null || true
+                cancelled="true"
+                rm -f "$RELAY_DIR/cancel"
+                echo "[cancelled]" >> "$RESULT_DIR/$cmd_id.log"
+                echo "[server] Command $cmd_id cancelled."
+                break
+            fi
+            sleep "$POLL_INTERVAL"
+        done
+
+        # Determine exit code (|| exit_code=$? prevents set -e from killing the
+        # server when the command exits non-zero)
+        if [[ -n "$cancelled" ]]; then
+            exit_code=130
+        else
+            exit_code=0
+            wait "$cmd_pid" 2>/dev/null || exit_code=$?
+        fi
+
+        # Stop console streaming
+        kill "$tail_pid" 2>/dev/null || true
+        wait "$tail_pid" 2>/dev/null || true
+
+        # Write exit code BEFORE removing the running marker, so any observer
+        # that sees running disappear can immediately find the result
         echo "$exit_code" > "$RESULT_DIR/$cmd_id.exit.tmp"
         mv "$RESULT_DIR/$cmd_id.exit.tmp" "$RESULT_DIR/$cmd_id.exit"
 
-        # Remove the command file to mark it as processed
-        rm -f "$cmd_file"
+        # Now safe to remove markers
+        rm -f "$RELAY_DIR/running"
+        rm -f "$RELAY_DIR/cancel"
 
         echo "[server] Command $cmd_id finished (exit=$exit_code)"
     done