diff --git a/tools/debugger/.gitignore b/tools/debugger/.gitignore new file mode 100644 index 0000000000..d77a9917cb --- /dev/null +++ b/tools/debugger/.gitignore @@ -0,0 +1 @@ +.relay/ diff --git a/tools/debugger/CLAUDE.md b/tools/debugger/CLAUDE.md new file mode 100644 index 0000000000..ab18e2627c --- /dev/null +++ b/tools/debugger/CLAUDE.md @@ -0,0 +1,62 @@ +# Remote Command Relay + +This directory contains a file-based command relay for executing commands inside a remote Docker +container from the host machine (where Claude Code runs). + +## How to Use (for Claude Code) + +### Setup (one-time per session) + +The user must start the server inside Docker first: + +```bash +# Inside Docker container (auto-detects repo root from script location): +bash /path/to/modelopt/tools/debugger/server.sh +``` + +Then Claude Code performs the handshake: + +```bash +bash tools/debugger/client.sh handshake +``` + +### Running Commands + +```bash +# Run any command in the Docker container (workdir = auto-detected repo root): +bash tools/debugger/client.sh run "" + +# For long-running tasks, increase timeout: +bash tools/debugger/client.sh --timeout 1800 run "" +``` + +### Key Paths Inside Docker + +| Path | Description | +|------|-------------| +| Repo root (auto-detected) | ModelOpt source, used as workdir | +| `/hf-local` | HuggingFace model cache | + +### Examples + +```bash +# Run PTQ test +bash tools/debugger/client.sh run "bash llm_ptq/scripts/huggingface_example.sh" + +# Run pytest +bash tools/debugger/client.sh run "python -m pytest tests/gpu -k test_quantize" + +# Check GPU +bash tools/debugger/client.sh run "nvidia-smi" + +# Use HF models from local cache +bash tools/debugger/client.sh run "python script.py --model /hf-local/Qwen/Qwen3-8B" +``` + +### Important Notes + +- The server must be started by the user manually inside Docker before the handshake. +- Default command timeout is 600 seconds (10 minutes). Use `--timeout` for longer tasks. +- Commands execute sequentially — one at a time. +- All commands run with the auto-detected repo root as the working directory. +- The `.relay/` directory is ephemeral and git-ignored. diff --git a/tools/debugger/README.md b/tools/debugger/README.md new file mode 100644 index 0000000000..ca7a90eb05 --- /dev/null +++ b/tools/debugger/README.md @@ -0,0 +1,109 @@ +# File-Based Command Relay (Debugger) + +A lightweight client/server system for running commands inside a Docker container from the host, +using only a shared filesystem — no networking required. + +## Overview + +```text +Host (Claude Code) Docker Container +┌─────────────┐ ┌─────────────────┐ +│ client.sh │ writes cmd file │ server.sh │ +│ run "X" │ ───────────────────► │ detects cmd │ +│ │ │ executes X │ +│ reads │ writes result file │ writes result │ +│ result │ ◄─────────────────── │ │ +└─────────────┘ └─────────────────┘ + └──── shared filesystem (.relay/) ────┘ +``` + +## Assumptions + +- The ModelOpt repo is accessible from both host and container (e.g., bind-mounted) +- **HuggingFace models** are mounted at `/hf-local` +- The server auto-detects the repo root from the location of `server.sh` + +## Quick Start + +### 1. Start the server (inside Docker) + +```bash +# The server auto-detects the repo root (two levels up from tools/debugger/) +bash /path/to/modelopt/tools/debugger/server.sh +``` + +The server automatically sets the working directory to the repo root. You can override with `--workdir`. + +### 2. Connect from the host + +```bash +bash tools/debugger/client.sh handshake +``` + +### 3. Run commands + +```bash +# Run a simple command +bash tools/debugger/client.sh run "echo hello" + +# Run a test script +bash tools/debugger/client.sh run "bash llm_ptq/scripts/huggingface_example.sh" + +# Run with a long timeout (default is 600s) +bash tools/debugger/client.sh --timeout 1800 run "python my_long_test.py" + +# Check status +bash tools/debugger/client.sh status +``` + +## Protocol + +The relay uses a directory at `tools/debugger/.relay/` with this structure: + +```text +.relay/ +├── server.ready # Written by server on startup +├── client.ready # Written by client during handshake +├── handshake.done # Written by server to confirm handshake +├── cmd/ # Client writes command .sh files here +│ └── .sh # Command to execute +└── result/ # Server writes results here + ├── .log # stdout + stderr + └── .exit # Exit code +``` + +### Handshake + +1. Server starts, creates `.relay/server.ready` +2. Client writes `.relay/client.ready` +3. Server detects it, writes `.relay/handshake.done` +4. Both sides are now connected + +### Command Execution + +1. Client writes a command to `.relay/cmd/.sh` +2. Server detects the file, runs `bash ` in the workdir, captures output +3. Server writes `.relay/result/.log` and `.relay/result/.exit` +4. Server removes the `.sh` file; client reads results and cleans up + +## Options + +### Server + +| Flag | Default | Description | +|------|---------|-------------| +| `--relay-dir` | `/.relay` | Relay directory path | +| `--workdir` | Auto-detected repo root | Working directory for commands | + +### Client + +| Flag | Default | Description | +|------|---------|-------------| +| `--relay-dir` | `/.relay` | Relay directory path | +| `--timeout` | `600` | Seconds to wait for command result | + +## Notes + +- The `.relay/` directory is in `.gitignore` — it is not checked in. +- Only one server should run at a time (startup clears the relay directory). +- Commands run sequentially in the order the server discovers them. diff --git a/tools/debugger/client.sh b/tools/debugger/client.sh new file mode 100755 index 0000000000..299bd52e82 --- /dev/null +++ b/tools/debugger/client.sh @@ -0,0 +1,172 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# File-based command relay client. +# Run this from the host / Claude Code side. It sends commands to the server +# running inside Docker by writing files to the shared relay directory. +# +# Usage: +# bash client.sh handshake - Connect to server +# bash client.sh run - Run a command and print output +# bash client.sh status - Check server status +# +# Options: +# --relay-dir Path to relay directory (default: /.relay) +# --timeout Timeout waiting for result (default: 600) + +set -euo pipefail + +RELAY_DIR="" +TIMEOUT=600 +POLL_INTERVAL=1 + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Parse global options before subcommand +while [[ $# -gt 0 ]]; do + case "$1" in + --relay-dir) RELAY_DIR="$2"; shift 2 ;; + --timeout) TIMEOUT="$2"; shift 2 ;; + *) break ;; + esac +done + +if [[ -z "$RELAY_DIR" ]]; then + RELAY_DIR="$SCRIPT_DIR/.relay" +fi + +CMD_DIR="$RELAY_DIR/cmd" +RESULT_DIR="$RELAY_DIR/result" + +SUBCOMMAND="${1:-}" +shift || true + +case "$SUBCOMMAND" in + handshake) + # Check server is ready + if [[ ! -f "$RELAY_DIR/server.ready" ]]; then + echo "ERROR: Server not ready. Start server.sh in Docker first." + exit 1 + fi + SERVER_INFO=$(cat "$RELAY_DIR/server.ready") + echo "Server found: $SERVER_INFO" + + # Send client handshake + echo "$(hostname):$$:$(date -Iseconds)" > "$RELAY_DIR/client.ready" + + # Wait for server acknowledgment + elapsed=0 + while [[ ! -f "$RELAY_DIR/handshake.done" ]]; do + sleep "$POLL_INTERVAL" + elapsed=$((elapsed + POLL_INTERVAL)) + if [[ $elapsed -ge 120 ]]; then + echo "ERROR: Handshake timed out after 120s." + exit 1 + fi + done + + echo "Handshake complete." + ;; + + run) + # Verify handshake was done + if [[ ! -f "$RELAY_DIR/handshake.done" ]]; then + echo "ERROR: Not connected. Run 'client.sh handshake' first." + exit 1 + fi + + # Generate a unique command ID (timestamp + PID to avoid collisions) + cmd_id="$(date +%s%N)_$$" + + # Write the command file atomically (tmp + mv) + echo "$*" > "$CMD_DIR/$cmd_id.sh.tmp" + mv "$CMD_DIR/$cmd_id.sh.tmp" "$CMD_DIR/$cmd_id.sh" + + # Wait for result + elapsed=0 + while [[ ! -f "$RESULT_DIR/$cmd_id.exit" ]]; do + # Check if server is still alive + if [[ ! -f "$RELAY_DIR/server.ready" ]]; then + echo "ERROR: Server appears to have stopped." + rm -f "$CMD_DIR/$cmd_id.sh" + exit 1 + fi + sleep "$POLL_INTERVAL" + elapsed=$((elapsed + POLL_INTERVAL)) + if [[ $elapsed -ge $TIMEOUT ]]; then + echo "ERROR: Command timed out after ${TIMEOUT}s." + # Clean up the pending command + rm -f "$CMD_DIR/$cmd_id.sh" + exit 1 + fi + done + + # Read and display results + exit_code=$(cat "$RESULT_DIR/$cmd_id.exit") + if [[ -f "$RESULT_DIR/$cmd_id.log" ]]; then + cat "$RESULT_DIR/$cmd_id.log" + fi + + # Clean up result files + rm -f "$RESULT_DIR/$cmd_id.exit" "$RESULT_DIR/$cmd_id.log" + + exit "$exit_code" + ;; + + status) + if [[ -f "$RELAY_DIR/server.ready" ]]; then + echo "Server: $(cat "$RELAY_DIR/server.ready")" + else + echo "Server: not running" + fi + if [[ -f "$RELAY_DIR/handshake.done" ]]; then + echo "Handshake: complete" + elif [[ -f "$RELAY_DIR/client.ready" ]]; then + echo "Handshake: pending" + else + echo "Handshake: not started" + fi + if [[ -d "$CMD_DIR" ]]; then + pending=$(find "$CMD_DIR" -maxdepth 1 -type f -name '*.sh' 2>/dev/null | wc -l) + else + pending=0 + fi + echo "Pending commands: $pending" + ;; + + flush) + if [[ -d "$RELAY_DIR" ]]; then + # Clear handshake and command/result files, but keep server.ready + rm -f "$RELAY_DIR/client.ready" "$RELAY_DIR/handshake.done" + rm -rf "$CMD_DIR" "$RESULT_DIR" + mkdir -p "$CMD_DIR" "$RESULT_DIR" + echo "Relay state cleared (server.ready preserved): $RELAY_DIR" + else + echo "Relay directory does not exist: $RELAY_DIR" + fi + ;; + + *) + echo "Usage: $0 [--relay-dir ] [--timeout ] " + echo "" + echo "Subcommands:" + echo " handshake Connect to the server" + echo " run Execute a command on the server" + echo " status Check connection status" + echo " flush Clear the relay directory" + exit 1 + ;; +esac diff --git a/tools/debugger/server.sh b/tools/debugger/server.sh new file mode 100755 index 0000000000..eefc849ae6 --- /dev/null +++ b/tools/debugger/server.sh @@ -0,0 +1,150 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# File-based command relay server. +# Run this inside the Docker container. It watches for command files from the +# client, executes them, and writes results back. +# +# Usage: bash server.sh [--relay-dir ] [--workdir ] + +set -euo pipefail + +RELAY_DIR="" +WORKDIR="" +POLL_INTERVAL=1 + +# Derive the modelopt repo root from the location of this script (tools/debugger/server.sh) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DEFAULT_WORKDIR="$(cd "$SCRIPT_DIR/../.." && pwd)" + +usage() { + echo "Usage: $0 [--relay-dir ] [--workdir ]" + echo "" + echo "Options:" + echo " --relay-dir Path to relay directory (default: /.relay)" + echo " --workdir Working directory for commands (default: auto-detected repo root)" + exit 1 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --relay-dir) RELAY_DIR="$2"; shift 2 ;; + --workdir) WORKDIR="$2"; shift 2 ;; + -h|--help) usage ;; + *) echo "Unknown option: $1"; usage ;; + esac +done + +# Default relay dir is .relay next to this script +if [[ -z "$RELAY_DIR" ]]; then + RELAY_DIR="$SCRIPT_DIR/.relay" +fi + +# Default workdir is the repo root (two levels up from tools/debugger/) +if [[ -z "$WORKDIR" ]]; then + WORKDIR="$DEFAULT_WORKDIR" +fi + +CMD_DIR="$RELAY_DIR/cmd" +RESULT_DIR="$RELAY_DIR/result" + +cleanup() { + echo "[server] Shutting down..." + # Kill any child processes in our process group + pkill -P $$ 2>/dev/null || true + rm -f "$RELAY_DIR/server.ready" + rm -f "$RELAY_DIR/handshake.done" + exit 0 +} +trap cleanup SIGINT SIGTERM + +# Set environment +export PYTHONPATH="$WORKDIR" + +# Check for an already-running server +if [[ -f "$RELAY_DIR/server.ready" ]]; then + old_pid=$(cut -d: -f2 "$RELAY_DIR/server.ready") + if kill -0 "$old_pid" 2>/dev/null; then + echo "[server] ERROR: Another server (PID $old_pid) is already running." + exit 1 + fi +fi + +# Initialize relay directories +rm -rf "$RELAY_DIR" +mkdir -p "$CMD_DIR" "$RESULT_DIR" + +# Install modelopt in editable mode (skip if already editable-installed from WORKDIR) +if python -c " +import modelopt, os +assert os.path.realpath(modelopt.__path__[0]).startswith(os.path.realpath('$WORKDIR')) +" 2>/dev/null; then + echo "[server] modelopt already editable-installed from $WORKDIR, skipping pip install." +else + echo "[server] Installing modelopt (pip install -e .[dev]) ..." + (cd "$WORKDIR" && pip install -e ".[dev]") || { + echo "[server] WARNING: pip install failed (exit=$?), continuing anyway." + } + echo "[server] Install done." +fi + +# Signal that server is ready +echo "$(hostname):$$:$(date -Iseconds)" > "$RELAY_DIR/server.ready" +echo "[server] Ready. Relay dir: $RELAY_DIR" +echo "[server] Workdir: $WORKDIR" +echo "[server] Waiting for client handshake..." + +# Wait for client handshake +while [[ ! -f "$RELAY_DIR/client.ready" ]]; do + sleep "$POLL_INTERVAL" +done + +CLIENT_INFO=$(cat "$RELAY_DIR/client.ready") +echo "[server] Client connected: $CLIENT_INFO" +echo "$(hostname):$$:$(date -Iseconds)" > "$RELAY_DIR/handshake.done" +echo "[server] Handshake complete. Listening for commands..." + +# Main loop: watch for command files and re-handshake requests +shopt -s nullglob +while true; do + # Detect re-handshake (client flushed and reconnected) + if [[ -f "$RELAY_DIR/client.ready" && ! -f "$RELAY_DIR/handshake.done" ]]; then + CLIENT_INFO=$(cat "$RELAY_DIR/client.ready") + echo "[server] Client re-connected: $CLIENT_INFO" + echo "$(hostname):$$:$(date -Iseconds)" > "$RELAY_DIR/handshake.done" + echo "[server] Re-handshake complete." + fi + + for cmd_file in "$CMD_DIR"/*.sh; do + cmd_id="$(basename "$cmd_file" .sh)" + echo "[server] Executing command $cmd_id..." + + # Execute the command, tee stdout+stderr to console and result file + (cd "$WORKDIR" && bash "$cmd_file" 2>&1) | tee "$RESULT_DIR/$cmd_id.log" || true + exit_code=${PIPESTATUS[0]} + + # Atomic write of exit code (signal to client that result is ready) + echo "$exit_code" > "$RESULT_DIR/$cmd_id.exit.tmp" + mv "$RESULT_DIR/$cmd_id.exit.tmp" "$RESULT_DIR/$cmd_id.exit" + + # Remove the command file to mark it as processed + rm -f "$cmd_file" + + echo "[server] Command $cmd_id finished (exit=$exit_code)" + done + + sleep "$POLL_INTERVAL" +done