Skip to content

Commit a22c6ae

Browse files
committed
address comments
Signed-off-by: Meng Xin <mxin@nvidia.com>
1 parent efb6fef commit a22c6ae

3 files changed

Lines changed: 32 additions & 13 deletions

File tree

.claude/skills/common/remote_exec.sh

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@
3535
# REMOTE_WORKSPACE, REMOTE_GPU_TYPE, REMOTE_ENV_TYPE,
3636
# REMOTE_CONTAINER_IMAGE, REMOTE_SLURM_ACCOUNT, REMOTE_SLURM_PARTITION
3737

38-
set -euo pipefail
38+
# NOTE: This file is designed to be sourced. It does NOT set shell options
39+
# (set -euo pipefail) to avoid mutating the caller's environment.
3940

4041
# ── Helpers ──────────────────────────────────────────────────────────────────
4142

@@ -155,8 +156,7 @@ remote_start_session() {
155156
local rc=$?
156157
if (( rc == 0 )); then
157158
echo "SSH session established. All commands will reuse this connection."
158-
# Register cleanup trap
159-
trap 'remote_stop_session 2>/dev/null' EXIT
159+
echo "Call 'remote_enable_cleanup_trap' to auto-close on exit, or 'remote_stop_session' manually."
160160
else
161161
echo "WARNING: Failed to start persistent SSH session (rc=$rc). Commands will use individual connections." >&2
162162
fi
@@ -173,6 +173,18 @@ remote_stop_session() {
173173
fi
174174
}
175175

176+
remote_enable_cleanup_trap() {
177+
# Opt-in: register an EXIT trap to auto-close the SSH session.
178+
# Chains with any existing EXIT trap to avoid breaking the caller.
179+
local existing_trap
180+
existing_trap=$(trap -p EXIT | sed "s/^trap -- '//;s/' EXIT$//")
181+
if [[ -n "$existing_trap" ]]; then
182+
trap "${existing_trap}; remote_stop_session 2>/dev/null" EXIT
183+
else
184+
trap 'remote_stop_session 2>/dev/null' EXIT
185+
fi
186+
}
187+
176188
# ── Core Functions ───────────────────────────────────────────────────────────
177189

178190
remote_load_cluster() {
@@ -278,8 +290,12 @@ remote_detect_env() {
278290
# Check Docker
279291
if command -v docker &>/dev/null; then
280292
echo 'HAS_DOCKER=yes';
281-
# Check if docker can access GPUs
282-
docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null && echo 'DOCKER_GPU=yes' || echo 'DOCKER_GPU=no';
293+
# Check if nvidia-container-cli is available (GPU support without pulling an image)
294+
if command -v nvidia-container-cli &>/dev/null || docker info 2>/dev/null | grep -qi nvidia; then
295+
echo 'DOCKER_GPU=yes';
296+
else
297+
echo 'DOCKER_GPU=no';
298+
fi;
283299
else
284300
echo 'HAS_DOCKER=no';
285301
fi;
@@ -297,10 +313,13 @@ remote_detect_env() {
297313

298314
if echo "$info" | grep -q "HAS_SLURM=yes"; then
299315
REMOTE_ENV_TYPE="slurm"
300-
elif echo "$info" | grep -q "HAS_DOCKER=yes"; then
316+
elif echo "$info" | grep -q "DOCKER_GPU=yes"; then
301317
REMOTE_ENV_TYPE="docker"
302318
elif echo "$info" | grep -q "HAS_BARE_GPU=yes"; then
303319
REMOTE_ENV_TYPE="bare"
320+
elif echo "$info" | grep -q "HAS_DOCKER=yes"; then
321+
# Docker available but no GPU support — fall back to bare
322+
REMOTE_ENV_TYPE="bare"
304323
else
305324
REMOTE_ENV_TYPE="unknown"
306325
fi
@@ -316,7 +335,7 @@ remote_run() {
316335
# Retries up to 3 times on SSH connection failures.
317336
local cmd="$1"
318337
local ws="${REMOTE_WORKSPACE:-\$HOME}"
319-
local full_cmd="cd $ws && $cmd"
338+
local full_cmd="cd \"$ws\" && $cmd"
320339
local encoded
321340
encoded=$(printf '%s' "$full_cmd" | base64 -w0)
322341

.claude/skills/ptq/references/remote-execution.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ remote_sync_from <remote_output_subdir> /local/output/
140140
| `unix_listener: cannot bind to path ... Read-only file system` | SSH ControlMaster socket in non-writable `/tmp` | `remote_exec.sh` auto-finds writable dir; ensure `TMPDIR` or `/tmp/claude-*` exists |
141141
| `cd: /home/user/~/path: No such file or directory` | `~` not expanding on remote | Use absolute paths in `workspace` config, not `~/...` |
142142
| Login nodes resolve home dirs differently | Symlinked home dirs vary by node | Use absolute lustre/NFS paths (e.g., `/lustre/fs1/...`) in job scripts |
143-
| `#!` becomes `#\!` in scripts | Shell environment mangles shebang | Fix with `sed -i 's\|^#\\\\!\|#!\| script.sh'` after writing |
143+
| `#!` becomes `#\!` in scripts | Shell environment mangles shebang | Fix with `sed -i 's\|^#\\\\!\|#!\|' script.sh` after writing |
144144

145145
## Reference Files
146146

.claude/skills/ptq/references/unsupported-models.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@ print(type(cfg).__name__)
1818

1919
```bash
2020
python -c "
21-
import inspect
22-
from transformers import AutoConfig, AutoModel
21+
import importlib, inspect
22+
from transformers import AutoConfig
2323
cfg = AutoConfig.from_pretrained('<ckpt_path>')
24-
cls = AutoModel._model_type_to_module_name.get(cfg.model_type)
25-
import transformers; mod = getattr(transformers, cls, None)
26-
print(inspect.getfile(mod) if mod else 'not found')
24+
mod_name = 'transformers.models.' + cfg.model_type.replace('-', '_')
25+
mod = importlib.import_module(mod_name + '.modeling_' + cfg.model_type.replace('-', '_'))
26+
print(inspect.getfile(mod))
2727
"
2828
```
2929

0 commit comments

Comments
 (0)