Skip to content

Commit 7a106c3

Browse files
committed
better bot
1 parent 6770524 commit 7a106c3

6 files changed

Lines changed: 284 additions & 117 deletions

File tree

.claude/skills/ptq/SKILL.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ For MLP-only quantization (skipping attention), use configs with `MLP_ONLY` in t
144144

145145
```bash
146146
docker run --gpus all -v <model_path>:<model_path> -v <output_path>:<output_path> \
147-
nvcr.io/nvidia/tensorrt-llm/release:<version> bash -c "pip install -e <modelopt_path>[hf] --quiet && python <ptq_script.py> ..."
147+
nvcr.io/nvidia/tensorrt-llm/release:<version> bash -c "pip install --no-build-isolation -e <modelopt_path>[hf] --quiet && python <ptq_script.py> ..."
148148
```
149149

150150
- **No Docker**: set up a virtual environment with conda (preferred) or venv:
@@ -155,7 +155,7 @@ For MLP-only quantization (skipping attention), use configs with `MLP_ONLY` in t
155155
# or venv
156156
python -m venv modelopt-env && source modelopt-env/bin/activate
157157
158-
pip install nvidia-modelopt[hf]
158+
pip install --no-build-isolation nvidia-modelopt[hf]
159159
```
160160

161161
**GPU memory**: Estimate `num_params × 2 bytes` for BF16. Use `device_map="auto"` for multi-GPU. If the model exceeds single-node memory, see the FSDP2 section in `references/slurm-setup.md`.

.claude/skills/ptq/references/slurm-setup.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ Only submit the full calibration job after the smoke test exits cleanly.
103103

104104
## 5. Monitor Until Completion
105105

106-
After submitting the final job, do not stop — the goal is a finished checkpoint, not a submitted job. Poll until done:
106+
After submitting the final job, do not stop — the goal is a finished checkpoint, not a submitted job. Poll with sleep until done:
107107

108108
```bash
109109
while squeue -j $JOBID -h 2>/dev/null | grep -q .; do
@@ -113,7 +113,9 @@ echo "Job $JOBID finished"
113113
sacct -j $JOBID --format=JobID,State,ExitCode,Elapsed
114114
```
115115

116-
If the session may not stay open that long, use the `CronCreate` tool to set up a periodic check, or ask the user to check back. Once the job ends, tail the last 50 lines of the log and verify the export directory before reporting success.
116+
**IMPORTANT**: Always use `sleep`-based polling (as above) rather than `CronCreate` or background tasks. This keeps output in the current session so the user can see progress. The sleep loop will wait as long as needed — even hours — until the job completes or fails.
117+
118+
Once the job ends, tail the last 50 lines of the log and verify the export directory before reporting success.
117119

118120
---
119121

slack-bot/bot.py

Lines changed: 126 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838

3939
from job_manager import WorkspaceManager
4040
from key_store import KeyStore
41-
from session_manager import run_claude
41+
from session_manager import run_claude_streaming
4242
from user_store import UserStore
4343

4444
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
@@ -70,6 +70,9 @@
7070
onboarding_state: dict[str, str] = {}
7171
cluster_setup_state: dict[str, dict] = {}
7272

73+
# Store last full response per user for /modelopt logs
74+
_last_response: dict[str, str] = {}
75+
7376
# ─── Helpers ─────────────────────────────────────────────────────────
7477

7578

@@ -125,9 +128,13 @@ def is_dm(event: dict) -> bool:
125128
• `/modelopt setup` — onboard (auth + cluster config)
126129
• `/modelopt add-cluster` — configure a remote cluster
127130
• `/modelopt clusters` — list your configured clusters
131+
• `/modelopt set-env KEY=VALUE` — set personal env var (DM only, e.g. `HF_TOKEN`, `NGC_API_KEY`)
132+
• `/modelopt env` — list your env vars
133+
• `/modelopt unset-env KEY` — remove an env var
128134
129-
*Workspaces:*
135+
*Workspaces & Logs:*
130136
• `/modelopt workspaces` — list your workspaces
137+
• `/modelopt logs` — upload full output of last task as a file
131138
• `/modelopt cleanup` — remove old workspaces
132139
• `/modelopt status` — show your current status
133140
@@ -361,6 +368,34 @@ async def handle_slash_command(ack, command, say, respond):
361368
else:
362369
await respond(text="No clusters configured. Use `/modelopt add-cluster` to set one up.")
363370

371+
elif subcmd == "set-env":
372+
if command.get("channel_name") != "directmessage":
373+
await respond(text=":warning: Use this command in a DM with me (contains secrets).")
374+
return
375+
if not args or "=" not in args:
376+
await respond(text="Usage: `/modelopt set-env HF_TOKEN=hf_abc123...`\n\nCommon variables: `HF_TOKEN`, `NGC_API_KEY`, `DOCKER_TOKEN`")
377+
return
378+
key, _, value = args.partition("=")
379+
user_store.set_env_var(user_id, key.strip(), value.strip())
380+
await respond(text=f"`{key.strip()}` saved.")
381+
382+
elif subcmd == "env":
383+
env_vars = user_store.get_env_vars(user_id)
384+
if env_vars:
385+
lines = [f"• `{k}` = `{v}`" for k, v in env_vars.items()]
386+
await respond(text="*Your env vars* (values masked):\n" + "\n".join(lines) + "\n\nUse `/modelopt set-env KEY=VALUE` to add/update, `/modelopt unset-env KEY` to remove.")
387+
else:
388+
await respond(text="No personal env vars set.\n\nUse `/modelopt set-env HF_TOKEN=hf_abc...` to add one.")
389+
390+
elif subcmd == "unset-env":
391+
if not args:
392+
await respond(text="Usage: `/modelopt unset-env HF_TOKEN`")
393+
return
394+
if user_store.remove_env_var(user_id, args.strip()):
395+
await respond(text=f"`{args.strip()}` removed.")
396+
else:
397+
await respond(text=f"`{args.strip()}` not found.")
398+
364399
elif subcmd == "workspaces":
365400
if not user_store.is_registered(user_id):
366401
await respond(text="Not registered yet. Use `/modelopt setup` first.")
@@ -396,6 +431,18 @@ async def handle_slash_command(ack, command, say, respond):
396431
elif subcmd in ("help", ""):
397432
await respond(text=HELP_MSG)
398433

434+
elif subcmd == "logs":
435+
last = _last_response.get(user_id)
436+
if not last:
437+
await respond(text="No recent task output. Run a task first.")
438+
return
439+
await app.client.files_upload_v2(
440+
channel=channel,
441+
content=last,
442+
filename="modelopt_task_log.md",
443+
title="Last Task Output",
444+
)
445+
399446
else:
400447
# Treat as a prompt
401448
await respond(text="Processing...")
@@ -485,31 +532,95 @@ async def _run_job(user_id: str, prompt: str, say_func, channel: str, thread_ts:
485532
f"Workspace root: {ws_root} (contains per-model workspaces). "
486533
f"Upstream repo: {workspace_mgr.repo_dir} (read-only, use for fresh copies). "
487534
f"Read skills/common/workspace-management.md before creating workspaces. "
488-
f"Check existing workspaces with: ls $MODELOPT_WORKSPACE_ROOT/"
535+
f"Check existing workspaces with: ls $MODELOPT_WORKSPACE_ROOT/ "
536+
f"SAFETY: You are running unattended — no human can approve actions. "
537+
f"NEVER run destructive commands (rm -rf /, kill -9, fdisk, mkfs, etc.). "
538+
f"NEVER modify files outside your workspace ({ws_root}) or the user's remote home directory. "
539+
f"Do NOT modify the upstream repo ({workspace_mgr.repo_dir}). "
540+
f"Do NOT modify system files, global configs, or other users' data. "
541+
f"If a task seems risky or ambiguous, output a warning instead of proceeding."
489542
)
490543

491-
session_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"modelopt-slack-{user_id}"))
544+
# Session per Slack thread: messages in the same thread share context,
545+
# new top-level messages start fresh sessions.
546+
# thread_ts is the parent message ts (or the message's own ts if it IS the parent).
547+
session_key = f"modelopt-slack-{user_id}-{thread_ts or 'ephemeral'}"
548+
session_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, session_key))
492549

493550
if thread_ts:
494-
await say_func(text=":rocket: Working on it...", thread_ts=thread_ts)
551+
await say_func(
552+
text=":rocket: Working on it — this may take a while. I'll let you know when it's done.",
553+
thread_ts=thread_ts,
554+
)
495555

556+
# Stream internally to keep idle detection alive. Only send final result to Slack.
557+
full_response = ""
496558
try:
497-
response = await run_claude(
559+
async for chunk in run_claude_streaming(
498560
prompt=prompt,
499561
cwd=workspace,
500562
env=env,
501563
session_id=session_id,
502564
system_prompt_extra=bot_context,
503-
)
565+
):
566+
full_response += chunk.text
567+
if chunk.is_final:
568+
break
569+
504570
except Exception as e:
505-
response = f":x: Failed: {e}"
571+
full_response += f"\n\n:x: Failed: {e}"
506572
logger.error("Request failed for user %s: %s", user_id, e)
507573

574+
# Send final response
575+
if not full_response.strip():
576+
full_response = "No response from Claude."
577+
578+
# Save for /modelopt logs
579+
_last_response[user_id] = full_response
580+
508581
kwargs = {"thread_ts": thread_ts} if thread_ts else {}
509-
if channel and len(response) > MAX_SLACK_LENGTH:
510-
await send_long_response(say_func, response, thread_ts, channel)
582+
if channel and len(full_response) > MAX_SLACK_LENGTH:
583+
await send_long_response(say_func, full_response, thread_ts, channel)
511584
else:
512-
await say_func(text=truncate(response), **kwargs)
585+
await say_func(text=truncate(full_response), **kwargs)
586+
587+
588+
# ─── Auto Cleanup ────────────────────────────────────────────────────
589+
590+
SESSION_MAX_AGE_DAYS = int(os.environ.get("SESSION_MAX_AGE_DAYS", "30"))
591+
CLEANUP_INTERVAL_HOURS = int(os.environ.get("CLEANUP_INTERVAL_HOURS", "6"))
592+
593+
594+
async def _auto_cleanup_loop():
595+
"""Periodically clean up old sessions and workspaces."""
596+
while True:
597+
await asyncio.sleep(CLEANUP_INTERVAL_HOURS * 3600)
598+
try:
599+
import time
600+
601+
cutoff = time.time() - SESSION_MAX_AGE_DAYS * 86400
602+
total_removed = 0
603+
604+
for uid in user_store.list_users():
605+
# Clean old Claude sessions
606+
config_dir = Path(user_store.get_claude_config_dir(uid))
607+
sessions_dir = config_dir / "projects"
608+
if sessions_dir.exists():
609+
for entry in sessions_dir.iterdir():
610+
if entry.is_dir() and entry.stat().st_mtime < cutoff:
611+
import shutil
612+
shutil.rmtree(entry, ignore_errors=True)
613+
total_removed += 1
614+
615+
# Clean old workspaces (older than 7 days, not the default)
616+
ws_root = user_store.jobs_dir(uid)
617+
removed = await workspace_mgr.cleanup_old(ws_root, max_age_days=SESSION_MAX_AGE_DAYS)
618+
total_removed += removed
619+
620+
if total_removed:
621+
logger.info("Auto-cleanup: removed %d old sessions/workspaces", total_removed)
622+
except Exception as e:
623+
logger.error("Auto-cleanup error: %s", e)
513624

514625

515626
# ─── Main ────────────────────────────────────────────────────────────
@@ -538,6 +649,10 @@ async def main():
538649
logger.error("Claude CLI not found in PATH — bot will not work")
539650

540651
logger.info("Registered users: %d", len(user_store.list_users()))
652+
logger.info("Auto-cleanup: every %dh, sessions older than %dd", CLEANUP_INTERVAL_HOURS, SESSION_MAX_AGE_DAYS)
653+
654+
# Start background cleanup task
655+
asyncio.create_task(_auto_cleanup_loop())
541656

542657
handler = AsyncSocketModeHandler(app, SLACK_APP_TOKEN)
543658
await handler.start_async()

slack-bot/job_manager.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,11 +137,12 @@ def list_workspaces(self, workspace_root: Path) -> list[dict]:
137137
})
138138
return result
139139

140-
async def cleanup_old(self, workspace_root: Path) -> int:
141-
"""Remove workspaces older than WORKSPACE_MAX_AGE. Returns count removed."""
142-
if WORKSPACE_MAX_AGE <= 0 or not workspace_root.exists():
140+
async def cleanup_old(self, workspace_root: Path, max_age_days: int | None = None) -> int:
141+
"""Remove workspaces older than max_age_days. Returns count removed."""
142+
max_age_secs = (max_age_days * 86400) if max_age_days else WORKSPACE_MAX_AGE
143+
if max_age_secs <= 0 or not workspace_root.exists():
143144
return 0
144-
cutoff = time.time() - WORKSPACE_MAX_AGE
145+
cutoff = time.time() - max_age_secs
145146
removed = 0
146147
for entry in sorted(workspace_root.iterdir()):
147148
if entry.is_dir() and entry.stat().st_mtime < cutoff:

0 commit comments

Comments
 (0)