better bot

mxinO · mxinO · commit 7a106c3c43f0 · 2026-03-20T04:33:05.000Z
diff --git a/.claude/skills/ptq/SKILL.md b/.claude/skills/ptq/SKILL.md
@@ -144,7 +144,7 @@ For MLP-only quantization (skipping attention), use configs with `MLP_ONLY` in t
 
     ```bash
     docker run --gpus all -v <model_path>:<model_path> -v <output_path>:<output_path> \
-        nvcr.io/nvidia/tensorrt-llm/release:<version> bash -c "pip install -e <modelopt_path>[hf] --quiet && python <ptq_script.py> ..."
+        nvcr.io/nvidia/tensorrt-llm/release:<version> bash -c "pip install --no-build-isolation -e <modelopt_path>[hf] --quiet && python <ptq_script.py> ..."
     ```
 
   - **No Docker**: set up a virtual environment with conda (preferred) or venv:
@@ -155,7 +155,7 @@ For MLP-only quantization (skipping attention), use configs with `MLP_ONLY` in t
     # or venv
     python -m venv modelopt-env && source modelopt-env/bin/activate
 
-    pip install nvidia-modelopt[hf]
+    pip install --no-build-isolation nvidia-modelopt[hf]
     ```
 
 **GPU memory**: Estimate `num_params × 2 bytes` for BF16. Use `device_map="auto"` for multi-GPU. If the model exceeds single-node memory, see the FSDP2 section in `references/slurm-setup.md`.
diff --git a/.claude/skills/ptq/references/slurm-setup.md b/.claude/skills/ptq/references/slurm-setup.md
@@ -103,7 +103,7 @@ Only submit the full calibration job after the smoke test exits cleanly.
 
 ## 5. Monitor Until Completion
 
-After submitting the final job, do not stop — the goal is a finished checkpoint, not a submitted job. Poll until done:
+After submitting the final job, do not stop — the goal is a finished checkpoint, not a submitted job. Poll with sleep until done:
 
 ```bash
 while squeue -j $JOBID -h 2>/dev/null | grep -q .; do
@@ -113,7 +113,9 @@ echo "Job $JOBID finished"
 sacct -j $JOBID --format=JobID,State,ExitCode,Elapsed
 ```
 
-If the session may not stay open that long, use the `CronCreate` tool to set up a periodic check, or ask the user to check back. Once the job ends, tail the last 50 lines of the log and verify the export directory before reporting success.
+**IMPORTANT**: Always use `sleep`-based polling (as above) rather than `CronCreate` or background tasks. This keeps output in the current session so the user can see progress. The sleep loop will wait as long as needed — even hours — until the job completes or fails.
+
+Once the job ends, tail the last 50 lines of the log and verify the export directory before reporting success.
 
 ---
 
diff --git a/slack-bot/bot.py b/slack-bot/bot.py
@@ -38,7 +38,7 @@
 
 from job_manager import WorkspaceManager
 from key_store import KeyStore
-from session_manager import run_claude
+from session_manager import run_claude_streaming
 from user_store import UserStore
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
@@ -70,6 +70,9 @@
 onboarding_state: dict[str, str] = {}
 cluster_setup_state: dict[str, dict] = {}
 
+# Store last full response per user for /modelopt logs
+_last_response: dict[str, str] = {}
+
 # ─── Helpers ─────────────────────────────────────────────────────────
 
 
@@ -125,9 +128,13 @@ def is_dm(event: dict) -> bool:
 • `/modelopt setup` — onboard (auth + cluster config)
 • `/modelopt add-cluster` — configure a remote cluster
 • `/modelopt clusters` — list your configured clusters
+• `/modelopt set-env KEY=VALUE` — set personal env var (DM only, e.g. `HF_TOKEN`, `NGC_API_KEY`)
+• `/modelopt env` — list your env vars
+• `/modelopt unset-env KEY` — remove an env var
 
-*Workspaces:*
+*Workspaces & Logs:*
 • `/modelopt workspaces` — list your workspaces
+• `/modelopt logs` — upload full output of last task as a file
 • `/modelopt cleanup` — remove old workspaces
 • `/modelopt status` — show your current status
 
@@ -361,6 +368,34 @@ async def handle_slash_command(ack, command, say, respond):
         else:
             await respond(text="No clusters configured. Use `/modelopt add-cluster` to set one up.")
 
+    elif subcmd == "set-env":
+        if command.get("channel_name") != "directmessage":
+            await respond(text=":warning: Use this command in a DM with me (contains secrets).")
+            return
+        if not args or "=" not in args:
+            await respond(text="Usage: `/modelopt set-env HF_TOKEN=hf_abc123...`\n\nCommon variables: `HF_TOKEN`, `NGC_API_KEY`, `DOCKER_TOKEN`")
+            return
+        key, _, value = args.partition("=")
+        user_store.set_env_var(user_id, key.strip(), value.strip())
+        await respond(text=f"`{key.strip()}` saved.")
+
+    elif subcmd == "env":
+        env_vars = user_store.get_env_vars(user_id)
+        if env_vars:
+            lines = [f"• `{k}` = `{v}`" for k, v in env_vars.items()]
+            await respond(text="*Your env vars* (values masked):\n" + "\n".join(lines) + "\n\nUse `/modelopt set-env KEY=VALUE` to add/update, `/modelopt unset-env KEY` to remove.")
+        else:
+            await respond(text="No personal env vars set.\n\nUse `/modelopt set-env HF_TOKEN=hf_abc...` to add one.")
+
+    elif subcmd == "unset-env":
+        if not args:
+            await respond(text="Usage: `/modelopt unset-env HF_TOKEN`")
+            return
+        if user_store.remove_env_var(user_id, args.strip()):
+            await respond(text=f"`{args.strip()}` removed.")
+        else:
+            await respond(text=f"`{args.strip()}` not found.")
+
     elif subcmd == "workspaces":
         if not user_store.is_registered(user_id):
             await respond(text="Not registered yet. Use `/modelopt setup` first.")
@@ -396,6 +431,18 @@ async def handle_slash_command(ack, command, say, respond):
     elif subcmd in ("help", ""):
         await respond(text=HELP_MSG)
 
+    elif subcmd == "logs":
+        last = _last_response.get(user_id)
+        if not last:
+            await respond(text="No recent task output. Run a task first.")
+            return
+        await app.client.files_upload_v2(
+            channel=channel,
+            content=last,
+            filename="modelopt_task_log.md",
+            title="Last Task Output",
+        )
+
     else:
         # Treat as a prompt
         await respond(text="Processing...")
@@ -485,31 +532,95 @@ async def _run_job(user_id: str, prompt: str, say_func, channel: str, thread_ts:
         f"Workspace root: {ws_root} (contains per-model workspaces). "
         f"Upstream repo: {workspace_mgr.repo_dir} (read-only, use for fresh copies). "
         f"Read skills/common/workspace-management.md before creating workspaces. "
-        f"Check existing workspaces with: ls $MODELOPT_WORKSPACE_ROOT/"
+        f"Check existing workspaces with: ls $MODELOPT_WORKSPACE_ROOT/ "
+        f"SAFETY: You are running unattended — no human can approve actions. "
+        f"NEVER run destructive commands (rm -rf /, kill -9, fdisk, mkfs, etc.). "
+        f"NEVER modify files outside your workspace ({ws_root}) or the user's remote home directory. "
+        f"Do NOT modify the upstream repo ({workspace_mgr.repo_dir}). "
+        f"Do NOT modify system files, global configs, or other users' data. "
+        f"If a task seems risky or ambiguous, output a warning instead of proceeding."
     )
 
-    session_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"modelopt-slack-{user_id}"))
+    # Session per Slack thread: messages in the same thread share context,
+    # new top-level messages start fresh sessions.
+    # thread_ts is the parent message ts (or the message's own ts if it IS the parent).
+    session_key = f"modelopt-slack-{user_id}-{thread_ts or 'ephemeral'}"
+    session_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, session_key))
 
     if thread_ts:
-        await say_func(text=":rocket: Working on it...", thread_ts=thread_ts)
+        await say_func(
+            text=":rocket: Working on it — this may take a while. I'll let you know when it's done.",
+            thread_ts=thread_ts,
+        )
 
+    # Stream internally to keep idle detection alive. Only send final result to Slack.
+    full_response = ""
     try:
-        response = await run_claude(
+        async for chunk in run_claude_streaming(
             prompt=prompt,
             cwd=workspace,
             env=env,
             session_id=session_id,
             system_prompt_extra=bot_context,
-        )
+        ):
+            full_response += chunk.text
+            if chunk.is_final:
+                break
+
     except Exception as e:
-        response = f":x: Failed: {e}"
+        full_response += f"\n\n:x: Failed: {e}"
         logger.error("Request failed for user %s: %s", user_id, e)
 
+    # Send final response
+    if not full_response.strip():
+        full_response = "No response from Claude."
+
+    # Save for /modelopt logs
+    _last_response[user_id] = full_response
+
     kwargs = {"thread_ts": thread_ts} if thread_ts else {}
-    if channel and len(response) > MAX_SLACK_LENGTH:
-        await send_long_response(say_func, response, thread_ts, channel)
+    if channel and len(full_response) > MAX_SLACK_LENGTH:
+        await send_long_response(say_func, full_response, thread_ts, channel)
     else:
-        await say_func(text=truncate(response), **kwargs)
+        await say_func(text=truncate(full_response), **kwargs)
+
+
+# ─── Auto Cleanup ────────────────────────────────────────────────────
+
+SESSION_MAX_AGE_DAYS = int(os.environ.get("SESSION_MAX_AGE_DAYS", "30"))
+CLEANUP_INTERVAL_HOURS = int(os.environ.get("CLEANUP_INTERVAL_HOURS", "6"))
+
+
+async def _auto_cleanup_loop():
+    """Periodically clean up old sessions and workspaces."""
+    while True:
+        await asyncio.sleep(CLEANUP_INTERVAL_HOURS * 3600)
+        try:
+            import time
+
+            cutoff = time.time() - SESSION_MAX_AGE_DAYS * 86400
+            total_removed = 0
+
+            for uid in user_store.list_users():
+                # Clean old Claude sessions
+                config_dir = Path(user_store.get_claude_config_dir(uid))
+                sessions_dir = config_dir / "projects"
+                if sessions_dir.exists():
+                    for entry in sessions_dir.iterdir():
+                        if entry.is_dir() and entry.stat().st_mtime < cutoff:
+                            import shutil
+                            shutil.rmtree(entry, ignore_errors=True)
+                            total_removed += 1
+
+                # Clean old workspaces (older than 7 days, not the default)
+                ws_root = user_store.jobs_dir(uid)
+                removed = await workspace_mgr.cleanup_old(ws_root, max_age_days=SESSION_MAX_AGE_DAYS)
+                total_removed += removed
+
+            if total_removed:
+                logger.info("Auto-cleanup: removed %d old sessions/workspaces", total_removed)
+        except Exception as e:
+            logger.error("Auto-cleanup error: %s", e)
 
 
 # ─── Main ────────────────────────────────────────────────────────────
@@ -538,6 +649,10 @@ async def main():
         logger.error("Claude CLI not found in PATH — bot will not work")
 
     logger.info("Registered users: %d", len(user_store.list_users()))
+    logger.info("Auto-cleanup: every %dh, sessions older than %dd", CLEANUP_INTERVAL_HOURS, SESSION_MAX_AGE_DAYS)
+
+    # Start background cleanup task
+    asyncio.create_task(_auto_cleanup_loop())
 
     handler = AsyncSocketModeHandler(app, SLACK_APP_TOKEN)
     await handler.start_async()
diff --git a/slack-bot/job_manager.py b/slack-bot/job_manager.py
@@ -137,11 +137,12 @@ def list_workspaces(self, workspace_root: Path) -> list[dict]:
             })
         return result
 
-    async def cleanup_old(self, workspace_root: Path) -> int:
-        """Remove workspaces older than WORKSPACE_MAX_AGE. Returns count removed."""
-        if WORKSPACE_MAX_AGE <= 0 or not workspace_root.exists():
+    async def cleanup_old(self, workspace_root: Path, max_age_days: int | None = None) -> int:
+        """Remove workspaces older than max_age_days. Returns count removed."""
+        max_age_secs = (max_age_days * 86400) if max_age_days else WORKSPACE_MAX_AGE
+        if max_age_secs <= 0 or not workspace_root.exists():
             return 0
-        cutoff = time.time() - WORKSPACE_MAX_AGE
+        cutoff = time.time() - max_age_secs
         removed = 0
         for entry in sorted(workspace_root.iterdir()):
             if entry.is_dir() and entry.stat().st_mtime < cutoff:
diff --git a/slack-bot/session_manager.py b/slack-bot/session_manager.py
diff --git a/slack-bot/user_store.py b/slack-bot/user_store.py