11#!/usr/bin/env python3
22
3+ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+ # SPDX-License-Identifier: Apache-2.0
5+ #
6+ # Licensed under the Apache License, Version 2.0 (the "License");
7+ # you may not use this file except in compliance with the License.
8+ # You may obtain a copy of the License at
9+ #
10+ # http://www.apache.org/licenses/LICENSE-2.0
11+ #
12+ # Unless required by applicable law or agreed to in writing, software
13+ # distributed under the License is distributed on an "AS IS" BASIS,
14+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+ # See the License for the specific language governing permissions and
16+ # limitations under the License.
17+
318# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
419# SPDX-License-Identifier: Apache-2.0
520
3348import uuid
3449from pathlib import Path
3550
36- from slack_bolt .adapter .socket_mode .async_handler import AsyncSocketModeHandler
37- from slack_bolt .async_app import AsyncApp
38-
3951from job_manager import WorkspaceManager
4052from key_store import KeyStore
4153from session_manager import run_claude_streaming
54+ from slack_bolt .adapter .socket_mode .async_handler import AsyncSocketModeHandler
55+ from slack_bolt .async_app import AsyncApp
4256from user_store import UserStore
4357
4458logging .basicConfig (level = logging .INFO , format = "%(asctime)s %(levelname)s %(message)s" )
7387# Store last full response per user for /modelopt logs
7488_last_response : dict [str , str ] = {}
7589
90+ # Keep strong references to background tasks to prevent GC
91+ _background_tasks : set = set ()
92+
7693# ─── Helpers ─────────────────────────────────────────────────────────
7794
7895
7996def strip_bot_mention (text : str ) -> str :
97+ """Remove @bot mention prefix from a message."""
8098 return re .sub (r"<@[A-Z0-9]+>\s*" , "" , text ).strip ()
8199
82100
83101def truncate (text : str , limit : int = MAX_SLACK_LENGTH ) -> str :
102+ """Truncate text to the given limit, appending a notice if cut."""
84103 if len (text ) <= limit :
85104 return text
86105 return text [:limit ] + "\n \n ... (truncated, full output in job dir)"
@@ -105,6 +124,7 @@ async def send_long_response(say, text: str, thread_ts: str, channel: str):
105124
106125
107126def is_dm (event : dict ) -> bool :
127+ """Return True if the event is a direct message."""
108128 return event .get ("channel_type" ) == "im"
109129
110130
@@ -198,7 +218,10 @@ async def handle_onboarding_response(event, say):
198218 user_store .setup_shared_key (user_id )
199219 del onboarding_state [user_id ]
200220 await say (
201- text = "Using shared team key. No setup needed!\n \n Would you like to configure a remote cluster? Reply `yes` or `no`." ,
221+ text = (
222+ "Using shared team key. No setup needed!\n \n "
223+ "Would you like to configure a remote cluster? Reply `yes` or `no`."
224+ ),
202225 thread_ts = thread_ts ,
203226 )
204227 onboarding_state [user_id ] = "awaiting_cluster_choice"
@@ -214,7 +237,9 @@ async def handle_onboarding_response(event, say):
214237 session = _auth_sessions .pop (user_id , None )
215238 if not session :
216239 onboarding_state .pop (user_id , None )
217- await say (text = "Login session expired. Try `/modelopt setup` again." , thread_ts = thread_ts )
240+ await say (
241+ text = "Login session expired. Try `/modelopt setup` again." , thread_ts = thread_ts
242+ )
218243 return True
219244
220245 try :
@@ -226,7 +251,10 @@ async def handle_onboarding_response(event, say):
226251 onboarding_state .pop (user_id , None )
227252 session .close ()
228253 await say (
229- text = "Logged in successfully!\n \n Would you like to configure a remote cluster? Reply `yes` or `no`." ,
254+ text = (
255+ "Logged in successfully!\n \n "
256+ "Would you like to configure a remote cluster? Reply `yes` or `no`."
257+ ),
230258 thread_ts = thread_ts ,
231259 )
232260 onboarding_state [user_id ] = "awaiting_cluster_choice"
@@ -250,7 +278,10 @@ async def handle_onboarding_response(event, say):
250278 await start_cluster_setup (user_id , say , thread_ts )
251279 else :
252280 await say (
253- text = "All set! You can configure a cluster later with `/modelopt add-cluster`.\n \n Try: `@modelopt quantize Qwen3-0.6B with nvfp4`" ,
281+ text = (
282+ "All set! You can configure a cluster later with `/modelopt add-cluster`."
283+ "\n \n Try: `@modelopt quantize Qwen3-0.6B with nvfp4`"
284+ ),
254285 thread_ts = thread_ts ,
255286 )
256287 return True
@@ -269,7 +300,10 @@ async def start_cluster_setup(user_id, say, thread_ts):
269300 """Begin interactive cluster configuration."""
270301 cluster_setup_state [user_id ] = {"step" : "name" }
271302 await say (
272- text = "Let's set up a remote cluster.\n \n *Step 1/5:* What would you like to call this cluster? (e.g., `cw-dfw`, `selene`, `my-workstation`)" ,
303+ text = (
304+ "Let's set up a remote cluster.\n \n *Step 1/5:* What would you like to call this"
305+ " cluster? (e.g., `cw-dfw`, `selene`, `my-workstation`)"
306+ ),
273307 thread_ts = thread_ts ,
274308 )
275309
@@ -283,7 +317,10 @@ async def handle_cluster_setup_response(user_id, text, say, thread_ts):
283317 state ["name" ] = text .strip ().replace (" " , "-" )
284318 state ["step" ] = "login_node"
285319 await say (
286- text = f"Cluster alias: *{ state ['name' ]} *\n \n *Step 2/5:* Login node hostname? (e.g., `cluster-login.example.com`)" ,
320+ text = (
321+ f"Cluster alias: *{ state ['name' ]} *\n \n *Step 2/5:* Login node hostname?"
322+ " (e.g., `cluster-login.example.com`)"
323+ ),
287324 thread_ts = thread_ts ,
288325 )
289326 elif step == "login_node" :
@@ -304,7 +341,11 @@ async def handle_cluster_setup_response(user_id, text, say, thread_ts):
304341 state ["workspace" ] = text .strip ()
305342 state ["step" ] = "gpu_type"
306343 await say (
307- text = "*Step 5/5:* GPU type on this cluster? (e.g., `H100`, `B200`, `A100` — used for format recommendations. Type `skip` if unknown.)" ,
344+ text = (
345+ "*Step 5/5:* GPU type on this cluster?"
346+ " (e.g., `H100`, `B200`, `A100` — used for format recommendations."
347+ " Type `skip` if unknown.)"
348+ ),
308349 thread_ts = thread_ts ,
309350 )
310351 elif step == "gpu_type" :
@@ -332,7 +373,10 @@ async def handle_cluster_setup_response(user_id, text, say, thread_ts):
332373
333374 user_store .save_clusters_yaml (user_id , yaml_content )
334375 await say (
335- text = f"Cluster *{ name } * configured!\n \n ```{ yaml_content } ```\n You're all set. Try: `@modelopt quantize Qwen3-0.6B with nvfp4`" ,
376+ text = (
377+ f"Cluster *{ name } * configured!\n \n ```{ yaml_content } ```\n "
378+ "You're all set. Try: `@modelopt quantize Qwen3-0.6B with nvfp4`"
379+ ),
336380 thread_ts = thread_ts ,
337381 )
338382
@@ -373,7 +417,12 @@ async def handle_slash_command(ack, command, say, respond):
373417 await respond (text = ":warning: Use this command in a DM with me (contains secrets)." )
374418 return
375419 if not args or "=" not in args :
376- await respond (text = "Usage: `/modelopt set-env HF_TOKEN=hf_abc123...`\n \n Common variables: `HF_TOKEN`, `NGC_API_KEY`, `DOCKER_TOKEN`" )
420+ await respond (
421+ text = (
422+ "Usage: `/modelopt set-env HF_TOKEN=hf_abc123...`\n \n "
423+ "Common variables: `HF_TOKEN`, `NGC_API_KEY`, `DOCKER_TOKEN`"
424+ )
425+ )
377426 return
378427 key , _ , value = args .partition ("=" )
379428 user_store .set_env_var (user_id , key .strip (), value .strip ())
@@ -383,9 +432,15 @@ async def handle_slash_command(ack, command, say, respond):
383432 env_vars = user_store .get_env_vars (user_id )
384433 if env_vars :
385434 lines = [f"• `{ k } ` = `{ v } `" for k , v in env_vars .items ()]
386- await respond (text = "*Your env vars* (values masked):\n " + "\n " .join (lines ) + "\n \n Use `/modelopt set-env KEY=VALUE` to add/update, `/modelopt unset-env KEY` to remove." )
435+ await respond (
436+ text = "*Your env vars* (values masked):\n "
437+ + "\n " .join (lines )
438+ + "\n \n Use `/modelopt set-env KEY=VALUE` to add/update, `/modelopt unset-env KEY` to remove."
439+ )
387440 else :
388- await respond (text = "No personal env vars set.\n \n Use `/modelopt set-env HF_TOKEN=hf_abc...` to add one." )
441+ await respond (
442+ text = "No personal env vars set.\n \n Use `/modelopt set-env HF_TOKEN=hf_abc...` to add one."
443+ )
389444
390445 elif subcmd == "unset-env" :
391446 if not args :
@@ -403,7 +458,9 @@ async def handle_slash_command(ack, command, say, respond):
403458 ws_root = user_store .jobs_dir (user_id )
404459 workspaces = workspace_mgr .list_workspaces (ws_root )
405460 if not workspaces :
406- await respond (text = "No workspaces yet. They'll be created when you run your first task." )
461+ await respond (
462+ text = "No workspaces yet. They'll be created when you run your first task."
463+ )
407464 return
408465 lines = ["*Your workspaces:*" ]
409466 for w in workspaces [:15 ]:
@@ -423,7 +480,12 @@ async def handle_slash_command(ack, command, say, respond):
423480 if info :
424481 ws_root = user_store .jobs_dir (user_id )
425482 workspaces = workspace_mgr .list_workspaces (ws_root )
426- msg = f"*Auth:* { info ['auth_method' ]} \n *Clusters:* { 'configured' if info ['has_clusters' ] else 'none' } \n *Workspaces:* { len (workspaces )} "
483+ clusters_str = "configured" if info ["has_clusters" ] else "none"
484+ msg = (
485+ f"*Auth:* { info ['auth_method' ]} \n "
486+ f"*Clusters:* { clusters_str } \n "
487+ f"*Workspaces:* { len (workspaces )} "
488+ )
427489 await respond (text = msg )
428490 else :
429491 await respond (text = "Not registered yet. Use `/modelopt setup` first." )
@@ -461,7 +523,10 @@ async def handle_mention(event, say):
461523 thread_ts = event .get ("thread_ts" , event ["ts" ])
462524
463525 if not text :
464- await say (text = "How can I help? Try: `@modelopt quantize Qwen3-0.6B with nvfp4`" , thread_ts = thread_ts )
526+ await say (
527+ text = "How can I help? Try: `@modelopt quantize Qwen3-0.6B with nvfp4`" ,
528+ thread_ts = thread_ts ,
529+ )
465530 return
466531
467532 if not user_store .is_registered (user_id ):
@@ -579,7 +644,7 @@ async def _run_job(user_id: str, prompt: str, say_func, channel: str, thread_ts:
579644 _last_response [user_id ] = full_response
580645
581646 kwargs = {"thread_ts" : thread_ts } if thread_ts else {}
582- if channel and len (full_response ) > MAX_SLACK_LENGTH :
647+ if channel and thread_ts and len (full_response ) > MAX_SLACK_LENGTH :
583648 await send_long_response (say_func , full_response , thread_ts , channel )
584649 else :
585650 await say_func (text = truncate (full_response ), ** kwargs )
@@ -609,12 +674,15 @@ async def _auto_cleanup_loop():
609674 for entry in sessions_dir .iterdir ():
610675 if entry .is_dir () and entry .stat ().st_mtime < cutoff :
611676 import shutil
677+
612678 shutil .rmtree (entry , ignore_errors = True )
613679 total_removed += 1
614680
615681 # Clean old workspaces (older than 7 days, not the default)
616682 ws_root = user_store .jobs_dir (uid )
617- removed = await workspace_mgr .cleanup_old (ws_root , max_age_days = SESSION_MAX_AGE_DAYS )
683+ removed = await workspace_mgr .cleanup_old (
684+ ws_root , max_age_days = SESSION_MAX_AGE_DAYS
685+ )
618686 total_removed += removed
619687
620688 if total_removed :
@@ -627,6 +695,7 @@ async def _auto_cleanup_loop():
627695
628696
629697async def main ():
698+ """Start the ModelOpt Slack bot."""
630699 logger .info ("Starting ModelOpt Slack Bot..." )
631700 logger .info ("Repo dir: %s" , REPO_DIR )
632701 logger .info ("Data dir: %s" , DATA_DIR )
@@ -649,10 +718,16 @@ async def main():
649718 logger .error ("Claude CLI not found in PATH — bot will not work" )
650719
651720 logger .info ("Registered users: %d" , len (user_store .list_users ()))
652- logger .info ("Auto-cleanup: every %dh, sessions older than %dd" , CLEANUP_INTERVAL_HOURS , SESSION_MAX_AGE_DAYS )
721+ logger .info (
722+ "Auto-cleanup: every %dh, sessions older than %dd" ,
723+ CLEANUP_INTERVAL_HOURS ,
724+ SESSION_MAX_AGE_DAYS ,
725+ )
653726
654- # Start background cleanup task
655- asyncio .create_task (_auto_cleanup_loop ())
727+ # Start background cleanup task (keep reference to prevent GC)
728+ _cleanup_task = asyncio .create_task (_auto_cleanup_loop ())
729+ _background_tasks .add (_cleanup_task )
730+ _cleanup_task .add_done_callback (_background_tasks .discard )
656731
657732 handler = AsyncSocketModeHandler (app , SLACK_APP_TOKEN )
658733 await handler .start_async ()
0 commit comments