@@ -95,6 +95,18 @@ pub fn cooldown_allows_training(
9595 Ok ( elapsed_secs >= cooldown_secs)
9696}
9797
98+ pub fn training_launch_allowed ( profile : & ModerationProfile ) -> Result < bool > {
99+ let Some ( status) = profile. training_status ( ) else {
100+ return Ok ( true ) ;
101+ } ;
102+
103+ let Some ( state) = status. get ( "status" ) . and_then ( |value| value. as_str ( ) ) else {
104+ return Ok ( true ) ;
105+ } ;
106+
107+ Ok ( state != "running" )
108+ }
109+
98110pub fn build_training_subprocess_command (
99111 root_dir : & str ,
100112 profile_name : & str ,
@@ -127,6 +139,10 @@ pub async fn plan_training_round(
127139 let mut planned = Vec :: new ( ) ;
128140
129141 for scanned in scan_profiles ( root_dir) ? {
142+ if !training_launch_allowed ( & scanned. profile ) ? {
143+ continue ;
144+ }
145+
130146 if !cooldown_allows_training (
131147 & scanned. profile ,
132148 settings. training_scheduler_failure_cooldown_minutes ,
@@ -163,23 +179,51 @@ pub async fn plan_training_round(
163179 Ok ( planned)
164180}
165181
182+ pub fn spawn_detached_command (
183+ program : & str ,
184+ args : & [ String ] ,
185+ profile_name : & str ,
186+ ) -> Result < u32 > {
187+ let mut child = std:: process:: Command :: new ( program)
188+ . args ( args)
189+ . stdin ( std:: process:: Stdio :: null ( ) )
190+ . stdout ( std:: process:: Stdio :: inherit ( ) )
191+ . stderr ( std:: process:: Stdio :: inherit ( ) )
192+ . spawn ( )
193+ . with_context ( || format ! ( "failed to spawn detached command for {}" , profile_name) ) ?;
194+ let pid = child. id ( ) ;
195+ let profile_name = profile_name. to_string ( ) ;
196+ std:: thread:: spawn ( move || match child. wait ( ) {
197+ Ok ( status) => {
198+ if !status. success ( ) {
199+ warn ! (
200+ profile = %profile_name,
201+ status = ?status. code( ) ,
202+ "training subprocess exited unsuccessfully"
203+ ) ;
204+ }
205+ }
206+ Err ( err) => {
207+ warn ! (
208+ profile = %profile_name,
209+ error = %err,
210+ "failed to wait for detached training subprocess"
211+ ) ;
212+ }
213+ } ) ;
214+ Ok ( pid)
215+ }
216+
166217pub async fn spawn_training_subprocess (
167218 settings : & Settings ,
168219 profile_name : & str ,
169- ) -> Result < std :: process :: ExitStatus > {
220+ ) -> Result < u32 > {
170221 let command = build_training_subprocess_command (
171222 & settings. root_dir . display ( ) . to_string ( ) ,
172223 profile_name,
173224 & settings. training_subprocess_allowed_cpus ,
174225 ) ?;
175- let status = std:: process:: Command :: new ( & command. program )
176- . args ( & command. args )
177- . stdin ( std:: process:: Stdio :: null ( ) )
178- . stdout ( std:: process:: Stdio :: inherit ( ) )
179- . stderr ( std:: process:: Stdio :: inherit ( ) )
180- . status ( )
181- . with_context ( || format ! ( "failed to run training subprocess for {}" , profile_name) ) ?;
182- Ok ( status)
226+ spawn_detached_command ( & command. program , & command. args , profile_name)
183227}
184228
185229pub async fn run_scheduler_once ( settings : & Settings ) -> Result < Vec < PlannedTrainingAction > > {
@@ -197,14 +241,8 @@ pub async fn run_scheduler_once(settings: &Settings) -> Result<Vec<PlannedTraini
197241 "scheduler selected profile for training"
198242 ) ;
199243
200- let status = spawn_training_subprocess ( settings, & action. profile_name ) . await ?;
201- if !status. success ( ) {
202- warn ! (
203- profile = %action. profile_name,
204- status = ?status. code( ) ,
205- "training subprocess exited unsuccessfully"
206- ) ;
207- }
244+ let pid = spawn_training_subprocess ( settings, & action. profile_name ) . await ?;
245+ info ! ( profile = %action. profile_name, pid, "scheduler started detached training subprocess" ) ;
208246 }
209247
210248 Ok ( planned)
0 commit comments