@@ -21,6 +21,7 @@ import (
2121 "github.com/creack/pty"
2222 "github.com/dstackai/ansistrip"
2323 "github.com/prometheus/procfs"
24+ "github.com/sirupsen/logrus"
2425 "golang.org/x/sys/unix"
2526
2627 "github.com/dstackai/dstack/runner/consts"
@@ -61,6 +62,10 @@ type RunExecutor struct {
6162 fileArchiveDir string
6263 repoBlobDir string
6364
65+ runnerLogFile * os.File
66+ runnerLogStripper * ansistrip.Writer
67+ runnerLogger * logrus.Entry
68+
6469 run schemas.Run
6570 jobSpec schemas.JobSpec
6671 jobSubmission schemas.JobSubmission
@@ -136,14 +141,26 @@ func NewRunExecutor(tempDir string, dstackDir string, currentUser linuxuser.User
136141 }, nil
137142}
138143
144+ // GetJobInfo must be called after SetJob
145+ func (ex * RunExecutor ) GetJobInfo (ctx context.Context ) (string , string , error ) {
146+ // preRun() sets ex.jobUser and ex.jobWorkingDir
147+ if err := ex .preRun (ctx ); err != nil {
148+ return "" , "" , err
149+ }
150+ return ex .jobUser .Username , ex .jobWorkingDir , nil
151+ }
152+
139153// Run must be called after SetJob and WriteRepoBlob
140154func (ex * RunExecutor ) Run (ctx context.Context ) (err error ) {
141- runnerLogFile , err := log .CreateAppendFile (filepath .Join (ex .tempDir , consts .RunnerLogFileName ))
142- if err != nil {
143- ex .SetJobState (ctx , types .JobStateFailed )
144- return fmt .Errorf ("create runner log file: %w" , err )
155+ // If jobStateHistory is not empty, either Run() has already been called or
156+ // preRun() has already been called via GetJobInfo() and failed
157+ if len (ex .jobStateHistory ) > 0 {
158+ return errors .New ("already running or finished" )
159+ }
160+ if err := ex .preRun (ctx ); err != nil {
161+ return err
145162 }
146- defer func () { _ = runnerLogFile . Close () }( )
163+ defer ex . postRun ( ctx )
147164
148165 jobLogFile , err := log .CreateAppendFile (filepath .Join (ex .tempDir , consts .RunnerJobLogFileName ))
149166 if err != nil {
@@ -153,7 +170,7 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) {
153170 defer func () { _ = jobLogFile .Close () }()
154171
155172 defer func () {
156- // recover goes after runnerLogFile.Close() to keep the log
173+ // recover goes after postRun(), which closes runnerLogFile, to keep the log
157174 if r := recover (); r != nil {
158175 log .Error (ctx , "Executor PANIC" , "err" , r )
159176 ex .SetJobState (ctx , types .JobStateFailed )
@@ -171,21 +188,8 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) {
171188 }
172189 }()
173190
174- stripper := ansistrip .NewWriter (ex .runnerLogs , AnsiStripFlushInterval , AnsiStripMaxDelay , MaxBufferSize )
175- defer func () { _ = stripper .Close () }()
176- logger := io .MultiWriter (runnerLogFile , os .Stdout , stripper )
177- ctx = log .WithLogger (ctx , log .NewEntry (logger , int (log .DefaultEntry .Logger .Level ))) // todo loglevel
178- log .Info (ctx , "Run job" , "log_level" , log .GetLogger (ctx ).Logger .Level .String ())
179-
180- if err := ex .setJobUser (ctx ); err != nil {
181- ex .SetJobStateWithTerminationReason (
182- ctx ,
183- types .JobStateFailed ,
184- types .TerminationReasonExecutorError ,
185- fmt .Sprintf ("Failed to set job user (%s)" , err ),
186- )
187- return fmt .Errorf ("set job user: %w" , err )
188- }
191+ ctx = log .WithLogger (ctx , ex .runnerLogger )
192+ log .Info (ctx , "Run job" )
189193
190194 // setJobUser sets User.HomeDir to "/" if the original home dir is not set or not accessible,
191195 // in that case we skip home dir provisioning
@@ -204,16 +208,6 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) {
204208 }
205209 }
206210
207- if err := ex .setJobWorkingDir (ctx ); err != nil {
208- ex .SetJobStateWithTerminationReason (
209- ctx ,
210- types .JobStateFailed ,
211- types .TerminationReasonExecutorError ,
212- fmt .Sprintf ("Failed to set job working dir (%s)" , err ),
213- )
214- return fmt .Errorf ("set job working dir: %w" , err )
215- }
216-
217211 if err := ex .setupRepo (ctx ); err != nil {
218212 ex .SetJobStateWithTerminationReason (
219213 ctx ,
@@ -336,6 +330,66 @@ func (ex *RunExecutor) SetRunnerState(state string) {
336330 ex .state = state
337331}
338332
333+ // preRun performs actions that were once part of Run() but were moved to a separate function
334+ // to implement GetJobInfo()
335+ // preRun must not execute long-running operations, as GetJobInfo() is called synchronously
336+ // in the /api/run method
337+ func (ex * RunExecutor ) preRun (ctx context.Context ) error {
338+ // Already called once
339+ if ex .runnerLogFile != nil {
340+ return nil
341+ }
342+
343+ // logging is required for the subsequent setJob{User,WorkingDir} calls
344+ runnerLogFile , err := log .CreateAppendFile (filepath .Join (ex .tempDir , consts .RunnerLogFileName ))
345+ if err != nil {
346+ ex .SetJobState (ctx , types .JobStateFailed )
347+ return fmt .Errorf ("create runner log file: %w" , err )
348+ }
349+ ex .runnerLogFile = runnerLogFile
350+ ex .runnerLogStripper = ansistrip .NewWriter (ex .runnerLogs , AnsiStripFlushInterval , AnsiStripMaxDelay , MaxBufferSize )
351+ runnerLogWriter := io .MultiWriter (ex .runnerLogFile , os .Stdout , ex .runnerLogStripper )
352+ runnerLogLevel := log .DefaultEntry .Logger .Level
353+ ex .runnerLogger = log .NewEntry (runnerLogWriter , int (runnerLogLevel ))
354+ ctx = log .WithLogger (ctx , ex .runnerLogger )
355+ log .Info (ctx , "Logging configured" , "log_level" , runnerLogLevel .String ())
356+
357+ // jobUser and jobWorkingDir are required for GetJobInfo()
358+ if err := ex .setJobUser (ctx ); err != nil {
359+ ex .SetJobStateWithTerminationReason (
360+ ctx ,
361+ types .JobStateFailed ,
362+ types .TerminationReasonExecutorError ,
363+ fmt .Sprintf ("Failed to set job user (%s)" , err ),
364+ )
365+ return fmt .Errorf ("set job user: %w" , err )
366+ }
367+ if err := ex .setJobWorkingDir (ctx ); err != nil {
368+ ex .SetJobStateWithTerminationReason (
369+ ctx ,
370+ types .JobStateFailed ,
371+ types .TerminationReasonExecutorError ,
372+ fmt .Sprintf ("Failed to set job working dir (%s)" , err ),
373+ )
374+ return fmt .Errorf ("set job working dir: %w" , err )
375+ }
376+
377+ return nil
378+ }
379+
380+ func (ex * RunExecutor ) postRun (ctx context.Context ) {
381+ if ex .runnerLogFile != nil {
382+ if err := ex .runnerLogFile .Close (); err != nil {
383+ log .Error (ctx , "Failed to close runnerLogFile" , "err" , err )
384+ }
385+ }
386+ if ex .runnerLogStripper != nil {
387+ if err := ex .runnerLogStripper .Close (); err != nil {
388+ log .Error (ctx , "Failed to close runnerLogStripper" , "err" , err )
389+ }
390+ }
391+ }
392+
339393// setJobWorkingDir must be called from Run after setJobUser
340394func (ex * RunExecutor ) setJobWorkingDir (ctx context.Context ) error {
341395 var err error
0 commit comments