@@ -872,7 +872,7 @@ func (d *DefaultManager) DeleteWorkloads(_ context.Context, names []string) (*er
872872}
873873
874874// RestartWorkloads restarts the specified workloads by name.
875- func (d * DefaultManager ) RestartWorkloads (_ context.Context , names []string , foreground bool ) (* errgroup.Group , error ) {
875+ func (d * DefaultManager ) RestartWorkloads (ctx context.Context , names []string , foreground bool ) (* errgroup.Group , error ) {
876876 // Validate all workload names to prevent path traversal attacks
877877 for _ , name := range names {
878878 if err := types .ValidateWorkloadName (name ); err != nil {
@@ -884,7 +884,7 @@ func (d *DefaultManager) RestartWorkloads(_ context.Context, names []string, for
884884
885885 for _ , name := range names {
886886 group .Go (func () error {
887- return d .restartSingleWorkload (name , foreground )
887+ return d .restartSingleWorkload (ctx , name , foreground )
888888 })
889889 }
890890
@@ -943,39 +943,59 @@ func (d *DefaultManager) updateSingleWorkload(workloadName string, newConfig *ru
943943}
944944
945945// restartSingleWorkload handles the restart logic for a single workload
946- func (d * DefaultManager ) restartSingleWorkload (name string , foreground bool ) error {
947- // Create a child context with a longer timeout
948- childCtx , cancel := context .WithTimeout (context .Background (), AsyncOperationTimeout )
949- defer cancel ()
946+ func (d * DefaultManager ) restartSingleWorkload (ctx context.Context , name string , foreground bool ) error {
950947
951948 // First, try to load the run configuration to check if it's a remote workload
952- runConfig , err := runner .LoadState (childCtx , name )
949+ runConfig , err := runner .LoadState (ctx , name )
953950 if err != nil {
954951 // If we can't load the state, it might be a container workload or the workload doesn't exist
955952 // Try to restart it as a container workload
956- return d .restartContainerWorkload (childCtx , name , foreground )
953+ return d .restartContainerWorkload (ctx , name , foreground )
957954 }
958955
959956 // Check if this is a remote workload
960957 if runConfig .RemoteURL != "" {
961- return d .restartRemoteWorkload (childCtx , name , runConfig , foreground )
958+ return d .restartRemoteWorkload (ctx , name , runConfig , foreground )
962959 }
963960
964961 // This is a container-based workload
965- return d .restartContainerWorkload (childCtx , name , foreground )
962+ return d .restartContainerWorkload (ctx , name , foreground )
966963}
967964
968965// restartRemoteWorkload handles restarting a remote workload
966+ // It blocks until the context is cancelled or there is already a supervisor process running.
969967func (d * DefaultManager ) restartRemoteWorkload (
970968 ctx context.Context ,
971969 name string ,
972970 runConfig * runner.RunConfig ,
973971 foreground bool ,
974972) error {
973+ mcpRunner , err := d .maybeSetupRemoteWorkload (ctx , name , runConfig )
974+ if err != nil {
975+ return fmt .Errorf ("failed to setup remote workload: %w" , err )
976+ }
977+
978+ if mcpRunner == nil {
979+ return nil
980+ }
981+
982+ return d .startWorkload (ctx , name , mcpRunner , foreground )
983+ }
984+
985+ // maybeSetupRemoteWorkload is the startup steps for a remote workload.
986+ // A runner may not be returned if the workload is already running and supervised.
987+ func (d * DefaultManager ) maybeSetupRemoteWorkload (
988+ ctx context.Context ,
989+ name string ,
990+ runConfig * runner.RunConfig ,
991+ ) (* runner.Runner , error ) {
992+ ctx , cancel := context .WithTimeout (ctx , AsyncOperationTimeout )
993+ defer cancel ()
994+
975995 // Get workload status using the status manager
976996 workload , err := d .statuses .GetWorkload (ctx , name )
977997 if err != nil && ! errors .Is (err , rt .ErrWorkloadNotFound ) {
978- return err
998+ return nil , err
979999 }
9801000
9811001 // If workload is already running, check if the supervisor process is healthy
@@ -986,7 +1006,7 @@ func (d *DefaultManager) restartRemoteWorkload(
9861006 if supervisorAlive {
9871007 // Workload is running and healthy - preserve old behavior (no-op)
9881008 logger .Infof ("Remote workload %s is already running" , name )
989- return nil
1009+ return nil , nil
9901010 }
9911011
9921012 // Supervisor is dead/missing - we need to clean up and restart to fix the damaged state
@@ -1015,7 +1035,7 @@ func (d *DefaultManager) restartRemoteWorkload(
10151035 // Load runner configuration from state
10161036 mcpRunner , err := d .loadRunnerFromState (ctx , runConfig .BaseName )
10171037 if err != nil {
1018- return fmt .Errorf ("failed to load state for %s: %v" , runConfig .BaseName , err )
1038+ return nil , fmt .Errorf ("failed to load state for %s: %v" , runConfig .BaseName , err )
10191039 }
10201040
10211041 // Set status to starting
@@ -1024,16 +1044,31 @@ func (d *DefaultManager) restartRemoteWorkload(
10241044 }
10251045
10261046 logger .Infof ("Loaded configuration from state for %s" , runConfig .BaseName )
1047+ return mcpRunner , nil
1048+ }
1049+
1050+ // restartContainerWorkload handles restarting a container-based workload.
1051+ // It blocks until the context is cancelled or there is already a supervisor process running.
1052+ func (d * DefaultManager ) restartContainerWorkload (ctx context.Context , name string , foreground bool ) error {
1053+ workloadName , mcpRunner , err := d .maybeSetupContainerWorkload (ctx , name )
1054+ if err != nil {
1055+ return fmt .Errorf ("failed to setup container workload: %w" , err )
1056+ }
1057+
1058+ if mcpRunner == nil {
1059+ return nil
1060+ }
10271061
1028- // Start the remote workload using the loaded runner
1029- // Use background context to avoid timeout cancellation - same reasoning as container workloads
1030- return d .startWorkload (context .Background (), name , mcpRunner , foreground )
1062+ return d .startWorkload (ctx , workloadName , mcpRunner , foreground )
10311063}
10321064
1033- // restartContainerWorkload handles restarting a container-based workload
1065+ // maybeSetupContainerWorkload is the startup steps for a container-based workload.
1066+ // A runner may not be returned if the workload is already running and supervised.
10341067//
10351068//nolint:gocyclo // Complexity is justified - handles multiple restart scenarios and edge cases
1036- func (d * DefaultManager ) restartContainerWorkload (ctx context.Context , name string , foreground bool ) error {
1069+ func (d * DefaultManager ) maybeSetupContainerWorkload (ctx context.Context , name string ) (string , * runner.Runner , error ) {
1070+ ctx , cancel := context .WithTimeout (ctx , AsyncOperationTimeout )
1071+ defer cancel ()
10371072 // Get container info to resolve partial names and extract proper workload name
10381073 var containerName string
10391074 var workloadName string
@@ -1057,7 +1092,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
10571092 // Get workload status using the status manager
10581093 workload , err := d .statuses .GetWorkload (ctx , name )
10591094 if err != nil && ! errors .Is (err , rt .ErrWorkloadNotFound ) {
1060- return err
1095+ return "" , nil , err
10611096 }
10621097
10631098 // Check if workload is running and healthy (including supervisor process)
@@ -1068,7 +1103,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
10681103 if supervisorAlive {
10691104 // Workload is running and healthy - preserve old behavior (no-op)
10701105 logger .Infof ("Container %s is already running" , containerName )
1071- return nil
1106+ return "" , nil , nil
10721107 }
10731108
10741109 // Supervisor is dead/missing - we need to clean up and restart to fix the damaged state
@@ -1107,7 +1142,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
11071142 if statusErr := d .statuses .SetWorkloadStatus (ctx , workloadName , rt .WorkloadStatusError , err .Error ()); statusErr != nil {
11081143 logger .Warnf ("Failed to set workload %s status to error: %v" , workloadName , statusErr )
11091144 }
1110- return fmt .Errorf ("failed to stop container %s: %v" , containerName , err )
1145+ return "" , nil , fmt .Errorf ("failed to stop container %s: %v" , containerName , err )
11111146 }
11121147 logger .Infof ("Container %s stopped" , containerName )
11131148 }
@@ -1126,7 +1161,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
11261161 // Load runner configuration from state
11271162 mcpRunner , err := d .loadRunnerFromState (ctx , workloadName )
11281163 if err != nil {
1129- return fmt .Errorf ("failed to load state for %s: %v" , workloadName , err )
1164+ return "" , nil , fmt .Errorf ("failed to load state for %s: %v" , workloadName , err )
11301165 }
11311166
11321167 // Set workload status to starting - use the workload name for status operations
@@ -1135,11 +1170,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
11351170 }
11361171 logger .Infof ("Loaded configuration from state for %s" , workloadName )
11371172
1138- // Start the workload with background context to avoid timeout cancellation
1139- // The ctx with AsyncOperationTimeout is only for the restart setup operations,
1140- // but the actual workload should run indefinitely with its own lifecycle management
1141- // Use workload name for user-facing operations
1142- return d .startWorkload (context .Background (), workloadName , mcpRunner , foreground )
1173+ return workloadName , mcpRunner , nil
11431174}
11441175
11451176// startWorkload starts the workload in either foreground or background mode
0 commit comments