Merge branch 'main' into jerm/vmcp-o11y

jhrozek · web-flow · commit 2b2d168816fc · 2025-12-08T18:37:03.000Z
diff --git a/cmd/thv/app/run.go b/cmd/thv/app/run.go
@@ -7,9 +7,7 @@ import (
 	"net"
 	"net/url"
 	"os"
-	"os/signal"
 	"strings"
-	"syscall"
 	"time"
 
 	"github.com/spf13/cobra"
@@ -126,7 +124,7 @@ func init() {
 	AddOIDCFlags(runCmd)
 }
 
-func cleanupAndWait(workloadManager workloads.Manager, name string, cancel context.CancelFunc, errCh <-chan error) {
+func cleanupAndWait(workloadManager workloads.Manager, name string) {
 	cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cleanupCancel()
 
@@ -138,13 +136,6 @@ func cleanupAndWait(workloadManager workloads.Manager, name string, cancel conte
 			logger.Warnf("DeleteWorkloads group error for %q: %v", name, err)
 		}
 	}
-
-	cancel()
-	select {
-	case <-errCh:
-	case <-time.After(5 * time.Second):
-		logger.Warnf("Timeout waiting for workload to stop")
-	}
 }
 
 // nolint:gocyclo // This function is complex by design
@@ -304,28 +295,26 @@ func getworkloadDefaultName(_ context.Context, serverOrImage string) string {
 }
 
 func runForeground(ctx context.Context, workloadManager workloads.Manager, runnerConfig *runner.RunConfig) error {
-	ctx, cancel := context.WithCancel(ctx)
-	defer cancel()
-
-	sigCh := make(chan os.Signal, 1)
-	signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM)
-	defer signal.Stop(sigCh)
 
 	errCh := make(chan error, 1)
 	go func() {
 		errCh <- workloadManager.RunWorkload(ctx, runnerConfig)
 	}()
 
-	select {
-	case sig := <-sigCh:
-		if !process.IsDetached() {
-			logger.Infof("Received signal: %v, stopping server %q", sig, runnerConfig.BaseName)
-			cleanupAndWait(workloadManager, runnerConfig.BaseName, cancel, errCh)
-		}
-		return nil
-	case err := <-errCh:
-		return err
+	// workloadManager.RunWorkload will block until the context is cancelled
+	// or an unrecoverable error is returned. In either case, it will stop the server.
+	// We wait until workloadManager.RunWorkload exits before deleting the workload,
+	// so stopping and deleting don't race.
+	//
+	// There's room for improvement in the factoring here.
+	// Shutdown and cancellation logic is unnecessarily spread across two goroutines.
+	err := <-errCh
+	if !process.IsDetached() {
+		logger.Infof("RunWorkload Exited. Error: %v, stopping server %q", err, runnerConfig.BaseName)
+		cleanupAndWait(workloadManager, runnerConfig.BaseName)
 	}
+	return err
+
 }
 
 func validateGroup(ctx context.Context, workloadsManager workloads.Manager, serverOrImage string) error {
diff --git a/cmd/thv/main.go b/cmd/thv/main.go
@@ -2,6 +2,7 @@
 package main
 
 import (
+	"context"
 	"os"
 	"os/signal"
 	"syscall"
@@ -12,7 +13,6 @@ import (
 	"github.com/stacklok/toolhive/cmd/thv/app"
 	"github.com/stacklok/toolhive/pkg/client"
 	"github.com/stacklok/toolhive/pkg/container"
-	"github.com/stacklok/toolhive/pkg/container/runtime"
 	"github.com/stacklok/toolhive/pkg/lockfile"
 	"github.com/stacklok/toolhive/pkg/logger"
 	"github.com/stacklok/toolhive/pkg/migration"
@@ -23,7 +23,7 @@ func main() {
 	logger.Initialize()
 
 	// Setup signal handling for graceful cleanup
-	setupSignalHandler()
+	ctx := setupSignalHandler()
 
 	// Clean up stale lock files on startup
 	cleanupStaleLockFiles()
@@ -47,8 +47,10 @@ func main() {
 		migration.CheckAndPerformDefaultGroupMigration()
 	}
 
+	cmd := app.NewRootCmd(!app.IsCompletionCommand(os.Args))
+
 	// Skip update check for completion command or if we are running in kubernetes
-	if err := app.NewRootCmd(!app.IsCompletionCommand(os.Args) && !runtime.IsKubernetesRuntime()).Execute(); err != nil {
+	if err := cmd.ExecuteContext(ctx); err != nil {
 		// Clean up any remaining lock files on error exit
 		lockfile.CleanupAllLocks()
 		os.Exit(1)
@@ -59,16 +61,19 @@ func main() {
 }
 
 // setupSignalHandler configures signal handling to ensure lock files are cleaned up
-func setupSignalHandler() {
+func setupSignalHandler() context.Context {
 	sigCh := make(chan os.Signal, 1)
 	signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT)
 
+	ctx, cancel := context.WithCancel(context.Background())
 	go func() {
 		<-sigCh
 		logger.Debugf("Received signal, cleaning up lock files...")
 		lockfile.CleanupAllLocks()
-		os.Exit(0)
+		cancel()
 	}()
+
+	return ctx
 }
 
 // cleanupStaleLockFiles removes stale lock files from known directories on startup
diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go
@@ -8,9 +8,7 @@ import (
 	"fmt"
 	"net/http"
 	"os"
-	"os/signal"
 	"strings"
-	"syscall"
 	"time"
 
 	"golang.org/x/oauth2"
@@ -317,16 +315,19 @@ func (r *Runner) Run(ctx context.Context) error {
 
 	// Define a function to stop the MCP server
 	stopMCPServer := func(reason string) {
+		// Use a background context to avoid cancellation of the main context.
+		cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 1*time.Minute)
+		defer cleanupCancel()
 		logger.Infof("Stopping MCP server: %s", reason)
 
 		// Stop the transport (which also stops the container, monitoring, and handles removal)
 		logger.Infof("Stopping %s transport...", r.Config.Transport)
-		if err := transportHandler.Stop(ctx); err != nil {
+		if err := transportHandler.Stop(cleanupCtx); err != nil {
 			logger.Warnf("Warning: Failed to stop transport: %v", err)
 		}
 
 		// Cleanup telemetry provider
-		if err := r.Cleanup(ctx); err != nil {
+		if err := r.Cleanup(cleanupCtx); err != nil {
 			logger.Warnf("Warning: Failed to cleanup telemetry: %v", err)
 		}
 
@@ -335,7 +336,7 @@ func (r *Runner) Run(ctx context.Context) error {
 		if err := process.RemovePIDFile(r.Config.BaseName); err != nil {
 			logger.Warnf("Warning: Failed to remove PID file: %v", err)
 		}
-		if err := r.statusManager.ResetWorkloadPID(ctx, r.Config.BaseName); err != nil {
+		if err := r.statusManager.ResetWorkloadPID(cleanupCtx, r.Config.BaseName); err != nil {
 			logger.Warnf("Warning: Failed to reset workload %s PID: %v", r.Config.ContainerName, err)
 		}
 
@@ -354,10 +355,6 @@ func (r *Runner) Run(ctx context.Context) error {
 		logger.Info("Press Ctrl+C to stop or wait for container to exit")
 	}
 
-	// Set up signal handling
-	sigCh := make(chan os.Signal, 1)
-	signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
-
 	// Create a done channel to signal when the server has been stopped
 	doneCh := make(chan struct{})
 
@@ -399,8 +396,8 @@ func (r *Runner) Run(ctx context.Context) error {
 
 	// Wait for either a signal or the done channel to be closed
 	select {
-	case sig := <-sigCh:
-		stopMCPServer(fmt.Sprintf("Received signal %s", sig))
+	case <-ctx.Done():
+		stopMCPServer("Context cancelled")
 	case <-doneCh:
 		// The transport has already been stopped (likely by the container exit)
 		// Clean up the PID file and state
diff --git a/pkg/workloads/manager.go b/pkg/workloads/manager.go
@@ -872,7 +872,7 @@ func (d *DefaultManager) DeleteWorkloads(_ context.Context, names []string) (*er
 }
 
 // RestartWorkloads restarts the specified workloads by name.
-func (d *DefaultManager) RestartWorkloads(_ context.Context, names []string, foreground bool) (*errgroup.Group, error) {
+func (d *DefaultManager) RestartWorkloads(ctx context.Context, names []string, foreground bool) (*errgroup.Group, error) {
 	// Validate all workload names to prevent path traversal attacks
 	for _, name := range names {
 		if err := types.ValidateWorkloadName(name); err != nil {
@@ -884,7 +884,7 @@ func (d *DefaultManager) RestartWorkloads(_ context.Context, names []string, for
 
 	for _, name := range names {
 		group.Go(func() error {
-			return d.restartSingleWorkload(name, foreground)
+			return d.restartSingleWorkload(ctx, name, foreground)
 		})
 	}
 
@@ -943,39 +943,59 @@ func (d *DefaultManager) updateSingleWorkload(workloadName string, newConfig *ru
 }
 
 // restartSingleWorkload handles the restart logic for a single workload
-func (d *DefaultManager) restartSingleWorkload(name string, foreground bool) error {
-	// Create a child context with a longer timeout
-	childCtx, cancel := context.WithTimeout(context.Background(), AsyncOperationTimeout)
-	defer cancel()
+func (d *DefaultManager) restartSingleWorkload(ctx context.Context, name string, foreground bool) error {
 
 	// First, try to load the run configuration to check if it's a remote workload
-	runConfig, err := runner.LoadState(childCtx, name)
+	runConfig, err := runner.LoadState(ctx, name)
 	if err != nil {
 		// If we can't load the state, it might be a container workload or the workload doesn't exist
 		// Try to restart it as a container workload
-		return d.restartContainerWorkload(childCtx, name, foreground)
+		return d.restartContainerWorkload(ctx, name, foreground)
 	}
 
 	// Check if this is a remote workload
 	if runConfig.RemoteURL != "" {
-		return d.restartRemoteWorkload(childCtx, name, runConfig, foreground)
+		return d.restartRemoteWorkload(ctx, name, runConfig, foreground)
 	}
 
 	// This is a container-based workload
-	return d.restartContainerWorkload(childCtx, name, foreground)
+	return d.restartContainerWorkload(ctx, name, foreground)
 }
 
 // restartRemoteWorkload handles restarting a remote workload
+// It blocks until the context is cancelled or there is already a supervisor process running.
 func (d *DefaultManager) restartRemoteWorkload(
 	ctx context.Context,
 	name string,
 	runConfig *runner.RunConfig,
 	foreground bool,
 ) error {
+	mcpRunner, err := d.maybeSetupRemoteWorkload(ctx, name, runConfig)
+	if err != nil {
+		return fmt.Errorf("failed to setup remote workload: %w", err)
+	}
+
+	if mcpRunner == nil {
+		return nil
+	}
+
+	return d.startWorkload(ctx, name, mcpRunner, foreground)
+}
+
+// maybeSetupRemoteWorkload is the startup steps for a remote workload.
+// A runner may not be returned if the workload is already running and supervised.
+func (d *DefaultManager) maybeSetupRemoteWorkload(
+	ctx context.Context,
+	name string,
+	runConfig *runner.RunConfig,
+) (*runner.Runner, error) {
+	ctx, cancel := context.WithTimeout(ctx, AsyncOperationTimeout)
+	defer cancel()
+
 	// Get workload status using the status manager
 	workload, err := d.statuses.GetWorkload(ctx, name)
 	if err != nil && !errors.Is(err, rt.ErrWorkloadNotFound) {
-		return err
+		return nil, err
 	}
 
 	// If workload is already running, check if the supervisor process is healthy
@@ -986,7 +1006,7 @@ func (d *DefaultManager) restartRemoteWorkload(
 		if supervisorAlive {
 			// Workload is running and healthy - preserve old behavior (no-op)
 			logger.Infof("Remote workload %s is already running", name)
-			return nil
+			return nil, nil
 		}
 
 		// Supervisor is dead/missing - we need to clean up and restart to fix the damaged state
@@ -1015,7 +1035,7 @@ func (d *DefaultManager) restartRemoteWorkload(
 	// Load runner configuration from state
 	mcpRunner, err := d.loadRunnerFromState(ctx, runConfig.BaseName)
 	if err != nil {
-		return fmt.Errorf("failed to load state for %s: %v", runConfig.BaseName, err)
+		return nil, fmt.Errorf("failed to load state for %s: %v", runConfig.BaseName, err)
 	}
 
 	// Set status to starting
@@ -1024,16 +1044,31 @@ func (d *DefaultManager) restartRemoteWorkload(
 	}
 
 	logger.Infof("Loaded configuration from state for %s", runConfig.BaseName)
+	return mcpRunner, nil
+}
+
+// restartContainerWorkload handles restarting a container-based workload.
+// It blocks until the context is cancelled or there is already a supervisor process running.
+func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name string, foreground bool) error {
+	workloadName, mcpRunner, err := d.maybeSetupContainerWorkload(ctx, name)
+	if err != nil {
+		return fmt.Errorf("failed to setup container workload: %w", err)
+	}
+
+	if mcpRunner == nil {
+		return nil
+	}
 
-	// Start the remote workload using the loaded runner
-	// Use background context to avoid timeout cancellation - same reasoning as container workloads
-	return d.startWorkload(context.Background(), name, mcpRunner, foreground)
+	return d.startWorkload(ctx, workloadName, mcpRunner, foreground)
 }
 
-// restartContainerWorkload handles restarting a container-based workload
+// maybeSetupContainerWorkload is the startup steps for a container-based workload.
+// A runner may not be returned if the workload is already running and supervised.
 //
 //nolint:gocyclo // Complexity is justified - handles multiple restart scenarios and edge cases
-func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name string, foreground bool) error {
+func (d *DefaultManager) maybeSetupContainerWorkload(ctx context.Context, name string) (string, *runner.Runner, error) {
+	ctx, cancel := context.WithTimeout(ctx, AsyncOperationTimeout)
+	defer cancel()
 	// Get container info to resolve partial names and extract proper workload name
 	var containerName string
 	var workloadName string
@@ -1057,7 +1092,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
 	// Get workload status using the status manager
 	workload, err := d.statuses.GetWorkload(ctx, name)
 	if err != nil && !errors.Is(err, rt.ErrWorkloadNotFound) {
-		return err
+		return "", nil, err
 	}
 
 	// Check if workload is running and healthy (including supervisor process)
@@ -1068,7 +1103,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
 		if supervisorAlive {
 			// Workload is running and healthy - preserve old behavior (no-op)
 			logger.Infof("Container %s is already running", containerName)
-			return nil
+			return "", nil, nil
 		}
 
 		// Supervisor is dead/missing - we need to clean up and restart to fix the damaged state
@@ -1107,7 +1142,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
 				if statusErr := d.statuses.SetWorkloadStatus(ctx, workloadName, rt.WorkloadStatusError, err.Error()); statusErr != nil {
 					logger.Warnf("Failed to set workload %s status to error: %v", workloadName, statusErr)
 				}
-				return fmt.Errorf("failed to stop container %s: %v", containerName, err)
+				return "", nil, fmt.Errorf("failed to stop container %s: %v", containerName, err)
 			}
 			logger.Infof("Container %s stopped", containerName)
 		}
@@ -1126,7 +1161,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
 	// Load runner configuration from state
 	mcpRunner, err := d.loadRunnerFromState(ctx, workloadName)
 	if err != nil {
-		return fmt.Errorf("failed to load state for %s: %v", workloadName, err)
+		return "", nil, fmt.Errorf("failed to load state for %s: %v", workloadName, err)
 	}
 
 	// Set workload status to starting - use the workload name for status operations
@@ -1135,11 +1170,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
 	}
 	logger.Infof("Loaded configuration from state for %s", workloadName)
 
-	// Start the workload with background context to avoid timeout cancellation
-	// The ctx with AsyncOperationTimeout is only for the restart setup operations,
-	// but the actual workload should run indefinitely with its own lifecycle management
-	// Use workload name for user-facing operations
-	return d.startWorkload(context.Background(), workloadName, mcpRunner, foreground)
+	return workloadName, mcpRunner, nil
 }
 
 // startWorkload starts the workload in either foreground or background mode
diff --git a/test/e2e/helpers.go b/test/e2e/helpers.go
diff --git a/test/e2e/osv_mcp_server_test.go b/test/e2e/osv_mcp_server_test.go