Skip to content
This repository was archived by the owner on Apr 15, 2026. It is now read-only.

Commit bb7a47e

Browse files
committed
Testing
1 parent 74a6134 commit bb7a47e

2 files changed

Lines changed: 12 additions & 2 deletions

File tree

internal/checkpointer/checkpointer.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ import (
1515
"strconv"
1616
"strings"
1717
"time"
18+
19+
"github.com/replicate/cog-runtime/internal/logging"
1820
)
1921

2022
const (
@@ -60,13 +62,15 @@ type checkpointer struct {
6062
hasCheckpoint bool
6163
checkpointDir string
6264
leaseFile string
65+
log *logging.SugaredLogger
6366
}
6467

65-
func NewCheckpointer(ctx context.Context) Checkpointer {
68+
func NewCheckpointer(ctx context.Context, log *logging.SugaredLogger) Checkpointer {
6669
return &checkpointer{
6770
enabled: os.Getenv(shouldCheckpointEnvVar) == "true",
6871
checkpointDir: os.Getenv(cudaCheckpointDirEnvVar),
6972
leaseFile: os.Getenv(leaseFileEnvVar),
73+
log: log,
7074
}
7175
}
7276

@@ -208,6 +212,7 @@ func (c *checkpointer) Restore(ctx context.Context) (*exec.Cmd, func(context.Con
208212
// Get the PID for the command
209213
cudaPID, err := exec.CommandContext(con, "pgrep", "-fx", string(cudaCmd)).Output()
210214
if err != nil {
215+
c.log.Errorw("failed to pgrep the CUDA command", "error", err)
211216
// If this command failed, we want to best effort try to kill the started process,
212217
// since we'll start a new one
213218
restoreCmd.Process.Kill() //nolint:errcheck // This is just best effort
@@ -218,6 +223,7 @@ func (c *checkpointer) Restore(ctx context.Context) (*exec.Cmd, func(context.Con
218223
// Toggle CUDA on for the restored process
219224
cmd := exec.CommandContext(con, cudaCheckpointPath, "--toggle", "--pid", string(cudaPID))
220225
if err := cmd.Run(); err != nil {
226+
c.log.Errorw("failed to toggle CUDA on", "error", err)
221227
// If this command failed, we want to best effort try to kill the started process,
222228
// since we'll start a new one
223229
restoreCmd.Process.Kill() //nolint:errcheck // This is just best effort

internal/runner/manager.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ func (m *Manager) createDefaultRunner(ctx context.Context) (*Runner, error) {
301301
}
302302

303303
// This returns an object that does nothing if it is not enabled.
304-
cp := checkpointer.NewCheckpointer(ctx)
304+
cp := checkpointer.NewCheckpointer(ctx, m.logger.Sugar())
305305
err := cp.Prepare(ctx)
306306
if err != nil {
307307
cp.Disable()
@@ -432,6 +432,7 @@ func (m *Manager) startRunnerFromCheckpoint(ctx context.Context, env []string, r
432432

433433
runner, err := m.setupRunner(runtimeContext, runtimeCancel, cmd, env, runnerCtx, maxConcurrency)
434434
if err != nil {
435+
m.logger.Sugar().Errorw("failed to set up runner", "error", err)
435436
return nil, fmt.Errorf("failed to set up runner: %w", err)
436437
}
437438

@@ -444,6 +445,9 @@ func (m *Manager) startRunnerFromCheckpoint(ctx context.Context, env []string, r
444445
// to the runner. We can do this by sending the SigReady signal to the current PID, as signal
445446
// mode should be on if the checkpoint exists
446447
err = syscall.Kill(syscall.Getpid(), SigReady)
448+
if err != nil {
449+
m.logger.Sugar().Errorw("failed to send SIGUSR1", "error", err)
450+
}
447451

448452
return runner, err
449453
}

0 commit comments

Comments
 (0)