@@ -15,6 +15,8 @@ import (
1515 "strconv"
1616 "strings"
1717 "time"
18+
19+ "github.com/replicate/cog-runtime/internal/logging"
1820)
1921
2022const (
@@ -60,13 +62,15 @@ type checkpointer struct {
6062 hasCheckpoint bool
6163 checkpointDir string
6264 leaseFile string
65+ log * logging.SugaredLogger
6366}
6467
65- func NewCheckpointer (ctx context.Context ) Checkpointer {
68+ func NewCheckpointer (ctx context.Context , log * logging. SugaredLogger ) Checkpointer {
6669 return & checkpointer {
6770 enabled : os .Getenv (shouldCheckpointEnvVar ) == "true" ,
6871 checkpointDir : os .Getenv (cudaCheckpointDirEnvVar ),
6972 leaseFile : os .Getenv (leaseFileEnvVar ),
73+ log : log ,
7074 }
7175}
7276
@@ -208,6 +212,7 @@ func (c *checkpointer) Restore(ctx context.Context) (*exec.Cmd, func(context.Con
208212 // Get the PID for the command
209213 cudaPID , err := exec .CommandContext (con , "pgrep" , "-fx" , string (cudaCmd )).Output ()
210214 if err != nil {
215+ c .log .Errorw ("failed to pgrep the CUDA command" , "error" , err )
211216 // If this command failed, we want to best effort try to kill the started process,
212217 // since we'll start a new one
213218 restoreCmd .Process .Kill () //nolint:errcheck // This is just best effort
@@ -218,6 +223,7 @@ func (c *checkpointer) Restore(ctx context.Context) (*exec.Cmd, func(context.Con
218223 // Toggle CUDA on for the restored process
219224 cmd := exec .CommandContext (con , cudaCheckpointPath , "--toggle" , "--pid" , string (cudaPID ))
220225 if err := cmd .Run (); err != nil {
226+ c .log .Errorw ("failed to toggle CUDA on" , "error" , err )
221227 // If this command failed, we want to best effort try to kill the started process,
222228 // since we'll start a new one
223229 restoreCmd .Process .Kill () //nolint:errcheck // This is just best effort
0 commit comments