From b3531d752c0895b2934c9ce97cb74dda34f8d7c7 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Sun, 17 May 2026 01:51:53 -0700 Subject: [PATCH 1/2] obs(envd): log initLock wait + per-stage SetData durations Today an /init failure surfaces as \"envd silent\" on the orchestrator without any envd-side signal about which stage of SetData was holding initLock. Each invocation now emits one info log with init_lock_wait and set_data durations, plus debug-level per-stage durations for the CA-bundle install and the NFS setup so the next investigation can tell \"envd was queued behind a slow predecessor\" from \"envd's NFS unmount took 9 seconds\" without grepping kernel traces. --- packages/envd/internal/api/init.go | 17 ++++++++++++++++- packages/envd/pkg/version.go | 2 +- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/packages/envd/internal/api/init.go b/packages/envd/internal/api/init.go index 4d0081b0a2..2f9aa300df 100644 --- a/packages/envd/internal/api/init.go +++ b/packages/envd/internal/api/init.go @@ -128,12 +128,20 @@ func (a *API) PostInit(w http.ResponseWriter, r *http.Request) { // Safe because Destroy() is nil-safe and TakeFrom clears the source. defer initRequest.AccessToken.Destroy() + lockWaitStart := time.Now() a.initLock.Lock() + lockWait := time.Since(lockWaitStart) defer a.initLock.Unlock() // Update data only if the request is newer or if there's no timestamp at all if initRequest.Timestamp == nil || a.lastSetTime.SetToGreater(initRequest.Timestamp.UnixNano()) { + setDataStart := time.Now() err = a.SetData(ctx, logger, initRequest) + setDataDur := time.Since(setDataStart) + logger.Info(). + Dur("init_lock_wait", lockWait). + Dur("set_data", setDataDur). + Msg("/init handled") if err != nil { switch { case errors.Is(err, ErrAccessTokenMismatch), errors.Is(err, ErrAccessTokenResetNotAuthorized): @@ -146,6 +154,8 @@ func (a *API) PostInit(w http.ResponseWriter, r *http.Request) { return } + } else { + logger.Debug().Dur("init_lock_wait", lockWait).Msg("/init skipped (older timestamp)") } } @@ -214,14 +224,19 @@ func (a *API) SetData(ctx context.Context, logger zerolog.Logger, data PostInitJ } if data.CaBundle != nil && *data.CaBundle != "" { + caStart := time.Now() err := a.caCertInstaller.Install(context.WithoutCancel(ctx), *data.CaBundle) + logger.Debug().Dur("ca_install", time.Since(caStart)).Msg("CA bundle install done") if err != nil { return fmt.Errorf("failed to install CA bundle: %w", err) } } if data.VolumeMounts != nil { - if err := a.setupNFS(ctx, logger, data.LifecycleID, *data.VolumeMounts); err != nil { + nfsStart := time.Now() + err := a.setupNFS(ctx, logger, data.LifecycleID, *data.VolumeMounts) + logger.Debug().Dur("nfs_setup", time.Since(nfsStart)).Msg("setupNFS done") + if err != nil { return fmt.Errorf("failed to setup NFS volumes: %w", err) } } diff --git a/packages/envd/pkg/version.go b/packages/envd/pkg/version.go index 88bc32c769..4cfbb08b29 100644 --- a/packages/envd/pkg/version.go +++ b/packages/envd/pkg/version.go @@ -1,3 +1,3 @@ package pkg -const Version = "0.5.23" +const Version = "0.5.24" From 03586e63407bf03e18b515887bcd9489fa165473 Mon Sep 17 00:00:00 2001 From: ValentaTomas Date: Sun, 17 May 2026 02:53:49 -0700 Subject: [PATCH 2/2] fix(envd): move /init timing logs after success check --- packages/envd/internal/api/init.go | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/packages/envd/internal/api/init.go b/packages/envd/internal/api/init.go index 2f9aa300df..c2915469b6 100644 --- a/packages/envd/internal/api/init.go +++ b/packages/envd/internal/api/init.go @@ -138,22 +138,26 @@ func (a *API) PostInit(w http.ResponseWriter, r *http.Request) { setDataStart := time.Now() err = a.SetData(ctx, logger, initRequest) setDataDur := time.Since(setDataStart) - logger.Info(). - Dur("init_lock_wait", lockWait). - Dur("set_data", setDataDur). - Msg("/init handled") if err != nil { + logger.Error(). + Err(err). + Dur("init_lock_wait", lockWait). + Dur("set_data", setDataDur). + Msg("/init failed") switch { case errors.Is(err, ErrAccessTokenMismatch), errors.Is(err, ErrAccessTokenResetNotAuthorized): w.WriteHeader(http.StatusUnauthorized) default: - logger.Error().Msgf("Failed to set data: %v", err) w.WriteHeader(http.StatusBadRequest) } w.Write([]byte(err.Error())) return } + logger.Info(). + Dur("init_lock_wait", lockWait). + Dur("set_data", setDataDur). + Msg("/init handled") } else { logger.Debug().Dur("init_lock_wait", lockWait).Msg("/init skipped (older timestamp)") } @@ -226,19 +230,19 @@ func (a *API) SetData(ctx context.Context, logger zerolog.Logger, data PostInitJ if data.CaBundle != nil && *data.CaBundle != "" { caStart := time.Now() err := a.caCertInstaller.Install(context.WithoutCancel(ctx), *data.CaBundle) - logger.Debug().Dur("ca_install", time.Since(caStart)).Msg("CA bundle install done") if err != nil { return fmt.Errorf("failed to install CA bundle: %w", err) } + logger.Debug().Dur("ca_install", time.Since(caStart)).Msg("CA bundle install done") } if data.VolumeMounts != nil { nfsStart := time.Now() err := a.setupNFS(ctx, logger, data.LifecycleID, *data.VolumeMounts) - logger.Debug().Dur("nfs_setup", time.Since(nfsStart)).Msg("setupNFS done") if err != nil { return fmt.Errorf("failed to setup NFS volumes: %w", err) } + logger.Debug().Dur("nfs_setup", time.Since(nfsStart)).Msg("setupNFS done") } return nil