Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions pkg/manager/internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ const (
defaultRenewDeadline = 10 * time.Second
defaultRetryPeriod = 2 * time.Second
defaultGracefulShutdownPeriod = 30 * time.Second
defaultHookPeriod = 15 * time.Second
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
defaultHookPeriod = 15 * time.Second
defaultHookTimeoutPeriod = 15 * time.Second

nit


defaultReadinessEndpoint = "/readyz"
defaultLivenessEndpoint = "/healthz"
Expand Down Expand Up @@ -166,6 +167,13 @@ type controllerManager struct {
// internalProceduresStop channel is used internally to the manager when coordinating
// the proper shutdown of servers. This channel is also used for dependency injection.
internalProceduresStop chan struct{}

// prestartHooks are functions that are run immediately before calling the Start functions
// of the leader election runnables.
prestartHooks []Runnable

// hookTimeout is the duration given to each hook to return successfully.
hookTimeout time.Duration
}

type hasCache interface {
Expand Down Expand Up @@ -240,6 +248,24 @@ func (cm *controllerManager) GetHTTPClient() *http.Client {
return cm.cluster.GetHTTPClient()
}

// Hook allows you to add hooks.
func (cm *controllerManager) Hook(hook HookType, runnable Runnable) error {
cm.Lock()
defer cm.Unlock()

if cm.started {
return fmt.Errorf("unable to add new hook because the manager has already been started")
}

switch hook {
case HookPrestartType:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be HookTypePrestart?

cm.prestartHooks = append(cm.prestartHooks, runnable)
return nil
}

return errors.New("hook type not supported")
}

func (cm *controllerManager) GetConfig() *rest.Config {
return cm.cluster.GetConfig()
}
Expand Down Expand Up @@ -648,6 +674,27 @@ func (cm *controllerManager) initLeaderElector() (*leaderelection.LeaderElector,
}

func (cm *controllerManager) startLeaderElectionRunnables() error {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's rename the func to startPreStartHooksAndLeaderElectionRunnables

cm.logger.Info("Running prestart hooks")
for _, hook := range cm.prestartHooks {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other than that being what you found in the original PR, what is the reason for starting the hooks sequentially and not in parallel? Couldn't the sequential starting be achieved if the user instead submitted one hook that does everything sequentlally?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably need to think about how we would collect errors from pre-start hooks on startup and use that to bail out if there are any errors, but that sounds like it would be fairly straightforward with a waitgroup

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree. I think we should start them in parallel

var ctx context.Context
var cancel context.CancelFunc

if cm.hookTimeout < 0 {
ctx, cancel = context.WithCancel(cm.internalCtx)
} else {
ctx, cancel = context.WithTimeout(cm.internalCtx, cm.hookTimeout)
}

defer cancel()

if err := hook.Start(ctx); err != nil {
return err
}
}

// All the prestart hooks have ben run, clear the slice to free the underlying resources.
cm.prestartHooks = nil
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add a log here finished running hooks or such so that if this hangs, there is at least some chance to figure out it is because of the hooks?


return cm.runnables.LeaderElection.Start(cm.internalCtx)
}

Expand Down
22 changes: 22 additions & 0 deletions pkg/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ type Manager interface {
// AddReadyzCheck allows you to add Readyz checker
AddReadyzCheck(name string, check healthz.Checker) error

// Hook allows to add Runnables as hooks to modify the behavior.
Hook(hook HookType, runnable Runnable) error
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know @vincepri had suggested this name in the other PR, but I am not a huge fan because IMHO:

  • It is different from Add in that its name is what is being added rather than whats being done/the operation
  • Having a HookType argument when there is only one HookType seems needlessly complicated to me and requires us to spread out godocs explaining what this does to this method and the HookType rather than having one place where we can explain it

I would suggest AddPrestartHook since that follows the existing naming pattern and is easy to discover. If we ever add another hook, we can then name that Add${OtherType}Hook.

But lets wait for a second opinion from @sbueringer before you end up changing this back and forth

Copy link
Copy Markdown
Member

@sbueringer sbueringer Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, AddPrestartHook sounds good.

Is there some prior art from other components (kube-controller-manager, other controller frameworks) that might give us an idea how likely it is that we'll have other types of hooks in the future and what these might be?

(but if we don't have more data let's go with AddPrestartHook)

Hm. Is "Prestart" precise enough to describe "before leader election runnables"? No strong opinion, but it's pretty generic.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+ "to modify the behavior" sounds strange. Let's try to explain this a bit better


// Start starts all registered Controllers and blocks until the context is cancelled.
// Returns an error if there is an error starting any controller.
//
Expand Down Expand Up @@ -281,6 +284,10 @@ type Options struct {
// +optional
Controller config.Controller

// HookTimeout is the duration given to each hook to return successfully.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WDYT about the idea of having a per-hook timeout rather than a global one for all hooks? Different hook may need different timeouts

// To use hooks without timeout, set to a negative duration, e.g. time.Duration(-1)
HookTimeout *time.Duration
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When @leontappe and I evaluated the proposed change in the old PR we decided to keep the -1 -> no-timeout in order to conform with previously decided behavior for GracefulShutdownTimeout.
Other projects also use -1 as a value to the denote absence of any timeout/ratelimiting (rest.Config.QPS)

Given that a 0 value is not logically valid, we could reset to the default here, this is also done in GracefulShotdownTimeout

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the comparisons are not great, GracefulShutdownTimeout has an additional state of disable this which is indicated by 0 and restConfig.QPS is not a pointer.

I would probably build this as a value <= 0 means no timeout

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer an explicit value meaning no timeout that we document personally, so I'd say if this is 0, it's no timeout

  • nil: no opinion, use something default
  • explicitly 0: no timeout
  • explicitly >0: use this as the timeout
  • explicitly <0: error

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What other timeout fields do we have in CR as prior art? Or do we only have GracefulShutdownTimeout? (just to end up with something somewhat consistent, it makes sense to make different decisions if the cases are different but we should minimize variations if possible)


// makeBroadcaster allows deferring the creation of the broadcaster to
// avoid leaking goroutines if we never call Start on this manager. It also
// returns whether or not this is a "owned" broadcaster, and as such should be
Expand All @@ -295,6 +302,15 @@ type Options struct {
newPprofListener func(addr string) (net.Listener, error)
}

// HookType defines hooks for use with AddHook.
type HookType int

const (
// HookPrestartType defines a hook that is run after leader election and immediately before
// calling Start on the runnables that needed leader election.
HookPrestartType HookType = iota
)

// BaseContextFunc is a function used to provide a base Context to Runnables
// managed by a Manager.
type BaseContextFunc func() context.Context
Expand Down Expand Up @@ -465,6 +481,7 @@ func New(config *rest.Config, options Options) (Manager, error) {
livenessEndpointName: options.LivenessEndpointName,
pprofListener: pprofListener,
gracefulShutdownTimeout: *options.GracefulShutdownTimeout,
hookTimeout: *options.HookTimeout,
internalProceduresStop: make(chan struct{}),
leaderElectionStopped: make(chan struct{}),
leaderElectionReleaseOnCancel: options.LeaderElectionReleaseOnCancel,
Expand Down Expand Up @@ -577,6 +594,11 @@ func setOptionsDefaults(config *rest.Config, options Options) (Options, error) {
options.GracefulShutdownTimeout = &gracefulShutdownTimeout
}

if options.HookTimeout == nil {
hookTimeout := defaultHookPeriod
options.HookTimeout = &hookTimeout
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: if we keep this let's use ptr.To()

}

if options.Logger.GetSink() == nil {
options.Logger = log.Log
}
Expand Down
115 changes: 115 additions & 0 deletions pkg/manager/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1255,6 +1255,121 @@ var _ = Describe("manger.Manager", func() {
<-managerStopDone
Expect(time.Since(beforeDone)).To(BeNumerically(">=", 1500*time.Millisecond))
})

It("should run prestart hooks before calling Start on leader election runnables", func(ctx context.Context) {
m, err := New(cfg, options)
Expect(err).NotTo(HaveOccurred())
for _, cb := range callbacks {
cb(m)
}

runnableRan := make(chan struct{})

Expect(m.Add(RunnableFunc(func(ctx context.Context) error {
close(runnableRan)
return nil
}))).ToNot(HaveOccurred())

Expect(m.Hook(HookPrestartType, RunnableFunc(func(ctx context.Context) error {
Expect(m.Elected()).ShouldNot(BeClosed())
Consistently(runnableRan).ShouldNot(BeClosed())
return nil
}))).ToNot(HaveOccurred())

ctx, cancel := context.WithCancel(ctx)
defer cancel()
go func() {
defer GinkgoRecover()
Expect(m.Elected()).ShouldNot(BeClosed())
Expect(m.Start(ctx)).To(Succeed())
}()

Eventually(m.Elected()).Should(BeClosed())
})

It("should run prestart hooks with timeout", func(ctx context.Context) {
m, err := New(cfg, options)
Expect(err).NotTo(HaveOccurred())
for _, cb := range callbacks {
cb(m)
}
m.(*controllerManager).hookTimeout = 1 * time.Nanosecond

Expect(m.Hook(HookPrestartType, RunnableFunc(func(ctx context.Context) error {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(1 * time.Second):
return errors.New("prestart hook timeout exceeded expected")
}
}))).ToNot(HaveOccurred())

ctx, cancel := context.WithCancel(ctx)
defer cancel()

Expect(m.Start(ctx)).Should(MatchError(context.DeadlineExceeded))
})

It("should run prestart hooks without timeout", func(ctx context.Context) {
m, err := New(cfg, options)
Expect(err).NotTo(HaveOccurred())
for _, cb := range callbacks {
cb(m)
}
m.(*controllerManager).hookTimeout = -1 * time.Second

Expect(m.Add(RunnableFunc(func(ctx context.Context) error {
return nil
}))).ToNot(HaveOccurred())

Expect(m.Hook(HookPrestartType, RunnableFunc(func(ctx context.Context) error {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(1 * time.Second):
fmt.Println("prestart hook returning")
return nil
}
}))).ToNot(HaveOccurred())

ctx, cancel := context.WithCancel(ctx)
defer cancel()

go func() {
defer GinkgoRecover()
Expect(m.Elected()).ShouldNot(BeClosed())
Expect(m.Start(ctx)).NotTo(HaveOccurred())
}()

<-m.Elected()
})

It("should not run leader election runnables if prestart hooks fail", func(ctx context.Context) {
m, err := New(cfg, options)
Expect(err).NotTo(HaveOccurred())
for _, cb := range callbacks {
cb(m)
}

runnableRan := make(chan struct{})

Expect(m.Add(RunnableFunc(func(ctx context.Context) error {
close(runnableRan)
return nil
}))).ToNot(HaveOccurred())

Expect(m.Hook(HookPrestartType, RunnableFunc(func(ctx context.Context) error {
Expect(m.Elected()).ShouldNot(BeClosed())
Consistently(runnableRan).ShouldNot(BeClosed())
return errors.New("prestart hook failed")
}))).ToNot(HaveOccurred())

ctx, cancel := context.WithCancel(ctx)
defer cancel()

Expect(m.Elected()).ShouldNot(BeClosed())
Expect(m.Start(ctx)).Should(MatchError(ContainSubstring("prestart hook failed")))
})
}

Context("with defaults", func() {
Expand Down
Loading