Skip to content

Commit c7d34e0

Browse files
Force transition instances in inconsistent state
If GARM is killed or restarted while creating a runner, there is a chance that runners remain in creating or deleting state. We've started checking state transitions in GARM and allow a transition when the new state makes sense in normal circumstances. However, when recovering from a crash, we may be in an inconsisten state from which we need to recover. This change added a ForceUpdateInstance() function that ignores state transition inconsistencies. For now, we only use it when spinning up a scale set and check for instance states. This change also fixes a locking issue. Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
1 parent e2d5526 commit c7d34e0

3 files changed

Lines changed: 34 additions & 14 deletions

File tree

database/common/store.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,15 @@ type InstanceStore interface {
9696
DeleteInstance(ctx context.Context, poolID string, instanceNameOrID string) error
9797
DeleteInstanceByName(ctx context.Context, instanceName string) error
9898
UpdateInstance(ctx context.Context, instanceNameOrID string, param params.UpdateInstanceParams) (params.Instance, error)
99+
// ForceUpdateInstance functions identically to UpdateInstance with one important distinction. It does not
100+
// validate the agent ID or instance status or runner status transitions. This function must only be used in
101+
// recovery scenarios, where GARM crashed or was restarted while runners were going through a transitory
102+
// state like "creating", "deleting", etc. In such cases, we cannot simply pick up where we left off, and
103+
// we must set the instance in "pending_delete" or some other state that allows us to recover.
104+
// An instance in "creating" state is currently being serviced by a call by GARM to the external provider.
105+
// If GARM is just starting up and we see one in "creating", it means that the process that was creating it
106+
// was most likely interrupted before it finished and we must clean it up.
107+
ForceUpdateInstance(ctx context.Context, instanceName string, param params.UpdateInstanceParams) (params.Instance, error)
99108

100109
// Probably a bad idea without some king of filter or at least pagination
101110
//

database/sql/instances.go

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -395,23 +395,34 @@ func (s *sqlDatabase) applyInstanceUpdates(instance *Instance, param params.Upda
395395
return nil
396396
}
397397

398+
func (s *sqlDatabase) ForceUpdateInstance(ctx context.Context, instanceName string, param params.UpdateInstanceParams) (params.Instance, error) {
399+
return s.updateInstance(ctx, instanceName, param, true)
400+
}
401+
398402
func (s *sqlDatabase) UpdateInstance(ctx context.Context, instanceName string, param params.UpdateInstanceParams) (params.Instance, error) {
403+
return s.updateInstance(ctx, instanceName, param, false)
404+
}
405+
406+
func (s *sqlDatabase) updateInstance(ctx context.Context, instanceName string, param params.UpdateInstanceParams, force bool) (params.Instance, error) {
399407
var rowsAffected int64
400408
err := s.conn.Transaction(func(tx *gorm.DB) error {
401409
instance, err := s.getInstance(ctx, tx, instanceName, "Pool", "ScaleSet")
402410
if err != nil {
403411
return fmt.Errorf("error updating instance: %w", err)
404412
}
405413

406-
// Validate transitions
407-
if err := s.validateAgentID(instance.AgentID, param.AgentID); err != nil {
408-
return err
409-
}
410-
if err := s.validateRunnerStatusTransition(instance.RunnerStatus, param.RunnerStatus); err != nil {
411-
return err
412-
}
413-
if err := s.validateInstanceStatusTransition(instance.Status, param.Status); err != nil {
414-
return err
414+
if !force {
415+
// Validate transitions
416+
if err := s.validateAgentID(instance.AgentID, param.AgentID); err != nil {
417+
return err
418+
}
419+
420+
if err := s.validateRunnerStatusTransition(instance.RunnerStatus, param.RunnerStatus); err != nil {
421+
return err
422+
}
423+
if err := s.validateInstanceStatusTransition(instance.Status, param.Status); err != nil {
424+
return err
425+
}
415426
}
416427

417428
// Apply updates

workers/scaleset/scaleset.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ func (w *Worker) Start() (err error) {
237237
// it appears that it finished booting and is now running.
238238
//
239239
// NOTE: if the instance was in creating and it managed to boot, there
240-
// is a high chance that the we do not have a provider ID for the runner
240+
// is a high chance that we do not have a provider ID for the runner
241241
// inside our database. When removing the runner, the provider will attempt
242242
// to use the instance name instead of the provider ID, the same as when
243243
// creation of the instance fails and we try to clean up any lingering resources
@@ -251,7 +251,7 @@ func (w *Worker) Start() (err error) {
251251
runnerUpdateParams := params.UpdateInstanceParams{
252252
Status: instanceState,
253253
}
254-
instance, err = w.store.UpdateInstance(w.ctx, instance.Name, runnerUpdateParams)
254+
instance, err = w.store.ForceUpdateInstance(w.ctx, instance.Name, runnerUpdateParams)
255255
if err != nil {
256256
if !errors.Is(err, runnerErrors.ErrNotFound) {
257257
locking.Unlock(instance.Name, false)
@@ -268,7 +268,7 @@ func (w *Worker) Start() (err error) {
268268
runnerUpdateParams := params.UpdateInstanceParams{
269269
Status: commonParams.InstancePendingDelete,
270270
}
271-
instance, err = w.store.UpdateInstance(w.ctx, instance.Name, runnerUpdateParams)
271+
instance, err = w.store.ForceUpdateInstance(w.ctx, instance.Name, runnerUpdateParams)
272272
if err != nil {
273273
if !errors.Is(err, runnerErrors.ErrNotFound) {
274274
locking.Unlock(instance.Name, false)
@@ -566,15 +566,16 @@ func (w *Worker) consolidateRunnerState(runners []params.RunnerReference) error
566566
slog.DebugContext(w.ctx, "runner is locked; skipping", "runner_name", runner.Name)
567567
continue
568568
}
569-
defer locking.Unlock(runner.Name, false)
570569

571570
if _, ok := providerRunnersByName[runner.Name]; !ok {
572571
// The runner is not in the provider anymore. Remove it from the DB.
573572
slog.InfoContext(w.ctx, "runner does not exist in provider; removing from database", "runner_name", runner.Name)
574573
if err := w.removeRunnerFromGithubAndSetPendingDelete(runner.Name, runner.AgentID); err != nil {
574+
locking.Unlock(runner.Name, false)
575575
return fmt.Errorf("removing runner %s: %w", runner.Name, err)
576576
}
577577
}
578+
locking.Unlock(runner.Name, false)
578579
}
579580

580581
return nil
@@ -893,7 +894,6 @@ func (w *Worker) handleScaleDown() {
893894
switch runner.RunnerStatus {
894895
case params.RunnerTerminated, params.RunnerActive:
895896
slog.DebugContext(w.ctx, "runner is not in a valid state; skipping", "runner_name", runner.Name, "runner_status", runner.RunnerStatus)
896-
locking.Unlock(runner.Name, false)
897897
continue
898898
}
899899
locked := locking.TryLock(runner.Name, w.consumerID)

0 commit comments

Comments
 (0)