diff --git a/.gitignore b/.gitignore index d2ccbb84e..64a2c1afb 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ test/results/ *.kubeconfig me out +/step me.yml me.yaml tmp/ diff --git a/cmd/step/main.go b/cmd/step/main.go index 809bcff58..d93c2e54b 100644 --- a/cmd/step/main.go +++ b/cmd/step/main.go @@ -70,6 +70,7 @@ func main() { flags.StringVarP(&step.Shell, "shell", "s", "/bin/sh", "The shell to execute the command in") flags.StringVar(&step.FailureFile, "is-failure", "", "The path of the file used to indicate failure above") flags.StringSliceVarP(&step.UploadFile, "upload", "u", []string{}, "Upload file as a kubernetes secret") + flags.StringSliceVar(&step.UploadOnErrorFile, "upload-on-error", []string{}, "Upload file as a kubernetes secret even when the command has failed") flags.StringVar(&step.WaitFile, "wait-on", "", "The path to a file to indicate this step can be run") flags.StringSliceVarP(&step.Commands, "command", "c", []string{}, "Command to execute") flags.IntVar(&step.RetryAttempts, "retry-attempts", 0, "Number of times to retry the commands") @@ -82,6 +83,11 @@ func main() { } } +// uploadRetryAttempts is the fixed number of retry attempts used for all secret upload +// operations (both success-path and error-path). A small conservative value is used +// intentionally: uploads are best-effort and should not block the primary operation. +const uploadRetryAttempts = 2 + // calculateBackoff returns a duration that includes the minimum backoff plus a random jitter func calculateBackoff(minBackoff, maxJitter time.Duration) time.Duration { if maxJitter <= 0 { @@ -92,135 +98,162 @@ func calculateBackoff(minBackoff, maxJitter time.Duration) time.Duration { return minBackoff + jitter } -// Run is called to implement the action -func Run(ctx context.Context, step Step) error { - if err := step.IsValid(); err != nil { +// waitForSignal waits for a signal file to appear before allowing execution to proceed. +// It returns an error if a failure file is found or the timeout expires. +func waitForSignal(ctx context.Context, step Step) error { + log.WithFields(log.Fields{ + "is-failure": step.FailureFile, + "on-error": step.ErrorFile, + "on-wait": step.WaitFile, + "timeout": step.Timeout.String(), + }).Info("waiting for signal to execute") + + return utils.RetryWithTimeout(ctx, step.Timeout, time.Second, func() (bool, error) { + if step.FailureFile != "" { + if found, _ := utils.FileExists(step.FailureFile); found { + return false, errors.New("found error signal file, refusing to execute") + } + } + if found, _ := utils.FileExists(step.WaitFile); found { + return true, nil + } + + return false, nil + }) +} + +// runCommand executes a single attempt of a command and returns any error. +func runCommand(ctx context.Context, step Step, index, attempt int, command string) error { + //nolint:gosec + cmd := exec.CommandContext(ctx, step.Shell, "-c", command) + cmd.Env = os.Environ() + + logger := log.WithFields(log.Fields{ + "command-index": index, + "attempt": attempt, + }) + + stdout, err := cmd.StdoutPipe() + if err != nil { + logger.WithError(err).Error("failed to acquire stdout pipe on command") + return err } - var cc client.Client - if len(step.UploadFile) > 0 { - ci, err := kubernetes.NewRuntimeClient(nil) - if err != nil { - return err - } - cc = ci + stderr, err := cmd.StderrPipe() + if err != nil { + logger.WithError(err).Error("failed to acquire stderr pipe on command") + + return err } - if step.Comment != "" { - fmt.Printf(` -======================================================= -%s -======================================================= -`, strings.ToUpper(step.Comment)) + //nolint:errcheck + go io.Copy(os.Stdout, stdout) + //nolint:errcheck + go io.Copy(os.Stdout, stderr) + + if err := cmd.Start(); err != nil { + logger.WithError(err).Error("failed to execute the command") + + return err } - if step.WaitFile != "" { - log.WithFields(log.Fields{ - "is-failure": step.FailureFile, - "on-error": step.ErrorFile, - "on-wait": step.WaitFile, - "timeout": step.Timeout.String(), - }).Info("waiting for signal to execute") - - err := utils.RetryWithTimeout(ctx, step.Timeout, time.Second, func() (bool, error) { - if step.FailureFile != "" { - if found, _ := utils.FileExists(step.FailureFile); found { - return false, errors.New("found error signal file, refusing to execute") - } - } - if found, _ := utils.FileExists(step.WaitFile); found { - return true, nil - } + if err := cmd.Wait(); err != nil { + logger.WithError(err).Error("command execution failed") - return false, nil - }) - if err != nil { - return err - } + return err } - for i, command := range step.Commands { - attempt := 0 - var lastErr error - - for attempt <= step.RetryAttempts { - if attempt > 0 { - backoff := calculateBackoff(step.RetryMinBackoff, step.RetryMaxJitter) - log.WithFields(log.Fields{ - "attempt": attempt, - "command-index": i, - "backoff": backoff, - }).Info("retrying command") - - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(backoff): - } - } + return nil +} - //nolint:gosec - cmd := exec.CommandContext(ctx, step.Shell, "-c", command) - cmd.Env = os.Environ() +// runCommandWithRetries runs a command with retry logic, returning the number of +// attempts made and any error after all retries are exhausted. +func runCommandWithRetries(ctx context.Context, step Step, index int, command string) (int, error) { + attempt := 0 + var lastErr error - logger := log.WithFields(log.Fields{ - "command-index": i, + for attempt <= step.RetryAttempts { + if attempt > 0 { + backoff := calculateBackoff(step.RetryMinBackoff, step.RetryMaxJitter) + log.WithFields(log.Fields{ "attempt": attempt, - }) - - stdout, err := cmd.StdoutPipe() - if err != nil { - logger.WithError(err).Error("failed to acquire stdout pipe on command") - return err - } - stderr, err := cmd.StderrPipe() - if err != nil { - logger.WithError(err).Error("failed to acquire stderr pipe on command") - return err + "command-index": index, + "backoff": backoff, + }).Info("retrying command") + + select { + case <-ctx.Done(): + return attempt, ctx.Err() + case <-time.After(backoff): } + } - //nolint:errcheck - go io.Copy(os.Stdout, stdout) - //nolint:errcheck - go io.Copy(os.Stdout, stderr) + if err := runCommand(ctx, step, index, attempt, command); err != nil { + lastErr = err + attempt++ - if err := cmd.Start(); err != nil { - logger.WithError(err).Error("failed to execute the command") - lastErr = err - attempt++ - continue - } + continue + } - // @step: wait for the command to finish - if err := cmd.Wait(); err != nil { - logger.WithError(err).Error("command execution failed") - lastErr = err - attempt++ - continue - } + return attempt, nil + } + + return attempt, lastErr +} + +// handleCommandError processes a command failure: touches the error file if configured, +// performs best-effort upload of any on-error files, and returns a wrapped error. +func handleCommandError(ctx context.Context, step Step, cc client.Client, attempts int, lastErr error) error { + if step.ErrorFile != "" { + if err := utils.TouchFile(step.ErrorFile); err != nil { + log.WithError(err).WithField("file", step.ErrorFile).Error("failed to create error file") - // Command succeeded, break the retry loop - lastErr = nil - break + return err } + } - // If we exhausted all retries and still have an error - if lastErr != nil { - if step.ErrorFile != "" { - if err := utils.TouchFile(step.ErrorFile); err != nil { - log.WithError(err).WithField("file", step.ErrorFile).Error("failed to create error file") - return err - } + // @step: attempt a best-effort upload of any files configured to upload on error + for name, path := range step.UploadOnErrorKeyPairs() { + if found, err := utils.FileExists(path); err != nil { + log.WithError(err).WithFields(log.Fields{ + "path": path, + "secret": name, + }).Warn("failed to check if upload-on-error file exists, skipping") + + continue + } else if !found { + log.WithFields(log.Fields{ + "path": path, + "secret": name, + }).Warn("skipping upload-on-error as file does not exist") + + continue + } + + // @step: attempt a best-effort upload with a conservative fixed retry count rather than + // the command's RetryAttempts, since this is a best-effort operation that should not + // block error reporting. + if err := utils.Retry(ctx, uploadRetryAttempts, true, 5*time.Second, func() (bool, error) { + err := uploadSecret(ctx, cc, step.Namespace, name, path) + if err == nil { + return true, nil } - return fmt.Errorf("command failed after %d attempts: %w", attempt, lastErr) + log.WithError(err).WithField("secret", name).Error("failed to upload secret on error") + + return false, nil + }); err != nil { + log.WithError(err).WithField("secret", name).Error("failed to upload secret on error, continuing") } } - log.Info("successfully executed the step") - // @step: upload any files as kubernetes secrets + return fmt.Errorf("command failed after %d attempts: %w", attempts, lastErr) +} + +// uploadSuccessFiles uploads all configured files as Kubernetes secrets after a successful run. +func uploadSuccessFiles(ctx context.Context, step Step, cc client.Client) error { for name, path := range step.UploadKeyPairs() { - err := utils.Retry(ctx, 2, true, 5*time.Second, func() (bool, error) { + err := utils.Retry(ctx, uploadRetryAttempts, true, 5*time.Second, func() (bool, error) { err := uploadSecret(ctx, cc, step.Namespace, name, path) if err == nil { return true, nil @@ -234,6 +267,51 @@ func Run(ctx context.Context, step Step) error { } } + return nil +} + +// Run is called to implement the action +func Run(ctx context.Context, step Step) error { + if err := step.IsValid(); err != nil { + return err + } + + var cc client.Client + if len(step.UploadFile) > 0 || len(step.UploadOnErrorFile) > 0 { + ci, err := kubernetes.NewRuntimeClient(nil) + if err != nil { + return err + } + cc = ci + } + + if step.Comment != "" { + fmt.Printf(` +======================================================= +%s +======================================================= +`, strings.ToUpper(step.Comment)) + } + + if step.WaitFile != "" { + if err := waitForSignal(ctx, step); err != nil { + return err + } + } + + for i, command := range step.Commands { + attempts, err := runCommandWithRetries(ctx, step, i, command) + if err != nil { + return handleCommandError(ctx, step, cc, attempts, err) + } + } + + log.Info("successfully executed the step") + + if err := uploadSuccessFiles(ctx, step, cc); err != nil { + return err + } + // @step: everything was good - lets touch the file if step.SuccessFile != "" { if err := utils.TouchFile(step.SuccessFile); err != nil { diff --git a/cmd/step/types.go b/cmd/step/types.go index 5955eb36b..6795d4988 100644 --- a/cmd/step/types.go +++ b/cmd/step/types.go @@ -50,6 +50,8 @@ type Step struct { Timeout time.Duration // UploadFile is file to upload on success of the command UploadFile []string + // UploadOnErrorFile is a list of files to upload even when the command has failed + UploadOnErrorFile []string // WaitFile is the path to a file which is wait for to run WaitFile string } @@ -83,6 +85,16 @@ func (s Step) IsValid() error { } } + if len(s.UploadOnErrorFile) > 0 && s.Namespace == "" { + return errors.New("namespace must be specified when uploading files on error") + } + + for _, x := range s.UploadOnErrorFile { + if e := strings.Split(x, "="); len(e) != 2 { + return fmt.Errorf("upload-on-error file %q must be in the format 'key=path'", x) + } + } + return nil } @@ -101,3 +113,19 @@ func (s Step) UploadKeyPairs() map[string]string { return keys } + +// UploadOnErrorKeyPairs returns a map of key pairs to upload even when the command has failed +func (s Step) UploadOnErrorKeyPairs() map[string]string { + if len(s.UploadOnErrorFile) == 0 { + return nil + } + + keys := make(map[string]string) + for _, x := range s.UploadOnErrorFile { + if e := strings.Split(x, "="); len(e) == 2 { + keys[e[0]] = e[1] + } + } + + return keys +} diff --git a/cmd/step/types_test.go b/cmd/step/types_test.go new file mode 100644 index 000000000..1791cf816 --- /dev/null +++ b/cmd/step/types_test.go @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2024 Appvia Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +package main + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestIsValid(t *testing.T) { + cases := []struct { + Name string + Step Step + ExpectError bool + }{ + { + Name: "no commands", + Step: Step{}, + ExpectError: true, + }, + { + Name: "valid step with command", + Step: Step{ + Commands: []string{"echo hello"}, + Shell: "/bin/sh", + }, + ExpectError: false, + }, + { + Name: "upload file without namespace", + Step: Step{ + Commands: []string{"echo hello"}, + UploadFile: []string{"secret=/path/to/file"}, + }, + ExpectError: true, + }, + { + Name: "upload file with namespace", + Step: Step{ + Commands: []string{"echo hello"}, + UploadFile: []string{"secret=/path/to/file"}, + Namespace: "default", + }, + ExpectError: false, + }, + { + Name: "upload file with invalid format", + Step: Step{ + Commands: []string{"echo hello"}, + UploadFile: []string{"invalid-format"}, + Namespace: "default", + }, + ExpectError: true, + }, + { + Name: "upload-on-error without namespace", + Step: Step{ + Commands: []string{"echo hello"}, + UploadOnErrorFile: []string{"secret=/path/to/file"}, + }, + ExpectError: true, + }, + { + Name: "upload-on-error with namespace", + Step: Step{ + Commands: []string{"echo hello"}, + UploadOnErrorFile: []string{"secret=/path/to/file"}, + Namespace: "default", + }, + ExpectError: false, + }, + { + Name: "upload-on-error with invalid format", + Step: Step{ + Commands: []string{"echo hello"}, + UploadOnErrorFile: []string{"invalid-format"}, + Namespace: "default", + }, + ExpectError: true, + }, + { + Name: "upload-on-error with valid key=path format", + Step: Step{ + Commands: []string{"echo hello"}, + UploadOnErrorFile: []string{"my-secret=/run/tfstate"}, + Namespace: "terraform-system", + }, + ExpectError: false, + }, + } + + for _, tc := range cases { + t.Run(tc.Name, func(t *testing.T) { + err := tc.Step.IsValid() + if tc.ExpectError { + require.Error(t, err) + } else { + require.NoError(t, err) + } + }) + } +} + +func TestUploadOnErrorKeyPairs(t *testing.T) { + cases := []struct { + Name string + Step Step + Expected map[string]string + }{ + { + Name: "no upload-on-error files", + Step: Step{}, + Expected: nil, + }, + { + Name: "single upload-on-error file", + Step: Step{ + UploadOnErrorFile: []string{"my-secret=/run/tfstate"}, + }, + Expected: map[string]string{ + "my-secret": "/run/tfstate", + }, + }, + { + Name: "multiple upload-on-error files", + Step: Step{ + UploadOnErrorFile: []string{ + "state-secret=/run/tfstate", + "plan-secret=/run/plan.out", + }, + }, + Expected: map[string]string{ + "state-secret": "/run/tfstate", + "plan-secret": "/run/plan.out", + }, + }, + { + Name: "upload-on-error with invalid format is skipped", + Step: Step{ + UploadOnErrorFile: []string{"invalid-format"}, + }, + Expected: map[string]string{}, + }, + } + + for _, tc := range cases { + t.Run(tc.Name, func(t *testing.T) { + result := tc.Step.UploadOnErrorKeyPairs() + assert.Equal(t, tc.Expected, result) + }) + } +} + +func TestUploadKeyPairs(t *testing.T) { + cases := []struct { + Name string + Step Step + Expected map[string]string + }{ + { + Name: "no upload files", + Step: Step{}, + Expected: nil, + }, + { + Name: "single upload file", + Step: Step{ + UploadFile: []string{"my-secret=/run/tfstate"}, + }, + Expected: map[string]string{ + "my-secret": "/run/tfstate", + }, + }, + { + Name: "multiple upload files", + Step: Step{ + UploadFile: []string{ + "state-secret=/run/tfstate", + "plan-secret=/run/plan.out", + }, + }, + Expected: map[string]string{ + "state-secret": "/run/tfstate", + "plan-secret": "/run/plan.out", + }, + }, + } + + for _, tc := range cases { + t.Run(tc.Name, func(t *testing.T) { + result := tc.Step.UploadKeyPairs() + assert.Equal(t, tc.Expected, result) + }) + } +} diff --git a/pkg/controller/configuration/delete.go b/pkg/controller/configuration/delete.go index daa86e5da..c69d91e8d 100644 --- a/pkg/controller/configuration/delete.go +++ b/pkg/controller/configuration/delete.go @@ -52,30 +52,38 @@ func (c *Controller) ensureTerraformDestroy(configuration *terraformv1alpha1.Con // else we are deleting the resource configuration.Status.ResourceStatus = terraformv1alpha1.DestroyingResources - // @step: ensure we have a status and the resource count has been defined - if configuration.Status.Resources != nil { - if ptr.Deref(configuration.Status.Resources, 0) == 0 { - c.recorder.Event(configuration, v1.EventTypeNormal, "DeletionSkipped", "Configuration had zero resources, skipping terraform destroy") - - return reconcile.Result{}, nil - } - } - - // @step: check we have a terraform state - else we can just continue - secret := &v1.Secret{} - secret.Namespace = c.ControllerNamespace - secret.Name = configuration.GetTerraformStateSecretName() - - found, err := kubernetes.GetIfExists(ctx, c.cc, secret) + // @step: check if the terraform state secret exists. We do this before the resource-count + // check because a failed apply may have left Resources == 0 in the status even though some + // cloud resources were created. If state is stored in an external backend (e.g. S3, GCS) + // the secret will also be absent, but we must still attempt the destroy. + stateSecret := &v1.Secret{} + stateSecret.Namespace = c.ControllerNamespace + stateSecret.Name = configuration.GetTerraformStateSecretName() + + stateExists, err := kubernetes.GetIfExists(ctx, c.cc, stateSecret) if err != nil { cond.Failed(err, "Failed to check for the terraform state secret") return reconcile.Result{}, err } - if !found { + + // @step: only skip the destroy when the resource count is explicitly zero AND no terraform + // state exists. If state exists we must run destroy even when Resources reports zero, + // because a failed apply can leave Resources unset while some cloud resources were created. + if shouldSkipDestroy(configuration, stateExists) { + c.recorder.Event(configuration, v1.EventTypeNormal, "DeletionSkipped", "Configuration had zero resources, skipping terraform destroy") + return reconcile.Result{}, nil } + if !stateExists { + log.WithFields(log.Fields{ + "name": configuration.GetName(), + "namespace": configuration.GetNamespace(), + "secret": stateSecret.Name, + }).Warn("terraform state secret not found, proceeding with destroy regardless (state may be held in an external backend)") + } + // @step: find any currently running destroy jobs job, found := filters.Jobs(state.jobs). WithGeneration(generation). @@ -168,6 +176,15 @@ func (c *Controller) ensureTerraformDestroy(configuration *terraformv1alpha1.Con } } +// shouldSkipDestroy returns true when we can safely skip running terraform destroy: the resource +// count is explicitly zero AND no terraform state secret exists. If state exists we must still +// attempt destroy because a failed apply can leave Resources == 0 while cloud resources were created. +func shouldSkipDestroy(configuration *terraformv1alpha1.Configuration, stateExists bool) bool { + return configuration.Status.Resources != nil && + ptr.Deref(configuration.Status.Resources, 0) == 0 && + !stateExists +} + // ensureConfigurationSecretsDeleted is responsible for deleting any associated terraform state func (c *Controller) ensureConfigurationSecretsDeleted(configuration *terraformv1alpha1.Configuration) controller.EnsureFunc { cond := controller.ConditionMgr(configuration, corev1alpha1.ConditionReady, c.recorder) diff --git a/pkg/controller/configuration/delete_test.go b/pkg/controller/configuration/delete_test.go index 440824f40..9d64cf623 100644 --- a/pkg/controller/configuration/delete_test.go +++ b/pkg/controller/configuration/delete_test.go @@ -165,6 +165,11 @@ var _ = Describe("Configuration Controller with Contexts", func() { Expect(cc.Status().Update(context.Background(), configuration)).To(Succeed()) Expect(cc.Get(context.Background(), configuration.GetNamespacedName(), configuration)).To(Succeed()) + // Delete the state secret so there is truly no evidence of any resources + stateSecret := fixtures.NewTerraformState(configuration) + stateSecret.Namespace = ctrl.ControllerNamespace + Expect(cc.Delete(context.Background(), stateSecret)).To(Succeed()) + result, _, rerr = controllertests.Roll(context.TODO(), ctrl, configuration, 0) }) @@ -185,6 +190,73 @@ var _ = Describe("Configuration Controller with Contexts", func() { Expect(list.Items).To(HaveLen(0)) }) }) + + Context("but the configuration has no resources yet terraform state secret exists", func() { + BeforeEach(func() { + configuration.Status.Resources = ptr.To(0) + Expect(cc.Status().Update(context.Background(), configuration)).To(Succeed()) + Expect(cc.Get(context.Background(), configuration.GetNamespacedName(), configuration)).To(Succeed()) + + // State secret is still present (from BeforeEach) — simulates a failed apply + // that left Resources == 0 in the status but did write some terraform state. + result, _, rerr = controllertests.Roll(context.TODO(), ctrl, configuration, 0) + }) + + It("should not return an error", func() { + Expect(rerr).ToNot(HaveOccurred()) + }) + + It("should create a destroy job despite zero resource count", func() { + list := &batchv1.JobList{} + Expect(cc.List(context.Background(), list)).To(Succeed()) + Expect(list.Items).ToNot(HaveLen(0)) + }) + + It("should not emit a DeletionSkipped event", func() { + for _, ev := range recorder.Events { + Expect(ev).ToNot(ContainSubstring("DeletionSkipped")) + } + }) + + It("should indicate destroy is running in the conditions", func() { + Expect(cc.Get(context.TODO(), configuration.GetNamespacedName(), configuration)).ToNot(HaveOccurred()) + + cond := configuration.Status.GetCondition(corev1alpha1.ConditionReady) + Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + Expect(cond.Reason).To(Equal(corev1alpha1.ReasonInProgress)) + Expect(cond.Message).To(Equal("Terraform destroy is running")) + }) + }) + }) + + Context("and the terraform state secret is missing", func() { + BeforeEach(func() { + // Delete the state secret to simulate an apply that failed before state was uploaded + state := fixtures.NewTerraformState(configuration) + state.Namespace = ctrl.ControllerNamespace + Expect(cc.Delete(context.Background(), state)).To(Succeed()) + + result, _, rerr = controllertests.Roll(context.TODO(), ctrl, configuration, 0) + }) + + It("should not return an error", func() { + Expect(rerr).ToNot(HaveOccurred()) + }) + + It("should create a destroy job even though state secret is missing", func() { + list := &batchv1.JobList{} + Expect(cc.List(context.Background(), list)).To(Succeed()) + Expect(list.Items).ToNot(HaveLen(0)) + }) + + It("should indicate the status in the conditions", func() { + Expect(cc.Get(context.TODO(), configuration.GetNamespacedName(), configuration)).ToNot(HaveOccurred()) + + cond := configuration.Status.GetCondition(corev1alpha1.ConditionReady) + Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + Expect(cond.Reason).To(Equal(corev1alpha1.ReasonInProgress)) + Expect(cond.Message).To(Equal("Terraform destroy is running")) + }) }) Context("and the provider is not ready", func() {