From d28a79170e08566828b242cce3d6c17e5a1062ec Mon Sep 17 00:00:00 2001 From: Devin Wong Date: Thu, 2 Oct 2025 12:13:51 -0700 Subject: [PATCH 1/7] return error early to avoid provision-wait timeout --- aks-node-controller/app.go | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 8a18ecccf00..9ea83ae5fba 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -52,18 +52,26 @@ func (a *App) Run(ctx context.Context, args []string) int { return exitCode } -func (a *App) run(ctx context.Context, args []string) error { +func (a *App) run(ctx context.Context, args []string) (err error) { if len(args) < 2 { return errors.New("missing command argument") } switch args[1] { case "provision": + // Notify provision-wait mode if error occurs before running CSE + defer func() { + if err != nil { + //best-effort to notify provision-wait mode so that it doesn't wait until timeout + if writeErr := os.WriteFile(provisionCompleteFilePath, []byte{}, 0644); writeErr != nil { + slog.Error("failed to write provision complete file", "error", writeErr) + } + } + }() fs := flag.NewFlagSet("provision", flag.ContinueOnError) provisionConfig := fs.String("provision-config", "", "path to the provision config file") dryRun := fs.Bool("dry-run", false, "print the command that would be run without executing it") - err := fs.Parse(args[2:]) - if err != nil { - return fmt.Errorf("parse args: %w", err) + if parseErr := fs.Parse(args[2:]); parseErr != nil { + return fmt.Errorf("parse args: %w", parseErr) } if provisionConfig == nil || *provisionConfig == "" { return errors.New("--provision-config is required") @@ -71,14 +79,15 @@ func (a *App) run(ctx context.Context, args []string) error { if dryRun != nil && *dryRun { a.cmdRunner = cmdRunnerDryRun } - return a.Provision(ctx, ProvisionFlags{ProvisionConfig: *provisionConfig}) + err = a.Provision(ctx, ProvisionFlags{ProvisionConfig: *provisionConfig}) + return err case "provision-wait": provisionStatusFiles := ProvisionStatusFiles{ProvisionJSONFile: provisionJSONFilePath, ProvisionCompleteFile: provisionCompleteFilePath} - provisionOutput, err := a.ProvisionWait(ctx, provisionStatusFiles) + provisionOutput, waitErr := a.ProvisionWait(ctx, provisionStatusFiles) //nolint:forbidigo // stdout is part of the interface fmt.Println(provisionOutput) slog.Info("provision-wait finished", "provisionOutput", provisionOutput) - return err + return waitErr default: return fmt.Errorf("unknown command: %s", args[1]) } @@ -116,7 +125,6 @@ func (a *App) Provision(ctx context.Context, flags ProvisionFlags) error { if cmd.ProcessState != nil { exitCode = cmd.ProcessState.ExitCode() } - // Is it ok to log a single line? Is it too much? slog.Info("CSE finished", "exitCode", exitCode, "stdout", stdoutBuf.String(), "stderr", stderrBuf.String(), "error", err) return err } @@ -125,7 +133,7 @@ func (a *App) ProvisionWait(ctx context.Context, filepaths ProvisionStatusFiles) if _, err := os.Stat(filepaths.ProvisionCompleteFile); err == nil { data, err := os.ReadFile(filepaths.ProvisionJSONFile) if err != nil { - return "", fmt.Errorf("failed to read provision.json: %w", err) + return "", fmt.Errorf("failed to read provision.json: %w. One reason could be that AKSNodeConfig is not properly set.", err) } return string(data), nil } @@ -134,12 +142,9 @@ func (a *App) ProvisionWait(ctx context.Context, filepaths ProvisionStatusFiles) if err != nil { return "", fmt.Errorf("failed to create watcher: %w", err) } - defer watcher.Close() - // Watch the directory containing the provision complete file dir := filepath.Dir(filepaths.ProvisionCompleteFile) - err = os.MkdirAll(dir, 0755) // create the directory if it doesn't exist - if err != nil { + if err = os.MkdirAll(dir, 0755); err != nil { // create the directory if it doesn't exist return "", fmt.Errorf("failed to create directory %s: %w", dir, err) } if err = watcher.Add(dir); err != nil { @@ -152,7 +157,7 @@ func (a *App) ProvisionWait(ctx context.Context, filepaths ProvisionStatusFiles) if event.Op&fsnotify.Create == fsnotify.Create && event.Name == filepaths.ProvisionCompleteFile { data, err := os.ReadFile(filepaths.ProvisionJSONFile) if err != nil { - return "", fmt.Errorf("failed to read provision.json: %w", err) + return "", fmt.Errorf("failed to read provision.json: %w. One reason could be that AKSNodeConfig is not properly set.", err) } return string(data), nil } From 71fb3293cc25fd779be485dbffd44545414081bf Mon Sep 17 00:00:00 2001 From: Devin Wong Date: Thu, 2 Oct 2025 12:14:05 -0700 Subject: [PATCH 2/7] update readme --- aks-node-controller/README.md | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/aks-node-controller/README.md b/aks-node-controller/README.md index 26ea71a297e..c65f3e4daf8 100644 --- a/aks-node-controller/README.md +++ b/aks-node-controller/README.md @@ -88,20 +88,20 @@ Clients need to provide CSE and Custom Data. [nodeconfigutils](pkg/nodeconfiguti 1. Custom Data: Contains base64 encoded bootstrap configuration of type [aksnodeconfigv1.Configuration](pkg/gen/aksnodeconfig/v1) in json format which is placed on the node through cloud-init write directive. -Format: -```yaml -#cloud-config -write_files: -- path: /opt/azure/containers/aks-node-controller-config.json - permissions: "0755" - owner: root - content: !!binary | - {{ encodedAKSNodeConfig }}` -``` + Format: + ```yaml + #cloud-config + write_files: + - path: /opt/azure/containers/aks-node-controller-config.json + permissions: "0755" + owner: root + content: !!binary | + {{ encodedAKSNodeConfig }}` + ``` 2. CSE: Script used to poll bootstrap status and return exit status once complete. -CSE script: `/opt/azure/containers/aks-node-controller provision-wait` + CSE script: `/opt/azure/containers/aks-node-controller provision-wait` #### Provisioning flow diagram: @@ -134,5 +134,7 @@ Key components: 1. `aks-node-controller.service`: systemd unit that is triggered once cloud-init is complete (guaranteeing that config is present on disk) and then kickstarts bootstrapping. 2. `aks-node-controller` go binary with two modes: -- **provision**: parses the node config and triggers bootstrap process -- **provision-wait**: waits for `provision.complete` to be present and reads `provision.json` which contains the provision output of type `CSEStatus` and is returned by CSE through capturing stdout +- **provision**: parses the node config and triggers bootstrap process. + - `aks-node-controller` will try its best to parse `aksnodeconfigv1`, even with some unexpected situations, e.g., unknown fields in the config, unknown enum values, etc. This is to support backward compatibility where `aksnodeconfigv1` is newer than the one used in `aks-node-controller` in an older VHD node image. + - If `aks-node-controller` can't handle the `aksnodeconfigv1` properly, e.g., the config version is different from the expected version, it will return early by creating the `provision.complete` to notify the `provision-wait` to stop waiting and get the provision error. +- **provision-wait**: waits for `provision.complete` to be present and reads `provision.json` which contains the provision output of type `CSEStatus` and is returned by CSE through capturing stdout. From 87e22f255f0dad707cdff12b9e90e4d23a555813 Mon Sep 17 00:00:00 2001 From: Devin Wong Date: Thu, 2 Oct 2025 14:51:45 -0700 Subject: [PATCH 3/7] extract the function --- aks-node-controller/README.md | 11 ++++++++--- aks-node-controller/app.go | 32 +++++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/aks-node-controller/README.md b/aks-node-controller/README.md index c65f3e4daf8..f528b5e3820 100644 --- a/aks-node-controller/README.md +++ b/aks-node-controller/README.md @@ -134,7 +134,12 @@ Key components: 1. `aks-node-controller.service`: systemd unit that is triggered once cloud-init is complete (guaranteeing that config is present on disk) and then kickstarts bootstrapping. 2. `aks-node-controller` go binary with two modes: -- **provision**: parses the node config and triggers bootstrap process. - - `aks-node-controller` will try its best to parse `aksnodeconfigv1`, even with some unexpected situations, e.g., unknown fields in the config, unknown enum values, etc. This is to support backward compatibility where `aksnodeconfigv1` is newer than the one used in `aks-node-controller` in an older VHD node image. - - If `aks-node-controller` can't handle the `aksnodeconfigv1` properly, e.g., the config version is different from the expected version, it will return early by creating the `provision.complete` to notify the `provision-wait` to stop waiting and get the provision error. +- **provision**: Parses the node configuration and starts the bootstrap sequence. + - The controller performs a tolerant (forward‑compatible) parse of `aksnodeconfigv1.Configuration`: unknown fields, additional enum values, or future‑version knobs are ignored (and may be logged) so that a newer control‑plane can talk to an older VHD image. + - If the config cannot be safely interpreted (e.g. unsupported `Version`, malformed required field, or incompatible schema change), the controller fails fast. It writes the sentinel file `provision.complete` early so the `provision-wait` process stops polling and can surface an error instead of hanging indefinitely. + - In a fail‑fast path the normal bootstrap scripts never run, therefore `provision.json` (which would contain the serialized `CSEStatus`) is never created. A typical error looks like: + ``` + failed to read provision.json: open /var/log/azure/aks/provision.json: no such file or directory. One reason could be that AKSNodeConfig is not properly set. + ``` + This indicates the controller exited before emitting `provision.json`. Most commonly the rendered AKSNodeConfig was missing, had the wrong `Version` (expected `v1`), or was written to the wrong path (`/opt/azure/containers/aks-node-controller-config.json`). Fix the config generation, redeploy, and the bootstrap scripts will then populate `provision.json`. - **provision-wait**: waits for `provision.complete` to be present and reads `provision.json` which contains the provision output of type `CSEStatus` and is returned by CSE through capturing stdout. diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 9ea83ae5fba..89975f99f35 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -58,15 +58,9 @@ func (a *App) run(ctx context.Context, args []string) (err error) { } switch args[1] { case "provision": - // Notify provision-wait mode if error occurs before running CSE - defer func() { - if err != nil { - //best-effort to notify provision-wait mode so that it doesn't wait until timeout - if writeErr := os.WriteFile(provisionCompleteFilePath, []byte{}, 0644); writeErr != nil { - slog.Error("failed to write provision complete file", "error", writeErr) - } - } - }() + // Ensure provision-wait is unblocked if we error before provisioning scripts create provision.complete + defer a.notifyProvisionFailure(&err) + fs := flag.NewFlagSet("provision", flag.ContinueOnError) provisionConfig := fs.String("provision-config", "", "path to the provision config file") dryRun := fs.Bool("dry-run", false, "print the command that would be run without executing it") @@ -129,6 +123,26 @@ func (a *App) Provision(ctx context.Context, flags ProvisionFlags) error { return err } +// notifyProvisionFailure creates the provision.complete sentinel file if a provisioning +// error occurred before the normal provisioning scripts had a chance to write it. +// This prevents the provision-wait mode from blocking until timeout when we fail fast +// (e.g. due to invalid or unsupported configuration). +func (a *App) notifyProvisionFailure(runErr *error) { + if runErr == nil || *runErr == nil { // success path or nil pointer + return + } + // Avoid clobbering an existing file if provisioning scripts actually created it before the error surfaced. + if _, statErr := os.Stat(provisionCompleteFilePath); statErr == nil { + return // file already exists + } else if !errors.Is(statErr, os.ErrNotExist) { // unexpected stat error + slog.Error("failed to stat provision.complete file", "error", statErr) + return + } + if writeErr := os.WriteFile(provisionCompleteFilePath, []byte{}, 0644); writeErr != nil { + slog.Error("failed to write provision.complete file", "error", writeErr) + } +} + func (a *App) ProvisionWait(ctx context.Context, filepaths ProvisionStatusFiles) (string, error) { if _, err := os.Stat(filepaths.ProvisionCompleteFile); err == nil { data, err := os.ReadFile(filepaths.ProvisionJSONFile) From da15d0c0318637dfe616859684db06b80dbc069b Mon Sep 17 00:00:00 2001 From: Devin Wong Date: Thu, 2 Oct 2025 15:06:24 -0700 Subject: [PATCH 4/7] fix linting errors --- aks-node-controller/app.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 89975f99f35..cb335ce2a15 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -52,15 +52,15 @@ func (a *App) Run(ctx context.Context, args []string) int { return exitCode } -func (a *App) run(ctx context.Context, args []string) (err error) { +func (a *App) run(ctx context.Context, args []string) error { if len(args) < 2 { return errors.New("missing command argument") } switch args[1] { case "provision": + var err error // Ensure provision-wait is unblocked if we error before provisioning scripts create provision.complete defer a.notifyProvisionFailure(&err) - fs := flag.NewFlagSet("provision", flag.ContinueOnError) provisionConfig := fs.String("provision-config", "", "path to the provision config file") dryRun := fs.Bool("dry-run", false, "print the command that would be run without executing it") @@ -138,7 +138,7 @@ func (a *App) notifyProvisionFailure(runErr *error) { slog.Error("failed to stat provision.complete file", "error", statErr) return } - if writeErr := os.WriteFile(provisionCompleteFilePath, []byte{}, 0644); writeErr != nil { + if writeErr := os.WriteFile(provisionCompleteFilePath, []byte{}, 0600); writeErr != nil { slog.Error("failed to write provision.complete file", "error", writeErr) } } @@ -147,7 +147,7 @@ func (a *App) ProvisionWait(ctx context.Context, filepaths ProvisionStatusFiles) if _, err := os.Stat(filepaths.ProvisionCompleteFile); err == nil { data, err := os.ReadFile(filepaths.ProvisionJSONFile) if err != nil { - return "", fmt.Errorf("failed to read provision.json: %w. One reason could be that AKSNodeConfig is not properly set.", err) + return "", fmt.Errorf("failed to read provision.json: %w. One reason could be that AKSNodeConfig is not properly set", err) } return string(data), nil } @@ -171,7 +171,7 @@ func (a *App) ProvisionWait(ctx context.Context, filepaths ProvisionStatusFiles) if event.Op&fsnotify.Create == fsnotify.Create && event.Name == filepaths.ProvisionCompleteFile { data, err := os.ReadFile(filepaths.ProvisionJSONFile) if err != nil { - return "", fmt.Errorf("failed to read provision.json: %w. One reason could be that AKSNodeConfig is not properly set.", err) + return "", fmt.Errorf("failed to read provision.json: %w. One reason could be that AKSNodeConfig is not properly set", err) } return string(data), nil } From 6d939c31eaf685b339797b7979a408e8704251f5 Mon Sep 17 00:00:00 2001 From: Devin Wong Date: Thu, 2 Oct 2025 16:42:51 -0700 Subject: [PATCH 5/7] remove defer pattern --- aks-node-controller/app.go | 54 ++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index cb335ce2a15..00f9929dc50 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -58,22 +58,9 @@ func (a *App) run(ctx context.Context, args []string) error { } switch args[1] { case "provision": - var err error - // Ensure provision-wait is unblocked if we error before provisioning scripts create provision.complete - defer a.notifyProvisionFailure(&err) - fs := flag.NewFlagSet("provision", flag.ContinueOnError) - provisionConfig := fs.String("provision-config", "", "path to the provision config file") - dryRun := fs.Bool("dry-run", false, "print the command that would be run without executing it") - if parseErr := fs.Parse(args[2:]); parseErr != nil { - return fmt.Errorf("parse args: %w", parseErr) - } - if provisionConfig == nil || *provisionConfig == "" { - return errors.New("--provision-config is required") - } - if dryRun != nil && *dryRun { - a.cmdRunner = cmdRunnerDryRun - } - err = a.Provision(ctx, ProvisionFlags{ProvisionConfig: *provisionConfig}) + err := a.runProvision(ctx, args[2:]) + // Always notify after provisioning attempt (success is a no-op inside notifier) + a.writeCompleteFileOnError(err) return err case "provision-wait": provisionStatusFiles := ProvisionStatusFiles{ProvisionJSONFile: provisionJSONFilePath, ProvisionCompleteFile: provisionCompleteFilePath} @@ -123,18 +110,34 @@ func (a *App) Provision(ctx context.Context, flags ProvisionFlags) error { return err } -// notifyProvisionFailure creates the provision.complete sentinel file if a provisioning -// error occurred before the normal provisioning scripts had a chance to write it. -// This prevents the provision-wait mode from blocking until timeout when we fail fast -// (e.g. due to invalid or unsupported configuration). -func (a *App) notifyProvisionFailure(runErr *error) { - if runErr == nil || *runErr == nil { // success path or nil pointer +// runProvision encapsulates argument parsing and execution for the "provision" subcommand. +// It returns an error describing any failure; callers should pass that error to +// writeCompleteFileOnError so the sentinel file can be written on fail-fast paths. +func (a *App) runProvision(ctx context.Context, args []string) error { + fs := flag.NewFlagSet("provision", flag.ContinueOnError) + provisionConfig := fs.String("provision-config", "", "path to the provision config file") + dryRun := fs.Bool("dry-run", false, "print the command that would be run without executing it") + if err := fs.Parse(args); err != nil { + return fmt.Errorf("parse args: %w", err) + } + if *provisionConfig == "" { + return errors.New("--provision-config is required") + } + if *dryRun { + a.cmdRunner = cmdRunnerDryRun + } + return a.Provision(ctx, ProvisionFlags{ProvisionConfig: *provisionConfig}) +} + +// writeCompleteFileOnError writes the provision.complete sentinel if err is non-nil, +// allowing provision-wait mode to unblock early on fail-fast validation errors. +func (a *App) writeCompleteFileOnError(err error) { + if err == nil { return } - // Avoid clobbering an existing file if provisioning scripts actually created it before the error surfaced. if _, statErr := os.Stat(provisionCompleteFilePath); statErr == nil { - return // file already exists - } else if !errors.Is(statErr, os.ErrNotExist) { // unexpected stat error + return // already exists + } else if !errors.Is(statErr, os.ErrNotExist) { // unexpected error slog.Error("failed to stat provision.complete file", "error", statErr) return } @@ -156,6 +159,7 @@ func (a *App) ProvisionWait(ctx context.Context, filepaths ProvisionStatusFiles) if err != nil { return "", fmt.Errorf("failed to create watcher: %w", err) } + defer watcher.Close() // Watch the directory containing the provision complete file dir := filepath.Dir(filepaths.ProvisionCompleteFile) if err = os.MkdirAll(dir, 0755); err != nil { // create the directory if it doesn't exist From ec795a55a40a0c81ea44273def1a3f0607bf4455 Mon Sep 17 00:00:00 2001 From: Devin Wong Date: Thu, 2 Oct 2025 16:47:50 -0700 Subject: [PATCH 6/7] update log --- aks-node-controller/app.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 00f9929dc50..22fc3a03c04 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -138,11 +138,11 @@ func (a *App) writeCompleteFileOnError(err error) { if _, statErr := os.Stat(provisionCompleteFilePath); statErr == nil { return // already exists } else if !errors.Is(statErr, os.ErrNotExist) { // unexpected error - slog.Error("failed to stat provision.complete file", "error", statErr) + slog.Error("failed to stat provision.complete file", "path", provisionCompleteFilePath, "error", statErr) return } if writeErr := os.WriteFile(provisionCompleteFilePath, []byte{}, 0600); writeErr != nil { - slog.Error("failed to write provision.complete file", "error", writeErr) + slog.Error("failed to write provision.complete file", "path", provisionCompleteFilePath, "error", writeErr) } } From 58797a5183f552aea14cec6a2de3289b9f6a49db Mon Sep 17 00:00:00 2001 From: Devin Wong Date: Thu, 2 Oct 2025 18:33:27 -0700 Subject: [PATCH 7/7] rename waitErr to err --- aks-node-controller/app.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 22fc3a03c04..532b90d15ed 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -64,11 +64,11 @@ func (a *App) run(ctx context.Context, args []string) error { return err case "provision-wait": provisionStatusFiles := ProvisionStatusFiles{ProvisionJSONFile: provisionJSONFilePath, ProvisionCompleteFile: provisionCompleteFilePath} - provisionOutput, waitErr := a.ProvisionWait(ctx, provisionStatusFiles) + provisionOutput, err := a.ProvisionWait(ctx, provisionStatusFiles) //nolint:forbidigo // stdout is part of the interface fmt.Println(provisionOutput) slog.Info("provision-wait finished", "provisionOutput", provisionOutput) - return waitErr + return err default: return fmt.Errorf("unknown command: %s", args[1]) }