diff --git a/aks-node-controller/README.md b/aks-node-controller/README.md index 26ea71a297e..f528b5e3820 100644 --- a/aks-node-controller/README.md +++ b/aks-node-controller/README.md @@ -88,20 +88,20 @@ Clients need to provide CSE and Custom Data. [nodeconfigutils](pkg/nodeconfiguti 1. Custom Data: Contains base64 encoded bootstrap configuration of type [aksnodeconfigv1.Configuration](pkg/gen/aksnodeconfig/v1) in json format which is placed on the node through cloud-init write directive. -Format: -```yaml -#cloud-config -write_files: -- path: /opt/azure/containers/aks-node-controller-config.json - permissions: "0755" - owner: root - content: !!binary | - {{ encodedAKSNodeConfig }}` -``` + Format: + ```yaml + #cloud-config + write_files: + - path: /opt/azure/containers/aks-node-controller-config.json + permissions: "0755" + owner: root + content: !!binary | + {{ encodedAKSNodeConfig }}` + ``` 2. CSE: Script used to poll bootstrap status and return exit status once complete. -CSE script: `/opt/azure/containers/aks-node-controller provision-wait` + CSE script: `/opt/azure/containers/aks-node-controller provision-wait` #### Provisioning flow diagram: @@ -134,5 +134,12 @@ Key components: 1. `aks-node-controller.service`: systemd unit that is triggered once cloud-init is complete (guaranteeing that config is present on disk) and then kickstarts bootstrapping. 2. `aks-node-controller` go binary with two modes: -- **provision**: parses the node config and triggers bootstrap process -- **provision-wait**: waits for `provision.complete` to be present and reads `provision.json` which contains the provision output of type `CSEStatus` and is returned by CSE through capturing stdout +- **provision**: Parses the node configuration and starts the bootstrap sequence. + - The controller performs a tolerant (forward‑compatible) parse of `aksnodeconfigv1.Configuration`: unknown fields, additional enum values, or future‑version knobs are ignored (and may be logged) so that a newer control‑plane can talk to an older VHD image. + - If the config cannot be safely interpreted (e.g. unsupported `Version`, malformed required field, or incompatible schema change), the controller fails fast. It writes the sentinel file `provision.complete` early so the `provision-wait` process stops polling and can surface an error instead of hanging indefinitely. + - In a fail‑fast path the normal bootstrap scripts never run, therefore `provision.json` (which would contain the serialized `CSEStatus`) is never created. A typical error looks like: + ``` + failed to read provision.json: open /var/log/azure/aks/provision.json: no such file or directory. One reason could be that AKSNodeConfig is not properly set. + ``` + This indicates the controller exited before emitting `provision.json`. Most commonly the rendered AKSNodeConfig was missing, had the wrong `Version` (expected `v1`), or was written to the wrong path (`/opt/azure/containers/aks-node-controller-config.json`). Fix the config generation, redeploy, and the bootstrap scripts will then populate `provision.json`. +- **provision-wait**: waits for `provision.complete` to be present and reads `provision.json` which contains the provision output of type `CSEStatus` and is returned by CSE through capturing stdout. diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 8a18ecccf00..532b90d15ed 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -58,20 +58,10 @@ func (a *App) run(ctx context.Context, args []string) error { } switch args[1] { case "provision": - fs := flag.NewFlagSet("provision", flag.ContinueOnError) - provisionConfig := fs.String("provision-config", "", "path to the provision config file") - dryRun := fs.Bool("dry-run", false, "print the command that would be run without executing it") - err := fs.Parse(args[2:]) - if err != nil { - return fmt.Errorf("parse args: %w", err) - } - if provisionConfig == nil || *provisionConfig == "" { - return errors.New("--provision-config is required") - } - if dryRun != nil && *dryRun { - a.cmdRunner = cmdRunnerDryRun - } - return a.Provision(ctx, ProvisionFlags{ProvisionConfig: *provisionConfig}) + err := a.runProvision(ctx, args[2:]) + // Always notify after provisioning attempt (success is a no-op inside notifier) + a.writeCompleteFileOnError(err) + return err case "provision-wait": provisionStatusFiles := ProvisionStatusFiles{ProvisionJSONFile: provisionJSONFilePath, ProvisionCompleteFile: provisionCompleteFilePath} provisionOutput, err := a.ProvisionWait(ctx, provisionStatusFiles) @@ -116,16 +106,51 @@ func (a *App) Provision(ctx context.Context, flags ProvisionFlags) error { if cmd.ProcessState != nil { exitCode = cmd.ProcessState.ExitCode() } - // Is it ok to log a single line? Is it too much? slog.Info("CSE finished", "exitCode", exitCode, "stdout", stdoutBuf.String(), "stderr", stderrBuf.String(), "error", err) return err } +// runProvision encapsulates argument parsing and execution for the "provision" subcommand. +// It returns an error describing any failure; callers should pass that error to +// writeCompleteFileOnError so the sentinel file can be written on fail-fast paths. +func (a *App) runProvision(ctx context.Context, args []string) error { + fs := flag.NewFlagSet("provision", flag.ContinueOnError) + provisionConfig := fs.String("provision-config", "", "path to the provision config file") + dryRun := fs.Bool("dry-run", false, "print the command that would be run without executing it") + if err := fs.Parse(args); err != nil { + return fmt.Errorf("parse args: %w", err) + } + if *provisionConfig == "" { + return errors.New("--provision-config is required") + } + if *dryRun { + a.cmdRunner = cmdRunnerDryRun + } + return a.Provision(ctx, ProvisionFlags{ProvisionConfig: *provisionConfig}) +} + +// writeCompleteFileOnError writes the provision.complete sentinel if err is non-nil, +// allowing provision-wait mode to unblock early on fail-fast validation errors. +func (a *App) writeCompleteFileOnError(err error) { + if err == nil { + return + } + if _, statErr := os.Stat(provisionCompleteFilePath); statErr == nil { + return // already exists + } else if !errors.Is(statErr, os.ErrNotExist) { // unexpected error + slog.Error("failed to stat provision.complete file", "path", provisionCompleteFilePath, "error", statErr) + return + } + if writeErr := os.WriteFile(provisionCompleteFilePath, []byte{}, 0600); writeErr != nil { + slog.Error("failed to write provision.complete file", "path", provisionCompleteFilePath, "error", writeErr) + } +} + func (a *App) ProvisionWait(ctx context.Context, filepaths ProvisionStatusFiles) (string, error) { if _, err := os.Stat(filepaths.ProvisionCompleteFile); err == nil { data, err := os.ReadFile(filepaths.ProvisionJSONFile) if err != nil { - return "", fmt.Errorf("failed to read provision.json: %w", err) + return "", fmt.Errorf("failed to read provision.json: %w. One reason could be that AKSNodeConfig is not properly set", err) } return string(data), nil } @@ -135,11 +160,9 @@ func (a *App) ProvisionWait(ctx context.Context, filepaths ProvisionStatusFiles) return "", fmt.Errorf("failed to create watcher: %w", err) } defer watcher.Close() - // Watch the directory containing the provision complete file dir := filepath.Dir(filepaths.ProvisionCompleteFile) - err = os.MkdirAll(dir, 0755) // create the directory if it doesn't exist - if err != nil { + if err = os.MkdirAll(dir, 0755); err != nil { // create the directory if it doesn't exist return "", fmt.Errorf("failed to create directory %s: %w", dir, err) } if err = watcher.Add(dir); err != nil { @@ -152,7 +175,7 @@ func (a *App) ProvisionWait(ctx context.Context, filepaths ProvisionStatusFiles) if event.Op&fsnotify.Create == fsnotify.Create && event.Name == filepaths.ProvisionCompleteFile { data, err := os.ReadFile(filepaths.ProvisionJSONFile) if err != nil { - return "", fmt.Errorf("failed to read provision.json: %w", err) + return "", fmt.Errorf("failed to read provision.json: %w. One reason could be that AKSNodeConfig is not properly set", err) } return string(data), nil }