diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 9e7767f6cc8..c06b71049b0 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -5,7 +5,6 @@ import ( "context" "encoding/json" "errors" - "flag" "fmt" "io" "log/slog" @@ -20,6 +19,7 @@ import ( "github.com/Azure/agentbaker/aks-node-controller/parser" "github.com/Azure/agentbaker/aks-node-controller/pkg/nodeconfigutils" "github.com/fsnotify/fsnotify" + "github.com/urfave/cli/v3" ) type App struct { @@ -36,42 +36,6 @@ type App struct { nodeCustomDataPath string } -// commandMetadata holds all metadata for a command in one place. -type commandMetadata struct { - taskName string - handler func(*App, context.Context, []string) error -} - -// getCommandRegistry returns the command registry mapping command names to their metadata. -// Adding a new command only requires adding one entry here. -func getCommandRegistry() map[string]commandMetadata { - return map[string]commandMetadata{ - "provision": { - taskName: "Provision", - handler: func(a *App, ctx context.Context, args []string) error { - provisionResult, err := a.runProvision(ctx, args[2:]) - // Always notify after provisioning attempt (success is a no-op inside notifier) - a.writeCompleteFileOnError(provisionResult, err) - return err - }, - }, - "provision-wait": { - taskName: "ProvisionWait", - handler: func(a *App, ctx context.Context, args []string) error { - provisionStatusFiles := ProvisionStatusFiles{ - ProvisionJSONFile: provisionJSONFilePath, - ProvisionCompleteFile: provisionCompleteFilePath, - } - provisionOutput, err := a.ProvisionWait(ctx, provisionStatusFiles) - //nolint:forbidigo // stdout is part of the interface - fmt.Println(provisionOutput) - slog.Info("provision-wait finished", "provisionOutput", provisionOutput) - return err - }, - }, - } -} - // provision.json values are emitted as strings by the shell jq invocation. // We only care about ExitCode + Error + Output (snippet) for failure detection. type ProvisionResult struct { @@ -99,76 +63,120 @@ type ProvisionStatusFiles struct { } func (a *App) Run(ctx context.Context, args []string) int { - if handled := handleInfoCommand(args); handled { - return 0 + cmd := &cli.Command{ + Name: "aks-node-controller", + Usage: "Parse contract and run csecmd", + Version: Version, + ExitErrHandler: func(context.Context, *cli.Command, error) { + // Return errors to the caller so exit codes are derived consistently in one place. + }, + Action: func(context.Context, *cli.Command) error { + if len(args) > 1 { + return fmt.Errorf("unknown command: %s", args[1]) + } + return errors.New("missing command argument") + }, + Commands: []*cli.Command{ + { + Name: "provision", + Usage: "Run node provisioning", + Flags: []cli.Flag{ + &cli.StringFlag{Name: "provision-config", Usage: "path to the provision config file"}, + &cli.StringFlag{Name: "nbc-cmd", Usage: "path to the NBC command file"}, + &cli.BoolFlag{Name: "dry-run", Usage: "print the command that would be run without executing it"}, + }, + Action: func(ctx context.Context, cmd *cli.Command) error { + return a.runProvisionCommand(ctx, ProvisionFlags{ + ProvisionConfig: cmd.String("provision-config"), + NBCCmd: cmd.String("nbc-cmd"), + }, cmd.Bool("dry-run")) + }, + }, + { + Name: "provision-wait", + Usage: "Wait for provisioning to complete", + Action: func(ctx context.Context, cmd *cli.Command) error { + provisionStatusFiles := ProvisionStatusFiles{ + ProvisionJSONFile: provisionJSONFilePath, + ProvisionCompleteFile: provisionCompleteFilePath, + } + provisionOutput, err := a.runProvisionWaitCommand(ctx, provisionStatusFiles) + _, _ = fmt.Fprintln(cmd.Root().Writer, provisionOutput) + return err + }, + }, + { + Name: "version", + Usage: "Print the version", + Action: func(_ context.Context, cmd *cli.Command) error { + _, _ = fmt.Fprintln(cmd.Root().Writer, Version) + return nil + }, + }, + { + Name: "download-hotfix", + Usage: "Download the requested hotfix binary", + Action: func(ctx context.Context, cmd *cli.Command) error { + if len(cmd.Args().Slice()) > 0 { + return fmt.Errorf("unexpected download-hotfix arguments: %s", strings.Join(cmd.Args().Slice(), " ")) + } + return a.runDownloadHotfixCommand(ctx) + }, + }, + }, } - slog.Info("aks-node-controller started", "args", args) - err := a.run(ctx, args) - exitCode := errToExitCode(err) - if exitCode == 0 { - slog.Info("aks-node-controller finished successfully.") - } else { - slog.Error("aks-node-controller failed", "error", err) - } - return exitCode + err := cmd.Run(ctx, args) + return errToExitCode(err) } -// handleInfoCommand handles --version, version, --help, and help commands -// without logging noise. Returns true if handled. -func handleInfoCommand(args []string) bool { - if len(args) < 2 { - return false - } - switch args[1] { - case "--version", "version": - _, _ = fmt.Fprintln(os.Stdout, Version) - return true - case "--help", "help": - _, _ = fmt.Fprintln(os.Stdout, "Usage: aks-node-controller [options]") - _, _ = fmt.Fprintln(os.Stdout) - _, _ = fmt.Fprintln(os.Stdout, "Commands:") - _, _ = fmt.Fprintln(os.Stdout, " provision Run node provisioning") - _, _ = fmt.Fprintln(os.Stdout, " provision-wait Wait for provisioning to complete") - _, _ = fmt.Fprintln(os.Stdout, " version Print the version") - _, _ = fmt.Fprintln(os.Stdout, " help Print this help message") - return true - default: - return false - } -} +func (a *App) runProvisionCommand(ctx context.Context, flags ProvisionFlags, dryRun bool) error { + slog.Info("aks-node-controller started", "task", "Provision") -func (a *App) run(ctx context.Context, args []string) error { - command := "" - if len(args) >= 2 { - command = args[1] - } - if command == "" { - return errors.New("missing command argument") - } - - cmd, ok := getCommandRegistry()[command] - if !ok { - return fmt.Errorf("unknown command: %s", command) + startTime := time.Now() + a.eventLogger.LogEvent("Provision", "Starting", helpers.EventLevelInformational, startTime, startTime) + provisionResult, err := a.runProvision(ctx, flags, dryRun) + a.writeCompleteFileOnError(provisionResult, err) + endTime := time.Now() + if err != nil { + message := fmt.Sprintf("aks-node-controller exited with error %s", err.Error()) + a.eventLogger.LogEvent("Provision", message, helpers.EventLevelError, startTime, endTime) + slog.Error("aks-node-controller failed", "error", err) + } else { + a.eventLogger.LogEvent("Provision", "Completed", helpers.EventLevelInformational, startTime, endTime) + slog.Info("aks-node-controller finished successfully.") } + return err +} - // Self-update before provisioning: check for hotfix version and install if needed. - // On success, syscall.Exec replaces this process and never returns. - // On any failure, selfUpdate logs a warning and returns nil (best-effort). - a.selfUpdate(ctx) +func (a *App) runProvisionWaitCommand(ctx context.Context, provisionStatusFiles ProvisionStatusFiles) (string, error) { + slog.Info("aks-node-controller started", "task", "ProvisionWait") startTime := time.Now() - a.eventLogger.LogEvent(cmd.taskName, "Starting", helpers.EventLevelInformational, startTime, startTime) - - err := cmd.handler(a, ctx, args) + a.eventLogger.LogEvent("ProvisionWait", "Starting", helpers.EventLevelInformational, startTime, startTime) + provisionOutput, err := a.ProvisionWait(ctx, provisionStatusFiles) endTime := time.Now() if err != nil { message := fmt.Sprintf("aks-node-controller exited with error %s", err.Error()) - a.eventLogger.LogEvent(cmd.taskName, message, helpers.EventLevelError, startTime, endTime) + a.eventLogger.LogEvent("ProvisionWait", message, helpers.EventLevelError, startTime, endTime) + slog.Error("aks-node-controller failed", "error", err) } else { - a.eventLogger.LogEvent(cmd.taskName, "Completed", helpers.EventLevelInformational, startTime, endTime) + a.eventLogger.LogEvent("ProvisionWait", "Completed", helpers.EventLevelInformational, startTime, endTime) + slog.Info("aks-node-controller finished successfully.") } - return err + slog.Info("provision-wait finished", "provisionOutput", provisionOutput) + return provisionOutput, err +} + +func (a *App) runDownloadHotfixCommand(ctx context.Context) error { + slog.Info("aks-node-controller hotfix download started") + err := a.downloadHotfix(ctx) + if err != nil { + slog.Error("aks-node-controller hotfix download failed", "error", err) + return err + } + slog.Info("aks-node-controller hotfix download finished") + return nil } func buildCmdFromProvisionConfig(ctx context.Context, path string) (*exec.Cmd, error) { @@ -275,10 +283,10 @@ func (a *App) Provision(ctx context.Context, flags ProvisionFlags) (*ProvisionRe return provisionResult, err } -// runProvision encapsulates argument parsing and execution for the "provision" subcommand. +// runProvision encapsulates execution for the "provision" subcommand after CLI parsing. // It returns an error describing any failure; callers should pass that error to // writeCompleteFileOnError so the sentinel file can be written on fail-fast paths. -func (a *App) runProvision(ctx context.Context, args []string) (*ProvisionResult, error) { +func (a *App) runProvision(ctx context.Context, flags ProvisionFlags, dryRun bool) (*ProvisionResult, error) { // Handle panics gracefully to ensure we always return a valid result provisionResult := &ProvisionResult{} var err error @@ -293,25 +301,15 @@ func (a *App) runProvision(ctx context.Context, args []string) (*ProvisionResult } }() - fs := flag.NewFlagSet("provision", flag.ContinueOnError) - provisionConfig := fs.String("provision-config", "", "path to the provision config file") - nbcCMD := fs.String("nbc-cmd", "", "path to the NBC command file") - dryRun := fs.Bool("dry-run", false, "print the command that would be run without executing it") - if parseErr := fs.Parse(args); parseErr != nil { - provisionResult.ExitCode = strconv.Itoa(240) - provisionResult.Error = fmt.Sprintf("parse args: %v", parseErr) - return provisionResult, errors.New(provisionResult.Error) - } - - if *provisionConfig == "" && *nbcCMD == "" { + if flags.ProvisionConfig == "" && flags.NBCCmd == "" { provisionResult.ExitCode = strconv.Itoa(240) provisionResult.Error = "--provision-config or --nbc-cmd is required" return provisionResult, errors.New(provisionResult.Error) } - if *dryRun { + if dryRun { a.cmdRun = cmdRunnerDryRun } - return a.Provision(ctx, ProvisionFlags{ProvisionConfig: *provisionConfig, NBCCmd: *nbcCMD}) + return a.Provision(ctx, flags) } // writeCompleteFileOnError writes the provision.complete sentinel if err is non-nil, diff --git a/aks-node-controller/app_test.go b/aks-node-controller/app_test.go index 025b8f491b4..471af6e2222 100644 --- a/aks-node-controller/app_test.go +++ b/aks-node-controller/app_test.go @@ -90,6 +90,26 @@ func TestApp_Run(t *testing.T) { assert.Equal(t, 0, exitCode) }) + t.Run("download-hotfix rejects unexpected arguments", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller", "download-hotfix", "extra"}) + assert.Equal(t, 1, exitCode) + }) + + t.Run("download-hotfix returns success when VHD version already matches target", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + origVersion := Version + Version = "202603.30.0-hotfix1" + defer func() { Version = origVersion }() + + configPath := filepath.Join(t.TempDir(), "hotfix-config.json") + require.NoError(t, os.WriteFile(configPath, []byte(`{"version": "202603.30.0-hotfix1"}`), 0o644)) + tt.App.hotfixVersionPath = configPath + + exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller", "download-hotfix"}) + assert.Equal(t, 0, exitCode) + }) + t.Run("provision command with missing flag", func(t *testing.T) { tt := NewTestApp(t, TestAppConfig{}) exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller", "provision"}) diff --git a/aks-node-controller/go.mod b/aks-node-controller/go.mod index 7daefb5c643..fc192a38e74 100644 --- a/aks-node-controller/go.mod +++ b/aks-node-controller/go.mod @@ -9,7 +9,9 @@ require ( github.com/fsnotify/fsnotify v1.8.0 github.com/google/go-cmp v0.7.0 github.com/stretchr/testify v1.11.1 + github.com/urfave/cli/v3 v3.8.0 google.golang.org/protobuf v1.36.6 + gopkg.in/yaml.v3 v3.0.1 ) require ( @@ -29,7 +31,6 @@ require ( github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/vincent-petithory/dataurl v1.0.0 // indirect golang.org/x/sys v0.40.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect ) replace github.com/Azure/agentbaker => ../ diff --git a/aks-node-controller/go.sum b/aks-node-controller/go.sum index fc744056923..2b305a2e964 100644 --- a/aks-node-controller/go.sum +++ b/aks-node-controller/go.sum @@ -51,6 +51,8 @@ github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/urfave/cli/v3 v3.8.0 h1:XqKPrm0q4P0q5JpoclYoCAv0/MIvH/jZ2umzuf8pNTI= +github.com/urfave/cli/v3 v3.8.0/go.mod h1:ysVLtOEmg2tOy6PknnYVhDoouyC/6N42TMeoMzskhso= github.com/vincent-petithory/dataurl v1.0.0 h1:cXw+kPto8NLuJtlMsI152irrVw9fRDX8AbShPRpg2CI= github.com/vincent-petithory/dataurl v1.0.0/go.mod h1:FHafX5vmDzyP+1CQATJn7WFKc9CvnvxyvZy6I1MrG/U= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= diff --git a/aks-node-controller/selfupdate.go b/aks-node-controller/hotfix.go similarity index 80% rename from aks-node-controller/selfupdate.go rename to aks-node-controller/hotfix.go index 2179d751d4a..8b88a0c5612 100644 --- a/aks-node-controller/selfupdate.go +++ b/aks-node-controller/hotfix.go @@ -9,7 +9,6 @@ import ( "os/exec" "path/filepath" "strings" - "syscall" "time" ) @@ -23,58 +22,45 @@ const ( vhdBinaryPath = "/opt/azure/containers/aks-node-controller" // hotfixBinaryPath is where the hotfix binary is placed alongside the VHD-baked binary. // The wrapper script checks for this path and prefers it over the VHD-baked binary. - // This avoids overwriting a running binary, which is not possible on Windows. hotfixBinaryPath = "/opt/azure/containers/aks-node-controller-hotfix" // pkgBinaryPath is where apt/dnf package installs the binary. pkgBinaryPath = "/usr/bin/aks-node-controller" ) -// selfUpdate checks for a hotfix version and installs it from PMC if needed. -// It is called before command dispatch for provision and provision-wait commands. -// On successful install, it re-execs the process with the new binary and never returns. -// On any failure, it logs a warning so the VHD-baked binary proceeds. -func (a *App) selfUpdate(ctx context.Context) { +// downloadHotfix installs the requested hotfix and stages it alongside the VHD-baked binary. +// The wrapper script decides which binary to execute after this command returns. +func (a *App) downloadHotfix(ctx context.Context) error { hotfixPath := a.hotfixVersionPath if hotfixPath == "" { hotfixPath = defaultHotfixVersionPath } hotfixVersion, err := readHotfixVersion(hotfixPath) if err != nil { - slog.Error("failed to read hotfix version, proceeding with VHD-baked version", - "path", hotfixPath, "error", err) - return + return fmt.Errorf("read hotfix version from %s: %w", hotfixPath, err) } if hotfixVersion == "" { - return + slog.Info("hotfix config does not request a version, skipping download", "path", hotfixPath) + return nil } + if Version == hotfixVersion { - slog.Info("ANC already at hotfix version, skipping self-update", "version", Version) - return + slog.Info("ANC already at hotfix version, skipping download", "version", Version) + return nil } - slog.Info("ANC self-update triggered", "current", Version, "target", hotfixVersion) + slog.Info("downloading ANC hotfix", "current", Version, "target", hotfixVersion) - installErr := a.installFromPMC(ctx, hotfixVersion) - if installErr != nil { - slog.Error("failed to install hotfix, proceeding with VHD-baked version", - "target", hotfixVersion, "error", installErr) - return + if err := a.installFromPMC(ctx, hotfixVersion); err != nil { + return fmt.Errorf("install hotfix version %s: %w", hotfixVersion, err) } - // Copy the hotfix binary alongside the VHD-baked binary rather than overwriting it. - // This avoids replacing a running binary (which is not possible on Windows) and lets - // the wrapper script choose the hotfix on subsequent restarts. if err := copyBinaryAlongside(pkgBinaryPath, hotfixBinaryPath, vhdBinaryPath); err != nil { - slog.Error("failed to copy hotfix binary alongside VHD binary, proceeding with current binary", - "error", err) - return + return fmt.Errorf("stage hotfix binary: %w", err) } - if err := a.reExec(); err != nil { - slog.Error("failed to re-exec after hotfix install, proceeding with current binary", - "error", err) - } + slog.Info("downloaded ANC hotfix", "target", hotfixVersion, "path", hotfixBinaryPath) + return nil } // hotfixConfig is the JSON structure of the hotfix configuration file. @@ -292,14 +278,3 @@ func copyBinaryAlongside(src, dst, refPath string) error { slog.Info("installed hotfix binary alongside VHD binary", "src", src, "hotfixPath", dst) return nil } - -// reExec replaces the current process with the updated hotfix binary. -// On Linux this uses syscall.Exec which atomically replaces the process in-place. -// TODO(windows): syscall.Exec is not available on Windows. When Windows hotfix support -// is added, this will need a platform-specific implementation (e.g., spawn the hotfix -// binary as a child process and exit, or defer to the wrapper script for restart). -func (a *App) reExec() error { - args := append([]string{hotfixBinaryPath}, os.Args[1:]...) - slog.Info("re-executing with hotfix binary", "path", hotfixBinaryPath, "args", args) - return syscall.Exec(hotfixBinaryPath, args, os.Environ()) -} diff --git a/aks-node-controller/selfupdate_test.go b/aks-node-controller/hotfix_test.go similarity index 91% rename from aks-node-controller/selfupdate_test.go rename to aks-node-controller/hotfix_test.go index d6ab6f8ae75..4e58f12782b 100644 --- a/aks-node-controller/selfupdate_test.go +++ b/aks-node-controller/hotfix_test.go @@ -99,39 +99,36 @@ func TestResolveMicrosoftProdSourceListPath(t *testing.T) { }) } -func TestSelfUpdate_NoHotfixFile(t *testing.T) { - // When no hotfix-version file exists, selfUpdate should be a no-op. +func TestDownloadHotfix_NoHotfixFile(t *testing.T) { tt := NewTestApp(t, TestAppConfig{}) tt.App.hotfixVersionPath = filepath.Join(t.TempDir(), "nonexistent") - tt.App.selfUpdate(context.Background()) // should not panic + require.NoError(t, tt.App.downloadHotfix(context.Background())) } -func TestSelfUpdate_VersionMatch(t *testing.T) { - // When the compiled version matches the hotfix version, selfUpdate should skip. +func TestDownloadHotfix_VersionMatch(t *testing.T) { origVersion := Version Version = "202603.30.0-hotfix1" defer func() { Version = origVersion }() dir := t.TempDir() path := filepath.Join(dir, "hotfix-config.json") - require.NoError(t, os.WriteFile(path, []byte(`{"version": "202603.30.0-hotfix1"}`), 0644)) + require.NoError(t, os.WriteFile(path, []byte(`{"version": "202603.30.0-hotfix1"}`), 0o644)) tt := NewTestApp(t, TestAppConfig{}) tt.App.hotfixVersionPath = path - tt.App.selfUpdate(context.Background()) // should not panic + require.NoError(t, tt.App.downloadHotfix(context.Background())) } -func TestSelfUpdate_UnreadableFile(t *testing.T) { - // When the hotfix file exists but is unreadable, selfUpdate should log warning and continue. +func TestDownloadHotfix_UnreadableFile(t *testing.T) { dir := t.TempDir() path := filepath.Join(dir, "hotfix-config.json") - require.NoError(t, os.WriteFile(path, []byte(`{"version": "1.0.0"}`), 0644)) - require.NoError(t, os.Chmod(path, 0000)) - t.Cleanup(func() { _ = os.Chmod(path, 0644) }) + require.NoError(t, os.WriteFile(path, []byte(`{"version": "1.0.0"}`), 0o644)) + require.NoError(t, os.Chmod(path, 0o000)) + t.Cleanup(func() { _ = os.Chmod(path, 0o644) }) tt := NewTestApp(t, TestAppConfig{}) tt.App.hotfixVersionPath = path - tt.App.selfUpdate(context.Background()) // should not panic, logs warning + require.Error(t, tt.App.downloadHotfix(context.Background())) } func TestRetryCommand_SuccessOnFirstAttempt(t *testing.T) { diff --git a/parts/linux/cloud-init/artifacts/aks-node-controller-wrapper.sh b/parts/linux/cloud-init/artifacts/aks-node-controller-wrapper.sh index e7898f5c599..6051688beff 100644 --- a/parts/linux/cloud-init/artifacts/aks-node-controller-wrapper.sh +++ b/parts/linux/cloud-init/artifacts/aks-node-controller-wrapper.sh @@ -7,6 +7,7 @@ done BIN_PATH="${BIN_PATH:-/opt/azure/containers/aks-node-controller}" HOTFIX_BIN="${BIN_PATH}-hotfix" +HOTFIX_JSON="/opt/azure/containers/aks-node-controller-hotfix.json" CONFIG_PATH="${CONFIG_PATH:-/opt/azure/containers/aks-node-controller-config.json}" NBC_CMD_PATH="${NBC_CMD_PATH:-/opt/azure/containers/aks-node-controller-nbc-cmd.sh}" LOGGER_TAG="aks-node-controller-wrapper" @@ -18,6 +19,18 @@ log() { echo "$message" } +# this is to ensure that shellspec won't interpret any further lines below +${__SOURCED__:+return} + +if [ -f "$HOTFIX_JSON" ]; then + log "Downloading ANC hotfix from ${HOTFIX_JSON}" + if "$BIN_PATH" download-hotfix; then + log "Finished downloading ANC hotfix from ${HOTFIX_JSON}" + else + log "Failed to download ANC hotfix from ${HOTFIX_JSON}; falling back to staged binaries" + fi +fi + if [ -x "$HOTFIX_BIN" ]; then BIN_PATH="$HOTFIX_BIN" log "Using hotfix binary: $HOTFIX_BIN" @@ -25,9 +38,6 @@ else log "Using VHD-baked binary: $BIN_PATH" fi -# this is to ensure that shellspec won't interpret any further lines below -${__SOURCED__:+return} - command=("$BIN_PATH" provision) if [ -f "$CONFIG_PATH" ]; then log "Launching aks-node-controller with config ${CONFIG_PATH}"