Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 106 additions & 108 deletions aks-node-controller/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"io"
"log/slog"
Expand All @@ -20,6 +19,7 @@ import (
"github.com/Azure/agentbaker/aks-node-controller/parser"
"github.com/Azure/agentbaker/aks-node-controller/pkg/nodeconfigutils"
"github.com/fsnotify/fsnotify"
"github.com/urfave/cli/v3"
)

type App struct {
Expand All @@ -36,42 +36,6 @@ type App struct {
nodeCustomDataPath string
}

// commandMetadata holds all metadata for a command in one place.
type commandMetadata struct {
taskName string
handler func(*App, context.Context, []string) error
}

// getCommandRegistry returns the command registry mapping command names to their metadata.
// Adding a new command only requires adding one entry here.
func getCommandRegistry() map[string]commandMetadata {
return map[string]commandMetadata{
"provision": {
taskName: "Provision",
handler: func(a *App, ctx context.Context, args []string) error {
provisionResult, err := a.runProvision(ctx, args[2:])
// Always notify after provisioning attempt (success is a no-op inside notifier)
a.writeCompleteFileOnError(provisionResult, err)
return err
},
},
"provision-wait": {
taskName: "ProvisionWait",
handler: func(a *App, ctx context.Context, args []string) error {
provisionStatusFiles := ProvisionStatusFiles{
ProvisionJSONFile: provisionJSONFilePath,
ProvisionCompleteFile: provisionCompleteFilePath,
}
provisionOutput, err := a.ProvisionWait(ctx, provisionStatusFiles)
//nolint:forbidigo // stdout is part of the interface
fmt.Println(provisionOutput)
slog.Info("provision-wait finished", "provisionOutput", provisionOutput)
return err
},
},
}
}

// provision.json values are emitted as strings by the shell jq invocation.
// We only care about ExitCode + Error + Output (snippet) for failure detection.
type ProvisionResult struct {
Expand Down Expand Up @@ -99,76 +63,120 @@ type ProvisionStatusFiles struct {
}

func (a *App) Run(ctx context.Context, args []string) int {
if handled := handleInfoCommand(args); handled {
return 0
cmd := &cli.Command{
Name: "aks-node-controller",
Usage: "Parse contract and run csecmd",
Version: Version,
ExitErrHandler: func(context.Context, *cli.Command, error) {
// Return errors to the caller so exit codes are derived consistently in one place.
},
Action: func(context.Context, *cli.Command) error {
if len(args) > 1 {
return fmt.Errorf("unknown command: %s", args[1])
}
return errors.New("missing command argument")
},
Commands: []*cli.Command{
{
Name: "provision",
Usage: "Run node provisioning",
Flags: []cli.Flag{
&cli.StringFlag{Name: "provision-config", Usage: "path to the provision config file"},
&cli.StringFlag{Name: "nbc-cmd", Usage: "path to the NBC command file"},
&cli.BoolFlag{Name: "dry-run", Usage: "print the command that would be run without executing it"},
},
Action: func(ctx context.Context, cmd *cli.Command) error {
return a.runProvisionCommand(ctx, ProvisionFlags{
ProvisionConfig: cmd.String("provision-config"),
NBCCmd: cmd.String("nbc-cmd"),
}, cmd.Bool("dry-run"))
},
},
{
Name: "provision-wait",
Usage: "Wait for provisioning to complete",
Action: func(ctx context.Context, cmd *cli.Command) error {
provisionStatusFiles := ProvisionStatusFiles{
ProvisionJSONFile: provisionJSONFilePath,
ProvisionCompleteFile: provisionCompleteFilePath,
}
provisionOutput, err := a.runProvisionWaitCommand(ctx, provisionStatusFiles)
_, _ = fmt.Fprintln(cmd.Root().Writer, provisionOutput)
return err
},
Comment on lines +66 to +106
Copy link

Copilot AI Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With urfave/cli, flag parsing errors for provision (e.g., unknown flag, missing flag value) will occur before runProvisionCommand runs, so writeCompleteFileOnError(...) is never invoked. That can leave provision.complete/provision.json unwritten and cause provision-wait to block until its context times out. Consider handling CLI parse/usage errors in ExitErrHandler (or equivalent hook) to emit a ProvisionResult (ExitCode=240 + error) and write the sentinel files for provision invocations, matching the previous fail-fast behavior.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be fine

},
{
Name: "version",
Usage: "Print the version",
Action: func(_ context.Context, cmd *cli.Command) error {
_, _ = fmt.Fprintln(cmd.Root().Writer, Version)
return nil
},
},
{
Name: "download-hotfix",
Usage: "Download the requested hotfix binary",
Action: func(ctx context.Context, cmd *cli.Command) error {
Comment thread
awesomenix marked this conversation as resolved.
if len(cmd.Args().Slice()) > 0 {
return fmt.Errorf("unexpected download-hotfix arguments: %s", strings.Join(cmd.Args().Slice(), " "))
}
return a.runDownloadHotfixCommand(ctx)
},
},
},
}

slog.Info("aks-node-controller started", "args", args)
err := a.run(ctx, args)
exitCode := errToExitCode(err)
if exitCode == 0 {
slog.Info("aks-node-controller finished successfully.")
} else {
slog.Error("aks-node-controller failed", "error", err)
}
return exitCode
err := cmd.Run(ctx, args)
return errToExitCode(err)
}

// handleInfoCommand handles --version, version, --help, and help commands
// without logging noise. Returns true if handled.
func handleInfoCommand(args []string) bool {
if len(args) < 2 {
return false
}
switch args[1] {
case "--version", "version":
_, _ = fmt.Fprintln(os.Stdout, Version)
return true
case "--help", "help":
_, _ = fmt.Fprintln(os.Stdout, "Usage: aks-node-controller <command> [options]")
_, _ = fmt.Fprintln(os.Stdout)
_, _ = fmt.Fprintln(os.Stdout, "Commands:")
_, _ = fmt.Fprintln(os.Stdout, " provision Run node provisioning")
_, _ = fmt.Fprintln(os.Stdout, " provision-wait Wait for provisioning to complete")
_, _ = fmt.Fprintln(os.Stdout, " version Print the version")
_, _ = fmt.Fprintln(os.Stdout, " help Print this help message")
return true
default:
return false
}
}
func (a *App) runProvisionCommand(ctx context.Context, flags ProvisionFlags, dryRun bool) error {
slog.Info("aks-node-controller started", "task", "Provision")

func (a *App) run(ctx context.Context, args []string) error {
command := ""
if len(args) >= 2 {
command = args[1]
}
if command == "" {
return errors.New("missing command argument")
}

cmd, ok := getCommandRegistry()[command]
if !ok {
return fmt.Errorf("unknown command: %s", command)
startTime := time.Now()
a.eventLogger.LogEvent("Provision", "Starting", helpers.EventLevelInformational, startTime, startTime)
provisionResult, err := a.runProvision(ctx, flags, dryRun)
a.writeCompleteFileOnError(provisionResult, err)
endTime := time.Now()
if err != nil {
message := fmt.Sprintf("aks-node-controller exited with error %s", err.Error())
a.eventLogger.LogEvent("Provision", message, helpers.EventLevelError, startTime, endTime)
slog.Error("aks-node-controller failed", "error", err)
} else {
a.eventLogger.LogEvent("Provision", "Completed", helpers.EventLevelInformational, startTime, endTime)
slog.Info("aks-node-controller finished successfully.")
}
return err
}

// Self-update before provisioning: check for hotfix version and install if needed.
// On success, syscall.Exec replaces this process and never returns.
// On any failure, selfUpdate logs a warning and returns nil (best-effort).
a.selfUpdate(ctx)
func (a *App) runProvisionWaitCommand(ctx context.Context, provisionStatusFiles ProvisionStatusFiles) (string, error) {
slog.Info("aks-node-controller started", "task", "ProvisionWait")

startTime := time.Now()
a.eventLogger.LogEvent(cmd.taskName, "Starting", helpers.EventLevelInformational, startTime, startTime)

err := cmd.handler(a, ctx, args)
a.eventLogger.LogEvent("ProvisionWait", "Starting", helpers.EventLevelInformational, startTime, startTime)
provisionOutput, err := a.ProvisionWait(ctx, provisionStatusFiles)
endTime := time.Now()
if err != nil {
message := fmt.Sprintf("aks-node-controller exited with error %s", err.Error())
a.eventLogger.LogEvent(cmd.taskName, message, helpers.EventLevelError, startTime, endTime)
a.eventLogger.LogEvent("ProvisionWait", message, helpers.EventLevelError, startTime, endTime)
slog.Error("aks-node-controller failed", "error", err)
} else {
a.eventLogger.LogEvent(cmd.taskName, "Completed", helpers.EventLevelInformational, startTime, endTime)
a.eventLogger.LogEvent("ProvisionWait", "Completed", helpers.EventLevelInformational, startTime, endTime)
slog.Info("aks-node-controller finished successfully.")
}
return err
slog.Info("provision-wait finished", "provisionOutput", provisionOutput)
return provisionOutput, err
}

func (a *App) runDownloadHotfixCommand(ctx context.Context) error {
slog.Info("aks-node-controller hotfix download started")
err := a.downloadHotfix(ctx)
if err != nil {
slog.Error("aks-node-controller hotfix download failed", "error", err)
return err
}
slog.Info("aks-node-controller hotfix download finished")
return nil
}

func buildCmdFromProvisionConfig(ctx context.Context, path string) (*exec.Cmd, error) {
Expand Down Expand Up @@ -275,10 +283,10 @@ func (a *App) Provision(ctx context.Context, flags ProvisionFlags) (*ProvisionRe
return provisionResult, err
}

// runProvision encapsulates argument parsing and execution for the "provision" subcommand.
// runProvision encapsulates execution for the "provision" subcommand after CLI parsing.
// It returns an error describing any failure; callers should pass that error to
// writeCompleteFileOnError so the sentinel file can be written on fail-fast paths.
func (a *App) runProvision(ctx context.Context, args []string) (*ProvisionResult, error) {
func (a *App) runProvision(ctx context.Context, flags ProvisionFlags, dryRun bool) (*ProvisionResult, error) {
// Handle panics gracefully to ensure we always return a valid result
provisionResult := &ProvisionResult{}
var err error
Expand All @@ -293,25 +301,15 @@ func (a *App) runProvision(ctx context.Context, args []string) (*ProvisionResult
}
}()

fs := flag.NewFlagSet("provision", flag.ContinueOnError)
provisionConfig := fs.String("provision-config", "", "path to the provision config file")
nbcCMD := fs.String("nbc-cmd", "", "path to the NBC command file")
dryRun := fs.Bool("dry-run", false, "print the command that would be run without executing it")
if parseErr := fs.Parse(args); parseErr != nil {
provisionResult.ExitCode = strconv.Itoa(240)
provisionResult.Error = fmt.Sprintf("parse args: %v", parseErr)
return provisionResult, errors.New(provisionResult.Error)
}

if *provisionConfig == "" && *nbcCMD == "" {
if flags.ProvisionConfig == "" && flags.NBCCmd == "" {
provisionResult.ExitCode = strconv.Itoa(240)
provisionResult.Error = "--provision-config or --nbc-cmd is required"
return provisionResult, errors.New(provisionResult.Error)
}
if *dryRun {
if dryRun {
a.cmdRun = cmdRunnerDryRun
}
return a.Provision(ctx, ProvisionFlags{ProvisionConfig: *provisionConfig, NBCCmd: *nbcCMD})
return a.Provision(ctx, flags)
}

// writeCompleteFileOnError writes the provision.complete sentinel if err is non-nil,
Expand Down
20 changes: 20 additions & 0 deletions aks-node-controller/app_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,26 @@ func TestApp_Run(t *testing.T) {
assert.Equal(t, 0, exitCode)
})

t.Run("download-hotfix rejects unexpected arguments", func(t *testing.T) {
tt := NewTestApp(t, TestAppConfig{})
exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller", "download-hotfix", "extra"})
assert.Equal(t, 1, exitCode)
})

t.Run("download-hotfix returns success when VHD version already matches target", func(t *testing.T) {
tt := NewTestApp(t, TestAppConfig{})
origVersion := Version
Version = "202603.30.0-hotfix1"
defer func() { Version = origVersion }()

configPath := filepath.Join(t.TempDir(), "hotfix-config.json")
require.NoError(t, os.WriteFile(configPath, []byte(`{"version": "202603.30.0-hotfix1"}`), 0o644))
tt.App.hotfixVersionPath = configPath

exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller", "download-hotfix"})
assert.Equal(t, 0, exitCode)
})

t.Run("provision command with missing flag", func(t *testing.T) {
tt := NewTestApp(t, TestAppConfig{})
exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller", "provision"})
Expand Down
3 changes: 2 additions & 1 deletion aks-node-controller/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ require (
github.com/fsnotify/fsnotify v1.8.0
github.com/google/go-cmp v0.7.0
github.com/stretchr/testify v1.11.1
github.com/urfave/cli/v3 v3.8.0
google.golang.org/protobuf v1.36.6
gopkg.in/yaml.v3 v3.0.1
)

require (
Expand All @@ -29,7 +31,6 @@ require (
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/vincent-petithory/dataurl v1.0.0 // indirect
golang.org/x/sys v0.40.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

replace github.com/Azure/agentbaker => ../
Expand Down
2 changes: 2 additions & 0 deletions aks-node-controller/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/urfave/cli/v3 v3.8.0 h1:XqKPrm0q4P0q5JpoclYoCAv0/MIvH/jZ2umzuf8pNTI=
github.com/urfave/cli/v3 v3.8.0/go.mod h1:ysVLtOEmg2tOy6PknnYVhDoouyC/6N42TMeoMzskhso=
github.com/vincent-petithory/dataurl v1.0.0 h1:cXw+kPto8NLuJtlMsI152irrVw9fRDX8AbShPRpg2CI=
github.com/vincent-petithory/dataurl v1.0.0/go.mod h1:FHafX5vmDzyP+1CQATJn7WFKc9CvnvxyvZy6I1MrG/U=
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
Expand Down
Loading
Loading