Skip to content

Commit 71900f9

Browse files
authored
feat: provide configurable cse timeout in seconds (#7766)
1 parent 068403d commit 71900f9

134 files changed

Lines changed: 195 additions & 143 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

aks-node-controller/parser/helper.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
"github.com/Azure/agentbaker/aks-node-controller/helpers"
3333
aksnodeconfigv1 "github.com/Azure/agentbaker/aks-node-controller/pkg/gen/aksnodeconfig/v1"
3434
"github.com/Azure/agentbaker/pkg/agent"
35+
"github.com/Azure/agentbaker/pkg/agent/datamodel"
3536
"google.golang.org/protobuf/encoding/protojson"
3637
)
3738

@@ -809,3 +810,16 @@ func getLocalDnsMemoryLimitInMb(aksnodeconfig *aksnodeconfigv1.Configuration) st
809810
}
810811

811812
// ---------------------- End of localdns related helper code ----------------------//
813+
814+
// ---------------------- Start of cse timeout helper code ----------------------//
815+
816+
// getCSETimeout returns the CSE timeout value in minutes.
817+
func getCSETimeout(aksnodeconfig *aksnodeconfigv1.Configuration) string {
818+
cseTimeout := 0
819+
if aksnodeconfig != nil {
820+
cseTimeout = int(aksnodeconfig.GetCseTimeout())
821+
}
822+
return datamodel.GetCSETimeout(cseTimeout)
823+
}
824+
825+
// ---------------------- End of cse timeout helper code ----------------------//

aks-node-controller/parser/parser.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string {
179179
"SERVICE_ACCOUNT_IMAGE_PULL_DEFAULT_CLIENT_ID": config.GetServiceAccountImagePullProfile().GetDefaultClientId(),
180180
"SERVICE_ACCOUNT_IMAGE_PULL_DEFAULT_TENANT_ID": config.GetServiceAccountImagePullProfile().GetDefaultTenantId(),
181181
"IDENTITY_BINDINGS_LOCAL_AUTHORITY_SNI": config.GetServiceAccountImagePullProfile().GetLocalAuthoritySni(),
182+
"CSE_TIMEOUT": getCSETimeout(config),
182183
}
183184

184185
for i, cert := range config.CustomCaCerts {

aks-node-controller/pkg/gen/aksnodeconfig/v1/config.pb.go

Lines changed: 31 additions & 18 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

aks-node-controller/proto/aksnodeconfig/v1/config.proto

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,4 +167,7 @@ message Configuration {
167167

168168
// Service account based image pull profile configuration
169169
ServiceAccountImagePullProfile service_account_image_pull_profile = 43;
170+
171+
// CSE timeout override in seconds. If not specified, defaults to 15 minutes with a maximum of 360 minutes (6 hours).
172+
optional int32 cse_timeout = 44;
170173
}

parts/linux/cloud-init/artifacts/cse_cmd.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,4 +183,5 @@ MCR_REPOSITORY_BASE="{{GetMCRRepositoryBase}}"
183183
ENABLE_IMDS_RESTRICTION="{{EnableIMDSRestriction}}"
184184
INSERT_IMDS_RESTRICTION_RULE_TO_MANGLE_TABLE="{{InsertIMDSRestrictionRuleToMangleTable}}"
185185
PRE_PROVISION_ONLY="{{GetPreProvisionOnly}}"
186+
CSE_TIMEOUT="{{GetCSETimeout}}"
186187
/usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh"

parts/linux/cloud-init/artifacts/cse_start.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ export CSE_STARTTIME_SECONDS=$(date -d "$CSE_STARTTIME_FORMATTED" +%s) # Export
77
EVENTS_LOGGING_DIR=/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events/
88
mkdir -p $EVENTS_LOGGING_DIR
99
# this is the "global" CSE execution timeout - we allow CSE to run for 15 minutes before timeout will attempt to kill the script. We exit early from some of the retry loops using `check_cse_timeout` in `cse_helpers.sh`.`
10-
timeout -k5s 15m /bin/bash /opt/azure/containers/provision.sh >> /var/log/azure/cluster-provision.log 2>&1
10+
timeout -k5s $CSE_TIMEOUT /bin/bash /opt/azure/containers/provision.sh >> /var/log/azure/cluster-provision.log 2>&1
1111
EXIT_CODE=$?
1212
systemctl --no-pager -l status kubelet >> /var/log/azure/cluster-provision-cse-output.log 2>&1
1313
OUTPUT=$(tail -c 3000 "/var/log/azure/cluster-provision.log")

pkg/agent/baker.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1222,6 +1222,7 @@ func getContainerServiceFuncMap(config *datamodel.NodeBootstrappingConfiguration
12221222
return profile.GetLocalDNSMemoryLimitInMB()
12231223
},
12241224
"GetPreProvisionOnly": func() bool { return config.PreProvisionOnly },
1225+
"GetCSETimeout": func() string { return datamodel.GetCSETimeout(config.CSETimeout) },
12251226
"BlockIptables": func() bool {
12261227
return cs.Properties.OrchestratorProfile.KubernetesConfig.BlockIptables
12271228
},

pkg/agent/datamodel/const.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,3 +136,9 @@ const (
136136
EnableIPv6Only = "EnableIPv6Only"
137137
EnableWinDSR = "EnableWinDSR"
138138
)
139+
140+
// CSE Constants.
141+
const (
142+
DefaultCSETimeout = 900 // 15 minutes
143+
MaxCSETimeout = 21600 // 6 hours
144+
)

pkg/agent/datamodel/helper.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,13 @@ func getComponentNameFromURL(downloadURL string) (string, error) {
129129
func IsMIGNode(gpuInstanceProfile string) bool {
130130
return gpuInstanceProfile != ""
131131
}
132+
133+
// returns the CSE timeout value in seconds.
134+
// if empty or invalid value is provided, it returns the default timeout value of 15minutes or 900 seconds.
135+
// Maximum allowed timeout is 360 minutes or 6 hours or 21600 seconds.
136+
func GetCSETimeout(cseTimeout int) string {
137+
if cseTimeout <= 0 || cseTimeout > MaxCSETimeout {
138+
cseTimeout = DefaultCSETimeout
139+
}
140+
return fmt.Sprintf("%d", cseTimeout)
141+
}

pkg/agent/datamodel/types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,6 +1771,9 @@ type NodeBootstrappingConfiguration struct {
17711771
// PreProvisionOnly creates a pre-provisioned image for later node spawning.
17721772
// Skips kubelet and some component configuration for image capture scenarios.
17731773
PreProvisionOnly bool
1774+
1775+
// CSETimeout specifies the timeout execution in seconds.
1776+
CSETimeout int
17741777
}
17751778

17761779
func (config *NodeBootstrappingConfiguration) IsFlatcar() bool {

0 commit comments

Comments
 (0)