Skip to content

Commit fc73a0c

Browse files
authored
Merge pull request #377 from devtron-labs/feat/buildx-builder-pod-wait-timeout
feat: add env-driven timeout for buildx builder pod readiness
2 parents fbde4d5 + 260bf0d commit fc73a0c

5 files changed

Lines changed: 532 additions & 74 deletions

File tree

ci-runner/helper/DockerHelper.go

Lines changed: 78 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,19 @@ import (
2323
"encoding/json"
2424
"errors"
2525
"fmt"
26+
"io"
27+
"io/ioutil"
28+
"log"
29+
"os"
30+
"os/exec"
31+
"path"
32+
"path/filepath"
33+
"strconv"
34+
"strings"
35+
"sync"
36+
"syscall"
37+
"time"
38+
2639
"github.com/aws/aws-sdk-go/aws"
2740
"github.com/aws/aws-sdk-go/aws/credentials"
2841
"github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds"
@@ -38,18 +51,6 @@ import (
3851
"github.com/devtron-labs/common-lib/utils/dockerOperations"
3952
"github.com/devtron-labs/common-lib/utils/retryFunc"
4053
"golang.org/x/sync/errgroup"
41-
"io"
42-
"io/ioutil"
43-
"log"
44-
"os"
45-
"os/exec"
46-
"path"
47-
"path/filepath"
48-
"strconv"
49-
"strings"
50-
"sync"
51-
"syscall"
52-
"time"
5354
)
5455

5556
const (
@@ -74,15 +75,23 @@ type DockerHelper interface {
7475
GetDockerAuthConfigForPrivateRegistries(workflowRequest *CommonWorkflowRequest) *bean.DockerAuthConfig
7576
}
7677

78+
// BuildxK8sClientFactory creates a BuildxK8sInterface from deployment names.
79+
// Abstracted for testability — production code uses newBuildxK8sClient as the default.
80+
type BuildxK8sClientFactory func(deploymentNames []string) (BuildxK8sInterface, error)
81+
7782
type DockerHelperImpl struct {
7883
DockerCommandEnv []string
7984
cmdExecutor CommandExecutor
85+
k8sClientFactory BuildxK8sClientFactory
8086
}
8187

8288
func NewDockerHelperImpl(cmdExecutor CommandExecutor) *DockerHelperImpl {
8389
return &DockerHelperImpl{
8490
DockerCommandEnv: os.Environ(),
8591
cmdExecutor: cmdExecutor,
92+
k8sClientFactory: func(names []string) (BuildxK8sInterface, error) {
93+
return newBuildxK8sClient(names)
94+
},
8695
}
8796
}
8897

@@ -300,9 +309,27 @@ func (impl *DockerHelperImpl) DockerLogin(ciContext cicxt.CiContext, dockerCrede
300309
return performDockerLogin()
301310
}
302311

312+
// waitForBuilderPods waits until all buildx k8s driver pods reach Running state,
313+
// or returns an error if the deadline is exceeded. Extracted for testability.
314+
func waitForBuilderPods(ctx context.Context, k8sClient BuildxK8sInterface, duration time.Duration) error {
315+
log.Println(util.DEVTRON, fmt.Sprintf("waiting for builder pods to be ready (timeout: %v)", duration))
316+
initDone := make(chan bool, 1)
317+
initCtx, initCancel := context.WithTimeout(ctx, duration)
318+
defer initCancel()
319+
go k8sClient.WaitUntilBuilderPodLive(initCtx, initDone)
320+
select {
321+
case <-initDone:
322+
log.Println(util.DEVTRON, "builder pods are ready for build")
323+
return nil
324+
case <-initCtx.Done():
325+
return fmt.Errorf("builder pods did not reach Running state within %v", duration)
326+
}
327+
}
328+
303329
func (impl *DockerHelperImpl) executeDockerReBuild(ciContext cicxt.CiContext, k8sClient BuildxK8sInterface,
304330
useBuildxK8sDriver bool, dockerBuild string, deploymentNames []string,
305-
dockerBuildStageMetadata bean2.DockerBuildStageMetadata, reBuildLogs []any) error {
331+
dockerBuildStageMetadata bean2.DockerBuildStageMetadata, reBuildLogs []any,
332+
builderPodWaitDuration time.Duration) error {
306333
if !useBuildxK8sDriver {
307334
return nil
308335
}
@@ -311,7 +338,7 @@ func (impl *DockerHelperImpl) executeDockerReBuild(ciContext cicxt.CiContext, k8
311338
log.Println(util.DEVTRON, fmt.Sprintf(" error in RestartBuilders : %s", k8sErr.Error()))
312339
return k8sErr
313340
}
314-
k8sClient, err := newBuildxK8sClient(deploymentNames)
341+
k8sClient, err := impl.k8sClientFactory(deploymentNames)
315342
if err != nil {
316343
log.Println(util.DEVTRON, " error in creating buildxK8sClient , err : ", err.Error())
317344
return err
@@ -324,18 +351,15 @@ func (impl *DockerHelperImpl) executeDockerReBuild(ciContext cicxt.CiContext, k8
324351
rebuildImageStage := func() error {
325352
// wait for the builder pod to be up again
326353
startTime := time.Now()
327-
util.LogInfo("Waiting for builder pod to be ready,", "timeout: 2 minutes")
328-
done := make(chan bool)
329-
ctx, cancel := context.WithCancel(ciContext)
354+
util.LogInfo("Waiting for builder pod to be ready,", fmt.Sprintf("timeout: %v", builderPodWaitDuration))
355+
done := make(chan bool, 1) // buffered to prevent goroutine leak on timeout
356+
ctx, cancel := context.WithTimeout(ciContext, builderPodWaitDuration)
330357
defer cancel()
331358
go k8sClient.WaitUntilBuilderPodLive(ctx, done)
332359
select {
333360
case <-done:
334361
// builder pod is up again, continue with the build
335-
cancel()
336-
case <-time.After(2 * time.Minute):
337-
// timeout after 2 minutes
338-
cancel()
362+
case <-ctx.Done():
339363
return BuilderPodDeletedError
340364
}
341365
util.LogInfo("DONE -->", time.Since(startTime).Seconds())
@@ -403,6 +427,10 @@ func (impl *DockerHelperImpl) BuildArtifact(ciRequest *CommonWorkflowRequest) (s
403427
if err != nil {
404428
log.Println("Error while parsing environment variables", err)
405429
}
430+
builderPodWaitDuration := 2 * time.Minute // backward-compat default
431+
if ciRequest.BuildxBuilderPodWaitDurationSecs > 0 {
432+
builderPodWaitDuration = time.Duration(ciRequest.BuildxBuilderPodWaitDurationSecs) * time.Second
433+
}
406434
if ciRequest.DockerImageTag == "" {
407435
ciRequest.DockerImageTag = "latest"
408436
}
@@ -453,12 +481,12 @@ func (impl *DockerHelperImpl) BuildArtifact(ciRequest *CommonWorkflowRequest) (s
453481
}
454482
useBuildxK8sDriver, eligibleK8sDriverNodes = dockerBuildConfig.CheckForBuildXK8sDriver()
455483
if useBuildxK8sDriver {
456-
deploymentNames, err = impl.createBuildxBuilderWithK8sDriver(ciContext, ciRequest.PropagateLabelsInBuildxPod, ciRequest.DockerConnection, dockerBuildConfig.BuildxDriverImage, eligibleK8sDriverNodes, ciRequest.PipelineId, ciRequest.WorkflowId)
484+
deploymentNames, err = impl.createBuildxBuilderWithK8sDriver(ciContext, ciRequest.PropagateLabelsInBuildxPod, ciRequest.DockerConnection, dockerBuildConfig.BuildxDriverImage, eligibleK8sDriverNodes, ciRequest.PipelineId, ciRequest.WorkflowId, builderPodWaitDuration)
457485
if err != nil {
458486
log.Println(util.DEVTRON, " error in creating buildxDriver , err : ", err.Error())
459487
return err
460488
}
461-
k8sClient, err = newBuildxK8sClient(deploymentNames)
489+
k8sClient, err = impl.k8sClientFactory(deploymentNames)
462490
if err != nil {
463491
log.Println(util.DEVTRON, " error in creating buildxK8sClient , err : ", err.Error())
464492
return err
@@ -469,6 +497,11 @@ func (impl *DockerHelperImpl) BuildArtifact(ciRequest *CommonWorkflowRequest) (s
469497
log.Println(util.DEVTRON, " error in registering builder pods ", " err: ", err)
470498
return err
471499
}
500+
// Wait for builder pods to reach Running state before starting the build.
501+
// Prevents false-positive BuilderPodDeletedError from pod startup latency.
502+
if err = waitForBuilderPods(ciContext, k8sClient, builderPodWaitDuration); err != nil {
503+
return err
504+
}
472505
} else {
473506
err = impl.createBuildxBuilderForMultiArchBuild(ciContext, ciRequest.DockerConnection, dockerBuildConfig.BuildxDriverImage)
474507
if err != nil {
@@ -534,7 +567,7 @@ func (impl *DockerHelperImpl) BuildArtifact(ciRequest *CommonWorkflowRequest) (s
534567
reBuildLogs = []any{fmt.Sprintf("Starting re docker build (Attempt %d) : ", attempt), dockerBuild}
535568
}
536569
return impl.executeDockerReBuild(ciContext, k8sClient, useBuildxK8sDriver, dockerBuild,
537-
deploymentNames, dockerBuildStageMetadata, reBuildLogs)
570+
deploymentNames, dockerBuildStageMetadata, reBuildLogs, builderPodWaitDuration)
538571
}
539572
err = retryFunc.RetryWithOutLogging(callback, retryFunc.IsRetryableError, maxRetry, 1*time.Second)
540573
if err != nil {
@@ -1097,14 +1130,14 @@ func (impl *DockerHelperImpl) createBuildxBuilderForMultiArchBuild(ciContext cic
10971130
return nil
10981131
}
10991132

1100-
func (impl *DockerHelperImpl) createBuildxBuilderWithK8sDriver(ciContext cicxt.CiContext, propagateLabelsInBuildxPod bool, dockerConnection, buildxDriverImage string, builderNodes []map[string]string, ciPipelineId, ciWorkflowId int) ([]string, error) {
1133+
func (impl *DockerHelperImpl) createBuildxBuilderWithK8sDriver(ciContext cicxt.CiContext, propagateLabelsInBuildxPod bool, dockerConnection, buildxDriverImage string, builderNodes []map[string]string, ciPipelineId, ciWorkflowId int, timeout time.Duration) ([]string, error) {
11011134
deploymentNames := make([]string, 0)
11021135
if len(builderNodes) == 0 {
11031136
return deploymentNames, errors.New("atleast one node is expected for builder with kubernetes driver")
11041137
}
11051138
for i := 0; i < len(builderNodes); i++ {
11061139
nodeOpts := builderNodes[i]
1107-
builderCmd, deploymentName, err := getBuildxK8sDriverCmd(propagateLabelsInBuildxPod, dockerConnection, buildxDriverImage, nodeOpts, ciPipelineId, ciWorkflowId)
1140+
builderCmd, deploymentName, err := getBuildxK8sDriverCmd(propagateLabelsInBuildxPod, dockerConnection, buildxDriverImage, nodeOpts, ciPipelineId, ciWorkflowId, timeout)
11081141
if err != nil {
11091142
return deploymentNames, err
11101143
}
@@ -1181,7 +1214,7 @@ func (impl *DockerHelperImpl) runCmd(cmd string) (error, *bytes.Buffer) {
11811214
return err, errBuf
11821215
}
11831216

1184-
func getBuildxK8sDriverCmd(propagateLabelsInBuildxPod bool, dockerConnection, buildxDriverImage string, driverOpts map[string]string, ciPipelineId, ciWorkflowId int) (string, string, error) {
1217+
func getBuildxK8sDriverCmd(propagateLabelsInBuildxPod bool, dockerConnection, buildxDriverImage string, driverOpts map[string]string, ciPipelineId, ciWorkflowId int, timeout time.Duration) (string, string, error) {
11851218
buildxCreate := "docker buildx create --buildkitd-flags '--allow-insecure-entitlement network.host --allow-insecure-entitlement security.insecure' --name=%s --driver=kubernetes --node=%s --bootstrap "
11861219
nodeName := driverOpts["node"]
11871220
if nodeName == "" {
@@ -1203,6 +1236,9 @@ func getBuildxK8sDriverCmd(propagateLabelsInBuildxPod bool, dockerConnection, bu
12031236
}
12041237

12051238
driverOpts["driverOptions"] = getBuildXDriverOptionsWithImage(buildxDriverImage, driverOpts["driverOptions"])
1239+
if timeout > 0 {
1240+
driverOpts["driverOptions"] = getBuildXDriverOptionsWithTimeout(timeout, driverOpts["driverOptions"])
1241+
}
12061242
if len(driverOpts["driverOptions"]) > 0 {
12071243
buildxCreate += " '--driver-opt=%s' "
12081244
buildxCreate = fmt.Sprintf(buildxCreate, driverOpts["driverOptions"])
@@ -1227,6 +1263,21 @@ func getBuildXDriverOptionsWithImage(buildxDriverImage, driverOptions string) st
12271263
return driverOptions
12281264
}
12291265

1266+
func getBuildXDriverOptionsWithTimeout(timeout time.Duration, driverOptions string) string {
1267+
if strings.HasPrefix(driverOptions, "timeout=") ||
1268+
strings.Contains(driverOptions, ",timeout=") {
1269+
// if timeout is already present in driver options then do not override it, just return the existing options
1270+
return driverOptions
1271+
}
1272+
timeoutOption := fmt.Sprintf("\"timeout=%s\"", timeout.String())
1273+
if len(driverOptions) > 0 {
1274+
driverOptions += fmt.Sprintf(",%s", timeoutOption)
1275+
} else {
1276+
driverOptions = timeoutOption
1277+
}
1278+
return driverOptions
1279+
}
1280+
12301281
func getBuildXDriverOptionsWithLabelsAndAnnotations(driverOptions string) (string, error) {
12311282
// not passing annotation as of now because --driver-opt=annotations is not supported by buildx if contains quotes
12321283
labels := make(map[string]string)

ci-runner/helper/DockerHelper_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ func TestCreateBuildXK8sDriver(t *testing.T) {
4141
eligibleK8sNodes := dockerBuildConfig.GetEligibleK8sDriverNodes()
4242
impl := getDockerHelperImpl()
4343
ciContext := cicxt.BuildCiContext(context.Background(), true)
44-
_, err := impl.createBuildxBuilderWithK8sDriver(ciContext, false, "", "", eligibleK8sNodes, 1, 1)
44+
_, err := impl.createBuildxBuilderWithK8sDriver(ciContext, false, "", "", eligibleK8sNodes, 1, 1, 0)
4545
t.Cleanup(func() {
4646
buildxDelete := fmt.Sprintf("docker buildx rm %s", BUILDX_K8S_DRIVER_NAME)
4747
builderRemoveCmd := exec.Command("/bin/sh", "-c", buildxDelete)
@@ -64,7 +64,7 @@ func TestCleanBuildxK8sDriver(t *testing.T) {
6464
eligibleK8sNodes := dockerBuildConfig.GetEligibleK8sDriverNodes()
6565
impl := getDockerHelperImpl()
6666
ciContext := cicxt.BuildCiContext(context.Background(), true)
67-
_, err := impl.createBuildxBuilderWithK8sDriver(ciContext, false, "", "", eligibleK8sNodes, 1, 1)
67+
_, err := impl.createBuildxBuilderWithK8sDriver(ciContext, false, "", "", eligibleK8sNodes, 1, 1, 0)
6868
if err != nil {
6969
fmt.Println(err.Error())
7070
t.Fail()

ci-runner/helper/EventHelper.go

Lines changed: 36 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@ import (
2020
"crypto/tls"
2121
"encoding/json"
2222
"fmt"
23-
bean2 "github.com/devtron-labs/common-lib/imageScan/bean"
24-
"github.com/devtron-labs/common-lib/utils/remoteConnection/bean"
2523
"log"
2624
"net/http"
2725
"strings"
@@ -31,7 +29,9 @@ import (
3129
"github.com/devtron-labs/ci-runner/pubsub"
3230
"github.com/devtron-labs/ci-runner/util"
3331
blobStorage "github.com/devtron-labs/common-lib/blob-storage"
32+
bean2 "github.com/devtron-labs/common-lib/imageScan/bean"
3433
pubSub "github.com/devtron-labs/common-lib/pubsub-lib"
34+
"github.com/devtron-labs/common-lib/utils/remoteConnection/bean"
3535
"github.com/go-resty/resty/v2"
3636
)
3737

@@ -170,39 +170,40 @@ type CommonWorkflowRequest struct {
170170
EnableSecretMasking bool `json:"enableSecretMasking"`
171171
PropagateLabelsInBuildxPod bool `json:"propagateLabelsInBuildxPod"`
172172
// Data from CD Workflow service
173-
WorkflowRunnerId int `json:"workflowRunnerId"`
174-
CdPipelineId int `json:"cdPipelineId"`
175-
StageYaml string `json:"stageYaml"`
176-
ArtifactLocation string `json:"artifactLocation"`
177-
CiArtifactDTO CiArtifactDTO `json:"ciArtifactDTO"`
178-
CdImage string `json:"cdImage"`
179-
StageType string `json:"stageType"`
180-
CdCacheLocation string `json:"cdCacheLocation"`
181-
CdCacheRegion string `json:"cdCacheRegion"`
182-
WorkflowPrefixForLog string `json:"workflowPrefixForLog"`
183-
DeploymentTriggeredBy string `json:"deploymentTriggeredBy,omitempty"`
184-
DeploymentTriggerTime time.Time `json:"deploymentTriggerTime,omitempty"`
185-
DeploymentReleaseCounter int `json:"deploymentReleaseCounter,omitempty"`
186-
PrePostDeploySteps []*StepObject `json:"prePostDeploySteps"`
187-
TaskYaml *TaskYaml `json:"-"`
188-
IsVirtualExecution bool `json:"isVirtualExecution"`
189-
CiArtifactLastFetch time.Time `json:"ciArtifactLastFetch"`
190-
CiPipelineType string `json:"CiPipelineType"`
191-
RegistryDestinationImageMap map[string][]string `json:"registryDestinationImageMap"`
192-
RegistryCredentialMap map[string]RegistryCredentials `json:"registryCredentialMap"`
193-
PluginArtifactStage string `json:"pluginArtifactStage"`
194-
PushImageBeforePostCI bool `json:"pushImageBeforePostCI"`
195-
IntermediateDockerRegistryUrl string `json:"-"` // this URL will be used for all operations and can be mutated
196-
BuildxCacheModeMin bool `json:"buildxCacheModeMin"`
197-
AsyncBuildxCacheExport bool `json:"asyncBuildxCacheExport"`
198-
BuildxInterruptionMaxRetry int `json:"buildxInterruptionMaxRetry"`
199-
UseDockerApiToGetDigest bool `json:"useDockerApiToGetDigest"`
200-
HostUrl string `json:"hostUrl"`
201-
ImageScanningSteps []*ImageScanningSteps `json:"imageScanningSteps,omitempty"`
202-
ExecuteImageScanningVia bean2.ScanExecutionMedium `json:"executeImageScanningVia,omitempty"`
203-
AwsInspectorConfig string `json:"awsInspectorConfig,omitempty"`
204-
PartSize int64 `json:"partSize,omitempty"`
205-
ConcurrencyMultiplier int `json:"concurrencyMultiplier,omitempty"`
173+
WorkflowRunnerId int `json:"workflowRunnerId"`
174+
CdPipelineId int `json:"cdPipelineId"`
175+
StageYaml string `json:"stageYaml"`
176+
ArtifactLocation string `json:"artifactLocation"`
177+
CiArtifactDTO CiArtifactDTO `json:"ciArtifactDTO"`
178+
CdImage string `json:"cdImage"`
179+
StageType string `json:"stageType"`
180+
CdCacheLocation string `json:"cdCacheLocation"`
181+
CdCacheRegion string `json:"cdCacheRegion"`
182+
WorkflowPrefixForLog string `json:"workflowPrefixForLog"`
183+
DeploymentTriggeredBy string `json:"deploymentTriggeredBy,omitempty"`
184+
DeploymentTriggerTime time.Time `json:"deploymentTriggerTime,omitempty"`
185+
DeploymentReleaseCounter int `json:"deploymentReleaseCounter,omitempty"`
186+
PrePostDeploySteps []*StepObject `json:"prePostDeploySteps"`
187+
TaskYaml *TaskYaml `json:"-"`
188+
IsVirtualExecution bool `json:"isVirtualExecution"`
189+
CiArtifactLastFetch time.Time `json:"ciArtifactLastFetch"`
190+
CiPipelineType string `json:"CiPipelineType"`
191+
RegistryDestinationImageMap map[string][]string `json:"registryDestinationImageMap"`
192+
RegistryCredentialMap map[string]RegistryCredentials `json:"registryCredentialMap"`
193+
PluginArtifactStage string `json:"pluginArtifactStage"`
194+
PushImageBeforePostCI bool `json:"pushImageBeforePostCI"`
195+
IntermediateDockerRegistryUrl string `json:"-"` // this URL will be used for all operations and can be mutated
196+
BuildxCacheModeMin bool `json:"buildxCacheModeMin"`
197+
AsyncBuildxCacheExport bool `json:"asyncBuildxCacheExport"`
198+
BuildxInterruptionMaxRetry int `json:"buildxInterruptionMaxRetry"`
199+
BuildxBuilderPodWaitDurationSecs int `json:"buildxBuilderPodWaitDurationSecs"`
200+
UseDockerApiToGetDigest bool `json:"useDockerApiToGetDigest"`
201+
HostUrl string `json:"hostUrl"`
202+
ImageScanningSteps []*ImageScanningSteps `json:"imageScanningSteps,omitempty"`
203+
ExecuteImageScanningVia bean2.ScanExecutionMedium `json:"executeImageScanningVia,omitempty"`
204+
AwsInspectorConfig string `json:"awsInspectorConfig,omitempty"`
205+
PartSize int64 `json:"partSize,omitempty"`
206+
ConcurrencyMultiplier int `json:"concurrencyMultiplier,omitempty"`
206207
}
207208

208209
func (c *CommonWorkflowRequest) IsPreCdStage() bool {

0 commit comments

Comments
 (0)