@@ -174,36 +174,26 @@ func determineBridgeGatewayIP(ctx context.Context, dockerClient client.NetworkAP
174174
175175// waitForContainerToStart waits for a container to start.
176176func waitForContainerToStart (ctx context.Context , dockerClient client.ContainerAPIClient , containerID string ) error {
177- // Unfortunately the Docker API's /containers/{id}/wait API (and the
178- // corresponding Client.ContainerWait method) don't allow waiting for
179- // container startup, so instead we'll take a polling approach.
180177 for i := 10 ; i > 0 ; i -- {
181- if status , err := dockerClient .ContainerInspect (ctx , containerID ); err != nil {
182- // There is a small gap between the time that a container ID and
183- // name are registered and the time that the container is actually
184- // created and shows up in container list and inspect requests:
185- //
186- // https://github.com/moby/moby/blob/de24c536b0ea208a09e0fff3fd896c453da6ef2e/daemon/container.go#L138-L156
187- //
188- // Given that multiple install operations tend to end up tightly
189- // synchronized by the preceeding pull operation and that this
190- // method is specifically designed to work around these race
191- // conditions, we'll allow 404 errors to pass silently (at least up
192- // until the polling time out - unfortunately we can't make the 404
193- // acceptance window any smaller than that because the CUDA-based
194- // containers are large and can take time to create).
195- if ! errdefs .IsNotFound (err ) {
196- return fmt .Errorf ("unable to inspect container (%s): %w" , containerID [:12 ], err )
197- }
198- } else {
199- switch status .State .Status {
200- case container .StateRunning :
201- return nil
202- case container .StateCreated , container .StateRestarting :
203- // wait for container to start
204- default :
205- return fmt .Errorf ("container is in unexpected state %q" , status .State .Status )
206- }
178+ err := dockerClient .ContainerStart (ctx , containerID , container.StartOptions {})
179+ if err == nil {
180+ return nil
181+ }
182+ // There is a small gap between the time that a container ID and
183+ // name are registered and the time that the container is actually
184+ // created and shows up in container list and inspect requests:
185+ //
186+ // https://github.com/moby/moby/blob/de24c536b0ea208a09e0fff3fd896c453da6ef2e/daemon/container.go#L138-L156
187+ //
188+ // Given that multiple install operations tend to end up tightly
189+ // synchronized by the preceeding pull operation and that this
190+ // method is specifically designed to work around these race
191+ // conditions, we'll allow 404 errors to pass silently (at least up
192+ // until the polling time out - unfortunately we can't make the 404
193+ // acceptance window any smaller than that because the CUDA-based
194+ // containers are large and can take time to create).
195+ if ! errdefs .IsNotFound (err ) {
196+ return err
207197 }
208198 if i > 1 {
209199 select {
@@ -278,19 +268,13 @@ func CreateControllerContainer(ctx context.Context, dockerClient *client.Client,
278268 // progress, then we wait for whichever install process creates the
279269 // container first and then wait for its container to be ready.
280270 resp , err := dockerClient .ContainerCreate (ctx , config , hostConfig , nil , nil , controllerContainerName )
281- if err != nil {
282- if errdefs .IsConflict (err ) {
283- if err := waitForContainerToStart (ctx , dockerClient , controllerContainerName ); err != nil {
284- return fmt .Errorf ("failed waiting for concurrent installation: %w" , err )
285- }
286- return nil
287- }
271+ if ! errdefs .IsConflict (err ) {
288272 return fmt .Errorf ("failed to create container %s: %w" , controllerContainerName , err )
289273 }
290274
291275 // Start the container.
292276 printer .Printf ("Starting model runner container %s...\n " , controllerContainerName )
293- if err := dockerClient . ContainerStart (ctx , resp . ID , container. StartOptions {} ); err != nil {
277+ if err := waitForContainerToStart (ctx , dockerClient , controllerContainerName ); err != nil {
294278 _ = dockerClient .ContainerRemove (ctx , resp .ID , container.RemoveOptions {Force : true })
295279 return fmt .Errorf ("failed to start container %s: %w" , controllerContainerName , err )
296280 }
0 commit comments