Skip to content

Commit 0ef723f

Browse files
committed
Move waiting logic for host orchestrator settlement to backend
While interacting with host orchestrator right after creating host API is returned, there is a possibility of getting 503 service temporarily unavailable error because host is created and booted but host orchestrator is not executed / ready yet. Handling logic for waiting host orchestrator readiness is currently in the client logic. Moving this logic into cloud orchestrator API endpoint (specifically inside WaitOperation) makes the client simpler and reduces redundant API endpoints. Context: b/501288123 TAG=agy CONV=f5f788b6-7aa2-4614-b441-87f3ad9fed4e
1 parent 6244a75 commit 0ef723f

4 files changed

Lines changed: 288 additions & 16 deletions

File tree

pkg/app/instances/docker.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"fmt"
2020
"io"
2121
"log"
22+
"net/http"
2223
"net/url"
2324
"strings"
2425
"sync"
@@ -188,6 +189,13 @@ func (m *DockerInstanceManager) waitCreateHostOperation(host string) (*apiv1.Hos
188189
return nil, fmt.Errorf("failed to inspect docker container: %w", err)
189190
}
190191
if res.State.Running {
192+
client, err := m.GetHostClient("local", host)
193+
if err != nil {
194+
return nil, err
195+
}
196+
if err := waitForHostReady(client); err != nil {
197+
return nil, err
198+
}
191199
return &apiv1.HostInstance{
192200
Name: host,
193201
}, nil
@@ -500,3 +508,18 @@ func (m *DockerInstanceManager) getRWMutex(user accounts.User) *sync.RWMutex {
500508
mu, _ := m.mutexes.LoadOrStore(user.Username(), &sync.RWMutex{})
501509
return mu.(*sync.RWMutex)
502510
}
511+
512+
func waitForHostReady(client HostClient) error {
513+
maxWait := 2 * time.Minute
514+
retryDelay := 5 * time.Second
515+
deadline := time.Now().Add(maxWait)
516+
517+
for time.Now().Before(deadline) {
518+
status, err := client.Get("/", "", nil)
519+
if err == nil && status != http.StatusBadGateway {
520+
return nil
521+
}
522+
time.Sleep(retryDelay)
523+
}
524+
return errors.NewServiceUnavailableError("wait for host orchestrator timed out", nil)
525+
}

pkg/app/instances/docker_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
package instances
1616

1717
import (
18+
"net/http"
19+
"net/http/httputil"
1820
"testing"
1921

2022
"github.com/google/go-cmp/cmp"
@@ -52,3 +54,53 @@ func TestDecodeOperationFailsMissingUnderscore(t *testing.T) {
5254
t.Errorf("expected error")
5355
}
5456
}
57+
58+
type mockHostClient struct {
59+
getFunc func(string, string, *HostResponse) (int, error)
60+
}
61+
62+
func (c *mockHostClient) Get(path, query string, res *HostResponse) (int, error) {
63+
return c.getFunc(path, query, res)
64+
}
65+
66+
func (c *mockHostClient) Post(path, query string, body any, res *HostResponse) (int, error) {
67+
return 0, nil
68+
}
69+
70+
func (c *mockHostClient) GetReverseProxy() *httputil.ReverseProxy {
71+
return nil
72+
}
73+
74+
func TestWaitForHostReadySucceeds(t *testing.T) {
75+
client := &mockHostClient{
76+
getFunc: func(path, query string, res *HostResponse) (int, error) {
77+
return http.StatusOK, nil
78+
},
79+
}
80+
81+
err := waitForHostReady(client)
82+
if err != nil {
83+
t.Errorf("unexpected error: %v", err)
84+
}
85+
}
86+
87+
func TestWaitForHostReadyRetriesOnBadGateway(t *testing.T) {
88+
calls := 0
89+
client := &mockHostClient{
90+
getFunc: func(path, query string, res *HostResponse) (int, error) {
91+
calls++
92+
if calls == 1 {
93+
return http.StatusBadGateway, nil
94+
}
95+
return http.StatusOK, nil
96+
},
97+
}
98+
99+
err := waitForHostReady(client)
100+
if err != nil {
101+
t.Errorf("unexpected error: %v", err)
102+
}
103+
if calls != 2 {
104+
t.Errorf("expected 2 calls, got %d", calls)
105+
}
106+
}

pkg/app/instances/gce.go

Lines changed: 96 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@ import (
1818
"context"
1919
"fmt"
2020
"log"
21+
"net/http"
2122
"net/url"
2223
"path"
2324
"regexp"
25+
"time"
2426

2527
apiv1 "github.com/google/cloud-android-orchestration/api/v1"
2628
"github.com/google/cloud-android-orchestration/pkg/app/accounts"
@@ -41,6 +43,7 @@ type GCPIMConfig struct {
4143
UseExternalIP bool
4244
// If true, instances created should be compatible with `acloud CLI`.
4345
AcloudCompatible bool
46+
HostReadyTimeout time.Duration
4447
}
4548

4649
const (
@@ -78,28 +81,40 @@ func (m *GCEInstanceManager) ListZones() (*apiv1.ListZonesResponse, error) {
7881
}, nil
7982
}
8083

81-
func (m *GCEInstanceManager) GetHostAddr(zone string, host string) (string, error) {
82-
instance, err := m.getHostInstance(zone, host)
83-
if err != nil {
84-
return "", err
85-
}
86-
ilen := len(instance.NetworkInterfaces)
84+
func getHostAddrWithIns(ins *compute.Instance) (string, error) {
85+
ilen := len(ins.NetworkInterfaces)
8786
if ilen == 0 {
88-
log.Printf("host instance %s in zone %s is missing a network interface", host, zone)
87+
log.Printf("host instance %s in zone %s is missing a network interface", ins.Name, ins.Zone)
8988
return "", errors.NewInternalError("host instance missing a network interface", nil)
9089
}
9190
if ilen > 1 {
92-
log.Printf("host instance %s in zone %s has %d network interfaces", host, zone, ilen)
91+
log.Printf("host instance %s in zone %s has %d network interfaces", ins.Name, ins.Zone, ilen)
92+
}
93+
return ins.NetworkInterfaces[0].NetworkIP, nil
94+
}
95+
96+
func (m *GCEInstanceManager) GetHostAddr(zone string, host string) (string, error) {
97+
ins, err := m.getHostInstance(zone, host)
98+
if err != nil {
99+
return "", err
100+
}
101+
return getHostAddrWithIns(ins)
102+
}
103+
104+
func getHostURLWithIns(ins *compute.Instance, config *Config) (*url.URL, error) {
105+
addr, err := getHostAddrWithIns(ins)
106+
if err != nil {
107+
return nil, err
93108
}
94-
return instance.NetworkInterfaces[0].NetworkIP, nil
109+
return url.Parse(fmt.Sprintf("%s://%s:%d", config.HostOrchestratorProtocol, addr, config.GCP.HostOrchestratorPort))
95110
}
96111

97112
func (m *GCEInstanceManager) GetHostURL(zone string, host string) (*url.URL, error) {
98-
addr, err := m.GetHostAddr(zone, host)
113+
ins, err := m.getHostInstance(zone, host)
99114
if err != nil {
100115
return nil, err
101116
}
102-
return url.Parse(fmt.Sprintf("%s://%s:%d", m.Config.HostOrchestratorProtocol, addr, m.Config.GCP.HostOrchestratorPort))
117+
return getHostURLWithIns(ins, &m.Config)
103118
}
104119

105120
const operationStatusDone = "DONE"
@@ -253,16 +268,81 @@ func (m *GCEInstanceManager) WaitOperation(zone string, user accounts.User, name
253268
if op.Status != operationStatusDone {
254269
return nil, errors.NewServiceUnavailableError("Wait for operation timed out", nil)
255270
}
256-
getter := opResultGetter{Service: m.Service, Op: op}
257-
return getter.Get()
271+
getter := opResultGetter{
272+
Service: m.Service,
273+
Op: op,
274+
Config: &m.Config,
275+
}
276+
res, err := getter.Get()
277+
if err != nil {
278+
return nil, err
279+
}
280+
if hostInst, ok := res.(*apiv1.HostInstance); ok && op.OperationType == "insert" {
281+
return m.WaitHostAvailability(zone, user, hostInst.Name)
282+
}
283+
return res, nil
284+
}
285+
286+
func (m *GCEInstanceManager) WaitHostAvailability(zone string, user accounts.User, host string) (*apiv1.HostInstance, error) {
287+
ins, err := m.getHostInstance(zone, host)
288+
if err != nil {
289+
return nil, err
290+
}
291+
hostInstance, err := BuildHostInstance(ins)
292+
if err != nil {
293+
return nil, err
294+
}
295+
client, err := getHostClientWithIns(ins, &m.Config)
296+
if err != nil {
297+
return nil, err
298+
}
299+
if err := m.waitForOrchestrator(zone, hostInstance, client); err != nil {
300+
return nil, err
301+
}
302+
return hostInstance, nil
303+
}
304+
305+
func (m *GCEInstanceManager) waitForOrchestrator(zone string, host *apiv1.HostInstance, client HostClient) error {
306+
timeout := m.Config.GCP.HostReadyTimeout
307+
if timeout == 0 {
308+
timeout = 5 * time.Minute
309+
}
310+
retryDelay := 5 * time.Second
311+
deadline := time.Now().Add(timeout)
312+
313+
for time.Now().Before(deadline) {
314+
_, err := m.Service.Instances.Get(m.Config.GCP.ProjectID, zone, host.Name).Context(context.TODO()).Do()
315+
if err != nil {
316+
if apiErr, ok := err.(*googleapi.Error); ok && apiErr.Code == http.StatusNotFound {
317+
return errors.NewNotFoundError("Host was deleted concurrently", err)
318+
}
319+
return fmt.Errorf("failed to check host existence: %w", err)
320+
}
321+
322+
status, err := client.Get("/", "", nil)
323+
if err == nil && status != http.StatusBadGateway {
324+
return nil
325+
}
326+
327+
time.Sleep(retryDelay)
328+
}
329+
return errors.NewServiceUnavailableError("Wait for host orchestrator timed out", nil)
330+
}
331+
332+
func getHostClientWithIns(ins *compute.Instance, config *Config) (HostClient, error) {
333+
url, err := getHostURLWithIns(ins, config)
334+
if err != nil {
335+
return nil, err
336+
}
337+
return NewNetHostClient(url, config.AllowSelfSignedHostSSLCertificate), nil
258338
}
259339

260340
func (m *GCEInstanceManager) GetHostClient(zone string, host string) (HostClient, error) {
261-
url, err := m.GetHostURL(zone, host)
341+
ins, err := m.getHostInstance(zone, host)
262342
if err != nil {
263343
return nil, err
264344
}
265-
return NewNetHostClient(url, m.Config.AllowSelfSignedHostSSLCertificate), nil
345+
return getHostClientWithIns(ins, &m.Config)
266346
}
267347

268348
func (m *GCEInstanceManager) getHostInstance(zone string, host string) (*compute.Instance, error) {
@@ -326,6 +406,7 @@ var (
326406
type opResultGetter struct {
327407
Service *compute.Service
328408
Op *compute.Operation
409+
Config *Config
329410
}
330411

331412
func (g *opResultGetter) Get() (any, error) {

0 commit comments

Comments
 (0)