Skip to content

Commit c8ef926

Browse files
committed
Move waiting logic for host orchestrator settlement to backend
While interacting with host orchestrator right after creating host API is returned, there is a possibility of getting 503 service temporarily unavailable error because host it created and booted but host orchestrator is not executed / ready yet. Handling logic for waiting host orchestrator readiness is currently in the client logic. Moving this logic into cloud orchestrator API endpoint would make the client that uses host chestrator API simpler. Context: b/501288123
1 parent 7fe0489 commit c8ef926

7 files changed

Lines changed: 95 additions & 24 deletions

File tree

pkg/app/app_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@ func (hc *testHostClient) GetReverseProxy() *httputil.ReverseProxy {
111111
return httputil.NewSingleHostReverseProxy(hc.url)
112112
}
113113

114+
func (hc *testHostClient) WaitForHostReady() error {
115+
return nil
116+
}
117+
114118
func TestListZonesSucceeds(t *testing.T) {
115119
controller := NewApp(&testInstanceManager{}, &testAccountManager{}, nil, nil, nil, "", nil, apiv1.InfraConfig{}, &config.Config{})
116120
ts := httptest.NewServer(controller.Handler())

pkg/app/instances/docker.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,13 @@ func (m *DockerInstanceManager) waitCreateHostOperation(host string) (*apiv1.Hos
188188
return nil, fmt.Errorf("failed to inspect docker container: %w", err)
189189
}
190190
if res.State.Running {
191+
client, err := m.GetHostClient("local", host)
192+
if err != nil {
193+
return nil, err
194+
}
195+
if err := client.WaitForHostReady(); err != nil {
196+
return nil, err
197+
}
191198
return &apiv1.HostInstance{
192199
Name: host,
193200
}, nil

pkg/app/instances/gce.go

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,16 @@ func (m *GCEInstanceManager) ListZones() (*apiv1.ListZonesResponse, error) {
7878
}, nil
7979
}
8080

81-
func (m *GCEInstanceManager) GetHostAddr(zone string, host string) (string, error) {
82-
instance, err := m.getHostInstance(zone, host)
81+
func (m *GCEInstanceManager) getHostAddrInner(zone string, host string, ins *compute.Instance) (string, error) {
82+
var instance *compute.Instance
83+
var err error
84+
85+
if ins == nil {
86+
instance, err = m.getHostInstance(zone, host)
87+
} else {
88+
instance, err = ins, nil
89+
}
90+
8391
if err != nil {
8492
return "", err
8593
}
@@ -94,14 +102,22 @@ func (m *GCEInstanceManager) GetHostAddr(zone string, host string) (string, erro
94102
return instance.NetworkInterfaces[0].NetworkIP, nil
95103
}
96104

97-
func (m *GCEInstanceManager) GetHostURL(zone string, host string) (*url.URL, error) {
98-
addr, err := m.GetHostAddr(zone, host)
105+
func (m *GCEInstanceManager) GetHostAddr(zone string, host string) (string, error) {
106+
return m.getHostAddrInner(zone, host, nil)
107+
}
108+
109+
func (m *GCEInstanceManager) getHostURLInner(zone string, host string, ins *compute.Instance) (*url.URL, error) {
110+
addr, err := m.getHostAddrInner(zone, host, nil)
99111
if err != nil {
100112
return nil, err
101113
}
102114
return url.Parse(fmt.Sprintf("%s://%s:%d", m.Config.HostOrchestratorProtocol, addr, m.Config.GCP.HostOrchestratorPort))
103115
}
104116

117+
func (m *GCEInstanceManager) GetHostURL(zone string, host string) (*url.URL, error) {
118+
return m.getHostURLInner(zone, host, nil)
119+
}
120+
105121
const operationStatusDone = "DONE"
106122

107123
func (m *GCEInstanceManager) CreateHost(zone string, req *apiv1.CreateHostRequest, user accounts.User) (*apiv1.Operation, error) {
@@ -253,18 +269,27 @@ func (m *GCEInstanceManager) WaitOperation(zone string, user accounts.User, name
253269
if op.Status != operationStatusDone {
254270
return nil, errors.NewServiceUnavailableError("Wait for operation timed out", nil)
255271
}
256-
getter := opResultGetter{Service: m.Service, Op: op}
272+
getter := opResultGetter{
273+
Service: m.Service,
274+
Op: op,
275+
InstanceManager: m,
276+
Zone: zone,
277+
}
257278
return getter.Get()
258279
}
259280

260-
func (m *GCEInstanceManager) GetHostClient(zone string, host string) (HostClient, error) {
261-
url, err := m.GetHostURL(zone, host)
281+
func (m *GCEInstanceManager) getHostClientInner(zone string, host string, ins *compute.Instance) (HostClient, error) {
282+
url, err := m.getHostURLInner(zone, host, ins)
262283
if err != nil {
263284
return nil, err
264285
}
265286
return NewNetHostClient(url, m.Config.AllowSelfSignedHostSSLCertificate), nil
266287
}
267288

289+
func (m *GCEInstanceManager) GetHostClient(zone string, host string) (HostClient, error) {
290+
return m.getHostClientInner(zone, host, nil)
291+
}
292+
268293
func (m *GCEInstanceManager) getHostInstance(zone string, host string) (*compute.Instance, error) {
269294
ins, err := m.Service.Instances.
270295
Get(m.Config.GCP.ProjectID, zone, host).
@@ -324,8 +349,10 @@ var (
324349
)
325350

326351
type opResultGetter struct {
327-
Service *compute.Service
328-
Op *compute.Operation
352+
Service *compute.Service
353+
Op *compute.Operation
354+
InstanceManager *GCEInstanceManager
355+
Zone string
329356
}
330357

331358
func (g *opResultGetter) Get() (any, error) {
@@ -362,7 +389,19 @@ func (g *opResultGetter) buildCreateInstanceResult() (*apiv1.HostInstance, error
362389
if err != nil {
363390
return nil, toAppError(err)
364391
}
365-
return BuildHostInstance(ins)
392+
393+
host, err := BuildHostInstance(ins)
394+
if err != nil {
395+
return nil, err
396+
}
397+
client, err := g.InstanceManager.getHostClientInner(g.Zone, host.Name, ins)
398+
if err != nil {
399+
return nil, err
400+
}
401+
if err := client.WaitForHostReady(); err != nil {
402+
return nil, err
403+
}
404+
return host, nil
366405
}
367406

368407
// Converts compute API errors to AppError if relevant, return the same error otherwise

pkg/app/instances/gce_test.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"net/http/httptest"
2525
"net/url"
2626
"reflect"
27+
"strconv"
2728
"strings"
2829
"testing"
2930

@@ -572,9 +573,14 @@ func TestWaitCreateInstanceOperationSucceeds(t *testing.T) {
572573
Name: "foo",
573574
MachineType: "mt",
574575
MinCpuPlatform: "mcp",
576+
NetworkInterfaces: []*compute.NetworkInterface{
577+
{NetworkIP: "127.0.0.1"},
578+
},
575579
}
576580
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
577581
switch path := r.URL.Path; path {
582+
case "/":
583+
replyJSON(w, map[string]string{})
578584
case "/projects/google.com:test-project/zones/us-central1-a/operations/operation-1/wait":
579585
replyJSON(w, operation)
580586
case "/projects/google.com:test-project/zones/us-central1-a/instances/foo":
@@ -584,7 +590,18 @@ func TestWaitCreateInstanceOperationSucceeds(t *testing.T) {
584590
}
585591
}))
586592
defer ts.Close()
587-
im := NewGCEInstanceManager(testConfig, buildTestService(t, ts), testNameGenerator)
593+
594+
tsURL, _ := url.Parse(ts.URL)
595+
port, _ := strconv.Atoi(tsURL.Port())
596+
597+
cfg := testConfig
598+
cfg.HostOrchestratorProtocol = "http"
599+
600+
gcpCfg := *cfg.GCP
601+
gcpCfg.HostOrchestratorPort = port
602+
cfg.GCP = &gcpCfg
603+
604+
im := NewGCEInstanceManager(cfg, buildTestService(t, ts), testNameGenerator)
588605

589606
res, _ := im.WaitOperation(zone, &TestUser{}, opName)
590607

pkg/app/instances/hostclient.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"net/http"
2424
"net/http/httputil"
2525
"net/url"
26+
"time"
2627

2728
apiv1 "github.com/google/cloud-android-orchestration/api/v1"
2829
)
@@ -94,6 +95,21 @@ func (c *NetHostClient) Post(path, query string, bodyJSON any, out *HostResponse
9495
return res.StatusCode, err
9596
}
9697

98+
func (c *NetHostClient) WaitForHostReady() error {
99+
maxWait := 2 * time.Minute
100+
retryDelay := 5 * time.Second
101+
deadline := time.Now().Add(maxWait)
102+
103+
for time.Now().Before(deadline) {
104+
status, err := c.Get("/", "", nil)
105+
if err == nil && status != http.StatusBadGateway {
106+
return nil
107+
}
108+
time.Sleep(retryDelay)
109+
}
110+
return fmt.Errorf("wait for host orchestrator timed out")
111+
}
112+
97113
func (c *NetHostClient) GetReverseProxy() *httputil.ReverseProxy {
98114
devProxy := httputil.NewSingleHostReverseProxy(c.url)
99115
if c.client != http.DefaultClient {

pkg/app/instances/instances.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ type HostClient interface {
4444
Get(URLPath, URLQuery string, res *HostResponse) (int, error)
4545
Post(URLPath, URLQuery string, bodyJSON any, res *HostResponse) (int, error)
4646
GetReverseProxy() *httputil.ReverseProxy
47+
WaitForHostReady() error
4748
}
4849

4950
type HostResponse struct {

pkg/client/client.go

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -126,19 +126,6 @@ func (c *clientImpl) CreateHost(req *apiv1.CreateHostRequest) (*apiv1.HostInstan
126126
return nil, err
127127
}
128128

129-
// There is a short delay between the creation of the host and the availability of the host
130-
// orchestrator. This call ensures the host orchestrator had time to start before returning
131-
// from the this function.
132-
retryOpts := hoclient.RetryOptions{
133-
StatusCodes: []int{http.StatusBadGateway},
134-
RetryDelay: 5 * time.Second,
135-
MaxWait: 2 * time.Minute,
136-
}
137-
hostPath := fmt.Sprintf("/hosts/%s/", ins.Name)
138-
if err := c.httpHelper.NewGetRequest(hostPath).JSONResDoWithRetries(nil, retryOpts); err != nil {
139-
return nil, fmt.Errorf("unable to communicate with host orchestrator: %w", err)
140-
}
141-
142129
return ins, nil
143130
}
144131

0 commit comments

Comments
 (0)