Skip to content

Commit 4dff5ee

Browse files
committed
Add waiting logic for host orchestrator settlement to backend for GCE
While interacting with host orchestrator right after creating host API is returned, there is a possibility of getting 503 service temporarily unavailable error because host is created and booted but host orchestrator is not executed / ready yet. Handling logic for waiting host orchestrator readiness is currently in the client logic. Moving this logic into cloud orchestrator API endpoint makes the client simpler and reduces redundant API endpoints. Apply it for GCE instance manager. Context: b/501288123
1 parent 8d31e14 commit 4dff5ee

2 files changed

Lines changed: 121 additions & 17 deletions

File tree

pkg/app/instances/gce.go

Lines changed: 103 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@ import (
1818
"context"
1919
"fmt"
2020
"log"
21+
"net/http"
2122
"net/url"
2223
"path"
2324
"regexp"
25+
"time"
2426

2527
apiv1 "github.com/google/cloud-android-orchestration/api/v1"
2628
"github.com/google/cloud-android-orchestration/pkg/app/accounts"
@@ -79,27 +81,19 @@ func (m *GCEInstanceManager) ListZones() (*apiv1.ListZonesResponse, error) {
7981
}
8082

8183
func (m *GCEInstanceManager) GetHostAddr(zone string, host string) (string, error) {
82-
instance, err := m.getHostInstance(zone, host)
84+
ins, err := m.getHostInstance(zone, host)
8385
if err != nil {
8486
return "", err
8587
}
86-
ilen := len(instance.NetworkInterfaces)
87-
if ilen == 0 {
88-
log.Printf("host instance %s in zone %s is missing a network interface", host, zone)
89-
return "", errors.NewInternalError("host instance missing a network interface", nil)
90-
}
91-
if ilen > 1 {
92-
log.Printf("host instance %s in zone %s has %d network interfaces", host, zone, ilen)
93-
}
94-
return instance.NetworkInterfaces[0].NetworkIP, nil
88+
return getHostAddrWithIns(ins)
9589
}
9690

9791
func (m *GCEInstanceManager) GetHostURL(zone string, host string) (*url.URL, error) {
98-
addr, err := m.GetHostAddr(zone, host)
92+
ins, err := m.getHostInstance(zone, host)
9993
if err != nil {
10094
return nil, err
10195
}
102-
return url.Parse(fmt.Sprintf("%s://%s:%d", m.Config.HostOrchestratorProtocol, addr, m.Config.GCP.HostOrchestratorPort))
96+
return getHostURLWithIns(ins, &m.Config)
10397
}
10498

10599
const operationStatusDone = "DONE"
@@ -253,16 +247,108 @@ func (m *GCEInstanceManager) WaitOperation(zone string, user accounts.User, name
253247
if op.Status != operationStatusDone {
254248
return nil, errors.NewServiceUnavailableError("Wait for operation timed out", nil)
255249
}
256-
getter := opResultGetter{Service: m.Service, Op: op}
257-
return getter.Get()
250+
getter := opResultGetter{
251+
Service: m.Service,
252+
Op: op,
253+
Config: &m.Config,
254+
}
255+
res, err := getter.Get()
256+
if err != nil {
257+
return nil, err
258+
}
259+
if hostInst, ok := res.(*apiv1.HostInstance); ok && op.OperationType == "insert" {
260+
return m.WaitHostAvailability(zone, user, hostInst.Name)
261+
}
262+
return res, nil
263+
}
264+
265+
func (m *GCEInstanceManager) WaitHostAvailability(zone string, user accounts.User, host string) (*apiv1.HostInstance, error) {
266+
ins, err := m.getHostInstance(zone, host)
267+
if err != nil {
268+
return nil, err
269+
}
270+
hostInstance, err := BuildHostInstance(ins)
271+
if err != nil {
272+
return nil, err
273+
}
274+
client, err := getHostClientWithIns(ins, &m.Config)
275+
if err != nil {
276+
return nil, err
277+
}
278+
if err := m.waitForOrchestrator(zone, hostInstance, client); err != nil {
279+
return nil, err
280+
}
281+
return hostInstance, nil
282+
}
283+
284+
type gceReadinessChecker struct {
285+
checker ReadinessChecker
286+
manager *GCEInstanceManager
287+
zone string
288+
hostName string
289+
}
290+
291+
func (g *gceReadinessChecker) IsHostReady() (bool, error) {
292+
_, err := g.manager.Service.Instances.Get(g.manager.Config.GCP.ProjectID, g.zone, g.hostName).Context(context.TODO()).Do()
293+
if err != nil {
294+
if apiErr, ok := err.(*googleapi.Error); ok && apiErr.Code == http.StatusNotFound {
295+
return false, errors.NewNotFoundError("Host was deleted concurrently", err)
296+
}
297+
return false, fmt.Errorf("failed to check host existence: %w", err)
298+
}
299+
return g.checker.IsHostReady()
300+
}
301+
302+
func (m *GCEInstanceManager) waitForOrchestrator(zone string, host *apiv1.HostInstance, client HostClient) error {
303+
readinessChecker, ok := client.(ReadinessChecker)
304+
if !ok {
305+
readinessChecker = &defaultReadinessChecker{client}
306+
}
307+
308+
gceChecker := &gceReadinessChecker{
309+
checker: readinessChecker,
310+
manager: m,
311+
zone: zone,
312+
hostName: host.Name,
313+
}
314+
315+
return WaitForHostReady(gceChecker, 5*time.Minute, 5*time.Second)
316+
}
317+
318+
func getHostAddrWithIns(ins *compute.Instance) (string, error) {
319+
ilen := len(ins.NetworkInterfaces)
320+
if ilen == 0 {
321+
log.Printf("host instance %s in zone %s is missing a network interface", ins.Name, ins.Zone)
322+
return "", errors.NewInternalError("host instance missing a network interface", nil)
323+
}
324+
if ilen > 1 {
325+
log.Printf("host instance %s in zone %s has %d network interfaces", ins.Name, ins.Zone, ilen)
326+
}
327+
return ins.NetworkInterfaces[0].NetworkIP, nil
328+
}
329+
330+
func getHostURLWithIns(ins *compute.Instance, config *Config) (*url.URL, error) {
331+
addr, err := getHostAddrWithIns(ins)
332+
if err != nil {
333+
return nil, err
334+
}
335+
return url.Parse(fmt.Sprintf("%s://%s:%d", config.HostOrchestratorProtocol, addr, config.GCP.HostOrchestratorPort))
336+
}
337+
338+
func getHostClientWithIns(ins *compute.Instance, config *Config) (HostClient, error) {
339+
url, err := getHostURLWithIns(ins, config)
340+
if err != nil {
341+
return nil, err
342+
}
343+
return NewNetHostClient(url, config.AllowSelfSignedHostSSLCertificate), nil
258344
}
259345

260346
func (m *GCEInstanceManager) GetHostClient(zone string, host string) (HostClient, error) {
261-
url, err := m.GetHostURL(zone, host)
347+
ins, err := m.getHostInstance(zone, host)
262348
if err != nil {
263349
return nil, err
264350
}
265-
return NewNetHostClient(url, m.Config.AllowSelfSignedHostSSLCertificate), nil
351+
return getHostClientWithIns(ins, &m.Config)
266352
}
267353

268354
func (m *GCEInstanceManager) getHostInstance(zone string, host string) (*compute.Instance, error) {
@@ -326,6 +412,7 @@ var (
326412
type opResultGetter struct {
327413
Service *compute.Service
328414
Op *compute.Operation
415+
Config *Config
329416
}
330417

331418
func (g *opResultGetter) Get() (any, error) {

pkg/app/instances/gce_test.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"net/http/httptest"
2525
"net/url"
2626
"reflect"
27+
"strconv"
2728
"strings"
2829
"testing"
2930

@@ -572,9 +573,14 @@ func TestWaitCreateInstanceOperationSucceeds(t *testing.T) {
572573
Name: "foo",
573574
MachineType: "mt",
574575
MinCpuPlatform: "mcp",
576+
NetworkInterfaces: []*compute.NetworkInterface{
577+
{NetworkIP: "127.0.0.1"},
578+
},
575579
}
576580
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
577581
switch path := r.URL.Path; path {
582+
case "/":
583+
replyJSON(w, map[string]string{})
578584
case "/projects/google.com:test-project/zones/us-central1-a/operations/operation-1/wait":
579585
replyJSON(w, operation)
580586
case "/projects/google.com:test-project/zones/us-central1-a/instances/foo":
@@ -584,7 +590,18 @@ func TestWaitCreateInstanceOperationSucceeds(t *testing.T) {
584590
}
585591
}))
586592
defer ts.Close()
587-
im := NewGCEInstanceManager(testConfig, buildTestService(t, ts), testNameGenerator)
593+
594+
tsURL, _ := url.Parse(ts.URL)
595+
port, _ := strconv.Atoi(tsURL.Port())
596+
597+
cfg := testConfig
598+
cfg.HostOrchestratorProtocol = "http"
599+
600+
gcpCfg := *cfg.GCP
601+
gcpCfg.HostOrchestratorPort = port
602+
cfg.GCP = &gcpCfg
603+
604+
im := NewGCEInstanceManager(cfg, buildTestService(t, ts), testNameGenerator)
588605

589606
res, _ := im.WaitOperation(zone, &TestUser{}, opName)
590607

0 commit comments

Comments
 (0)