Skip to content

Commit a7c8ab3

Browse files
committed
Add waiting logic for host orchestrator settlement to backend for GCE
While interacting with host orchestrator right after creating host API is returned, there is a possibility of getting 503 service temporarily unavailable error because host is created and booted but host orchestrator is not executed / ready yet. Handling logic for waiting host orchestrator readiness is currently in the client logic. Moving this logic into cloud orchestrator API endpoint makes the client simpler and reduces redundant API endpoints. Apply it for GCE instance manager. Context: b/501288123
1 parent 549f420 commit a7c8ab3

2 files changed

Lines changed: 116 additions & 17 deletions

File tree

pkg/app/instances/gce.go

Lines changed: 98 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@ import (
1818
"context"
1919
"fmt"
2020
"log"
21+
"net/http"
2122
"net/url"
2223
"path"
2324
"regexp"
25+
"time"
2426

2527
apiv1 "github.com/google/cloud-android-orchestration/api/v1"
2628
"github.com/google/cloud-android-orchestration/pkg/app/accounts"
@@ -79,27 +81,19 @@ func (m *GCEInstanceManager) ListZones() (*apiv1.ListZonesResponse, error) {
7981
}
8082

8183
func (m *GCEInstanceManager) GetHostAddr(zone string, host string) (string, error) {
82-
instance, err := m.getHostInstance(zone, host)
84+
ins, err := m.getHostInstance(zone, host)
8385
if err != nil {
8486
return "", err
8587
}
86-
ilen := len(instance.NetworkInterfaces)
87-
if ilen == 0 {
88-
log.Printf("host instance %s in zone %s is missing a network interface", host, zone)
89-
return "", errors.NewInternalError("host instance missing a network interface", nil)
90-
}
91-
if ilen > 1 {
92-
log.Printf("host instance %s in zone %s has %d network interfaces", host, zone, ilen)
93-
}
94-
return instance.NetworkInterfaces[0].NetworkIP, nil
88+
return getHostAddrWithIns(ins)
9589
}
9690

9791
func (m *GCEInstanceManager) GetHostURL(zone string, host string) (*url.URL, error) {
98-
addr, err := m.GetHostAddr(zone, host)
92+
ins, err := m.getHostInstance(zone, host)
9993
if err != nil {
10094
return nil, err
10195
}
102-
return url.Parse(fmt.Sprintf("%s://%s:%d", m.Config.HostOrchestratorProtocol, addr, m.Config.GCP.HostOrchestratorPort))
96+
return getHostURLWithIns(ins, &m.Config)
10397
}
10498

10599
const operationStatusDone = "DONE"
@@ -253,16 +247,103 @@ func (m *GCEInstanceManager) WaitOperation(zone string, user accounts.User, name
253247
if op.Status != operationStatusDone {
254248
return nil, errors.NewServiceUnavailableError("Wait for operation timed out", nil)
255249
}
256-
getter := opResultGetter{Service: m.Service, Op: op}
257-
return getter.Get()
250+
getter := opResultGetter{
251+
Service: m.Service,
252+
Op: op,
253+
Config: &m.Config,
254+
}
255+
res, err := getter.Get()
256+
if err != nil {
257+
return nil, err
258+
}
259+
if hostInst, ok := res.(*apiv1.HostInstance); ok && op.OperationType == "insert" {
260+
return m.WaitHostAvailability(zone, user, hostInst.Name)
261+
}
262+
return res, nil
263+
}
264+
265+
func (m *GCEInstanceManager) WaitHostAvailability(zone string, user accounts.User, host string) (*apiv1.HostInstance, error) {
266+
ins, err := m.getHostInstance(zone, host)
267+
if err != nil {
268+
return nil, err
269+
}
270+
hostInstance, err := BuildHostInstance(ins)
271+
if err != nil {
272+
return nil, err
273+
}
274+
client, err := getHostClientWithIns(ins, &m.Config)
275+
if err != nil {
276+
return nil, err
277+
}
278+
if err := m.waitForOrchestrator(zone, hostInstance, client); err != nil {
279+
return nil, err
280+
}
281+
return hostInstance, nil
282+
}
283+
284+
type gceHostReadinessChecker struct {
285+
checker *BasicHostReadinessChecker
286+
manager *GCEInstanceManager
287+
zone string
288+
hostName string
289+
}
290+
291+
func (g *gceHostReadinessChecker) IsHostReady() (bool, error) {
292+
_, err := g.manager.Service.Instances.Get(g.manager.Config.GCP.ProjectID, g.zone, g.hostName).Context(context.TODO()).Do()
293+
if err != nil {
294+
if apiErr, ok := err.(*googleapi.Error); ok && apiErr.Code == http.StatusNotFound {
295+
return false, errors.NewNotFoundError("Host was deleted concurrently", err)
296+
}
297+
return false, fmt.Errorf("failed to check host existence: %w", err)
298+
}
299+
return g.checker.IsHostReady()
300+
}
301+
302+
func (m *GCEInstanceManager) waitForOrchestrator(zone string, host *apiv1.HostInstance, client HostClient) error {
303+
gceChecker := &gceHostReadinessChecker{
304+
checker: &BasicHostReadinessChecker{Client: client},
305+
manager: m,
306+
zone: zone,
307+
hostName: host.Name,
308+
}
309+
310+
return WaitForHostReady(gceChecker, 5*time.Minute, 5*time.Second)
311+
}
312+
313+
func getHostAddrWithIns(ins *compute.Instance) (string, error) {
314+
ilen := len(ins.NetworkInterfaces)
315+
if ilen == 0 {
316+
log.Printf("host instance %s in zone %s is missing a network interface", ins.Name, ins.Zone)
317+
return "", errors.NewInternalError("host instance missing a network interface", nil)
318+
}
319+
if ilen > 1 {
320+
log.Printf("host instance %s in zone %s has %d network interfaces", ins.Name, ins.Zone, ilen)
321+
}
322+
return ins.NetworkInterfaces[0].NetworkIP, nil
323+
}
324+
325+
func getHostURLWithIns(ins *compute.Instance, config *Config) (*url.URL, error) {
326+
addr, err := getHostAddrWithIns(ins)
327+
if err != nil {
328+
return nil, err
329+
}
330+
return url.Parse(fmt.Sprintf("%s://%s:%d", config.HostOrchestratorProtocol, addr, config.GCP.HostOrchestratorPort))
331+
}
332+
333+
func getHostClientWithIns(ins *compute.Instance, config *Config) (HostClient, error) {
334+
url, err := getHostURLWithIns(ins, config)
335+
if err != nil {
336+
return nil, err
337+
}
338+
return NewNetHostClient(url, config.AllowSelfSignedHostSSLCertificate), nil
258339
}
259340

260341
func (m *GCEInstanceManager) GetHostClient(zone string, host string) (HostClient, error) {
261-
url, err := m.GetHostURL(zone, host)
342+
ins, err := m.getHostInstance(zone, host)
262343
if err != nil {
263344
return nil, err
264345
}
265-
return NewNetHostClient(url, m.Config.AllowSelfSignedHostSSLCertificate), nil
346+
return getHostClientWithIns(ins, &m.Config)
266347
}
267348

268349
func (m *GCEInstanceManager) getHostInstance(zone string, host string) (*compute.Instance, error) {
@@ -326,6 +407,7 @@ var (
326407
type opResultGetter struct {
327408
Service *compute.Service
328409
Op *compute.Operation
410+
Config *Config
329411
}
330412

331413
func (g *opResultGetter) Get() (any, error) {

pkg/app/instances/gce_test.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"net/http/httptest"
2525
"net/url"
2626
"reflect"
27+
"strconv"
2728
"strings"
2829
"testing"
2930

@@ -572,9 +573,14 @@ func TestWaitCreateInstanceOperationSucceeds(t *testing.T) {
572573
Name: "foo",
573574
MachineType: "mt",
574575
MinCpuPlatform: "mcp",
576+
NetworkInterfaces: []*compute.NetworkInterface{
577+
{NetworkIP: "127.0.0.1"},
578+
},
575579
}
576580
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
577581
switch path := r.URL.Path; path {
582+
case "/":
583+
replyJSON(w, map[string]string{})
578584
case "/projects/google.com:test-project/zones/us-central1-a/operations/operation-1/wait":
579585
replyJSON(w, operation)
580586
case "/projects/google.com:test-project/zones/us-central1-a/instances/foo":
@@ -584,7 +590,18 @@ func TestWaitCreateInstanceOperationSucceeds(t *testing.T) {
584590
}
585591
}))
586592
defer ts.Close()
587-
im := NewGCEInstanceManager(testConfig, buildTestService(t, ts), testNameGenerator)
593+
594+
tsURL, _ := url.Parse(ts.URL)
595+
port, _ := strconv.Atoi(tsURL.Port())
596+
597+
cfg := testConfig
598+
cfg.HostOrchestratorProtocol = "http"
599+
600+
gcpCfg := *cfg.GCP
601+
gcpCfg.HostOrchestratorPort = port
602+
cfg.GCP = &gcpCfg
603+
604+
im := NewGCEInstanceManager(cfg, buildTestService(t, ts), testNameGenerator)
588605

589606
res, _ := im.WaitOperation(zone, &TestUser{}, opName)
590607

0 commit comments

Comments
 (0)