Skip to content

Commit e2a438b

Browse files
scotwellsclaude
andcommitted
feat: surface instance blocking reasons and claim instanceType vCPU/memory
Two Instance-controller correctness changes: - Blocking-reason rollup: surface the most specific provider sub-condition (ImageUnavailable, InstanceCrashing, ConfigurationError, Provisioning) and its message onto the Instance Ready condition instead of a generic "Instance has not been programmed", so e.g. an image-pull failure reads as ImageUnavailable with the real message. Adds the reason constants and ranks them in the blocking-reason priority. - Quota sizing: resolve vCPU/memory for instanceType-sized instances from a new instanceTypeCatalog (datumcloud/d1-standard-2 = 1 vCPU / 2 GiB) so the quota ResourceClaim requests vcpus + memory, not just instance count. Explicit container limits / instance requests still take precedence. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 8e55d09 commit e2a438b

3 files changed

Lines changed: 799 additions & 20 deletions

File tree

api/v1alpha/instance_types.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,28 @@ const (
463463
// InstanceReadyReasonAvailable indicates that the instance is available
464464
InstanceReadyReasonAvailable = "Available"
465465

466+
// InstanceReadyReasonImageUnavailable indicates the provider could not pull
467+
// the instance image (bad name, missing credentials, registry unreachable).
468+
// This matches the reason written by translateWaitingReason in the unikraft
469+
// provider when the container enters an image-pull waiting state.
470+
InstanceReadyReasonImageUnavailable = "ImageUnavailable"
471+
472+
// InstanceReadyReasonInstanceCrashing indicates the instance process started
473+
// but is repeatedly exiting and being restarted (CrashLoopBackOff in the
474+
// underlying runtime). This is user-actionable: the application itself is
475+
// failing, not the platform.
476+
InstanceReadyReasonInstanceCrashing = "InstanceCrashing"
477+
478+
// InstanceReadyReasonConfigurationError indicates the runtime rejected the
479+
// instance configuration before the process could start (e.g. invalid env
480+
// variable injection, missing device). User must correct the workload spec.
481+
InstanceReadyReasonConfigurationError = "ConfigurationError"
482+
483+
// InstanceReadyReasonProvisioning indicates the instance runtime is still
484+
// setting up the execution environment (container being created, image being
485+
// unpacked). This is a transient, non-actionable state.
486+
InstanceReadyReasonProvisioning = "Provisioning"
487+
466488
// InstanceAvailableReasonStopped indicates that the instance is stopped
467489
InstanceAvailableReasonStopped = "Stopped"
468490

internal/controller/instance_controller.go

Lines changed: 178 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,30 @@ const (
9696
reasonNetworkFailedToCreate = "NetworkFailedToCreate"
9797
)
9898

99+
// instanceTypeResources holds the vCPU and memory for a named instance type.
100+
type instanceTypeResources struct {
101+
// CPUMillicores is the number of CPU millicores (1000 = 1 vCPU).
102+
CPUMillicores int64
103+
// MemoryMiB is the amount of RAM in mebibytes.
104+
MemoryMiB int64
105+
}
106+
107+
// instanceTypeCatalog maps platform instance type names to their resource
108+
// dimensions used for quota accounting when the instance spec carries only an
109+
// instanceType and no explicit container Limits or instance-level Requests.
110+
//
111+
// These are the platform-declared quota sizes for the instance type, not a
112+
// derivation of any infra provider's machine type. (infra-provider-gcp separately
113+
// maps datumcloud/d1-standard-2 to the GCP n2-standard-2 machine type for VM
114+
// provisioning; that mapping does not define the quota size here.) When new
115+
// instance types are added, add them here with their vCPU/memory values.
116+
var instanceTypeCatalog = map[string]instanceTypeResources{
117+
"datumcloud/d1-standard-2": {
118+
CPUMillicores: 1000, // 1 vCPU
119+
MemoryMiB: 2048, // 2 GiB
120+
},
121+
}
122+
99123
// Quota-pending requeue backoff. The instance controller is normally re-queued by
100124
// the ResourceClaim watch when a claim is granted, but that grant event lives on
101125
// the project control plane and can be missed (informer engagement races, watch
@@ -852,8 +876,24 @@ func (r *InstanceReconciler) classifyCreateError(
852876
}, fmt.Errorf("failed creating resource claim: %w", err)
853877
}
854878

879+
// resolveInstanceResources determines the vCPU and memory amounts to claim
880+
// for an instance. Explicit sizing always takes precedence over the instance
881+
// type catalog, so a workload that overrides container limits is accounted at
882+
// its actual resource footprint rather than the catalog baseline.
883+
//
884+
// Precedence order:
885+
// 1. Sandbox container Limits (sum across all containers) — all containers
886+
// must have both cpu and memory Limits for this path to succeed.
887+
// 2. Instance-level Resources.Requests — both cpu and memory must be present.
888+
// 3. instanceTypeCatalog lookup by instanceType — used for the common case
889+
// where a workload is sized only by instanceType with no explicit limits.
890+
//
891+
// Returns (0, 0, false) when none of the above yield a complete sizing, so
892+
// the caller falls back to claiming only the instance count.
855893
func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores int64, memMiB int64, resolved bool) {
856894
rt := instance.Spec.Runtime
895+
896+
// Path 1: explicit per-container Limits — most specific, wins if fully set.
857897
if rt.Sandbox != nil {
858898
var totalCPU resource.Quantity
859899
var totalMem resource.Quantity
@@ -872,18 +912,59 @@ func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores
872912
totalCPU.Add(cpu)
873913
totalMem.Add(mem)
874914
}
875-
if !allSet || len(rt.Sandbox.Containers) == 0 {
876-
return 0, 0, false
915+
if allSet && len(rt.Sandbox.Containers) > 0 {
916+
return totalCPU.MilliValue(), totalMem.Value() / (1024 * 1024), true
877917
}
878-
return totalCPU.MilliValue(), totalMem.Value() / (1024 * 1024), true
918+
// Containers exist but limits are incomplete — fall through to catalog
919+
// rather than returning false, because instanceType is still set.
879920
}
880921

922+
// Path 2: instance-level resource requests.
881923
cpu, hasCPU := rt.Resources.Requests[corev1.ResourceCPU]
882924
mem, hasMem := rt.Resources.Requests[corev1.ResourceMemory]
883-
if !hasCPU || !hasMem {
884-
return 0, 0, false
925+
if hasCPU && hasMem {
926+
return cpu.MilliValue(), mem.Value() / (1024 * 1024), true
927+
}
928+
929+
// Path 3: instanceType catalog — handles the typical production case where
930+
// instanceType is the only sizing signal and no explicit limits are set.
931+
if rt.Resources.InstanceType != "" {
932+
if spec, ok := instanceTypeCatalog[rt.Resources.InstanceType]; ok {
933+
return spec.CPUMillicores, spec.MemoryMiB, true
934+
}
935+
}
936+
937+
return 0, 0, false
938+
}
939+
940+
// instanceBlockingReasonPriority ranks Instance blocking reasons so the most
941+
// specific, user-actionable cause wins when several conditions are unsatisfied.
942+
// Higher numbers are more specific. Reasons absent from the table rank 0.
943+
//
944+
// 0 - unknown/default
945+
// 1 - Provisioning (transient runtime startup)
946+
// 3 - PendingQuota (operator action may be needed)
947+
// 5 - ImageUnavailable / InstanceCrashing / ConfigurationError
948+
// (hard runtime error, user-actionable)
949+
// 7 - NetworkFailedToCreate (hard infra error)
950+
func instanceBlockingReasonPriority(reason string) int {
951+
switch reason {
952+
case computev1alpha.InstanceReadyReasonProvisioning:
953+
return 1
954+
case computev1alpha.InstanceProgrammedReasonPendingQuota:
955+
return 3
956+
case computev1alpha.InstanceReadyReasonImageUnavailable,
957+
computev1alpha.InstanceReadyReasonInstanceCrashing,
958+
computev1alpha.InstanceReadyReasonConfigurationError:
959+
// Hard runtime errors are user-actionable (wrong image, crashing app, bad
960+
// config) and rank highest among non-infra reasons so they are not buried
961+
// under transient startup/quota reasons.
962+
return 5
963+
case reasonNetworkFailedToCreate:
964+
return 7
965+
default:
966+
return 0
885967
}
886-
return cpu.MilliValue(), mem.Value() / (1024 * 1024), true
887968
}
888969

889970
// networkFailureChecker is a function that checks if a network creation failure
@@ -967,16 +1048,88 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition(
9671048
if programmedCondition == nil || programmedCondition.Status != metav1.ConditionTrue {
9681049
logger.Info("instance is not programmed", "instance", instance.Name)
9691050

970-
readyCondition.Status = metav1.ConditionFalse
971-
readyCondition.Reason = computev1alpha.InstanceProgrammedReasonPendingProgramming
972-
if programmedCondition != nil && programmedCondition.Reason != pendingReason {
973-
readyCondition.Reason = programmedCondition.Reason
1051+
// Surface the most specific provider sub-condition rather than a generic
1052+
// "Instance has not been programmed". A provider reason like
1053+
// ImageUnavailable (set on the Available condition while Programmed is
1054+
// still Unknown) must surface on Ready with its actionable message.
1055+
//
1056+
// Two tiers are tracked:
1057+
// - bestKnown: the best candidate from the priority table (ranked 1-7).
1058+
// - fallback: the Programmed condition's own reason/message when it has
1059+
// one but it is not in the priority table (e.g. a provider
1060+
// writes a custom Programmed reason otherwise unknown to
1061+
// this controller). Preserves Programmed.Reason → Ready.Reason
1062+
// pass-through behavior.
1063+
type candidate struct {
1064+
status metav1.ConditionStatus
1065+
reason string
1066+
message string
1067+
priority int
1068+
}
1069+
1070+
// Generic default — used only when nothing better is found.
1071+
fallbackCandidate := candidate{
1072+
status: metav1.ConditionFalse,
1073+
reason: computev1alpha.InstanceProgrammedReasonPendingProgramming,
1074+
message: msgNotProgrammed,
1075+
priority: -1,
1076+
}
1077+
// Promote the Programmed condition's own reason as a fallback when it is
1078+
// more specific than PendingProgramming/Pending but not in the priority
1079+
// table. Preserves pass-through for provider-written Programmed reasons.
1080+
if programmedCondition != nil && programmedCondition.Reason != pendingReason &&
1081+
programmedCondition.Reason != computev1alpha.InstanceProgrammedReasonPendingProgramming {
1082+
fallbackCandidate = candidate{
1083+
status: programmedCondition.Status,
1084+
reason: programmedCondition.Reason,
1085+
message: programmedCondition.Message,
1086+
priority: 0,
1087+
}
1088+
}
1089+
1090+
best := fallbackCandidate
1091+
consider := func(status metav1.ConditionStatus, reason, message string) {
1092+
// A generic "Pending" reason carries no actionable signal; skip it so
1093+
// it cannot displace an already-set specific reason from the provider.
1094+
if reason == pendingReason {
1095+
return
1096+
}
1097+
p := instanceBlockingReasonPriority(reason)
1098+
if p > best.priority {
1099+
best = candidate{status: status, reason: reason, message: message, priority: p}
1100+
}
9741101
}
9751102

976-
readyCondition.Message = msgNotProgrammed
977-
if programmedCondition != nil && programmedCondition.Status != metav1.ConditionUnknown {
978-
readyCondition.Message = programmedCondition.Message
1103+
// Sub-conditions set by the provider (e.g. Available=Unknown/ImageUnavailable)
1104+
// may be more specific than the Programmed condition. Consult each one so
1105+
// the highest-priority reason wins, regardless of which condition carries it.
1106+
for _, cond := range instance.Status.Conditions {
1107+
if cond.Status == metav1.ConditionTrue {
1108+
// Satisfied conditions are not blocking; skip them.
1109+
continue
1110+
}
1111+
switch cond.Type {
1112+
case computev1alpha.InstanceProgrammed,
1113+
computev1alpha.InstanceReady,
1114+
computev1alpha.InstanceQuotaGranted:
1115+
// InstanceProgrammed is handled below; InstanceReady is being set
1116+
// now. InstanceQuotaGranted is a gate-level signal evaluated before
1117+
// this branch is reached — including it here would let a transient
1118+
// PendingEvaluation reason displace the generic not-programmed
1119+
// fallback when no provider sub-condition is set yet.
1120+
continue
1121+
}
1122+
consider(cond.Status, cond.Reason, cond.Message)
9791123
}
1124+
// Also let the Programmed condition itself compete through the priority table
1125+
// in case it carries a known reason (e.g. PendingQuota).
1126+
if programmedCondition != nil {
1127+
consider(programmedCondition.Status, programmedCondition.Reason, programmedCondition.Message)
1128+
}
1129+
1130+
readyCondition.Status = best.status
1131+
readyCondition.Reason = best.reason
1132+
readyCondition.Message = best.message
9801133

9811134
return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil
9821135
}
@@ -987,16 +1140,21 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition(
9871140
if availableCondition == nil || availableCondition.Status != metav1.ConditionTrue {
9881141
logger.Info("instance is not available", "instance", instance.Name)
9891142

990-
readyCondition.Status = metav1.ConditionFalse
991-
readyCondition.Reason = pendingReason
1143+
// Propagate the Available condition's reason and message directly —
1144+
// including when the status is Unknown — so provider-set reasons like
1145+
// ImageUnavailable surface on Ready rather than a generic message.
1146+
readyStatus := metav1.ConditionFalse
1147+
readyReason := pendingReason
1148+
readyMessage := "Instance is not available"
9921149
if availableCondition != nil && availableCondition.Reason != pendingReason {
993-
readyCondition.Reason = availableCondition.Reason
1150+
readyStatus = availableCondition.Status
1151+
readyReason = availableCondition.Reason
1152+
readyMessage = availableCondition.Message
9941153
}
9951154

996-
readyCondition.Message = "Instance is not available"
997-
if availableCondition != nil && availableCondition.Status != metav1.ConditionUnknown {
998-
readyCondition.Message = availableCondition.Message
999-
}
1155+
readyCondition.Status = readyStatus
1156+
readyCondition.Reason = readyReason
1157+
readyCondition.Message = readyMessage
10001158

10011159
return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil
10021160
}

0 commit comments

Comments
 (0)