@@ -96,6 +96,30 @@ const (
9696 reasonNetworkFailedToCreate = "NetworkFailedToCreate"
9797)
9898
99+ // instanceTypeResources holds the vCPU and memory for a named instance type.
100+ type instanceTypeResources struct {
101+ // CPUMillicores is the number of CPU millicores (1000 = 1 vCPU).
102+ CPUMillicores int64
103+ // MemoryMiB is the amount of RAM in mebibytes.
104+ MemoryMiB int64
105+ }
106+
107+ // instanceTypeCatalog maps platform instance type names to their resource
108+ // dimensions used for quota accounting when the instance spec carries only an
109+ // instanceType and no explicit container Limits or instance-level Requests.
110+ //
111+ // These are the platform-declared quota sizes for the instance type, not a
112+ // derivation of any infra provider's machine type. (infra-provider-gcp separately
113+ // maps datumcloud/d1-standard-2 to the GCP n2-standard-2 machine type for VM
114+ // provisioning; that mapping does not define the quota size here.) When new
115+ // instance types are added, add them here with their vCPU/memory values.
116+ var instanceTypeCatalog = map [string ]instanceTypeResources {
117+ "datumcloud/d1-standard-2" : {
118+ CPUMillicores : 1000 , // 1 vCPU
119+ MemoryMiB : 2048 , // 2 GiB
120+ },
121+ }
122+
99123// Quota-pending requeue backoff. The instance controller is normally re-queued by
100124// the ResourceClaim watch when a claim is granted, but that grant event lives on
101125// the project control plane and can be missed (informer engagement races, watch
@@ -852,8 +876,24 @@ func (r *InstanceReconciler) classifyCreateError(
852876 }, fmt .Errorf ("failed creating resource claim: %w" , err )
853877}
854878
879+ // resolveInstanceResources determines the vCPU and memory amounts to claim
880+ // for an instance. Explicit sizing always takes precedence over the instance
881+ // type catalog, so a workload that overrides container limits is accounted at
882+ // its actual resource footprint rather than the catalog baseline.
883+ //
884+ // Precedence order:
885+ // 1. Sandbox container Limits (sum across all containers) — all containers
886+ // must have both cpu and memory Limits for this path to succeed.
887+ // 2. Instance-level Resources.Requests — both cpu and memory must be present.
888+ // 3. instanceTypeCatalog lookup by instanceType — used for the common case
889+ // where a workload is sized only by instanceType with no explicit limits.
890+ //
891+ // Returns (0, 0, false) when none of the above yield a complete sizing, so
892+ // the caller falls back to claiming only the instance count.
855893func resolveInstanceResources (instance * computev1alpha.Instance ) (cpuMillicores int64 , memMiB int64 , resolved bool ) {
856894 rt := instance .Spec .Runtime
895+
896+ // Path 1: explicit per-container Limits — most specific, wins if fully set.
857897 if rt .Sandbox != nil {
858898 var totalCPU resource.Quantity
859899 var totalMem resource.Quantity
@@ -872,18 +912,59 @@ func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores
872912 totalCPU .Add (cpu )
873913 totalMem .Add (mem )
874914 }
875- if ! allSet || len (rt .Sandbox .Containers ) == 0 {
876- return 0 , 0 , false
915+ if allSet && len (rt .Sandbox .Containers ) > 0 {
916+ return totalCPU . MilliValue (), totalMem . Value () / ( 1024 * 1024 ), true
877917 }
878- return totalCPU .MilliValue (), totalMem .Value () / (1024 * 1024 ), true
918+ // Containers exist but limits are incomplete — fall through to catalog
919+ // rather than returning false, because instanceType is still set.
879920 }
880921
922+ // Path 2: instance-level resource requests.
881923 cpu , hasCPU := rt .Resources .Requests [corev1 .ResourceCPU ]
882924 mem , hasMem := rt .Resources .Requests [corev1 .ResourceMemory ]
883- if ! hasCPU || ! hasMem {
884- return 0 , 0 , false
925+ if hasCPU && hasMem {
926+ return cpu .MilliValue (), mem .Value () / (1024 * 1024 ), true
927+ }
928+
929+ // Path 3: instanceType catalog — handles the typical production case where
930+ // instanceType is the only sizing signal and no explicit limits are set.
931+ if rt .Resources .InstanceType != "" {
932+ if spec , ok := instanceTypeCatalog [rt .Resources .InstanceType ]; ok {
933+ return spec .CPUMillicores , spec .MemoryMiB , true
934+ }
935+ }
936+
937+ return 0 , 0 , false
938+ }
939+
940+ // instanceBlockingReasonPriority ranks Instance blocking reasons so the most
941+ // specific, user-actionable cause wins when several conditions are unsatisfied.
942+ // Higher numbers are more specific. Reasons absent from the table rank 0.
943+ //
944+ // 0 - unknown/default
945+ // 1 - Provisioning (transient runtime startup)
946+ // 3 - PendingQuota (operator action may be needed)
947+ // 5 - ImageUnavailable / InstanceCrashing / ConfigurationError
948+ // (hard runtime error, user-actionable)
949+ // 7 - NetworkFailedToCreate (hard infra error)
950+ func instanceBlockingReasonPriority (reason string ) int {
951+ switch reason {
952+ case computev1alpha .InstanceReadyReasonProvisioning :
953+ return 1
954+ case computev1alpha .InstanceProgrammedReasonPendingQuota :
955+ return 3
956+ case computev1alpha .InstanceReadyReasonImageUnavailable ,
957+ computev1alpha .InstanceReadyReasonInstanceCrashing ,
958+ computev1alpha .InstanceReadyReasonConfigurationError :
959+ // Hard runtime errors are user-actionable (wrong image, crashing app, bad
960+ // config) and rank highest among non-infra reasons so they are not buried
961+ // under transient startup/quota reasons.
962+ return 5
963+ case reasonNetworkFailedToCreate :
964+ return 7
965+ default :
966+ return 0
885967 }
886- return cpu .MilliValue (), mem .Value () / (1024 * 1024 ), true
887968}
888969
889970// networkFailureChecker is a function that checks if a network creation failure
@@ -967,16 +1048,88 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition(
9671048 if programmedCondition == nil || programmedCondition .Status != metav1 .ConditionTrue {
9681049 logger .Info ("instance is not programmed" , "instance" , instance .Name )
9691050
970- readyCondition .Status = metav1 .ConditionFalse
971- readyCondition .Reason = computev1alpha .InstanceProgrammedReasonPendingProgramming
972- if programmedCondition != nil && programmedCondition .Reason != pendingReason {
973- readyCondition .Reason = programmedCondition .Reason
1051+ // Surface the most specific provider sub-condition rather than a generic
1052+ // "Instance has not been programmed". A provider reason like
1053+ // ImageUnavailable (set on the Available condition while Programmed is
1054+ // still Unknown) must surface on Ready with its actionable message.
1055+ //
1056+ // Two tiers are tracked:
1057+ // - bestKnown: the best candidate from the priority table (ranked 1-7).
1058+ // - fallback: the Programmed condition's own reason/message when it has
1059+ // one but it is not in the priority table (e.g. a provider
1060+ // writes a custom Programmed reason otherwise unknown to
1061+ // this controller). Preserves Programmed.Reason → Ready.Reason
1062+ // pass-through behavior.
1063+ type candidate struct {
1064+ status metav1.ConditionStatus
1065+ reason string
1066+ message string
1067+ priority int
1068+ }
1069+
1070+ // Generic default — used only when nothing better is found.
1071+ fallbackCandidate := candidate {
1072+ status : metav1 .ConditionFalse ,
1073+ reason : computev1alpha .InstanceProgrammedReasonPendingProgramming ,
1074+ message : msgNotProgrammed ,
1075+ priority : - 1 ,
1076+ }
1077+ // Promote the Programmed condition's own reason as a fallback when it is
1078+ // more specific than PendingProgramming/Pending but not in the priority
1079+ // table. Preserves pass-through for provider-written Programmed reasons.
1080+ if programmedCondition != nil && programmedCondition .Reason != pendingReason &&
1081+ programmedCondition .Reason != computev1alpha .InstanceProgrammedReasonPendingProgramming {
1082+ fallbackCandidate = candidate {
1083+ status : programmedCondition .Status ,
1084+ reason : programmedCondition .Reason ,
1085+ message : programmedCondition .Message ,
1086+ priority : 0 ,
1087+ }
1088+ }
1089+
1090+ best := fallbackCandidate
1091+ consider := func (status metav1.ConditionStatus , reason , message string ) {
1092+ // A generic "Pending" reason carries no actionable signal; skip it so
1093+ // it cannot displace an already-set specific reason from the provider.
1094+ if reason == pendingReason {
1095+ return
1096+ }
1097+ p := instanceBlockingReasonPriority (reason )
1098+ if p > best .priority {
1099+ best = candidate {status : status , reason : reason , message : message , priority : p }
1100+ }
9741101 }
9751102
976- readyCondition .Message = msgNotProgrammed
977- if programmedCondition != nil && programmedCondition .Status != metav1 .ConditionUnknown {
978- readyCondition .Message = programmedCondition .Message
1103+ // Sub-conditions set by the provider (e.g. Available=Unknown/ImageUnavailable)
1104+ // may be more specific than the Programmed condition. Consult each one so
1105+ // the highest-priority reason wins, regardless of which condition carries it.
1106+ for _ , cond := range instance .Status .Conditions {
1107+ if cond .Status == metav1 .ConditionTrue {
1108+ // Satisfied conditions are not blocking; skip them.
1109+ continue
1110+ }
1111+ switch cond .Type {
1112+ case computev1alpha .InstanceProgrammed ,
1113+ computev1alpha .InstanceReady ,
1114+ computev1alpha .InstanceQuotaGranted :
1115+ // InstanceProgrammed is handled below; InstanceReady is being set
1116+ // now. InstanceQuotaGranted is a gate-level signal evaluated before
1117+ // this branch is reached — including it here would let a transient
1118+ // PendingEvaluation reason displace the generic not-programmed
1119+ // fallback when no provider sub-condition is set yet.
1120+ continue
1121+ }
1122+ consider (cond .Status , cond .Reason , cond .Message )
9791123 }
1124+ // Also let the Programmed condition itself compete through the priority table
1125+ // in case it carries a known reason (e.g. PendingQuota).
1126+ if programmedCondition != nil {
1127+ consider (programmedCondition .Status , programmedCondition .Reason , programmedCondition .Message )
1128+ }
1129+
1130+ readyCondition .Status = best .status
1131+ readyCondition .Reason = best .reason
1132+ readyCondition .Message = best .message
9801133
9811134 return apimeta .SetStatusCondition (& instance .Status .Conditions , * readyCondition ), nil
9821135 }
@@ -987,16 +1140,21 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition(
9871140 if availableCondition == nil || availableCondition .Status != metav1 .ConditionTrue {
9881141 logger .Info ("instance is not available" , "instance" , instance .Name )
9891142
990- readyCondition .Status = metav1 .ConditionFalse
991- readyCondition .Reason = pendingReason
1143+ // Propagate the Available condition's reason and message directly —
1144+ // including when the status is Unknown — so provider-set reasons like
1145+ // ImageUnavailable surface on Ready rather than a generic message.
1146+ readyStatus := metav1 .ConditionFalse
1147+ readyReason := pendingReason
1148+ readyMessage := "Instance is not available"
9921149 if availableCondition != nil && availableCondition .Reason != pendingReason {
993- readyCondition .Reason = availableCondition .Reason
1150+ readyStatus = availableCondition .Status
1151+ readyReason = availableCondition .Reason
1152+ readyMessage = availableCondition .Message
9941153 }
9951154
996- readyCondition .Message = "Instance is not available"
997- if availableCondition != nil && availableCondition .Status != metav1 .ConditionUnknown {
998- readyCondition .Message = availableCondition .Message
999- }
1155+ readyCondition .Status = readyStatus
1156+ readyCondition .Reason = readyReason
1157+ readyCondition .Message = readyMessage
10001158
10011159 return apimeta .SetStatusCondition (& instance .Status .Conditions , * readyCondition ), nil
10021160 }
0 commit comments