feat(azure/rhel-ai): add GPU guardrails for instance selection

rishupk · claude · rishupk · commit 724c82d296f7 · 2026-06-05T15:03:09.000+01:00
- Add GPU capability detection to VM SKU filter
- Validate that selected compute sizes are GPU-capable (ND/NC-series)
- Default GPUs=1 when unset so spot allocator targets GPU instances

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: Rishabh Kothari &lt;rkothari@redhat.com&gt;
diff --git a/pkg/provider/azure/action/rhel-ai/rhelai.go b/pkg/provider/azure/action/rhel-ai/rhelai.go
@@ -34,16 +34,41 @@ func imageId(accelerator, version string) string {
 	return imageIdFromName(fmt.Sprintf(imageNameRegex, accelerator, version))
 }
 
+// isGPUCapableSize returns true for ND-series and NC-series Azure VM sizes,
+// which are the compute GPU families supported for RHEL AI workloads.
+// NV-series (visualization GPUs) is intentionally excluded.
+func isGPUCapableSize(vmSize string) bool {
+	lower := strings.ToLower(vmSize)
+	return strings.HasPrefix(lower, "standard_nd") || strings.HasPrefix(lower, "standard_nc")
+}
+
 func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err error) {
-	logging.Debug("Creating RHEL Server")
+	if args == nil || args.ComputeRequest == nil {
+		return fmt.Errorf("RHEL AI: args and ComputeRequest must not be nil")
+	}
+	logging.Debug("Creating RHEL AI Server")
 	sharedImageID := imageId(args.Accelerator, args.Version)
 	if args.CustomImage != "" {
 		sharedImageID = imageIdFromName(args.CustomImage)
 	}
+	// Shallow-copy to avoid mutating the caller's ComputeRequestArgs.
+	computeReq := *args.ComputeRequest
+	// Ensure GPU-capable instance selection for auto-selection paths.
+	if computeReq.GPUs == 0 {
+		logging.Debug("RHEL AI: GPUs not set, defaulting to 1 for GPU-capable instance selection")
+		computeReq.GPUs = 1
+	}
+	// All explicitly specified sizes must be GPU-capable; a single non-GPU entry
+	// could get allocated and vllm would fail silently.
+	for _, s := range computeReq.ComputeSizes {
+		if !isGPUCapableSize(s) {
+			return fmt.Errorf("RHEL AI: %q is not GPU-capable (expected ND-series or NC-series for vllm)", s)
+		}
+	}
 	azureLinuxRequest :=
 		&azureLinux.LinuxArgs{
 			Prefix:         args.Prefix,
-			ComputeRequest: args.ComputeRequest,
+			ComputeRequest: &computeReq,
 			Spot:           args.Spot,
 			ImageRef: &data.ImageReference{
 				SharedImageID: sharedImageID,
@@ -55,7 +80,10 @@ func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err
 			},
 			Username:         username,
 			ReadinessCommand: command.CommandPing}
-	return azureLinux.Create(mCtxArgs, azureLinuxRequest)
+	if err = azureLinux.Create(mCtxArgs, azureLinuxRequest); err != nil && len(computeReq.ComputeSizes) == 0 {
+		return fmt.Errorf("RHEL AI: failed to provision a GPU-capable instance (ND/NC-series required for vllm); verify GPU quota in the target location/subscription: %w", err)
+	}
+	return err
 }
 
 func Destroy(mCtxArgs *maptContext.ContextArgs) error {
diff --git a/pkg/provider/azure/action/rhel-ai/rhelai_test.go b/pkg/provider/azure/action/rhel-ai/rhelai_test.go
@@ -0,0 +1,29 @@
+package rhelai
+
+import "testing"
+
+func TestIsGPUCapableSize(t *testing.T) {
+	cases := []struct {
+		size     string
+		expected bool
+	}{
+		{"Standard_ND96asr_v4", true},
+		{"Standard_ND40rs_v2", true},
+		{"Standard_NC6s_v3", true},
+		{"Standard_NC24rs_v3", true},
+		{"standard_nd96asr_v4", true},
+		{"standard_nc6s_v3", true},
+		{"Standard_D8as_v5", false},
+		{"Standard_E16as_v5", false},
+		{"Standard_F32s_v2", false},
+		{"Standard_NV6", false},
+		{"Standard_NV36ads_A10_v5", false},
+		{"", false},
+	}
+	for _, tc := range cases {
+		got := isGPUCapableSize(tc.size)
+		if got != tc.expected {
+			t.Errorf("isGPUCapableSize(%q) = %v, want %v", tc.size, got, tc.expected)
+		}
+	}
+}
diff --git a/pkg/provider/azure/data/compute-request.go b/pkg/provider/azure/data/compute-request.go
@@ -151,6 +151,7 @@ type virtualMachine struct {
 	// Spot capable
 	LowPriorityCapable  bool
 	MaxResourceVolumeMB int32
+	GPUs                int32
 	// IaaS or PaaS
 	VMDeploymentTypes []string
 	// Fast SSD
@@ -261,6 +262,12 @@ func resourceSKUToVirtualMachine(res *armcompute.ResourceSKU) *virtualMachine {
 				return nil
 			}
 			vm.MaxResourceVolumeMB = int32(disk)
+		case "GPUs":
+			gpus, err := strconv.ParseInt(*capability.Value, 10, 32)
+			if err != nil {
+				return nil
+			}
+			vm.GPUs = int32(gpus)
 		case "VMDeploymentTypes":
 			vm.VMDeploymentTypes = strings.Split(*capability.Value, ",")
 		default:
@@ -283,10 +290,22 @@ func filterCPUsAndMemory(args *cr.ComputeRequestArgs) filterFunc {
 			if args.NestedVirt && !vm.nestedVirtSupported() {
 				return
 			}
+			if args.GPUs > 0 && vm.GPUs < args.GPUs {
+				return
+			}
+			// GPU VMs (ND/NC-series) have large temp disks, so skip the
+			// local-storage check that would otherwise reject them.
+			featuresOK := false
+			if args.GPUs > 0 {
+				featuresOK = vm.AcceleratedNetworkingEnabled && vm.PremiumIO &&
+					vm.EncryptionAtHostSupported && vm.hypervGen2Supported()
+			} else {
+				featuresOK = vm.baseFeaturesSupported()
+			}
 			if vm.VCPUs >= args.CPUs &&
 				vm.Memory >= args.MemoryGib &&
 				vm.Arch == args.Arch.String() &&
-				vm.baseFeaturesSupported() {
+				featuresOK {
 				dSeries := regexp.MustCompile(lowerCpuPattern)
 				if !dSeries.Match([]byte(vm.Name)) {
 					vmCh <- vm.Name