Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 51 additions & 3 deletions pkg/provider/azure/action/rhel-ai/rhelai.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package rhelai

import (
"context"
"fmt"
"strings"

Expand Down Expand Up @@ -34,23 +35,70 @@ func imageId(accelerator, version string) string {
return imageIdFromName(fmt.Sprintf(imageNameRegex, accelerator, version))
}

// isGPUCapableSize returns true for ND-series and NC-series Azure VM sizes,
// which are the compute GPU families supported for RHEL AI workloads.
// NV-series (visualization GPUs) is intentionally excluded.
func isGPUCapableSize(vmSize string) bool {
lower := strings.ToLower(vmSize)
return strings.HasPrefix(lower, "standard_nd") || strings.HasPrefix(lower, "standard_nc")
}

func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err error) {
logging.Debug("Creating RHEL Server")
if args == nil || args.ComputeRequest == nil {
return fmt.Errorf("RHEL AI: args and ComputeRequest must not be nil")
}
logging.Debug("Creating RHEL AI Server")
sharedImageID := imageId(args.Accelerator, args.Version)
if args.CustomImage != "" {
sharedImageID = imageIdFromName(args.CustomImage)
}
// Shallow-copy to avoid mutating the caller's ComputeRequestArgs.
computeReq := *args.ComputeRequest
if len(computeReq.ComputeSizes) > 0 {
ctx := mCtxArgs.Context
if ctx == nil {
ctx = context.Background()
}
computeReq.ComputeSizes, err = data.FilterNoLocalStorageSizes(
ctx, computeReq.ComputeSizes)
if err != nil {
return err
}
if len(computeReq.ComputeSizes) == 0 {
return fmt.Errorf("no valid compute sizes: all provided sizes have NVMe-only local storage, incompatible with RHEL AI")
}
}
// Ensure GPU-capable instance selection for auto-selection paths.
if computeReq.GPUs == 0 {
logging.Debug("RHEL AI: GPUs not set, defaulting to 1 for GPU-capable instance selection")
computeReq.GPUs = 1
}
// All explicitly specified sizes must be GPU-capable; a single non-GPU entry
// could get allocated and vllm would fail silently.
for _, s := range computeReq.ComputeSizes {
if !isGPUCapableSize(s) {
return fmt.Errorf("RHEL AI: %q is not GPU-capable (expected ND-series or NC-series for vllm)", s)
}
}
azureLinuxRequest :=
&azureLinux.LinuxArgs{
Prefix: args.Prefix,
ComputeRequest: args.ComputeRequest,
ComputeRequest: &computeReq,
Spot: args.Spot,
ImageRef: &data.ImageReference{
SharedImageID: sharedImageID,
// Belt-and-suspenders: set SCSI explicitly so Azure never infers a
// conflicting default. resolveImageRef will also derive this from the
// gallery image's Features, but the static value protects against API
// failures or future images with multiple supported types.
DiskControllerType: "SCSI",
},
Username: username,
ReadinessCommand: command.CommandPing}
return azureLinux.Create(mCtxArgs, azureLinuxRequest)
if err = azureLinux.Create(mCtxArgs, azureLinuxRequest); err != nil && len(computeReq.ComputeSizes) == 0 {
return fmt.Errorf("RHEL AI: failed to provision a GPU-capable instance (ND/NC-series required for vllm); verify GPU quota in the target location/subscription: %w", err)
}
return err
}

func Destroy(mCtxArgs *maptContext.ContextArgs) error {
Expand Down
29 changes: 29 additions & 0 deletions pkg/provider/azure/action/rhel-ai/rhelai_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package rhelai

import "testing"

func TestIsGPUCapableSize(t *testing.T) {
cases := []struct {
size string
expected bool
}{
{"Standard_ND96asr_v4", true},
{"Standard_ND40rs_v2", true},
{"Standard_NC6s_v3", true},
{"Standard_NC24rs_v3", true},
{"standard_nd96asr_v4", true},
{"standard_nc6s_v3", true},
{"Standard_D8as_v5", false},
{"Standard_E16as_v5", false},
{"Standard_F32s_v2", false},
{"Standard_NV6", false},
{"Standard_NV36ads_A10_v5", false},
{"", false},
}
for _, tc := range cases {
got := isGPUCapableSize(tc.size)
if got != tc.expected {
t.Errorf("isGPUCapableSize(%q) = %v, want %v", tc.size, got, tc.expected)
}
}
}
180 changes: 161 additions & 19 deletions pkg/provider/azure/data/compute-request.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package data

import (
"context"
"os"
"fmt"
"regexp"
"slices"
"strconv"
Expand All @@ -12,6 +12,7 @@ import (
"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
cr "github.com/redhat-developer/mapt/pkg/provider/api/compute-request"
"github.com/redhat-developer/mapt/pkg/util/logging"
)

const (
Expand All @@ -33,7 +34,13 @@ func (c *ComputeSelector) Select(ctx context.Context, args *cr.ComputeRequestArg
return getAzureVMSKUs(ctx, args)
}

func FilterComputeSizesByLocation(ctx context.Context, location *string, computeSizes []string) ([]string, error) {
// FilterComputeSizesByDiskControllerType returns the subset of computeSizes that are
// available in location AND support requiredType. Sizes without a DiskControllerTypes
// capability are assumed to support only SCSI (Azure historical default).
func FilterComputeSizesByDiskControllerType(ctx context.Context, location *string, computeSizes []string, requiredType string) ([]string, error) {
Comment thread
coderabbitai[bot] marked this conversation as resolved.
if location == nil {
return nil, fmt.Errorf("location cannot be nil")
}
creds, subscriptionID, err := getCredentials()
if err != nil {
return nil, err
Expand All @@ -43,35 +50,139 @@ func FilterComputeSizesByLocation(ctx context.Context, location *string, compute
return nil, err
}
pager := client.NewListPager(nil)
supportedSizes := []string{}
supported := []string{}
for pager.More() {
page, err := pager.NextPage(ctx)
if err != nil {
return nil, err
}
for _, sku := range page.Value {
if sku.ResourceType != nil &&
*sku.ResourceType == string(RTVirtualMachines) {
if sku.Name != nil && slices.Contains(computeSizes, *sku.Name) {
for _, loc := range sku.Locations {
if strings.EqualFold(*loc, *location) {
supportedSizes = append(supportedSizes, *sku.Name)
break
}
}
if sku.ResourceType == nil || *sku.ResourceType != string(RTVirtualMachines) {
continue
}
if sku.Name == nil || !slices.Contains(computeSizes, *sku.Name) {
continue
}
inLocation := false
for _, loc := range sku.Locations {
if loc != nil && strings.EqualFold(*loc, *location) {
inLocation = true
break
}
}
if !inLocation {
continue
}
diskTypes := diskControllerTypesFromCapabilities(sku.Capabilities)
if diskControllerTypeSupported(diskTypes, requiredType) && !slices.Contains(supported, *sku.Name) {
supported = append(supported, *sku.Name)
}
}
}
return supported, nil
}

// diskControllerTypesFromCapabilities extracts the DiskControllerTypes value from SKU
// capabilities. Returns nil when the capability is absent.
func diskControllerTypesFromCapabilities(caps []*armcompute.ResourceSKUCapabilities) []string {
for _, c := range caps {
if c.Name != nil && *c.Name == "DiskControllerTypes" && c.Value != nil {
return splitDiskControllerTypes(*c.Value)
}
}
return nil
}

// diskControllerTypeSupported reports whether requiredType is satisfied by the supported
// set. Empty requiredType means no restriction (always passes). A nil/empty supported
// set means the capability is absent; Azure sizes that predate NVMe default to SCSI, so
// absence is treated as SCSI-only.
func diskControllerTypeSupported(supported []string, requiredType string) bool {
if requiredType == "" {
return true
}
if len(supported) == 0 {
return strings.EqualFold(requiredType, "SCSI")
}
for _, t := range supported {
if strings.EqualFold(t, requiredType) {
return true
}
}
return false
}

// FilterNoLocalStorageSizes returns only the sizes from computeSizes that have no
// NVMe-only local storage (L-series). Temp disks (MaxResourceVolumeMB > 0) are allowed
// — they are ephemeral scratch space that does not interfere with RHEL AI's OS disk.
// Sizes not found in the Azure SKU catalog (typo or restricted SKU) are logged as
// warnings and excluded.
func FilterNoLocalStorageSizes(ctx context.Context, computeSizes []string) ([]string, error) {
creds, subscriptionID, err := getCredentials()
if err != nil {
return nil, err
}
client, err := armcompute.NewResourceSKUsClient(*subscriptionID, creds, nil)
if err != nil {
return nil, err
}
pager := client.NewListPager(nil)
capabilities := make(map[string]*virtualMachine, len(computeSizes))
for pager.More() {
page, err := pager.NextPage(ctx)
if err != nil {
return nil, err
}
for _, sku := range page.Value {
if sku.ResourceType == nil || *sku.ResourceType != string(RTVirtualMachines) {
continue
}
if sku.Name == nil || !slices.Contains(computeSizes, *sku.Name) {
continue
}
if _, seen := capabilities[*sku.Name]; seen {
continue
}
if vm := resourceSKUToVirtualMachine(sku); vm != nil {
capabilities[*sku.Name] = vm
}
}
}
return supportedSizes, nil
valid, dropped, unknown := filterNVMeStorage(computeSizes, capabilities)
for _, size := range dropped {
logging.Warnf("dropping compute size %q: has NVMe-only local storage, incompatible with RHEL AI", size)
}
for _, size := range unknown {
logging.Warnf("dropping compute size %q: not found in Azure SKU catalog (typo or restricted SKU)", size)
}
return valid, nil
}

// filterNVMeStorage classifies each size into valid (no NVMe-only local storage),
// dropped (has NVMe local storage — e.g. L-series), or unknown (absent from capabilities).
func filterNVMeStorage(computeSizes []string, capabilities map[string]*virtualMachine) (valid, dropped, unknown []string) {
for _, size := range computeSizes {
vm, ok := capabilities[size]
if !ok {
unknown = append(unknown, size)
continue
}
if vm.NvmeDiskSizeInMiB > 0 {
dropped = append(dropped, size)
} else {
valid = append(valid, size)
}
}
return valid, dropped, unknown
}

func getAzureVMSKUs(ctx context.Context, args *cr.ComputeRequestArgs) ([]string, error) {
ensureAzureEnvs()
cred, err := azidentity.NewDefaultAzureCredential(nil)
if err != nil {
return nil, err
}
subscriptionId := os.Getenv("AZURE_SUBSCRIPTION_ID")
subscriptionId := SubscriptionID()
clientFactory, err := armcompute.NewClientFactory(
subscriptionId, cred, nil)
if err != nil {
Expand Down Expand Up @@ -109,6 +220,11 @@ type virtualMachine struct {
// Spot capable
LowPriorityCapable bool
MaxResourceVolumeMB int32
GPUs int32
// L-series VMs expose NVMe storage separately from the temp disk
NvmeDiskSizeInMiB int32
// Used by the disk-controller-type fix (PR #823) to cross-reference SKU capabilities
DiskControllerTypes []string
// IaaS or PaaS
VMDeploymentTypes []string
// Fast SSD
Expand Down Expand Up @@ -144,17 +260,17 @@ func (vm *virtualMachine) hypervGen2Supported() bool {
return slices.Contains(vm.HyperVGenerations, "V2")
}

func (vm *virtualMachine) emptyDiskSupported() bool {
return vm.MaxResourceVolumeMB == 0
func (vm *virtualMachine) noLocalStorageAttached() bool {
return vm.MaxResourceVolumeMB == 0 && vm.NvmeDiskSizeInMiB == 0
}

func (vm *virtualMachine) baseFeaturesSupported() bool {
return vm.AcceleratedNetworkingEnabled && vm.PremiumIO && vm.EncryptionAtHostSupported &&
vm.emptyDiskSupported() && vm.hypervGen2Supported()
vm.noLocalStorageAttached() && vm.hypervGen2Supported()
}

func resourceSKUToVirtualMachine(res *armcompute.ResourceSKU) *virtualMachine {
if res.ResourceType != nil && *res.ResourceType != "virtualMachines" {
if res.ResourceType != nil && *res.ResourceType != string(RTVirtualMachines) {
return nil
}
// If Machine type has any type of restriccions discard
Expand Down Expand Up @@ -219,6 +335,20 @@ func resourceSKUToVirtualMachine(res *armcompute.ResourceSKU) *virtualMachine {
return nil
}
vm.MaxResourceVolumeMB = int32(disk)
case "GPUs":
gpus, err := strconv.ParseInt(*capability.Value, 10, 32)
if err != nil {
return nil
}
vm.GPUs = int32(gpus)
case "NvmeDiskSizeInMiB":
nvme, err := strconv.ParseUint(*capability.Value, 10, 32)
if err != nil {
return nil
}
vm.NvmeDiskSizeInMiB = int32(nvme)
case "DiskControllerTypes":
vm.DiskControllerTypes = strings.Split(*capability.Value, ",")
case "VMDeploymentTypes":
vm.VMDeploymentTypes = strings.Split(*capability.Value, ",")
default:
Expand All @@ -241,10 +371,22 @@ func filterCPUsAndMemory(args *cr.ComputeRequestArgs) filterFunc {
if args.NestedVirt && !vm.nestedVirtSupported() {
return
}
if args.GPUs > 0 && vm.GPUs < args.GPUs {
return
}
// GPU VMs (ND/NC-series) have large temp disks, so skip the
// local-storage check that would otherwise reject them.
featuresOK := false
if args.GPUs > 0 {
featuresOK = vm.AcceleratedNetworkingEnabled && vm.PremiumIO &&
vm.EncryptionAtHostSupported && vm.hypervGen2Supported()
} else {
featuresOK = vm.baseFeaturesSupported()
}
if vm.VCPUs >= args.CPUs &&
vm.Memory >= args.MemoryGib &&
vm.Arch == args.Arch.String() &&
vm.baseFeaturesSupported() {
featuresOK {
dSeries := regexp.MustCompile(lowerCpuPattern)
if !dSeries.Match([]byte(vm.Name)) {
vmCh <- vm.Name
Expand Down
Loading