Skip to content

Commit 1fcfe80

Browse files
rishupkclaude
andcommitted
fix(azure): filter L-series NVMe VMs from RHEL AI candidates
- Add NvmeDiskSizeInMiB and DiskControllerTypes fields to virtualMachine struct and parse them in resourceSKUToVirtualMachine(); previously both capabilities fell through to default and were discarded - Rename emptyDiskSupported() to noLocalStorageAttached() and tighten condition to MaxResourceVolumeMB == 0 && NvmeDiskSizeInMiB == 0; fixes L-series VMs (e.g. Standard_L8aos_v4) incorrectly passing the filter - Add FilterNoLocalStorageSizes() to validate explicit --compute-sizes in the RHEL AI path; sizes with local storage or absent from the SKU catalog are warned and dropped before allocation proceeds - Add 12 unit tests covering noLocalStorageAttached(), SKU capability parsing, filterLocalStorage() (including unknown-size path), and a baseFeaturesSupported() regression for L-series Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> Signed-off-by: Rishabh Kothari <rkothari@redhat.com> Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> Signed-off-by: Rishabh Kothari <rkothari@redhat.com>
1 parent 7c5d50d commit 1fcfe80

4 files changed

Lines changed: 293 additions & 6 deletions

File tree

.claude/settings.local.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"permissions": {
3+
"allow": [
4+
"WebFetch(domain:martinfowler.com)",
5+
"Bash(gh pr *)",
6+
"Bash(python3 -c \"import json,sys; d=json.load\\(sys.stdin\\); print\\(json.dumps\\(d.get\\('skills',d.get\\('skill_definitions',[]\\)\\), indent=2\\)\\)\")",
7+
"Read(//Users/rish/Developer/claudio-skills/claudio-plugin/**)"
8+
]
9+
}
10+
}

pkg/provider/azure/action/rhel-ai/rhelai.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package rhelai
22

33
import (
4+
"context"
45
"fmt"
56
"strings"
67

@@ -35,15 +36,30 @@ func imageId(accelerator, version string) string {
3536
}
3637

3738
func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err error) {
38-
logging.Debug("Creating RHEL Server")
39+
logging.Debug("Creating RHEL AI Server")
40+
computeReq := *args.ComputeRequest
41+
if len(computeReq.ComputeSizes) > 0 {
42+
ctx := mCtxArgs.Context
43+
if ctx == nil {
44+
ctx = context.Background()
45+
}
46+
computeReq.ComputeSizes, err = data.FilterNoLocalStorageSizes(
47+
ctx, computeReq.ComputeSizes)
48+
if err != nil {
49+
return err
50+
}
51+
if len(computeReq.ComputeSizes) == 0 {
52+
return fmt.Errorf("no valid compute sizes: all provided sizes have local NVMe/temp-disk storage, incompatible with RHEL AI")
53+
}
54+
}
3955
sharedImageID := imageId(args.Accelerator, args.Version)
4056
if args.CustomImage != "" {
4157
sharedImageID = imageIdFromName(args.CustomImage)
4258
}
4359
azureLinuxRequest :=
4460
&azureLinux.LinuxArgs{
4561
Prefix: args.Prefix,
46-
ComputeRequest: args.ComputeRequest,
62+
ComputeRequest: &computeReq,
4763
Spot: args.Spot,
4864
ImageRef: &data.ImageReference{
4965
SharedImageID: sharedImageID,

pkg/provider/azure/data/compute-request.go

Lines changed: 80 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
1313
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
1414
cr "github.com/redhat-developer/mapt/pkg/provider/api/compute-request"
15+
"github.com/redhat-developer/mapt/pkg/util/logging"
1516
)
1617

1718
const (
@@ -66,6 +67,69 @@ func FilterComputeSizesByLocation(ctx context.Context, location *string, compute
6667
return supportedSizes, nil
6768
}
6869

70+
// FilterNoLocalStorageSizes returns only the sizes from computeSizes that have no local
71+
// storage (no temp disk, no NVMe). Sizes with local storage are logged as warnings and
72+
// dropped. Sizes not found in the Azure SKU catalog (typo or restricted SKU) are logged
73+
// as warnings and excluded.
74+
func FilterNoLocalStorageSizes(ctx context.Context, computeSizes []string) ([]string, error) {
75+
creds, subscriptionID, err := getCredentials()
76+
if err != nil {
77+
return nil, err
78+
}
79+
client, err := armcompute.NewResourceSKUsClient(*subscriptionID, creds, nil)
80+
if err != nil {
81+
return nil, err
82+
}
83+
pager := client.NewListPager(nil)
84+
capabilities := make(map[string]*virtualMachine, len(computeSizes))
85+
for pager.More() {
86+
page, err := pager.NextPage(ctx)
87+
if err != nil {
88+
return nil, err
89+
}
90+
for _, sku := range page.Value {
91+
if sku.ResourceType == nil || *sku.ResourceType != string(RTVirtualMachines) {
92+
continue
93+
}
94+
if sku.Name == nil || !slices.Contains(computeSizes, *sku.Name) {
95+
continue
96+
}
97+
if _, seen := capabilities[*sku.Name]; seen {
98+
continue
99+
}
100+
if vm := resourceSKUToVirtualMachine(sku); vm != nil {
101+
capabilities[*sku.Name] = vm
102+
}
103+
}
104+
}
105+
valid, dropped, unknown := filterLocalStorage(computeSizes, capabilities)
106+
for _, size := range dropped {
107+
logging.Warnf("dropping compute size %q: has local NVMe/temp-disk storage, incompatible with RHEL AI", size)
108+
}
109+
for _, size := range unknown {
110+
logging.Warnf("dropping compute size %q: not found in Azure SKU catalog (typo or restricted SKU)", size)
111+
}
112+
return valid, nil
113+
}
114+
115+
// filterLocalStorage classifies each size into valid (no local storage), dropped (has
116+
// local storage), or unknown (absent from capabilities — typo or restricted SKU).
117+
func filterLocalStorage(computeSizes []string, capabilities map[string]*virtualMachine) (valid, dropped, unknown []string) {
118+
for _, size := range computeSizes {
119+
vm, ok := capabilities[size]
120+
if !ok {
121+
unknown = append(unknown, size)
122+
continue
123+
}
124+
if vm.noLocalStorageAttached() {
125+
valid = append(valid, size)
126+
} else {
127+
dropped = append(dropped, size)
128+
}
129+
}
130+
return valid, dropped, unknown
131+
}
132+
69133
func getAzureVMSKUs(ctx context.Context, args *cr.ComputeRequestArgs) ([]string, error) {
70134
cred, err := azidentity.NewDefaultAzureCredential(nil)
71135
if err != nil {
@@ -109,6 +173,10 @@ type virtualMachine struct {
109173
// Spot capable
110174
LowPriorityCapable bool
111175
MaxResourceVolumeMB int32
176+
// L-series VMs expose NVMe storage separately from the temp disk
177+
NvmeDiskSizeInMiB int32
178+
// Used by the disk-controller-type fix (PR #823) to cross-reference SKU capabilities
179+
DiskControllerTypes []string
112180
// IaaS or PaaS
113181
VMDeploymentTypes []string
114182
// Fast SSD
@@ -144,17 +212,17 @@ func (vm *virtualMachine) hypervGen2Supported() bool {
144212
return slices.Contains(vm.HyperVGenerations, "V2")
145213
}
146214

147-
func (vm *virtualMachine) emptyDiskSupported() bool {
148-
return vm.MaxResourceVolumeMB == 0
215+
func (vm *virtualMachine) noLocalStorageAttached() bool {
216+
return vm.MaxResourceVolumeMB == 0 && vm.NvmeDiskSizeInMiB == 0
149217
}
150218

151219
func (vm *virtualMachine) baseFeaturesSupported() bool {
152220
return vm.AcceleratedNetworkingEnabled && vm.PremiumIO && vm.EncryptionAtHostSupported &&
153-
vm.emptyDiskSupported() && vm.hypervGen2Supported()
221+
vm.noLocalStorageAttached() && vm.hypervGen2Supported()
154222
}
155223

156224
func resourceSKUToVirtualMachine(res *armcompute.ResourceSKU) *virtualMachine {
157-
if res.ResourceType != nil && *res.ResourceType != "virtualMachines" {
225+
if res.ResourceType != nil && *res.ResourceType != string(RTVirtualMachines) {
158226
return nil
159227
}
160228
// If Machine type has any type of restriccions discard
@@ -219,6 +287,14 @@ func resourceSKUToVirtualMachine(res *armcompute.ResourceSKU) *virtualMachine {
219287
return nil
220288
}
221289
vm.MaxResourceVolumeMB = int32(disk)
290+
case "NvmeDiskSizeInMiB":
291+
nvme, err := strconv.ParseUint(*capability.Value, 10, 32)
292+
if err != nil {
293+
return nil
294+
}
295+
vm.NvmeDiskSizeInMiB = int32(nvme)
296+
case "DiskControllerTypes":
297+
vm.DiskControllerTypes = strings.Split(*capability.Value, ",")
222298
case "VMDeploymentTypes":
223299
vm.VMDeploymentTypes = strings.Split(*capability.Value, ",")
224300
default:
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
package data
2+
3+
import (
4+
"testing"
5+
6+
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
7+
)
8+
9+
func ptr[T any](v T) *T { return &v }
10+
11+
// noLocalStorageAttached tests
12+
13+
func TestNoLocalStorageAttached_NoTempDiskNoNvme(t *testing.T) {
14+
vm := &virtualMachine{MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 0}
15+
if !vm.noLocalStorageAttached() {
16+
t.Error("expected true: VM with no temp disk and no NVMe should have no local storage")
17+
}
18+
}
19+
20+
func TestNoLocalStorageAttached_HasTempDisk(t *testing.T) {
21+
vm := &virtualMachine{MaxResourceVolumeMB: 512, NvmeDiskSizeInMiB: 0}
22+
if vm.noLocalStorageAttached() {
23+
t.Error("expected false: VM with temp disk should have local storage")
24+
}
25+
}
26+
27+
func TestNoLocalStorageAttached_HasNvmeDisk(t *testing.T) {
28+
// L-series bug case: MaxResourceVolumeMB=0 but NvmeDiskSizeInMiB>0
29+
vm := &virtualMachine{MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 5492736}
30+
if vm.noLocalStorageAttached() {
31+
t.Error("expected false: L-series VM with NVMe storage should have local storage")
32+
}
33+
}
34+
35+
func TestNoLocalStorageAttached_HasBoth(t *testing.T) {
36+
vm := &virtualMachine{MaxResourceVolumeMB: 512, NvmeDiskSizeInMiB: 5492736}
37+
if vm.noLocalStorageAttached() {
38+
t.Error("expected false: VM with both temp disk and NVMe should have local storage")
39+
}
40+
}
41+
42+
// resourceSKUToVirtualMachine parsing tests
43+
44+
func TestResourceSKUToVirtualMachine_ParsesNvmeDiskSizeInMiB(t *testing.T) {
45+
sku := &armcompute.ResourceSKU{
46+
ResourceType: ptr("virtualMachines"),
47+
Name: ptr("Standard_L8aos_v4"),
48+
Family: ptr("standardLasv4Family"),
49+
Capabilities: []*armcompute.ResourceSKUCapabilities{
50+
{Name: ptr("NvmeDiskSizeInMiB"), Value: ptr("5492736")},
51+
},
52+
}
53+
vm := resourceSKUToVirtualMachine(sku)
54+
if vm == nil {
55+
t.Fatal("expected non-nil virtualMachine")
56+
}
57+
if vm.NvmeDiskSizeInMiB != 5492736 {
58+
t.Errorf("NvmeDiskSizeInMiB: got %d, want 5492736", vm.NvmeDiskSizeInMiB)
59+
}
60+
}
61+
62+
func TestResourceSKUToVirtualMachine_ParsesDiskControllerTypes(t *testing.T) {
63+
sku := &armcompute.ResourceSKU{
64+
ResourceType: ptr("virtualMachines"),
65+
Name: ptr("Standard_L8aos_v4"),
66+
Family: ptr("standardLasv4Family"),
67+
Capabilities: []*armcompute.ResourceSKUCapabilities{
68+
{Name: ptr("DiskControllerTypes"), Value: ptr("NVMe,SCSI")},
69+
},
70+
}
71+
vm := resourceSKUToVirtualMachine(sku)
72+
if vm == nil {
73+
t.Fatal("expected non-nil virtualMachine")
74+
}
75+
if len(vm.DiskControllerTypes) != 2 {
76+
t.Fatalf("DiskControllerTypes: got %v, want [NVMe SCSI]", vm.DiskControllerTypes)
77+
}
78+
if vm.DiskControllerTypes[0] != "NVMe" || vm.DiskControllerTypes[1] != "SCSI" {
79+
t.Errorf("DiskControllerTypes: got %v, want [NVMe SCSI]", vm.DiskControllerTypes)
80+
}
81+
}
82+
83+
func TestResourceSKUToVirtualMachine_NvmeDiskSizeDefaultsToZero(t *testing.T) {
84+
sku := &armcompute.ResourceSKU{
85+
ResourceType: ptr("virtualMachines"),
86+
Name: ptr("Standard_D8as_v5"),
87+
Family: ptr("standardDasv5Family"),
88+
Capabilities: []*armcompute.ResourceSKUCapabilities{
89+
{Name: ptr("MaxResourceVolumeMB"), Value: ptr("307200")},
90+
},
91+
}
92+
vm := resourceSKUToVirtualMachine(sku)
93+
if vm == nil {
94+
t.Fatal("expected non-nil virtualMachine")
95+
}
96+
if vm.NvmeDiskSizeInMiB != 0 {
97+
t.Errorf("NvmeDiskSizeInMiB: got %d, want 0 for non-NVMe SKU", vm.NvmeDiskSizeInMiB)
98+
}
99+
}
100+
101+
// filterLocalStorage tests
102+
103+
func TestFilterLocalStorage_DropsNvmeSizes(t *testing.T) {
104+
capabilities := map[string]*virtualMachine{
105+
"Standard_D8as_v5": {MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 0},
106+
"Standard_L8aos_v4": {MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 5492736},
107+
}
108+
got, dropped, unknown := filterLocalStorage([]string{"Standard_D8as_v5", "Standard_L8aos_v4"}, capabilities)
109+
if len(got) != 1 || got[0] != "Standard_D8as_v5" {
110+
t.Errorf("filtered: got %v, want [Standard_D8as_v5]", got)
111+
}
112+
if len(dropped) != 1 || dropped[0] != "Standard_L8aos_v4" {
113+
t.Errorf("dropped: got %v, want [Standard_L8aos_v4]", dropped)
114+
}
115+
if len(unknown) != 0 {
116+
t.Errorf("unknown: got %v, want []", unknown)
117+
}
118+
}
119+
120+
func TestFilterLocalStorage_DropsTempDiskSizes(t *testing.T) {
121+
capabilities := map[string]*virtualMachine{
122+
"Standard_D4s_v3": {MaxResourceVolumeMB: 32768, NvmeDiskSizeInMiB: 0},
123+
}
124+
got, dropped, unknown := filterLocalStorage([]string{"Standard_D4s_v3"}, capabilities)
125+
if len(got) != 0 {
126+
t.Errorf("filtered: got %v, want []", got)
127+
}
128+
if len(dropped) != 1 {
129+
t.Errorf("dropped: got %v, want [Standard_D4s_v3]", dropped)
130+
}
131+
if len(unknown) != 0 {
132+
t.Errorf("unknown: got %v, want []", unknown)
133+
}
134+
}
135+
136+
func TestFilterLocalStorage_PassesCleanSizes(t *testing.T) {
137+
capabilities := map[string]*virtualMachine{
138+
"Standard_D8as_v5": {MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 0},
139+
}
140+
got, dropped, unknown := filterLocalStorage([]string{"Standard_D8as_v5"}, capabilities)
141+
if len(got) != 1 {
142+
t.Errorf("filtered: got %v, want [Standard_D8as_v5]", got)
143+
}
144+
if len(dropped) != 0 {
145+
t.Errorf("dropped: got %v, want []", dropped)
146+
}
147+
if len(unknown) != 0 {
148+
t.Errorf("unknown: got %v, want []", unknown)
149+
}
150+
}
151+
152+
func TestFilterLocalStorage_ReportsUnknownSizes(t *testing.T) {
153+
// Sizes absent from the catalog (typo, restricted SKU) must appear in unknown,
154+
// not silently disappear.
155+
capabilities := map[string]*virtualMachine{
156+
"Standard_D8as_v5": {MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 0},
157+
}
158+
got, dropped, unknown := filterLocalStorage(
159+
[]string{"Standard_D8as_v5", "Standard_Typo_v99"}, capabilities)
160+
if len(got) != 1 || got[0] != "Standard_D8as_v5" {
161+
t.Errorf("filtered: got %v, want [Standard_D8as_v5]", got)
162+
}
163+
if len(dropped) != 0 {
164+
t.Errorf("dropped: got %v, want []", dropped)
165+
}
166+
if len(unknown) != 1 || unknown[0] != "Standard_Typo_v99" {
167+
t.Errorf("unknown: got %v, want [Standard_Typo_v99]", unknown)
168+
}
169+
}
170+
171+
// baseFeaturesSupported regression: L-series must be rejected
172+
173+
func TestBaseFeaturesSupported_LSeriesWithNvmeIsRejected(t *testing.T) {
174+
vm := &virtualMachine{
175+
MaxResourceVolumeMB: 0,
176+
NvmeDiskSizeInMiB: 5492736,
177+
AcceleratedNetworkingEnabled: true,
178+
PremiumIO: true,
179+
EncryptionAtHostSupported: true,
180+
HyperVGenerations: []string{"V1", "V2"},
181+
}
182+
if vm.baseFeaturesSupported() {
183+
t.Error("expected false: L-series VM with NVMe storage must not pass baseFeaturesSupported")
184+
}
185+
}

0 commit comments

Comments
 (0)