Skip to content

Commit 5a1f868

Browse files
rishupkclaude
andcommitted
fix(azure): harden NVMe filter and credential handling for compute-sizes
- Relax FilterNoLocalStorageSizes to only reject NVMe-only local storage (L-series). VMs with temp disks (MaxResourceVolumeMB > 0) like Standard_NC64as_T4_v3 are now allowed — temp disks are ephemeral scratch space that does not interfere with RHEL AI's OS disk. - Add ensureAzureEnvs() to map ARM_* env vars to AZURE_* before any Azure SDK call. The Azure SDK reads AZURE_TENANT_ID etc from the environment, but callers (Claudio, Pulumi) typically set ARM_* vars. Previously this mapping only happened in provider.Init(), which runs after FilterNoLocalStorageSizes — causing credential failures when --compute-sizes is used. - Centralize AZURE_SUBSCRIPTION_ID reads through SubscriptionID() helper that falls back to ARM_SUBSCRIPTION_ID. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Rishabh Kothari <rkothari@redhat.com>
1 parent 7c5d50d commit 5a1f868

7 files changed

Lines changed: 325 additions & 21 deletions

File tree

pkg/provider/azure/action/rhel-ai/rhelai.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package rhelai
22

33
import (
4+
"context"
45
"fmt"
56
"strings"
67

@@ -35,15 +36,30 @@ func imageId(accelerator, version string) string {
3536
}
3637

3738
func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err error) {
38-
logging.Debug("Creating RHEL Server")
39+
logging.Debug("Creating RHEL AI Server")
40+
computeReq := *args.ComputeRequest
41+
if len(computeReq.ComputeSizes) > 0 {
42+
ctx := mCtxArgs.Context
43+
if ctx == nil {
44+
ctx = context.Background()
45+
}
46+
computeReq.ComputeSizes, err = data.FilterNoLocalStorageSizes(
47+
ctx, computeReq.ComputeSizes)
48+
if err != nil {
49+
return err
50+
}
51+
if len(computeReq.ComputeSizes) == 0 {
52+
return fmt.Errorf("no valid compute sizes: all provided sizes have NVMe-only local storage, incompatible with RHEL AI")
53+
}
54+
}
3955
sharedImageID := imageId(args.Accelerator, args.Version)
4056
if args.CustomImage != "" {
4157
sharedImageID = imageIdFromName(args.CustomImage)
4258
}
4359
azureLinuxRequest :=
4460
&azureLinux.LinuxArgs{
4561
Prefix: args.Prefix,
46-
ComputeRequest: args.ComputeRequest,
62+
ComputeRequest: &computeReq,
4763
Spot: args.Spot,
4864
ImageRef: &data.ImageReference{
4965
SharedImageID: sharedImageID,

pkg/provider/azure/data/compute-request.go

Lines changed: 83 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package data
22

33
import (
44
"context"
5-
"os"
65
"regexp"
76
"slices"
87
"strconv"
@@ -12,6 +11,7 @@ import (
1211
"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
1312
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
1413
cr "github.com/redhat-developer/mapt/pkg/provider/api/compute-request"
14+
"github.com/redhat-developer/mapt/pkg/util/logging"
1515
)
1616

1717
const (
@@ -66,12 +66,77 @@ func FilterComputeSizesByLocation(ctx context.Context, location *string, compute
6666
return supportedSizes, nil
6767
}
6868

69+
// FilterNoLocalStorageSizes returns only the sizes from computeSizes that have no
70+
// NVMe-only local storage (L-series). Temp disks (MaxResourceVolumeMB > 0) are allowed
71+
// — they are ephemeral scratch space that does not interfere with RHEL AI's OS disk.
72+
// Sizes not found in the Azure SKU catalog (typo or restricted SKU) are logged as
73+
// warnings and excluded.
74+
func FilterNoLocalStorageSizes(ctx context.Context, computeSizes []string) ([]string, error) {
75+
creds, subscriptionID, err := getCredentials()
76+
if err != nil {
77+
return nil, err
78+
}
79+
client, err := armcompute.NewResourceSKUsClient(*subscriptionID, creds, nil)
80+
if err != nil {
81+
return nil, err
82+
}
83+
pager := client.NewListPager(nil)
84+
capabilities := make(map[string]*virtualMachine, len(computeSizes))
85+
for pager.More() {
86+
page, err := pager.NextPage(ctx)
87+
if err != nil {
88+
return nil, err
89+
}
90+
for _, sku := range page.Value {
91+
if sku.ResourceType == nil || *sku.ResourceType != string(RTVirtualMachines) {
92+
continue
93+
}
94+
if sku.Name == nil || !slices.Contains(computeSizes, *sku.Name) {
95+
continue
96+
}
97+
if _, seen := capabilities[*sku.Name]; seen {
98+
continue
99+
}
100+
if vm := resourceSKUToVirtualMachine(sku); vm != nil {
101+
capabilities[*sku.Name] = vm
102+
}
103+
}
104+
}
105+
valid, dropped, unknown := filterNVMeStorage(computeSizes, capabilities)
106+
for _, size := range dropped {
107+
logging.Warnf("dropping compute size %q: has NVMe-only local storage, incompatible with RHEL AI", size)
108+
}
109+
for _, size := range unknown {
110+
logging.Warnf("dropping compute size %q: not found in Azure SKU catalog (typo or restricted SKU)", size)
111+
}
112+
return valid, nil
113+
}
114+
115+
// filterNVMeStorage classifies each size into valid (no NVMe-only local storage),
116+
// dropped (has NVMe local storage — e.g. L-series), or unknown (absent from capabilities).
117+
func filterNVMeStorage(computeSizes []string, capabilities map[string]*virtualMachine) (valid, dropped, unknown []string) {
118+
for _, size := range computeSizes {
119+
vm, ok := capabilities[size]
120+
if !ok {
121+
unknown = append(unknown, size)
122+
continue
123+
}
124+
if vm.NvmeDiskSizeInMiB > 0 {
125+
dropped = append(dropped, size)
126+
} else {
127+
valid = append(valid, size)
128+
}
129+
}
130+
return valid, dropped, unknown
131+
}
132+
69133
func getAzureVMSKUs(ctx context.Context, args *cr.ComputeRequestArgs) ([]string, error) {
134+
ensureAzureEnvs()
70135
cred, err := azidentity.NewDefaultAzureCredential(nil)
71136
if err != nil {
72137
return nil, err
73138
}
74-
subscriptionId := os.Getenv("AZURE_SUBSCRIPTION_ID")
139+
subscriptionId := SubscriptionID()
75140
clientFactory, err := armcompute.NewClientFactory(
76141
subscriptionId, cred, nil)
77142
if err != nil {
@@ -109,6 +174,10 @@ type virtualMachine struct {
109174
// Spot capable
110175
LowPriorityCapable bool
111176
MaxResourceVolumeMB int32
177+
// L-series VMs expose NVMe storage separately from the temp disk
178+
NvmeDiskSizeInMiB int32
179+
// Used by the disk-controller-type fix (PR #823) to cross-reference SKU capabilities
180+
DiskControllerTypes []string
112181
// IaaS or PaaS
113182
VMDeploymentTypes []string
114183
// Fast SSD
@@ -144,17 +213,17 @@ func (vm *virtualMachine) hypervGen2Supported() bool {
144213
return slices.Contains(vm.HyperVGenerations, "V2")
145214
}
146215

147-
func (vm *virtualMachine) emptyDiskSupported() bool {
148-
return vm.MaxResourceVolumeMB == 0
216+
func (vm *virtualMachine) noLocalStorageAttached() bool {
217+
return vm.MaxResourceVolumeMB == 0 && vm.NvmeDiskSizeInMiB == 0
149218
}
150219

151220
func (vm *virtualMachine) baseFeaturesSupported() bool {
152221
return vm.AcceleratedNetworkingEnabled && vm.PremiumIO && vm.EncryptionAtHostSupported &&
153-
vm.emptyDiskSupported() && vm.hypervGen2Supported()
222+
vm.noLocalStorageAttached() && vm.hypervGen2Supported()
154223
}
155224

156225
func resourceSKUToVirtualMachine(res *armcompute.ResourceSKU) *virtualMachine {
157-
if res.ResourceType != nil && *res.ResourceType != "virtualMachines" {
226+
if res.ResourceType != nil && *res.ResourceType != string(RTVirtualMachines) {
158227
return nil
159228
}
160229
// If Machine type has any type of restriccions discard
@@ -219,6 +288,14 @@ func resourceSKUToVirtualMachine(res *armcompute.ResourceSKU) *virtualMachine {
219288
return nil
220289
}
221290
vm.MaxResourceVolumeMB = int32(disk)
291+
case "NvmeDiskSizeInMiB":
292+
nvme, err := strconv.ParseUint(*capability.Value, 10, 32)
293+
if err != nil {
294+
return nil
295+
}
296+
vm.NvmeDiskSizeInMiB = int32(nvme)
297+
case "DiskControllerTypes":
298+
vm.DiskControllerTypes = strings.Split(*capability.Value, ",")
222299
case "VMDeploymentTypes":
223300
vm.VMDeploymentTypes = strings.Split(*capability.Value, ",")
224301
default:
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
package data
2+
3+
import (
4+
"testing"
5+
6+
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
7+
)
8+
9+
func ptr[T any](v T) *T { return &v }
10+
11+
// noLocalStorageAttached tests
12+
13+
func TestNoLocalStorageAttached_NoTempDiskNoNvme(t *testing.T) {
14+
vm := &virtualMachine{MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 0}
15+
if !vm.noLocalStorageAttached() {
16+
t.Error("expected true: VM with no temp disk and no NVMe should have no local storage")
17+
}
18+
}
19+
20+
func TestNoLocalStorageAttached_HasTempDisk(t *testing.T) {
21+
vm := &virtualMachine{MaxResourceVolumeMB: 512, NvmeDiskSizeInMiB: 0}
22+
if vm.noLocalStorageAttached() {
23+
t.Error("expected false: VM with temp disk should have local storage")
24+
}
25+
}
26+
27+
func TestNoLocalStorageAttached_HasNvmeDisk(t *testing.T) {
28+
// L-series bug case: MaxResourceVolumeMB=0 but NvmeDiskSizeInMiB>0
29+
vm := &virtualMachine{MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 5492736}
30+
if vm.noLocalStorageAttached() {
31+
t.Error("expected false: L-series VM with NVMe storage should have local storage")
32+
}
33+
}
34+
35+
func TestNoLocalStorageAttached_HasBoth(t *testing.T) {
36+
vm := &virtualMachine{MaxResourceVolumeMB: 512, NvmeDiskSizeInMiB: 5492736}
37+
if vm.noLocalStorageAttached() {
38+
t.Error("expected false: VM with both temp disk and NVMe should have local storage")
39+
}
40+
}
41+
42+
// resourceSKUToVirtualMachine parsing tests
43+
44+
func TestResourceSKUToVirtualMachine_ParsesNvmeDiskSizeInMiB(t *testing.T) {
45+
sku := &armcompute.ResourceSKU{
46+
ResourceType: ptr("virtualMachines"),
47+
Name: ptr("Standard_L8aos_v4"),
48+
Family: ptr("standardLasv4Family"),
49+
Capabilities: []*armcompute.ResourceSKUCapabilities{
50+
{Name: ptr("NvmeDiskSizeInMiB"), Value: ptr("5492736")},
51+
},
52+
}
53+
vm := resourceSKUToVirtualMachine(sku)
54+
if vm == nil {
55+
t.Fatal("expected non-nil virtualMachine")
56+
}
57+
if vm.NvmeDiskSizeInMiB != 5492736 {
58+
t.Errorf("NvmeDiskSizeInMiB: got %d, want 5492736", vm.NvmeDiskSizeInMiB)
59+
}
60+
}
61+
62+
func TestResourceSKUToVirtualMachine_ParsesDiskControllerTypes(t *testing.T) {
63+
sku := &armcompute.ResourceSKU{
64+
ResourceType: ptr("virtualMachines"),
65+
Name: ptr("Standard_L8aos_v4"),
66+
Family: ptr("standardLasv4Family"),
67+
Capabilities: []*armcompute.ResourceSKUCapabilities{
68+
{Name: ptr("DiskControllerTypes"), Value: ptr("NVMe,SCSI")},
69+
},
70+
}
71+
vm := resourceSKUToVirtualMachine(sku)
72+
if vm == nil {
73+
t.Fatal("expected non-nil virtualMachine")
74+
}
75+
if len(vm.DiskControllerTypes) != 2 {
76+
t.Fatalf("DiskControllerTypes: got %v, want [NVMe SCSI]", vm.DiskControllerTypes)
77+
}
78+
if vm.DiskControllerTypes[0] != "NVMe" || vm.DiskControllerTypes[1] != "SCSI" {
79+
t.Errorf("DiskControllerTypes: got %v, want [NVMe SCSI]", vm.DiskControllerTypes)
80+
}
81+
}
82+
83+
func TestResourceSKUToVirtualMachine_NvmeDiskSizeDefaultsToZero(t *testing.T) {
84+
sku := &armcompute.ResourceSKU{
85+
ResourceType: ptr("virtualMachines"),
86+
Name: ptr("Standard_D8as_v5"),
87+
Family: ptr("standardDasv5Family"),
88+
Capabilities: []*armcompute.ResourceSKUCapabilities{
89+
{Name: ptr("MaxResourceVolumeMB"), Value: ptr("307200")},
90+
},
91+
}
92+
vm := resourceSKUToVirtualMachine(sku)
93+
if vm == nil {
94+
t.Fatal("expected non-nil virtualMachine")
95+
}
96+
if vm.NvmeDiskSizeInMiB != 0 {
97+
t.Errorf("NvmeDiskSizeInMiB: got %d, want 0 for non-NVMe SKU", vm.NvmeDiskSizeInMiB)
98+
}
99+
}
100+
101+
// filterNVMeStorage tests
102+
103+
func TestFilterNVMeStorage_DropsNvmeSizes(t *testing.T) {
104+
capabilities := map[string]*virtualMachine{
105+
"Standard_D8as_v5": {MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 0},
106+
"Standard_L8aos_v4": {MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 5492736},
107+
}
108+
got, dropped, unknown := filterNVMeStorage([]string{"Standard_D8as_v5", "Standard_L8aos_v4"}, capabilities)
109+
if len(got) != 1 || got[0] != "Standard_D8as_v5" {
110+
t.Errorf("filtered: got %v, want [Standard_D8as_v5]", got)
111+
}
112+
if len(dropped) != 1 || dropped[0] != "Standard_L8aos_v4" {
113+
t.Errorf("dropped: got %v, want [Standard_L8aos_v4]", dropped)
114+
}
115+
if len(unknown) != 0 {
116+
t.Errorf("unknown: got %v, want []", unknown)
117+
}
118+
}
119+
120+
func TestFilterNVMeStorage_AllowsTempDiskSizes(t *testing.T) {
121+
capabilities := map[string]*virtualMachine{
122+
"Standard_NC64as_T4_v3": {MaxResourceVolumeMB: 32768, NvmeDiskSizeInMiB: 0},
123+
}
124+
got, dropped, unknown := filterNVMeStorage([]string{"Standard_NC64as_T4_v3"}, capabilities)
125+
if len(got) != 1 {
126+
t.Errorf("filtered: got %v, want [Standard_NC64as_T4_v3]", got)
127+
}
128+
if len(dropped) != 0 {
129+
t.Errorf("dropped: got %v, want []", dropped)
130+
}
131+
if len(unknown) != 0 {
132+
t.Errorf("unknown: got %v, want []", unknown)
133+
}
134+
}
135+
136+
func TestFilterNVMeStorage_PassesCleanSizes(t *testing.T) {
137+
capabilities := map[string]*virtualMachine{
138+
"Standard_D8as_v5": {MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 0},
139+
}
140+
got, dropped, unknown := filterNVMeStorage([]string{"Standard_D8as_v5"}, capabilities)
141+
if len(got) != 1 {
142+
t.Errorf("filtered: got %v, want [Standard_D8as_v5]", got)
143+
}
144+
if len(dropped) != 0 {
145+
t.Errorf("dropped: got %v, want []", dropped)
146+
}
147+
if len(unknown) != 0 {
148+
t.Errorf("unknown: got %v, want []", unknown)
149+
}
150+
}
151+
152+
func TestFilterNVMeStorage_ReportsUnknownSizes(t *testing.T) {
153+
capabilities := map[string]*virtualMachine{
154+
"Standard_D8as_v5": {MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 0},
155+
}
156+
got, dropped, unknown := filterNVMeStorage(
157+
[]string{"Standard_D8as_v5", "Standard_Typo_v99"}, capabilities)
158+
if len(got) != 1 || got[0] != "Standard_D8as_v5" {
159+
t.Errorf("filtered: got %v, want [Standard_D8as_v5]", got)
160+
}
161+
if len(dropped) != 0 {
162+
t.Errorf("dropped: got %v, want []", dropped)
163+
}
164+
if len(unknown) != 1 || unknown[0] != "Standard_Typo_v99" {
165+
t.Errorf("unknown: got %v, want [Standard_Typo_v99]", unknown)
166+
}
167+
}
168+
169+
// baseFeaturesSupported regression: L-series must be rejected
170+
171+
func TestBaseFeaturesSupported_LSeriesWithNvmeIsRejected(t *testing.T) {
172+
vm := &virtualMachine{
173+
MaxResourceVolumeMB: 0,
174+
NvmeDiskSizeInMiB: 5492736,
175+
AcceleratedNetworkingEnabled: true,
176+
PremiumIO: true,
177+
EncryptionAtHostSupported: true,
178+
HyperVGenerations: []string{"V1", "V2"},
179+
}
180+
if vm.baseFeaturesSupported() {
181+
t.Error("expected false: L-series VM with NVMe storage must not pass baseFeaturesSupported")
182+
}
183+
}

0 commit comments

Comments
 (0)