Skip to content

Commit 92e39f9

Browse files
rishupkclaude
andcommitted
fix(azure): harden NVMe filter and credential handling for compute-sizes
- Relax FilterNoLocalStorageSizes to only reject NVMe-only local storage (L-series). VMs with temp disks (MaxResourceVolumeMB > 0) like Standard_NC64as_T4_v3 are now allowed — temp disks are ephemeral scratch space that does not interfere with RHEL AI's OS disk. - Add ensureAzureEnvs() to map ARM_* env vars to AZURE_* before any Azure SDK call. The Azure SDK reads AZURE_TENANT_ID etc from the environment, but callers (Claudio, Pulumi) typically set ARM_* vars. Previously this mapping only happened in provider.Init(), which runs after FilterNoLocalStorageSizes — causing credential failures when --compute-sizes is used. - Centralize AZURE_SUBSCRIPTION_ID reads through SubscriptionID() helper that falls back to ARM_SUBSCRIPTION_ID. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Rishabh Kothari <rkothari@redhat.com>
1 parent dcd60a4 commit 92e39f9

7 files changed

Lines changed: 321 additions & 19 deletions

File tree

pkg/provider/azure/action/rhel-ai/rhelai.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package rhelai
22

33
import (
4+
"context"
45
"fmt"
56
"strings"
67

@@ -53,6 +54,20 @@ func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err
5354
}
5455
// Shallow-copy to avoid mutating the caller's ComputeRequestArgs.
5556
computeReq := *args.ComputeRequest
57+
if len(computeReq.ComputeSizes) > 0 {
58+
ctx := mCtxArgs.Context
59+
if ctx == nil {
60+
ctx = context.Background()
61+
}
62+
computeReq.ComputeSizes, err = data.FilterNoLocalStorageSizes(
63+
ctx, computeReq.ComputeSizes)
64+
if err != nil {
65+
return err
66+
}
67+
if len(computeReq.ComputeSizes) == 0 {
68+
return fmt.Errorf("no valid compute sizes: all provided sizes have NVMe-only local storage, incompatible with RHEL AI")
69+
}
70+
}
5671
// Ensure GPU-capable instance selection for auto-selection paths.
5772
if computeReq.GPUs == 0 {
5873
logging.Debug("RHEL AI: GPUs not set, defaulting to 1 for GPU-capable instance selection")

pkg/provider/azure/data/compute-request.go

Lines changed: 83 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package data
33
import (
44
"context"
55
"fmt"
6-
"os"
76
"regexp"
87
"slices"
98
"strconv"
@@ -13,6 +12,7 @@ import (
1312
"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
1413
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
1514
cr "github.com/redhat-developer/mapt/pkg/provider/api/compute-request"
15+
"github.com/redhat-developer/mapt/pkg/util/logging"
1616
)
1717

1818
const (
@@ -112,12 +112,77 @@ func diskControllerTypeSupported(supported []string, requiredType string) bool {
112112
return false
113113
}
114114

115+
// FilterNoLocalStorageSizes returns only the sizes from computeSizes that have no
116+
// NVMe-only local storage (L-series). Temp disks (MaxResourceVolumeMB > 0) are allowed
117+
// — they are ephemeral scratch space that does not interfere with RHEL AI's OS disk.
118+
// Sizes not found in the Azure SKU catalog (typo or restricted SKU) are logged as
119+
// warnings and excluded.
120+
func FilterNoLocalStorageSizes(ctx context.Context, computeSizes []string) ([]string, error) {
121+
creds, subscriptionID, err := getCredentials()
122+
if err != nil {
123+
return nil, err
124+
}
125+
client, err := armcompute.NewResourceSKUsClient(*subscriptionID, creds, nil)
126+
if err != nil {
127+
return nil, err
128+
}
129+
pager := client.NewListPager(nil)
130+
capabilities := make(map[string]*virtualMachine, len(computeSizes))
131+
for pager.More() {
132+
page, err := pager.NextPage(ctx)
133+
if err != nil {
134+
return nil, err
135+
}
136+
for _, sku := range page.Value {
137+
if sku.ResourceType == nil || *sku.ResourceType != string(RTVirtualMachines) {
138+
continue
139+
}
140+
if sku.Name == nil || !slices.Contains(computeSizes, *sku.Name) {
141+
continue
142+
}
143+
if _, seen := capabilities[*sku.Name]; seen {
144+
continue
145+
}
146+
if vm := resourceSKUToVirtualMachine(sku); vm != nil {
147+
capabilities[*sku.Name] = vm
148+
}
149+
}
150+
}
151+
valid, dropped, unknown := filterNVMeStorage(computeSizes, capabilities)
152+
for _, size := range dropped {
153+
logging.Warnf("dropping compute size %q: has NVMe-only local storage, incompatible with RHEL AI", size)
154+
}
155+
for _, size := range unknown {
156+
logging.Warnf("dropping compute size %q: not found in Azure SKU catalog (typo or restricted SKU)", size)
157+
}
158+
return valid, nil
159+
}
160+
161+
// filterNVMeStorage classifies each size into valid (no NVMe-only local storage),
162+
// dropped (has NVMe local storage — e.g. L-series), or unknown (absent from capabilities).
163+
func filterNVMeStorage(computeSizes []string, capabilities map[string]*virtualMachine) (valid, dropped, unknown []string) {
164+
for _, size := range computeSizes {
165+
vm, ok := capabilities[size]
166+
if !ok {
167+
unknown = append(unknown, size)
168+
continue
169+
}
170+
if vm.NvmeDiskSizeInMiB > 0 {
171+
dropped = append(dropped, size)
172+
} else {
173+
valid = append(valid, size)
174+
}
175+
}
176+
return valid, dropped, unknown
177+
}
178+
115179
func getAzureVMSKUs(ctx context.Context, args *cr.ComputeRequestArgs) ([]string, error) {
180+
ensureAzureEnvs()
116181
cred, err := azidentity.NewDefaultAzureCredential(nil)
117182
if err != nil {
118183
return nil, err
119184
}
120-
subscriptionId := os.Getenv("AZURE_SUBSCRIPTION_ID")
185+
subscriptionId := SubscriptionID()
121186
clientFactory, err := armcompute.NewClientFactory(
122187
subscriptionId, cred, nil)
123188
if err != nil {
@@ -156,6 +221,10 @@ type virtualMachine struct {
156221
LowPriorityCapable bool
157222
MaxResourceVolumeMB int32
158223
GPUs int32
224+
// L-series VMs expose NVMe storage separately from the temp disk
225+
NvmeDiskSizeInMiB int32
226+
// Used by the disk-controller-type fix (PR #823) to cross-reference SKU capabilities
227+
DiskControllerTypes []string
159228
// IaaS or PaaS
160229
VMDeploymentTypes []string
161230
// Fast SSD
@@ -191,17 +260,17 @@ func (vm *virtualMachine) hypervGen2Supported() bool {
191260
return slices.Contains(vm.HyperVGenerations, "V2")
192261
}
193262

194-
func (vm *virtualMachine) emptyDiskSupported() bool {
195-
return vm.MaxResourceVolumeMB == 0
263+
func (vm *virtualMachine) noLocalStorageAttached() bool {
264+
return vm.MaxResourceVolumeMB == 0 && vm.NvmeDiskSizeInMiB == 0
196265
}
197266

198267
func (vm *virtualMachine) baseFeaturesSupported() bool {
199268
return vm.AcceleratedNetworkingEnabled && vm.PremiumIO && vm.EncryptionAtHostSupported &&
200-
vm.emptyDiskSupported() && vm.hypervGen2Supported()
269+
vm.noLocalStorageAttached() && vm.hypervGen2Supported()
201270
}
202271

203272
func resourceSKUToVirtualMachine(res *armcompute.ResourceSKU) *virtualMachine {
204-
if res.ResourceType != nil && *res.ResourceType != "virtualMachines" {
273+
if res.ResourceType != nil && *res.ResourceType != string(RTVirtualMachines) {
205274
return nil
206275
}
207276
// If Machine type has any type of restriccions discard
@@ -272,6 +341,14 @@ func resourceSKUToVirtualMachine(res *armcompute.ResourceSKU) *virtualMachine {
272341
return nil
273342
}
274343
vm.GPUs = int32(gpus)
344+
case "NvmeDiskSizeInMiB":
345+
nvme, err := strconv.ParseUint(*capability.Value, 10, 32)
346+
if err != nil {
347+
return nil
348+
}
349+
vm.NvmeDiskSizeInMiB = int32(nvme)
350+
case "DiskControllerTypes":
351+
vm.DiskControllerTypes = strings.Split(*capability.Value, ",")
275352
case "VMDeploymentTypes":
276353
vm.VMDeploymentTypes = strings.Split(*capability.Value, ",")
277354
default:
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
package data
2+
3+
import (
4+
"testing"
5+
6+
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
7+
)
8+
9+
func ptr[T any](v T) *T { return &v }
10+
11+
// noLocalStorageAttached tests
12+
13+
func TestNoLocalStorageAttached_NoTempDiskNoNvme(t *testing.T) {
14+
vm := &virtualMachine{MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 0}
15+
if !vm.noLocalStorageAttached() {
16+
t.Error("expected true: VM with no temp disk and no NVMe should have no local storage")
17+
}
18+
}
19+
20+
func TestNoLocalStorageAttached_HasTempDisk(t *testing.T) {
21+
vm := &virtualMachine{MaxResourceVolumeMB: 512, NvmeDiskSizeInMiB: 0}
22+
if vm.noLocalStorageAttached() {
23+
t.Error("expected false: VM with temp disk should have local storage")
24+
}
25+
}
26+
27+
func TestNoLocalStorageAttached_HasNvmeDisk(t *testing.T) {
28+
// L-series bug case: MaxResourceVolumeMB=0 but NvmeDiskSizeInMiB>0
29+
vm := &virtualMachine{MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 5492736}
30+
if vm.noLocalStorageAttached() {
31+
t.Error("expected false: L-series VM with NVMe storage should have local storage")
32+
}
33+
}
34+
35+
func TestNoLocalStorageAttached_HasBoth(t *testing.T) {
36+
vm := &virtualMachine{MaxResourceVolumeMB: 512, NvmeDiskSizeInMiB: 5492736}
37+
if vm.noLocalStorageAttached() {
38+
t.Error("expected false: VM with both temp disk and NVMe should have local storage")
39+
}
40+
}
41+
42+
// resourceSKUToVirtualMachine parsing tests
43+
44+
func TestResourceSKUToVirtualMachine_ParsesNvmeDiskSizeInMiB(t *testing.T) {
45+
sku := &armcompute.ResourceSKU{
46+
ResourceType: ptr("virtualMachines"),
47+
Name: ptr("Standard_L8aos_v4"),
48+
Family: ptr("standardLasv4Family"),
49+
Capabilities: []*armcompute.ResourceSKUCapabilities{
50+
{Name: ptr("NvmeDiskSizeInMiB"), Value: ptr("5492736")},
51+
},
52+
}
53+
vm := resourceSKUToVirtualMachine(sku)
54+
if vm == nil {
55+
t.Fatal("expected non-nil virtualMachine")
56+
}
57+
if vm.NvmeDiskSizeInMiB != 5492736 {
58+
t.Errorf("NvmeDiskSizeInMiB: got %d, want 5492736", vm.NvmeDiskSizeInMiB)
59+
}
60+
}
61+
62+
func TestResourceSKUToVirtualMachine_ParsesDiskControllerTypes(t *testing.T) {
63+
sku := &armcompute.ResourceSKU{
64+
ResourceType: ptr("virtualMachines"),
65+
Name: ptr("Standard_L8aos_v4"),
66+
Family: ptr("standardLasv4Family"),
67+
Capabilities: []*armcompute.ResourceSKUCapabilities{
68+
{Name: ptr("DiskControllerTypes"), Value: ptr("NVMe,SCSI")},
69+
},
70+
}
71+
vm := resourceSKUToVirtualMachine(sku)
72+
if vm == nil {
73+
t.Fatal("expected non-nil virtualMachine")
74+
}
75+
if len(vm.DiskControllerTypes) != 2 {
76+
t.Fatalf("DiskControllerTypes: got %v, want [NVMe SCSI]", vm.DiskControllerTypes)
77+
}
78+
if vm.DiskControllerTypes[0] != "NVMe" || vm.DiskControllerTypes[1] != "SCSI" {
79+
t.Errorf("DiskControllerTypes: got %v, want [NVMe SCSI]", vm.DiskControllerTypes)
80+
}
81+
}
82+
83+
func TestResourceSKUToVirtualMachine_NvmeDiskSizeDefaultsToZero(t *testing.T) {
84+
sku := &armcompute.ResourceSKU{
85+
ResourceType: ptr("virtualMachines"),
86+
Name: ptr("Standard_D8as_v5"),
87+
Family: ptr("standardDasv5Family"),
88+
Capabilities: []*armcompute.ResourceSKUCapabilities{
89+
{Name: ptr("MaxResourceVolumeMB"), Value: ptr("307200")},
90+
},
91+
}
92+
vm := resourceSKUToVirtualMachine(sku)
93+
if vm == nil {
94+
t.Fatal("expected non-nil virtualMachine")
95+
}
96+
if vm.NvmeDiskSizeInMiB != 0 {
97+
t.Errorf("NvmeDiskSizeInMiB: got %d, want 0 for non-NVMe SKU", vm.NvmeDiskSizeInMiB)
98+
}
99+
}
100+
101+
// filterNVMeStorage tests
102+
103+
func TestFilterNVMeStorage_DropsNvmeSizes(t *testing.T) {
104+
capabilities := map[string]*virtualMachine{
105+
"Standard_D8as_v5": {MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 0},
106+
"Standard_L8aos_v4": {MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 5492736},
107+
}
108+
got, dropped, unknown := filterNVMeStorage([]string{"Standard_D8as_v5", "Standard_L8aos_v4"}, capabilities)
109+
if len(got) != 1 || got[0] != "Standard_D8as_v5" {
110+
t.Errorf("filtered: got %v, want [Standard_D8as_v5]", got)
111+
}
112+
if len(dropped) != 1 || dropped[0] != "Standard_L8aos_v4" {
113+
t.Errorf("dropped: got %v, want [Standard_L8aos_v4]", dropped)
114+
}
115+
if len(unknown) != 0 {
116+
t.Errorf("unknown: got %v, want []", unknown)
117+
}
118+
}
119+
120+
func TestFilterNVMeStorage_AllowsTempDiskSizes(t *testing.T) {
121+
capabilities := map[string]*virtualMachine{
122+
"Standard_NC64as_T4_v3": {MaxResourceVolumeMB: 32768, NvmeDiskSizeInMiB: 0},
123+
}
124+
got, dropped, unknown := filterNVMeStorage([]string{"Standard_NC64as_T4_v3"}, capabilities)
125+
if len(got) != 1 {
126+
t.Errorf("filtered: got %v, want [Standard_NC64as_T4_v3]", got)
127+
}
128+
if len(dropped) != 0 {
129+
t.Errorf("dropped: got %v, want []", dropped)
130+
}
131+
if len(unknown) != 0 {
132+
t.Errorf("unknown: got %v, want []", unknown)
133+
}
134+
}
135+
136+
func TestFilterNVMeStorage_PassesCleanSizes(t *testing.T) {
137+
capabilities := map[string]*virtualMachine{
138+
"Standard_D8as_v5": {MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 0},
139+
}
140+
got, dropped, unknown := filterNVMeStorage([]string{"Standard_D8as_v5"}, capabilities)
141+
if len(got) != 1 {
142+
t.Errorf("filtered: got %v, want [Standard_D8as_v5]", got)
143+
}
144+
if len(dropped) != 0 {
145+
t.Errorf("dropped: got %v, want []", dropped)
146+
}
147+
if len(unknown) != 0 {
148+
t.Errorf("unknown: got %v, want []", unknown)
149+
}
150+
}
151+
152+
func TestFilterNVMeStorage_ReportsUnknownSizes(t *testing.T) {
153+
capabilities := map[string]*virtualMachine{
154+
"Standard_D8as_v5": {MaxResourceVolumeMB: 0, NvmeDiskSizeInMiB: 0},
155+
}
156+
got, dropped, unknown := filterNVMeStorage(
157+
[]string{"Standard_D8as_v5", "Standard_Typo_v99"}, capabilities)
158+
if len(got) != 1 || got[0] != "Standard_D8as_v5" {
159+
t.Errorf("filtered: got %v, want [Standard_D8as_v5]", got)
160+
}
161+
if len(dropped) != 0 {
162+
t.Errorf("dropped: got %v, want []", dropped)
163+
}
164+
if len(unknown) != 1 || unknown[0] != "Standard_Typo_v99" {
165+
t.Errorf("unknown: got %v, want [Standard_Typo_v99]", unknown)
166+
}
167+
}
168+
169+
// baseFeaturesSupported regression: L-series must be rejected
170+
171+
func TestBaseFeaturesSupported_LSeriesWithNvmeIsRejected(t *testing.T) {
172+
vm := &virtualMachine{
173+
MaxResourceVolumeMB: 0,
174+
NvmeDiskSizeInMiB: 5492736,
175+
AcceleratedNetworkingEnabled: true,
176+
PremiumIO: true,
177+
EncryptionAtHostSupported: true,
178+
HyperVGenerations: []string{"V1", "V2"},
179+
}
180+
if vm.baseFeaturesSupported() {
181+
t.Error("expected false: L-series VM with NVMe storage must not pass baseFeaturesSupported")
182+
}
183+
}

0 commit comments

Comments
 (0)