Skip to content

Commit b044120

Browse files
committed
feat(azure/rhel-ai): add GPU guardrails for instance selection
- Add isGPUCapableSize helper matching ND/NC series (NV excluded) - Shallow-copy ComputeRequestArgs before mutation to avoid caller side-effects - Default ComputeRequest.GPUs to 1 so filterCPUsAndMemory auto-selects only GPU-capable instance types when no explicit GPU count is set - Warn when caller explicitly provides compute sizes that are not GPU-capable (expected ND/NC series; vllm requires a GPU device)
1 parent a0709a1 commit b044120

1 file changed

Lines changed: 31 additions & 2 deletions

File tree

pkg/provider/azure/action/rhel-ai/rhelai.go

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,45 @@ func imageId(accelerator, version string) string {
3434
return imageIdFromName(fmt.Sprintf(imageNameRegex, accelerator, version))
3535
}
3636

37+
// isGPUCapableSize returns true for ND-series and NC-series Azure VM sizes,
38+
// which are the compute GPU families supported for RHEL AI workloads.
39+
// NV-series (visualization GPUs) is intentionally excluded.
40+
func isGPUCapableSize(vmSize string) bool {
41+
lower := strings.ToLower(vmSize)
42+
return strings.HasPrefix(lower, "standard_nd") || strings.HasPrefix(lower, "standard_nc")
43+
}
44+
3745
func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err error) {
38-
logging.Debug("Creating RHEL Server")
46+
logging.Debug("Creating RHEL AI Server")
3947
sharedImageID := imageId(args.Accelerator, args.Version)
4048
if args.CustomImage != "" {
4149
sharedImageID = imageIdFromName(args.CustomImage)
4250
}
51+
// Shallow-copy to avoid mutating the caller's ComputeRequestArgs.
52+
computeReq := *args.ComputeRequest
53+
// Ensure GPU-capable instance selection for auto-selection paths.
54+
if computeReq.GPUs == 0 {
55+
logging.Debug("RHEL AI: GPUs not set, defaulting to 1 for GPU-capable instance selection")
56+
computeReq.GPUs = 1
57+
}
58+
// Warn when the caller explicitly specifies compute sizes that are not GPU-capable.
59+
if len(computeReq.ComputeSizes) > 0 {
60+
allNonGPU := true
61+
for _, s := range computeReq.ComputeSizes {
62+
if isGPUCapableSize(s) {
63+
allNonGPU = false
64+
break
65+
}
66+
}
67+
if allNonGPU {
68+
logging.Warnf("RHEL AI: none of the specified compute sizes %v appear to be GPU-capable "+
69+
"(expected ND-series or NC-series); vllm may not start without a GPU", computeReq.ComputeSizes)
70+
}
71+
}
4372
azureLinuxRequest :=
4473
&azureLinux.LinuxArgs{
4574
Prefix: args.Prefix,
46-
ComputeRequest: args.ComputeRequest,
75+
ComputeRequest: &computeReq,
4776
Spot: args.Spot,
4877
ImageRef: &data.ImageReference{
4978
SharedImageID: sharedImageID,

0 commit comments

Comments
 (0)