Skip to content

Commit 60dce2c

Browse files
committed
Make GPU count configurable in e2e validation functions
- Add gpuCountExpected parameter to ValidateNodeAdvertisesGPUResources() to validate exact GPU count instead of just checking > 0 - Add gpuCount parameter to ValidateGPUWorkloadSchedulable() to make GPU resource request configurable - Update all test callers to pass expected GPU count of 1 - Improve logging to show actual vs expected GPU counts for better debugging Signed-off-by: Suraj Deshmukh <suraj.deshmukh@microsoft.com>
1 parent 21fa006 commit 60dce2c

2 files changed

Lines changed: 15 additions & 15 deletions

File tree

e2e/scenario_gpu_managed_experience_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,10 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning(t *testing.T) {
6363
ValidateNvidiaDevicePluginServiceRunning(ctx, s)
6464

6565
// Validate that GPU resources are advertised by the device plugin
66-
ValidateNodeAdvertisesGPUResources(ctx, s)
66+
ValidateNodeAdvertisesGPUResources(ctx, s, 1)
6767

6868
// Validate that GPU workloads can be scheduled
69-
ValidateGPUWorkloadSchedulable(ctx, s)
69+
ValidateGPUWorkloadSchedulable(ctx, s, 1)
7070

7171
// Validate that the NVIDIA DCGM packages were installed correctly
7272
for _, packageName := range getDCGMPackageNames(os) {
@@ -118,10 +118,10 @@ func Test_Ubuntu2204_NvidiaDevicePluginRunning(t *testing.T) {
118118
ValidateNvidiaDevicePluginServiceRunning(ctx, s)
119119

120120
// Validate that GPU resources are advertised by the device plugin
121-
ValidateNodeAdvertisesGPUResources(ctx, s)
121+
ValidateNodeAdvertisesGPUResources(ctx, s, 1)
122122

123123
// Validate that GPU workloads can be scheduled
124-
ValidateGPUWorkloadSchedulable(ctx, s)
124+
ValidateGPUWorkloadSchedulable(ctx, s, 1)
125125

126126
for _, packageName := range getDCGMPackageNames(os) {
127127
versions := components.GetExpectedPackageVersions(packageName, os, osVersion)
@@ -172,10 +172,10 @@ func Test_AzureLinux3_NvidiaDevicePluginRunning(t *testing.T) {
172172
ValidateNvidiaDevicePluginServiceRunning(ctx, s)
173173

174174
// Validate that GPU resources are advertised by the device plugin
175-
ValidateNodeAdvertisesGPUResources(ctx, s)
175+
ValidateNodeAdvertisesGPUResources(ctx, s, 1)
176176

177177
// Validate that GPU workloads can be scheduled
178-
ValidateGPUWorkloadSchedulable(ctx, s)
178+
ValidateGPUWorkloadSchedulable(ctx, s, 1)
179179

180180
for _, packageName := range getDCGMPackageNames(os) {
181181
versions := components.GetExpectedPackageVersions(packageName, os, osVersion)

e2e/validators.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -968,29 +968,29 @@ func ValidateNvidiaDevicePluginServiceRunning(ctx context.Context, s *Scenario)
968968
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "NVIDIA device plugin systemd service should be active and enabled")
969969
}
970970

971-
func ValidateNodeAdvertisesGPUResources(ctx context.Context, s *Scenario) {
971+
func ValidateNodeAdvertisesGPUResources(ctx context.Context, s *Scenario, gpuCountExpected int) {
972972
s.T.Helper()
973973
s.T.Logf("validating that node advertises GPU resources")
974+
resourceName := "nvidia.com/gpu"
974975

975976
// First, wait for the nvidia.com/gpu resource to be available
976-
waitUntilResourceAvailable(ctx, s, "nvidia.com/gpu")
977+
waitUntilResourceAvailable(ctx, s, resourceName)
977978

978979
// Get the node using the Kubernetes client from the test framework
979980
nodeName := s.Runtime.KubeNodeName
980981
node, err := s.Runtime.Cluster.Kube.Typed.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
981982
require.NoError(s.T, err, "failed to get node %q", nodeName)
982983

983984
// Check if the node advertises GPU capacity
984-
gpuCapacity, exists := node.Status.Capacity["nvidia.com/gpu"]
985-
require.True(s.T, exists, "node should advertise nvidia.com/gpu capacity")
985+
gpuCapacity, exists := node.Status.Capacity[corev1.ResourceName(resourceName)]
986+
require.True(s.T, exists, "node should advertise resource %s", resourceName)
986987

987988
gpuCount := gpuCapacity.Value()
988-
require.Greater(s.T, gpuCount, int64(0), "node should advertise at least 1 GPU, but got %d", gpuCount)
989-
990-
s.T.Logf("node %s advertises %d nvidia.com/gpu resources", nodeName, gpuCount)
989+
require.Equal(s.T, gpuCount, gpuCountExpected, "node should advertise %s=%d, but got %s=%d", resourceName, gpuCountExpected, resourceName, gpuCount)
990+
s.T.Logf("node %s advertises %s=%d resources", nodeName, resourceName, gpuCount)
991991
}
992992

993-
func ValidateGPUWorkloadSchedulable(ctx context.Context, s *Scenario) {
993+
func ValidateGPUWorkloadSchedulable(ctx context.Context, s *Scenario, gpuCount int) {
994994
s.T.Helper()
995995
s.T.Logf("validating that GPU workloads can be scheduled")
996996

@@ -1014,7 +1014,7 @@ func ValidateGPUWorkloadSchedulable(ctx context.Context, s *Scenario) {
10141014
},
10151015
Resources: corev1.ResourceRequirements{
10161016
Limits: corev1.ResourceList{
1017-
"nvidia.com/gpu": resource.MustParse("1"),
1017+
"nvidia.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuCount)),
10181018
},
10191019
},
10201020
},

0 commit comments

Comments
 (0)