Skip to content

Commit 1f70c48

Browse files
committed
fixes
1 parent 7affd32 commit 1f70c48

2 files changed

Lines changed: 14 additions & 3 deletions

File tree

cuda_memtest.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -575,12 +575,12 @@ main(int argc, char** argv)
575575
exit(ERR_BAD_STATE);
576576
}
577577

578-
active_update_temperature = 0;
579-
580578
for(i=0;i < num_gpus;i++){
581579
pthread_join(pid[i], NULL);
582580
}
583581

582+
active_update_temperature = 0;
583+
584584
printf("main thread: Program exits\n");
585585

586586
return 0;

misc.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,24 @@ void update_temperature(void)
88
unsigned int deviceCount;
99
NVML_CHECK(nvmlDeviceGetCount( &deviceCount ));
1010

11-
for( unsigned int devIdx = 0; devIdx < deviceCount; ++devIdx )
11+
unsigned int monitoredCount = (deviceCount > MAX_GPU_NUM) ? MAX_GPU_NUM : deviceCount;
12+
if (deviceCount > MAX_GPU_NUM) {
13+
fprintf(stderr, "WARNING: Found %u GPUs, but MAX_GPU_NUM is %u. Clamping to maximum supported.\n", deviceCount, MAX_GPU_NUM);
14+
}
15+
16+
for( unsigned int devIdx = 0; devIdx < monitoredCount; ++devIdx )
1217
{
1318
nvmlDevice_t devHandle;
1419
NVML_CHECK(nvmlDeviceGetHandleByIndex( devIdx, &devHandle ));
1520

1621
unsigned int devTemperature;
22+
#if (NVML_API_VERSION >= 13)
23+
nvmlTemperature_t temperature = {nvmlTemperature_v1, NVML_TEMPERATURE_GPU};
24+
NVML_CHECK(nvmlDeviceGetTemperatureV( devHandle, &temperature ));
25+
devTemperature = temperature.temperature;
26+
#else
1727
NVML_CHECK(nvmlDeviceGetTemperature( devHandle, NVML_TEMPERATURE_GPU, &devTemperature ));
28+
#endif
1829
gpu_temp[devIdx] = devTemperature;
1930

2031
DEBUG_PRINTF("temperature updated: (gpu %d) %d \n", devIdx, devTemperature);

0 commit comments

Comments
 (0)