@@ -162,6 +162,7 @@ def pytest_runtest_logreport(report):
162162
163163
164164def get_free_gpu (required_memory_mb ):
165+ mem_needed_with_buffer = int (required_memory_mb * 1.3 ) # add buffer to avoid OOM
165166 pynvml .nvmlInit ()
166167 device_count = pynvml .nvmlDeviceGetCount ()
167168 device_indices = list (range (device_count ))
@@ -170,19 +171,28 @@ def get_free_gpu(required_memory_mb):
170171 handle = pynvml .nvmlDeviceGetHandleByIndex (i )
171172 info = pynvml .nvmlDeviceGetMemoryInfo (handle )
172173 free_in_mb = info .free / 1024 ** 2
173- if free_in_mb >= required_memory_mb :
174- return i , free_in_mb
175- return None , 0
174+ if free_in_mb >= mem_needed_with_buffer :
175+ utilization = (
176+ required_memory_mb * (1024 ** 2 ) / info .total if info .total else 0
177+ )
178+ return i , free_in_mb , utilization
179+ return None , 0 , 0
176180
177181
178182@pytest .fixture (autouse = True )
179183def setup_gpu_resource (request ):
180184 marker = request .node .get_closest_marker ("gpu_mem" )
181185 if marker :
182186 mem_needed = marker .args [0 ]
183- gpu_id , free_in_mb = get_free_gpu (mem_needed )
187+ gpu_id , free_in_mb , gpu_utilization = get_free_gpu (mem_needed )
184188 if gpu_id is not None :
185- print (f"Allocating GPU { gpu_id } with { free_in_mb } MB free memory" )
189+ print (
190+ f"Allocating GPU { gpu_id } with { free_in_mb } MB free memory, gpu utilization { gpu_utilization :.4%} "
191+ )
186192 os .environ ["CUDA_VISIBLE_DEVICES" ] = str (gpu_id )
193+ if gpu_utilization :
194+ os .environ ["E2E_TEST_GPU_MEMORY_UTILIZATION" ] = str (gpu_utilization )
187195 else :
188- pytest .fail (f"No GPU with { mem_needed } MB free memory available" )
196+ pytest .fail (
197+ f"No GPU with { mem_needed } MB(+30% buffer) free memory available"
198+ )
0 commit comments