Skip to content

Commit 7ca5c20

Browse files
committed
reduce gpu utilization to 4GB
1 parent d63a467 commit 7ca5c20

4 files changed

Lines changed: 21 additions & 9 deletions

File tree

test/common/offline_inference_utils.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,6 @@ def build_llm_with_uc(
222222
"model": model_path,
223223
"kv_transfer_config": ktc,
224224
"max_model_len": 12000,
225-
"gpu_memory_utilization": 0.3, # Reduced to prevent OOM after Phase 1
226225
"max_num_batched_tokens": max_num_batched_tokens,
227226
"block_size": 128,
228227
"enforce_eager": llm_kwargs.get("enforce_eager", True),
@@ -276,11 +275,17 @@ def run_offline_inference(
276275
"""
277276
sampling_params = from_dict_for_serialization(sampling_params_dict)
278277

278+
gpu_memory_utilization = float(os.getenv("E2E_TEST_GPU_MEMORY_UTILIZATION", "0.1"))
279+
logger.info(
280+
"run offline inference with gpu memory utilization: %.4f",
281+
gpu_memory_utilization,
282+
)
283+
279284
with build_llm_with_uc(
280285
model_path=model_path,
281286
ucm_config=ucm_config,
282287
enable_prefix_caching=enable_prefix_caching,
283-
gpu_memory_utilization=0.3,
288+
gpu_memory_utilization=gpu_memory_utilization,
284289
max_num_batched_tokens=max_num_batched_tokens,
285290
enforce_eager=enforce_eager,
286291
) as llm:

test/conftest.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,18 +171,25 @@ def get_free_gpu(required_memory_mb):
171171
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
172172
free_in_mb = info.free / 1024**2
173173
if free_in_mb >= required_memory_mb:
174-
return i, free_in_mb
175-
return None, 0
174+
utilization = (
175+
required_memory_mb * (1024**2) / info.total if info.total else 0
176+
)
177+
return i, free_in_mb, utilization
178+
return None, 0, 0
176179

177180

178181
@pytest.fixture(autouse=True)
179182
def setup_gpu_resource(request):
180183
marker = request.node.get_closest_marker("gpu_mem")
181184
if marker:
182185
mem_needed = marker.args[0]
183-
gpu_id, free_in_mb = get_free_gpu(mem_needed)
186+
gpu_id, free_in_mb, gpu_utilization = get_free_gpu(mem_needed)
184187
if gpu_id is not None:
185-
print(f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory")
188+
print(
189+
f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization {gpu_utilization:.4%}"
190+
)
186191
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
192+
if gpu_utilization:
193+
os.environ["E2E_TEST_GPU_MEMORY_UTILIZATION"] = str(gpu_utilization)
187194
else:
188195
pytest.fail(f"No GPU with {mem_needed}MB free memory available")

test/suites/E2E/test_offline_inference.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class TestBasicOfflineInference:
2525

2626
@pytest.mark.stage(1)
2727
@pytest.mark.feature("offline_inference")
28-
@pytest.mark.gpu_mem(30000)
28+
@pytest.mark.gpu_mem(4000)
2929
@pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
3030
@pytest.mark.parametrize("max_tokens", [200])
3131
@pytest.mark.parametrize("prompt_split_ratio", [0.5]) # Split prompt in half

test/suites/E2E/test_offline_inference_sparse.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class TestBasicOfflineInferenceSparse:
2525

2626
@pytest.mark.stage(1)
2727
@pytest.mark.feature("offline_inference_sparse")
28-
@pytest.mark.gpu_mem(30000)
28+
@pytest.mark.gpu_mem(4000)
2929
@pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
3030
@pytest.mark.parametrize("max_tokens", [200])
3131
@pytest.mark.parametrize("prompt_split_ratio", [0.5]) # Split prompt in half
@@ -229,7 +229,7 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
229229

230230
@pytest.mark.stage(1)
231231
@pytest.mark.feature("offline_inference_sparse")
232-
@pytest.mark.gpu_mem(30000)
232+
@pytest.mark.gpu_mem(4000)
233233
@pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
234234
@pytest.mark.parametrize("max_tokens", [200])
235235
@pytest.mark.parametrize("enforce_eager", [False])

0 commit comments

Comments
 (0)