Skip to content

Commit c4f58fe

Browse files
dante159753mag1c-h
andauthored
[misc] Reduce gpu utilization to 6GB in test for 1.5B model (#665)
Co-authored-by: Mag1c.H <hemajun815@163.com>
1 parent d6e6a47 commit c4f58fe

4 files changed

Lines changed: 26 additions & 11 deletions

File tree

test/common/offline_inference_utils.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,6 @@ def build_llm_with_uc(
222222
"model": model_path,
223223
"kv_transfer_config": ktc,
224224
"max_model_len": 12000,
225-
"gpu_memory_utilization": 0.3, # Reduced to prevent OOM after Phase 1
226225
"max_num_batched_tokens": max_num_batched_tokens,
227226
"block_size": 128,
228227
"enforce_eager": llm_kwargs.get("enforce_eager", True),
@@ -276,11 +275,17 @@ def run_offline_inference(
276275
"""
277276
sampling_params = from_dict_for_serialization(sampling_params_dict)
278277

278+
gpu_memory_utilization = float(os.getenv("E2E_TEST_GPU_MEMORY_UTILIZATION", "0.1"))
279+
logger.info(
280+
"run offline inference with gpu memory utilization: %.4f",
281+
gpu_memory_utilization,
282+
)
283+
279284
with build_llm_with_uc(
280285
model_path=model_path,
281286
ucm_config=ucm_config,
282287
enable_prefix_caching=enable_prefix_caching,
283-
gpu_memory_utilization=0.3,
288+
gpu_memory_utilization=gpu_memory_utilization,
284289
max_num_batched_tokens=max_num_batched_tokens,
285290
enforce_eager=enforce_eager,
286291
) as llm:

test/conftest.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ def pytest_runtest_logreport(report):
162162

163163

164164
def get_free_gpu(required_memory_mb):
165+
mem_needed_with_buffer = int(required_memory_mb * 1.3) # add buffer to avoid OOM
165166
pynvml.nvmlInit()
166167
device_count = pynvml.nvmlDeviceGetCount()
167168
device_indices = list(range(device_count))
@@ -170,19 +171,28 @@ def get_free_gpu(required_memory_mb):
170171
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
171172
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
172173
free_in_mb = info.free / 1024**2
173-
if free_in_mb >= required_memory_mb:
174-
return i, free_in_mb
175-
return None, 0
174+
if free_in_mb >= mem_needed_with_buffer:
175+
utilization = (
176+
required_memory_mb * (1024**2) / info.total if info.total else 0
177+
)
178+
return i, free_in_mb, utilization
179+
return None, 0, 0
176180

177181

178182
@pytest.fixture(autouse=True)
179183
def setup_gpu_resource(request):
180184
marker = request.node.get_closest_marker("gpu_mem")
181185
if marker:
182186
mem_needed = marker.args[0]
183-
gpu_id, free_in_mb = get_free_gpu(mem_needed)
187+
gpu_id, free_in_mb, gpu_utilization = get_free_gpu(mem_needed)
184188
if gpu_id is not None:
185-
print(f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory")
189+
print(
190+
f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization {gpu_utilization:.4%}"
191+
)
186192
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
193+
if gpu_utilization:
194+
os.environ["E2E_TEST_GPU_MEMORY_UTILIZATION"] = str(gpu_utilization)
187195
else:
188-
pytest.fail(f"No GPU with {mem_needed}MB free memory available")
196+
pytest.fail(
197+
f"No GPU with {mem_needed}MB(+30% buffer) free memory available"
198+
)

test/suites/E2E/test_offline_inference.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class TestBasicOfflineInference:
2525

2626
@pytest.mark.stage(1)
2727
@pytest.mark.feature("offline_inference")
28-
@pytest.mark.gpu_mem(30000)
28+
@pytest.mark.gpu_mem(6000)
2929
@pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
3030
@pytest.mark.parametrize("max_tokens", [200])
3131
@pytest.mark.parametrize("prompt_split_ratio", [0.5]) # Split prompt in half

test/suites/E2E/test_offline_inference_sparse.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class TestBasicOfflineInferenceSparse:
2525

2626
@pytest.mark.stage(1)
2727
@pytest.mark.feature("offline_inference_sparse")
28-
@pytest.mark.gpu_mem(30000)
28+
@pytest.mark.gpu_mem(6000)
2929
@pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
3030
@pytest.mark.parametrize("max_tokens", [200])
3131
@pytest.mark.parametrize("prompt_split_ratio", [0.5]) # Split prompt in half
@@ -229,7 +229,7 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
229229

230230
@pytest.mark.stage(1)
231231
@pytest.mark.feature("offline_inference_sparse")
232-
@pytest.mark.gpu_mem(30000)
232+
@pytest.mark.gpu_mem(6000)
233233
@pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
234234
@pytest.mark.parametrize("max_tokens", [200])
235235
@pytest.mark.parametrize("enforce_eager", [False])

0 commit comments

Comments
 (0)