Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions test/common/offline_inference_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,6 @@ def build_llm_with_uc(
"model": model_path,
"kv_transfer_config": ktc,
"max_model_len": 12000,
"gpu_memory_utilization": 0.3, # Reduced to prevent OOM after Phase 1
"max_num_batched_tokens": max_num_batched_tokens,
"block_size": 128,
"enforce_eager": llm_kwargs.get("enforce_eager", True),
Expand Down Expand Up @@ -276,11 +275,17 @@ def run_offline_inference(
"""
sampling_params = from_dict_for_serialization(sampling_params_dict)

gpu_memory_utilization = float(os.getenv("E2E_TEST_GPU_MEMORY_UTILIZATION", "0.1"))
logger.info(
"run offline inference with gpu memory utilization: %.4f",
gpu_memory_utilization,
)

with build_llm_with_uc(
model_path=model_path,
ucm_config=ucm_config,
enable_prefix_caching=enable_prefix_caching,
gpu_memory_utilization=0.3,
gpu_memory_utilization=gpu_memory_utilization,
max_num_batched_tokens=max_num_batched_tokens,
enforce_eager=enforce_eager,
) as llm:
Expand Down
22 changes: 16 additions & 6 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def pytest_runtest_logreport(report):


def get_free_gpu(required_memory_mb):
mem_needed_with_buffer = int(required_memory_mb * 1.3) # add buffer to avoid OOM
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
device_indices = list(range(device_count))
Expand All @@ -170,19 +171,28 @@ def get_free_gpu(required_memory_mb):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
free_in_mb = info.free / 1024**2
if free_in_mb >= required_memory_mb:
return i, free_in_mb
return None, 0
if free_in_mb >= mem_needed_with_buffer:
utilization = (
required_memory_mb * (1024**2) / info.total if info.total else 0
)
return i, free_in_mb, utilization
return None, 0, 0


@pytest.fixture(autouse=True)
def setup_gpu_resource(request):
marker = request.node.get_closest_marker("gpu_mem")
if marker:
mem_needed = marker.args[0]
gpu_id, free_in_mb = get_free_gpu(mem_needed)
gpu_id, free_in_mb, gpu_utilization = get_free_gpu(mem_needed)
if gpu_id is not None:
print(f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory")
print(
f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization {gpu_utilization:.4%}"
)
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
if gpu_utilization:
os.environ["E2E_TEST_GPU_MEMORY_UTILIZATION"] = str(gpu_utilization)
else:
pytest.fail(f"No GPU with {mem_needed}MB free memory available")
pytest.fail(
f"No GPU with {mem_needed}MB(+30% buffer) free memory available"
)
2 changes: 1 addition & 1 deletion test/suites/E2E/test_offline_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class TestBasicOfflineInference:

@pytest.mark.stage(1)
@pytest.mark.feature("offline_inference")
@pytest.mark.gpu_mem(30000)
@pytest.mark.gpu_mem(6000)
@pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
@pytest.mark.parametrize("max_tokens", [200])
@pytest.mark.parametrize("prompt_split_ratio", [0.5]) # Split prompt in half
Expand Down
4 changes: 2 additions & 2 deletions test/suites/E2E/test_offline_inference_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class TestBasicOfflineInferenceSparse:

@pytest.mark.stage(1)
@pytest.mark.feature("offline_inference_sparse")
@pytest.mark.gpu_mem(30000)
@pytest.mark.gpu_mem(6000)
@pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
@pytest.mark.parametrize("max_tokens", [200])
@pytest.mark.parametrize("prompt_split_ratio", [0.5]) # Split prompt in half
Expand Down Expand Up @@ -229,7 +229,7 @@ def match_any_answer(output: str, answers: list[str]) -> bool:

@pytest.mark.stage(1)
@pytest.mark.feature("offline_inference_sparse")
@pytest.mark.gpu_mem(30000)
@pytest.mark.gpu_mem(6000)
@pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
@pytest.mark.parametrize("max_tokens", [200])
@pytest.mark.parametrize("enforce_eager", [False])
Expand Down