diff --git a/test/common/offline_inference_utils.py b/test/common/offline_inference_utils.py index ae3687b74..f55473e00 100644 --- a/test/common/offline_inference_utils.py +++ b/test/common/offline_inference_utils.py @@ -222,7 +222,6 @@ def build_llm_with_uc( "model": model_path, "kv_transfer_config": ktc, "max_model_len": 12000, - "gpu_memory_utilization": 0.3, # Reduced to prevent OOM after Phase 1 "max_num_batched_tokens": max_num_batched_tokens, "block_size": 128, "enforce_eager": llm_kwargs.get("enforce_eager", True), @@ -276,11 +275,17 @@ def run_offline_inference( """ sampling_params = from_dict_for_serialization(sampling_params_dict) + gpu_memory_utilization = float(os.getenv("E2E_TEST_GPU_MEMORY_UTILIZATION", "0.1")) + logger.info( + "run offline inference with gpu memory utilization: %.4f", + gpu_memory_utilization, + ) + with build_llm_with_uc( model_path=model_path, ucm_config=ucm_config, enable_prefix_caching=enable_prefix_caching, - gpu_memory_utilization=0.3, + gpu_memory_utilization=gpu_memory_utilization, max_num_batched_tokens=max_num_batched_tokens, enforce_eager=enforce_eager, ) as llm: diff --git a/test/conftest.py b/test/conftest.py index 2189094e9..784a29135 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -162,6 +162,7 @@ def pytest_runtest_logreport(report): def get_free_gpu(required_memory_mb): + mem_needed_with_buffer = int(required_memory_mb * 1.3) # add buffer to avoid OOM pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() device_indices = list(range(device_count)) @@ -170,9 +171,12 @@ def get_free_gpu(required_memory_mb): handle = pynvml.nvmlDeviceGetHandleByIndex(i) info = pynvml.nvmlDeviceGetMemoryInfo(handle) free_in_mb = info.free / 1024**2 - if free_in_mb >= required_memory_mb: - return i, free_in_mb - return None, 0 + if free_in_mb >= mem_needed_with_buffer: + utilization = ( + required_memory_mb * (1024**2) / info.total if info.total else 0 + ) + return i, free_in_mb, utilization + return None, 0, 0 @pytest.fixture(autouse=True) @@ -180,9 +184,15 @@ def setup_gpu_resource(request): marker = request.node.get_closest_marker("gpu_mem") if marker: mem_needed = marker.args[0] - gpu_id, free_in_mb = get_free_gpu(mem_needed) + gpu_id, free_in_mb, gpu_utilization = get_free_gpu(mem_needed) if gpu_id is not None: - print(f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory") + print( + f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization {gpu_utilization:.4%}" + ) os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) + if gpu_utilization: + os.environ["E2E_TEST_GPU_MEMORY_UTILIZATION"] = str(gpu_utilization) else: - pytest.fail(f"No GPU with {mem_needed}MB free memory available") + pytest.fail( + f"No GPU with {mem_needed}MB(+30% buffer) free memory available" + ) diff --git a/test/suites/E2E/test_offline_inference.py b/test/suites/E2E/test_offline_inference.py index 345c759e5..ced06ff04 100644 --- a/test/suites/E2E/test_offline_inference.py +++ b/test/suites/E2E/test_offline_inference.py @@ -25,7 +25,7 @@ class TestBasicOfflineInference: @pytest.mark.stage(1) @pytest.mark.feature("offline_inference") - @pytest.mark.gpu_mem(30000) + @pytest.mark.gpu_mem(6000) @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"]) @pytest.mark.parametrize("max_tokens", [200]) @pytest.mark.parametrize("prompt_split_ratio", [0.5]) # Split prompt in half diff --git a/test/suites/E2E/test_offline_inference_sparse.py b/test/suites/E2E/test_offline_inference_sparse.py index 49ead5d3c..fda2532cb 100644 --- a/test/suites/E2E/test_offline_inference_sparse.py +++ b/test/suites/E2E/test_offline_inference_sparse.py @@ -25,7 +25,7 @@ class TestBasicOfflineInferenceSparse: @pytest.mark.stage(1) @pytest.mark.feature("offline_inference_sparse") - @pytest.mark.gpu_mem(30000) + @pytest.mark.gpu_mem(6000) @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"]) @pytest.mark.parametrize("max_tokens", [200]) @pytest.mark.parametrize("prompt_split_ratio", [0.5]) # Split prompt in half @@ -229,7 +229,7 @@ def match_any_answer(output: str, answers: list[str]) -> bool: @pytest.mark.stage(1) @pytest.mark.feature("offline_inference_sparse") - @pytest.mark.gpu_mem(30000) + @pytest.mark.gpu_mem(6000) @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"]) @pytest.mark.parametrize("max_tokens", [200]) @pytest.mark.parametrize("enforce_eager", [False])