reduce gpu utilization to 4GB

dante159753 · dante159753 · commit 7ca5c20214e8 · 2026-01-22T16:31:43.000+08:00
diff --git a/test/common/offline_inference_utils.py b/test/common/offline_inference_utils.py
@@ -222,7 +222,6 @@ def build_llm_with_uc(
         "model": model_path,
         "kv_transfer_config": ktc,
         "max_model_len": 12000,
-        "gpu_memory_utilization": 0.3,  # Reduced to prevent OOM after Phase 1
         "max_num_batched_tokens": max_num_batched_tokens,
         "block_size": 128,
         "enforce_eager": llm_kwargs.get("enforce_eager", True),
@@ -276,11 +275,17 @@ def run_offline_inference(
     """
     sampling_params = from_dict_for_serialization(sampling_params_dict)
 
+    gpu_memory_utilization = float(os.getenv("E2E_TEST_GPU_MEMORY_UTILIZATION", "0.1"))
+    logger.info(
+        "run offline inference with gpu memory utilization: %.4f",
+        gpu_memory_utilization,
+    )
+
     with build_llm_with_uc(
         model_path=model_path,
         ucm_config=ucm_config,
         enable_prefix_caching=enable_prefix_caching,
-        gpu_memory_utilization=0.3,
+        gpu_memory_utilization=gpu_memory_utilization,
         max_num_batched_tokens=max_num_batched_tokens,
         enforce_eager=enforce_eager,
     ) as llm:
diff --git a/test/conftest.py b/test/conftest.py
@@ -171,18 +171,25 @@ def get_free_gpu(required_memory_mb):
         info = pynvml.nvmlDeviceGetMemoryInfo(handle)
         free_in_mb = info.free / 1024**2
         if free_in_mb >= required_memory_mb:
-            return i, free_in_mb
-    return None, 0
+            utilization = (
+                required_memory_mb * (1024**2) / info.total if info.total else 0
+            )
+            return i, free_in_mb, utilization
+    return None, 0, 0
 
 
 @pytest.fixture(autouse=True)
 def setup_gpu_resource(request):
     marker = request.node.get_closest_marker("gpu_mem")
     if marker:
         mem_needed = marker.args[0]
-        gpu_id, free_in_mb = get_free_gpu(mem_needed)
+        gpu_id, free_in_mb, gpu_utilization = get_free_gpu(mem_needed)
         if gpu_id is not None:
-            print(f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory")
+            print(
+                f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization {gpu_utilization:.4%}"
+            )
             os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+            if gpu_utilization:
+                os.environ["E2E_TEST_GPU_MEMORY_UTILIZATION"] = str(gpu_utilization)
         else:
             pytest.fail(f"No GPU with {mem_needed}MB free memory available")
diff --git a/test/suites/E2E/test_offline_inference.py b/test/suites/E2E/test_offline_inference.py
@@ -25,7 +25,7 @@ class TestBasicOfflineInference:
 
     @pytest.mark.stage(1)
     @pytest.mark.feature("offline_inference")
-    @pytest.mark.gpu_mem(30000)
+    @pytest.mark.gpu_mem(4000)
     @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
     @pytest.mark.parametrize("max_tokens", [200])
     @pytest.mark.parametrize("prompt_split_ratio", [0.5])  # Split prompt in half
diff --git a/test/suites/E2E/test_offline_inference_sparse.py b/test/suites/E2E/test_offline_inference_sparse.py
@@ -25,7 +25,7 @@ class TestBasicOfflineInferenceSparse:
 
     @pytest.mark.stage(1)
     @pytest.mark.feature("offline_inference_sparse")
-    @pytest.mark.gpu_mem(30000)
+    @pytest.mark.gpu_mem(4000)
     @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
     @pytest.mark.parametrize("max_tokens", [200])
     @pytest.mark.parametrize("prompt_split_ratio", [0.5])  # Split prompt in half
@@ -229,7 +229,7 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
 
     @pytest.mark.stage(1)
     @pytest.mark.feature("offline_inference_sparse")
-    @pytest.mark.gpu_mem(30000)
+    @pytest.mark.gpu_mem(4000)
     @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
     @pytest.mark.parametrize("max_tokens", [200])
     @pytest.mark.parametrize("enforce_eager", [False])