ModelEngine-Group · mag1c-h · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026
@@ -222,7 +222,6 @@ def build_llm_with_uc(
         "model": model_path,
         "kv_transfer_config": ktc,
         "max_model_len": 12000,
-        "gpu_memory_utilization": 0.3,  # Reduced to prevent OOM after Phase 1
         "max_num_batched_tokens": max_num_batched_tokens,
         "block_size": 128,
         "enforce_eager": llm_kwargs.get("enforce_eager", True),
@@ -276,11 +275,17 @@ def run_offline_inference(
     """
     sampling_params = from_dict_for_serialization(sampling_params_dict)
 
+    gpu_memory_utilization = float(os.getenv("E2E_TEST_GPU_MEMORY_UTILIZATION", "0.1"))
+    logger.info(
+        "run offline inference with gpu memory utilization: %.4f",
+        gpu_memory_utilization,
+    )
+
     with build_llm_with_uc(
         model_path=model_path,
         ucm_config=ucm_config,
         enable_prefix_caching=enable_prefix_caching,
-        gpu_memory_utilization=0.3,
+        gpu_memory_utilization=gpu_memory_utilization,
         max_num_batched_tokens=max_num_batched_tokens,
         enforce_eager=enforce_eager,
     ) as llm:

@@ -162,6 +162,7 @@ def pytest_runtest_logreport(report):
 
 
 def get_free_gpu(required_memory_mb):
+    mem_needed_with_buffer = int(required_memory_mb * 1.3)  # add buffer to avoid OOM
     pynvml.nvmlInit()
     device_count = pynvml.nvmlDeviceGetCount()
     device_indices = list(range(device_count))
@@ -170,19 +171,28 @@ def get_free_gpu(required_memory_mb):
         handle = pynvml.nvmlDeviceGetHandleByIndex(i)
         info = pynvml.nvmlDeviceGetMemoryInfo(handle)
         free_in_mb = info.free / 1024**2
-        if free_in_mb >= required_memory_mb:
-            return i, free_in_mb
-    return None, 0
+        if free_in_mb >= mem_needed_with_buffer:
+            utilization = (
+                required_memory_mb * (1024**2) / info.total if info.total else 0
+            )
+            return i, free_in_mb, utilization
+    return None, 0, 0
 
 
 @pytest.fixture(autouse=True)
 def setup_gpu_resource(request):
     marker = request.node.get_closest_marker("gpu_mem")
     if marker:
         mem_needed = marker.args[0]
-        gpu_id, free_in_mb = get_free_gpu(mem_needed)
+        gpu_id, free_in_mb, gpu_utilization = get_free_gpu(mem_needed)
         if gpu_id is not None:
-            print(f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory")
+            print(
+                f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization {gpu_utilization:.4%}"
+            )
             os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+            if gpu_utilization:
+                os.environ["E2E_TEST_GPU_MEMORY_UTILIZATION"] = str(gpu_utilization)
         else:
-            pytest.fail(f"No GPU with {mem_needed}MB free memory available")
+            pytest.fail(
+                f"No GPU with {mem_needed}MB(+30% buffer) free memory available"
+            )
@@ -25,7 +25,7 @@ class TestBasicOfflineInference:
 
     @pytest.mark.stage(1)
     @pytest.mark.feature("offline_inference")
-    @pytest.mark.gpu_mem(30000)
+    @pytest.mark.gpu_mem(6000)
     @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
     @pytest.mark.parametrize("max_tokens", [200])
     @pytest.mark.parametrize("prompt_split_ratio", [0.5])  # Split prompt in half

@@ -25,7 +25,7 @@ class TestBasicOfflineInferenceSparse:
 
     @pytest.mark.stage(1)
     @pytest.mark.feature("offline_inference_sparse")
-    @pytest.mark.gpu_mem(30000)
+    @pytest.mark.gpu_mem(6000)
     @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
     @pytest.mark.parametrize("max_tokens", [200])
     @pytest.mark.parametrize("prompt_split_ratio", [0.5])  # Split prompt in half
@@ -229,7 +229,7 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
 
     @pytest.mark.stage(1)
     @pytest.mark.feature("offline_inference_sparse")
-    @pytest.mark.gpu_mem(30000)
+    @pytest.mark.gpu_mem(6000)
     @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
     @pytest.mark.parametrize("max_tokens", [200])
     @pytest.mark.parametrize("enforce_eager", [False])