user docker to build&test, update sparse attention e2e test

dante159753 · dante159753 · commit 1f1f2124bca6 · 2026-01-26T11:28:26.000+08:00
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
@@ -76,8 +76,6 @@ jobs:
   test-e2e-pc-gpu:
     runs-on: gpu
     needs: lint-and-unit-tests
-    env:
-      BUILD_TYPE: Release
     permissions:
       checks: write
       pull-requests: write
@@ -90,30 +88,54 @@ jobs:
           rm -rf .[!.]*
         fi
     - uses: actions/checkout@v4
+    - name: Install Docker CLI
+      run: |
+        if ! command -v docker &> /dev/null; then
+          echo "Docker CLI not found, installing..."
+          sudo apt-get update
+          sudo apt-get install -y docker.io
+        else
+          echo "Docker CLI already installed"
+        fi
+    - name: Generate Docker Image Version
+      id: version
+      run: |
+        DATE=$(date +%Y%m%d)
+        SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7)
+        VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}"
+        echo "version=${VERSION}" >> $GITHUB_OUTPUT
+        echo "Docker image version: ${VERSION}"
     - name: Build
       run: |
         cd ${{github.workspace}}
-        export PLATFORM=cuda
-        pip install -v -e . --no-build-isolation
-    - name: Test E2E
+        sudo docker build -t ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} -f ./docker/Dockerfile-onlyPC ./
+    - name: Test E2E in Docker
       run: |
-        cd ${{github.workspace}}
-        cd test
-        pip install -r requirements.txt
-        python3 -m pytest -x --stage=1 --feature=offline_inference --junitxml=offline-inference.xml
+        sudo docker run --rm \
+          -itd \
+          --gpus all \
+          --network=host \
+          --ipc=host \
+          --cap-add IPC_LOCK \
+          -v /home/models:/home/models \
+          -v ${{github.workspace}}:/workspace \
+          ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} \
+          -c "cd /workspace/test && pip install -r requirements.txt && python3 -m pytest -x --stage=1 --feature=offline_inference --junitxml=offline-inference.xml"
     - name: Upload pytest results
       uses: EnricoMi/publish-unit-test-result-action/linux@v2
       if: (!cancelled())
       with:
         files: |
           ${{github.workspace}}/test/offline-inference.xml
         check_name: Prefix cache test results
+    - name: Cleanup Docker Image
+      if: always()
+      run: |
+        sudo docker rmi ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} || true
 
   test-e2e-sparse-gpu:
     runs-on: gpu
     needs: lint-and-unit-tests
-    env:
-      BUILD_TYPE: Release
     permissions:
       checks: write
       pull-requests: write
@@ -126,23 +148,48 @@ jobs:
           rm -rf .[!.]*
         fi
     - uses: actions/checkout@v4
+    - name: Install Docker CLI
+      run: |
+        if ! command -v docker &> /dev/null; then
+          echo "Docker CLI not found, installing..."
+          sudo apt-get update
+          sudo apt-get install -y docker.io
+        else
+          echo "Docker CLI already installed"
+        fi
+    - name: Generate Docker Image Version
+      id: version
+      run: |
+        DATE=$(date +%Y%m%d)
+        SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7)
+        VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}"
+        echo "version=${VERSION}" >> $GITHUB_OUTPUT
+        echo "Docker image version: ${VERSION}"
     - name: Build
       run: |
         cd ${{github.workspace}}
-        export PLATFORM=cuda
-        export ENABLE_SPARSE=TRUE
-        pip install -v -e . --no-build-isolation
-    - name: Test E2E
+        sudo docker build -t ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} -f ./docker/Dockerfile ./
+    - name: Test E2E in Docker
       run: |
-        cd ${{github.workspace}}
-        cd test
-        pip install -r requirements.txt
-        python3 -m pytest -x --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml
+        sudo docker run --rm \
+          -itd \
+          --gpus all \
+          --network=host \
+          --ipc=host \
+          --cap-add IPC_LOCK \
+          -v /home/models:/home/models \
+          -v ${{github.workspace}}:/workspace \
+          ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} \
+          -c "cd /workspace/test && pip install -r requirements.txt && python3 -m pytest -x --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml"
     - name: Upload pytest results
       uses: EnricoMi/publish-unit-test-result-action/linux@v2
       if: (!cancelled())
       with:
         files: |
           ${{github.workspace}}/test/offline-inference-sparse.xml
         check_name: Sparse attention test results
+    - name: Cleanup Docker Image
+      if: always()
+      run: |
+        sudo docker rmi ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} || true
 
diff --git a/docker/Dockerfile-onlyPC b/docker/Dockerfile-onlyPC
@@ -0,0 +1,16 @@
+# Set to other image if needed
+FROM vllm/vllm-openai:v0.9.2
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+
+WORKDIR /workspace
+
+# Install unified-cache-management
+COPY . /workspace/unified-cache-management
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+RUN export PLATFORM="cuda" && \
+     pip install -v -e /workspace/unified-cache-management --no-build-isolation
+
+ENTRYPOINT ["/bin/bash"]
diff --git a/test/conftest.py b/test/conftest.py
@@ -164,20 +164,25 @@ def pytest_runtest_logreport(report):
 
 
 def get_free_gpu(required_memory_mb):
-    mem_needed_with_buffer = int(required_memory_mb * 1.3)  # add buffer to avoid OOM
-    pynvml.nvmlInit()
-    device_count = pynvml.nvmlDeviceGetCount()
-    device_indices = list(range(device_count))
-    random.shuffle(device_indices)
-    for i in device_indices:  # random order to reduce collisions
-        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-        free_in_mb = info.free / 1024**2
-        if free_in_mb >= mem_needed_with_buffer:
-            utilization = (
-                required_memory_mb * (1024**2) / info.total if info.total else 0
-            )
-            return i, free_in_mb, utilization
+    try:
+        mem_needed_with_buffer = int(
+            required_memory_mb * 1.3
+        )  # add buffer to avoid OOM
+        pynvml.nvmlInit()
+        device_count = pynvml.nvmlDeviceGetCount()
+        device_indices = list(range(device_count))
+        random.shuffle(device_indices)
+        for i in device_indices:  # random order to reduce collisions
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            free_in_mb = info.free / 1024**2
+            if free_in_mb >= mem_needed_with_buffer:
+                utilization = (
+                    required_memory_mb * (1024**2) / info.total if info.total else 0
+                )
+                return i, free_in_mb, utilization
+    finally:
+        pynvml.nvmlShutdown()
     return None, 0, 0
 
 
@@ -189,7 +194,7 @@ def setup_gpu_resource(request):
         gpu_id, free_in_mb, gpu_utilization = get_free_gpu(mem_needed)
         if gpu_id is not None:
             print(
-                f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization {gpu_utilization:.4%}"
+                f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization for test {gpu_utilization:.4%}"
             )
             os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
             if gpu_utilization:
diff --git a/test/suites/E2E/test_offline_inference_sparse.py b/test/suites/E2E/test_offline_inference_sparse.py
@@ -215,16 +215,16 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
             print(f"Standard answers:\n{standard_answers}")
             pytest.fail("HBM + SSD Mixed Accuracy Test Failed!")
 
-    """Test ESA sparse attention."""
+    """Test GSA sparse attention."""
 
     @pytest.mark.stage(1)
     @pytest.mark.feature("offline_inference_sparse")
     @pytest.mark.gpu_mem(6000)
     @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
     @pytest.mark.parametrize("max_tokens", [200])
-    @pytest.mark.parametrize("enforce_eager", [False])
-    @pytest.mark.parametrize("max_num_batched_tokens", [2047])
-    def test_offline_esa(
+    @pytest.mark.parametrize("enforce_eager", [True])
+    @pytest.mark.parametrize("max_num_batched_tokens", [30000])
+    def test_offline_gsa(
         self,
         model_name: str,
         max_tokens: int,
@@ -286,15 +286,7 @@ def test_offline_esa(
                     },
                 }
             ],
-            "ucm_sparse_config": {
-                "ESA": {
-                    "init_window_sz": 1,
-                    "local_window_sz": 2,
-                    "min_blocks": 4,
-                    "sparse_ratio": 0.3,
-                    "retrieval_stride": 5,
-                }
-            },
+            "ucm_sparse_config": {"GSAOnDevice": {}},
         }
 
         sampling_params = SamplingParams(
@@ -315,7 +307,7 @@ def test_offline_esa(
             sampling_params_dict,
             False,  # enable_prefix_caching=False
             enforce_eager,
-            "ESA",
+            "GSA",
             max_num_batched_tokens,
             timeout=180,
         )