From 8b5f58674e82fefbc86d13b6580c8453a1131698 Mon Sep 17 00:00:00 2001
From: Yanzhao Wang <wyz159753@gmail.com>
Date: Wed, 21 Jan 2026 15:11:54 +0800
Subject: [PATCH] add workflow runs on pr, do build/unittest/install/e2e
 inference add workflow check for push, do linter/unittest only allow ucm team
 member to modify the workflow content add test utils for offline inference

---
 .github/actionlint.yaml                       |   3 +-
 .github/workflows/cpp-linter.yml              |  34 --
 .github/workflows/e2e_test.yml                |  30 --
 .github/workflows/lint-and-test.yml           |  78 ++++
 .github/workflows/pre-commit.yml              |  31 --
 .github/workflows/pull-request.yml            | 129 ++++++
 .github/workflows/push-check.yml              |   9 +
 .github/workflows/ucmstore.yml                |  46 ---
 .github/workflows/unifiedcache_test.yml       |  22 --
 .gitignore                                    |   1 -
 test/common/offline_inference_utils.py        | 370 ++++++++++++++++++
 test/common/path_utils.py                     |  44 +++
 test/config.yaml                              |   2 +-
 test/conftest.py                              |  30 ++
 test/pytest.ini                               |   1 +
 .../E2E/prompts/test_offline_inference.json   |   1 +
 test/suites/E2E/test_offline_inference.py     | 237 +++++++++++
 .../E2E/test_offline_inference_sparse.py      | 336 ++++++++++++++++
 18 files changed, 1238 insertions(+), 166 deletions(-)
 delete mode 100644 .github/workflows/cpp-linter.yml
 delete mode 100644 .github/workflows/e2e_test.yml
 create mode 100644 .github/workflows/lint-and-test.yml
 delete mode 100644 .github/workflows/pre-commit.yml
 create mode 100644 .github/workflows/pull-request.yml
 create mode 100644 .github/workflows/push-check.yml
 delete mode 100644 .github/workflows/ucmstore.yml
 delete mode 100644 .github/workflows/unifiedcache_test.yml
 create mode 100644 test/common/offline_inference_utils.py
 create mode 100644 test/common/path_utils.py
 create mode 100644 test/suites/E2E/prompts/test_offline_inference.json
 create mode 100644 test/suites/E2E/test_offline_inference.py
 create mode 100644 test/suites/E2E/test_offline_inference_sparse.py

diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index b4e02e8aa..b28824d1e 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -2,4 +2,5 @@ self-hosted-runner:
   # Labels of self-hosted runner in array of strings.
   labels:
     - default
-    - arc-runner-ucm
\ No newline at end of file
+    - gpu
+    - npu
\ No newline at end of file
diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml
deleted file mode 100644
index 7e9525b6c..000000000
--- a/.github/workflows/cpp-linter.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: cpp-linter
-
-on:
-  push:
-    branches: [ "*" ]
-  pull_request:
-    branches: [ "dev*", "main", "*release", "feature*" ]
-
-
-jobs:
-  cpp-linter:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3  # v6.0.0
-        with:
-          persist-credentials: false
-      - uses: cpp-linter/cpp-linter-action@main
-        id: linter
-        continue-on-error: true
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          style: file
-          tidy-checks: '-*'
-          files-changed-only: true
-          lines-changed-only: diff
-          format-review: true
-          version: 20
-
-      - name: Fail fast?!
-        if: steps.linter.outputs.checks-failed != 0
-        run: |
-          echo "some linter checks failed. ${{ steps.linter.outputs.checks-failed }}"
-          exit 1
diff --git a/.github/workflows/e2e_test.yml b/.github/workflows/e2e_test.yml
deleted file mode 100644
index 434b1a1f9..000000000
--- a/.github/workflows/e2e_test.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: offline_inference_test
-on: 
-    workflow_dispatch:
-
-jobs:
-  offline-inference:
-    runs-on: arc-runner-ucm       
-    steps:
-      - uses: actions/checkout@v4
-      - run: nvidia-smi
-      - name: Run offline_inference in container
-        run: |
-          docker run --rm \
-            --gpus all \
-            -v ${{ github.workspace }}:/workspace/unified-cache-management \
-            -v /home_116/models/Qwen2.5-1.5B-Instruct:/home/models/Qwen2.5-1.5B-Instruct \
-            -w /workspace/unified-cache-management \
-            --entrypoint /bin/bash \
-            vllm/vllm-openai:v0.9.2 \
-            -c "
-              set -euo pipefail
-              export PLATFORM=cuda
-              export MODEL_PATH=/home/models/Qwen2.5-1.5B-Instruct
-              pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-              pip install -v -e . --no-build-isolation
-              cd \$(pip show vllm | grep Location | awk '{print \$2}') &&
-              git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
-              cd /workspace/unified-cache-management
-              python3 examples/offline_inference.py
-            "
\ No newline at end of file
diff --git a/.github/workflows/lint-and-test.yml b/.github/workflows/lint-and-test.yml
new file mode 100644
index 000000000..d1aa7cf95
--- /dev/null
+++ b/.github/workflows/lint-and-test.yml
@@ -0,0 +1,78 @@
+name: 'Lint and Unit Tests'
+
+on:
+  workflow_call:
+
+jobs:
+  cpp-linter:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+      - uses: cpp-linter/cpp-linter-action@main
+        id: linter
+        continue-on-error: true
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          style: file
+          tidy-checks: '-*'
+          files-changed-only: true
+          lines-changed-only: diff
+          format-review: false
+          version: 20
+
+      - name: Fail fast?!
+        if: steps.linter.outputs.checks-failed != 0
+        run: |
+          echo "some linter checks failed. ${{ steps.linter.outputs.checks-failed }}"
+          exit 1
+
+  py-linter:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.12"
+
+    - name: Add matchers for better error display
+      run: |
+        echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+        echo "::add-matcher::.github/workflows/matchers/mypy.json"
+
+    - name: Run pre-commit checks on all files
+      uses: pre-commit/action@v3.0.1
+      env:
+        SHELLCHECK_OPTS: "--exclude=SC2046,SC2006,SC2086"
+      with:
+        extra_args: --all-files --hook-stage manual
+
+  cpp_gtest:
+    runs-on: ubuntu-latest
+    env:
+      BUILD_TYPE: Debug
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install googletest
+      run: |
+        git clone https://github.com/google/googletest.git --depth=1 --branch=v1.17.0
+        cd googletest
+        mkdir build && cd build
+        cmake -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_C_FLAGS="-fPIC" -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_STANDARD_REQUIRED=True ..
+        sudo make install -j
+
+    - name: Configure CMake
+      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBUILD_UCM_SPARSE=OFF -DBUILD_UNIT_TESTS=ON -DRUNTIME_ENVIRONMENT=simu
+
+    - name: Build
+      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} -j
+
+    - name: Test
+      working-directory: ${{github.workspace}}/build
+      run: ctest -C ${{env.BUILD_TYPE}} --output-on-failure
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
deleted file mode 100644
index 9516b8b5a..000000000
--- a/.github/workflows/pre-commit.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: pre-commit
-
-on:
-    workflow_call:
-
-permissions:
-  contents: read
-
-jobs:
-  pre-commit:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: "3.12"
-
-    - name: Add matchers for better error display
-      run: |
-        echo "::add-matcher::.github/workflows/matchers/actionlint.json"
-        echo "::add-matcher::.github/workflows/matchers/mypy.json"
-
-    - name: Run pre-commit checks on all files
-      uses: pre-commit/action@v3.0.1
-      env:
-        SHELLCHECK_OPTS: "--exclude=SC2046,SC2006,SC2086" # Exclude SC2046, SC2006, SC2086 for actionlint
-      with:
-        extra_args: --all-files --hook-stage manual
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
new file mode 100644
index 000000000..f1b395765
--- /dev/null
+++ b/.github/workflows/pull-request.yml
@@ -0,0 +1,129 @@
+name: 'Pull Request Gate'
+
+on:
+  pull_request:
+    branches: [ "dev*", "main", "*release", "feature*" ]
+
+jobs:
+  # protect the workflows dir, only allow specific users to modify
+  protect-workflows-dir:
+    runs-on: ubuntu-latest
+    outputs:
+      allowed: ${{ steps.check.outputs.allowed }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - id: check
+        env:
+          ACTOR: ${{ github.actor }}
+        run: |
+          # get the target branch contents
+          git fetch origin ${{ github.base_ref }}
+
+          CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD)
+          echo "CHANGED_FILES=$CHANGED_FILES"
+
+          if ! echo "$CHANGED_FILES" | grep -q '^\.github/workflows/'; then
+            echo "No .github/workflows changes, allow."
+            echo "allowed=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          ALLOWED_USERS=("dante159753" "mag1c-h")
+          
+          echo ".github changes detected, check if user is allowed..."
+          ACTOR="${{ github.actor }}"
+          echo "PR author: $ACTOR"
+
+          for u in "${ALLOWED_USERS[@]}"; do
+            if [[ "$ACTOR" == "$u" ]]; then
+              echo "Authorized user, allowing change."
+              echo "allowed=true" >> $GITHUB_OUTPUT
+              exit 0
+            fi
+          done
+
+          echo "ERROR: Only privileged users may modify .github/workflows/"
+          echo "allowed=false" >> $GITHUB_OUTPUT
+          exit 1
+
+  lint-and-unit-tests:
+    needs: protect-workflows-dir
+    if: needs.protect-workflows-dir.outputs.allowed == 'true'
+    uses: ./.github/workflows/lint-and-test.yml
+
+  test-e2e-pc-gpu:
+    runs-on: gpu
+    needs: lint-and-unit-tests
+    env:
+      BUILD_TYPE: Release
+    permissions:
+      checks: write
+      pull-requests: write
+    steps:
+    - name: Clean repo
+      run: |
+        if [ -d "${{github.workspace}}" ]; then
+          cd ${{github.workspace}}
+          rm -rf ./*
+          rm -rf .[!.]*
+        fi
+    - uses: actions/checkout@v4
+    - name: Build
+      run: |
+        cd ${{github.workspace}}
+        export PLATFORM=cuda
+        pip install -v -e . --no-build-isolation
+    - name: Test E2E
+      run: |
+        cd ${{github.workspace}}
+        cd test
+        pip install pytest pytest-cov pynvml pandas
+        python3 -m pytest --stage=1 --feature=offline_inference --junitxml=offline-inference.xml
+    - name: Upload pytest results
+      uses: EnricoMi/publish-unit-test-result-action/linux@v2
+      if: (!cancelled())
+      with:
+        files: |
+          ${{github.workspace}}/test/offline-inference.xml
+        check_name: Prefix cache test results
+
+  test-e2e-sparse-gpu:
+    runs-on: gpu
+    needs: lint-and-unit-tests
+    env:
+      BUILD_TYPE: Release
+    permissions:
+      checks: write
+      pull-requests: write
+    steps:
+    - name: Clean repo
+      run: |
+        if [ -d "${{github.workspace}}" ]; then
+          cd ${{github.workspace}}
+          rm -rf ./*
+          rm -rf .[!.]*
+        fi
+    - uses: actions/checkout@v4
+    - name: Build
+      run: |
+        cd ${{github.workspace}}
+        export PLATFORM=cuda
+        export ENABLE_SPARSE=TRUE
+        pip install -v -e . --no-build-isolation
+    - name: Test E2E
+      run: |
+        cd ${{github.workspace}}
+        cd test
+        pip install pytest pytest-cov pynvml pandas
+        python3 -m pytest --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml
+    - name: Upload pytest results
+      uses: EnricoMi/publish-unit-test-result-action/linux@v2
+      if: (!cancelled())
+      with:
+        files: |
+          ${{github.workspace}}/test/offline-inference-sparse.xml
+        check_name: Sparse attention test results
+
diff --git a/.github/workflows/push-check.yml b/.github/workflows/push-check.yml
new file mode 100644
index 000000000..e39cbc3db
--- /dev/null
+++ b/.github/workflows/push-check.yml
@@ -0,0 +1,9 @@
+name: 'Push Commit Checks'
+
+on:
+  push:
+    branches: [ "**" ] # ** matches all branches, while * matches only top-level branches without '/'
+
+jobs:
+  lint-and-unit-tests:
+    uses: ./.github/workflows/lint-and-test.yml
\ No newline at end of file
diff --git a/.github/workflows/ucmstore.yml b/.github/workflows/ucmstore.yml
deleted file mode 100644
index 5d6ce9b47..000000000
--- a/.github/workflows/ucmstore.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-# This starter workflow is for a CMake project running on a single platform. There is a different starter workflow if you need cross-platform coverage.
-# See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-multi-platform.yml
-name: ucmstore
-
-on:
-  push:
-    branches: [ "*" ]
-  pull_request:
-    branches: [ "dev*", "main", "*release", "feature*" ]
-
-env:
-  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
-  BUILD_TYPE: Debug
-
-jobs:
-  cc_gtest:
-    # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
-    # You can convert this to a matrix build if you need cross-platform coverage.
-    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Install googletest
-      run: |
-        git clone https://github.com/google/googletest.git --depth=1 --branch=v1.17.0
-        cd googletest
-        mkdir build && cd build
-        cmake -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_C_FLAGS="-fPIC" -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_STANDARD_REQUIRED=True ..
-        sudo make install -j
-
-    - name: Configure CMake
-      # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
-      # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
-      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBUILD_UCM_SPARSE=OFF -DBUILD_UNIT_TESTS=ON -DRUNTIME_ENVIRONMENT=simu
-
-    - name: Build
-      # Build your program with the given configuration
-      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} -j
-
-    - name: Test
-      working-directory: ${{github.workspace}}/build
-      # Execute tests defined by the CMake configuration.
-      # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
-      run: ctest -C ${{env.BUILD_TYPE}} --output-on-failure
diff --git a/.github/workflows/unifiedcache_test.yml b/.github/workflows/unifiedcache_test.yml
deleted file mode 100644
index 9fa8d1a84..000000000
--- a/.github/workflows/unifiedcache_test.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: 'ucm-lint-and-unittest'
-
-on:
-  push:
-    branches:
-      - 'main'
-      - 'dev*'
-      - '*release'
-      - 'feature*'
-  pull_request:
-    branches:
-      - 'main'
-      - 'dev*'
-      - '*release'
-      - 'feature*'
-
-jobs:
-  # gpu-test:
-  #   uses: ./.github/workflows/e2e_test.yml
-
-  call-lint:
-    uses: ./.github/workflows/pre-commit.yml
diff --git a/.gitignore b/.gitignore
index 734cf4bf1..f90ad2895 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,7 +53,6 @@
 reports/
 dataset/
 logs/
-.*
 *.log
 result_outputs/
 results/
diff --git a/test/common/offline_inference_utils.py b/test/common/offline_inference_utils.py
new file mode 100644
index 000000000..ae3687b74
--- /dev/null
+++ b/test/common/offline_inference_utils.py
@@ -0,0 +1,370 @@
+"""
+MULTIPROCESS FRAMEWORK:
+======================
+This module provides a `run_in_spawn_subprocess` function to simplify running functions in
+subprocess while handling GPU memory cleanup automatically.
+
+NOTE: Each offline inference test case should run with multiprocessing spawn mode to ensure GPU memory
+is fully released after each test. This prevents memory accumulation across test cases.
+
+USAGE EXAMPLE:
+    # Define your test function that contains the core test logic
+    def my_test_logic(model_path, config, params):
+        # Your test logic here - no need to handle multiprocessing or GPU cleanup
+        with build_llm_with_uc(model_path, config) as llm:
+            results = llm.generate(...)
+        return results
+
+    # Run it in subprocess using the framework
+    results = run_in_spawn_subprocess(
+        my_test_logic,
+        model_path,
+        config,
+        params,
+        timeout=180  # optional, default 180 seconds
+    )
+"""
+
+import contextlib
+import gc
+import json
+import multiprocessing
+import os
+import time
+from dataclasses import asdict
+from functools import wraps
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from common.capture_utils import export_vars
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.engine.arg_utils import EngineArgs
+
+from ucm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def _run_subprocess_wrapper(func, args, kwargs, result_queue, error_queue):
+    """Module-level wrapper function for subprocess execution.
+
+    This must be at module level (not local) to be picklable by spawn mode.
+    """
+    try:
+        result = func(*args, **kwargs)
+        result_queue.put(result)
+    except Exception as e:
+        import traceback
+
+        error_info = f"{type(e).__name__}: {str(e)}\n{traceback.format_exc()}"
+        error_queue.put(RuntimeError(error_info))
+
+
+def run_in_spawn_subprocess(func, *args, timeout: int = 180, **kwargs):
+    """Run a function in a subprocess.
+
+    Args:
+        func: The function to run in subprocess
+        *args: Positional arguments to pass to func
+        timeout: Timeout in seconds (default 180), this can only be set using keyword argument(e.g. timeout=300)
+        **kwargs: Keyword arguments to pass to func
+
+    Returns:
+        The return value from func
+
+    Raises:
+        RuntimeError: If subprocess times out or fails
+        Exception: Any exception raised by func in the subprocess
+    """
+
+    ctx = multiprocessing.get_context("spawn")
+    result_queue = ctx.Queue()
+    error_queue = ctx.Queue()
+
+    process = ctx.Process(
+        target=_run_subprocess_wrapper,
+        args=(func, args, kwargs, result_queue, error_queue),
+    )
+    process.start()
+    process.join(timeout=timeout)
+
+    if process.is_alive():
+        process.terminate()
+        process.join()
+        raise RuntimeError(f"Subprocess timed out after {timeout} seconds")
+
+    if not error_queue.empty():
+        error = error_queue.get()
+        raise error
+
+    if not result_queue.empty():
+        return result_queue.get()
+
+    if process.exitcode != 0:
+        raise RuntimeError(f"Subprocess failed with exit code {process.exitcode}")
+
+
+def to_dict_for_serialization(obj: Any) -> Dict[str, Any]:
+    """Convert any object to dict for subprocess serialization.
+
+    Supports:
+    - dataclass objects
+    - regular objects with __dict__
+    - vLLM SamplingParams and other custom classes
+
+    Args:
+        obj: Object to serialize (dataclass, SamplingParams, etc.)
+
+    Returns:
+        Dict with _type and _data fields for reconstruction
+    """
+    from dataclasses import asdict, is_dataclass
+
+    try:
+        # Try dataclass first
+        if is_dataclass(obj) and not isinstance(obj, type):
+            data = asdict(obj)
+        # Try __dict__ for regular objects
+        elif hasattr(obj, "__dict__"):
+            data = obj.__dict__.copy()
+        else:
+            raise ValueError(f"Cannot serialize object of type {type(obj)}")
+
+        return {
+            "_type": f"{obj.__class__.__module__}.{obj.__class__.__name__}",
+            "_data": data,
+        }
+    except Exception as e:
+        logger.warning(f"Serialization failed for {type(obj)}: {e}")
+        raise
+
+
+def from_dict_for_serialization(serialized: Dict[str, Any]) -> Any:
+    """Recreate object from serialized dict.
+
+    Args:
+        serialized: Dict created by to_dict_for_serialization()
+
+    Returns:
+        Reconstructed object instance
+    """
+    import importlib
+
+    if "_type" not in serialized:
+        # Not a serialized object, return as-is
+        return serialized
+
+    type_str = serialized["_type"]
+    obj_data = serialized.get("_data", {})
+
+    try:
+        # Parse module and class name
+        module_name, class_name = type_str.rsplit(".", 1)
+        module = importlib.import_module(module_name)
+        cls = getattr(module, class_name)
+
+        # Reconstruct object
+        return cls(**obj_data)
+    except Exception as e:
+        logger.warning(f"Deserialization failed for {type_str}: {e}")
+        raise
+
+
+def ensure_storage_dir(storage_path: str, clear_existing: bool = False):
+    os.makedirs(storage_path, exist_ok=True)
+    if clear_existing:
+        for item in os.listdir(storage_path):
+            item_path = os.path.join(storage_path, item)
+            if os.path.isfile(item_path):
+                os.remove(item_path)
+            elif os.path.isdir(item_path):
+                import shutil
+
+                shutil.rmtree(item_path)
+
+
+def cleanup_gpu_memory():
+    """Clean up GPU/NPU memory."""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+    elif hasattr(torch, "npu") and torch.npu.is_available():
+        torch.npu.empty_cache()
+        torch.npu.synchronize()
+    gc.collect()
+
+
+@contextlib.contextmanager
+def build_llm_with_uc(
+    model_path: str,
+    ucm_config: Optional[Dict[str, Any]] = None,
+    enable_prefix_caching: bool = False,
+    max_num_batched_tokens: int = 2047,
+    **llm_kwargs,
+):
+    module_path = "ucm.integration.vllm.ucm_connector"
+    name = "UCMConnector"
+
+    ktc = KVTransferConfig(
+        kv_connector=name,
+        kv_connector_module_path=module_path,
+        kv_role="kv_both",
+        kv_connector_extra_config=ucm_config,
+    )
+
+    tensor_parallel_size = 1
+
+    default_args = {
+        "model": model_path,
+        "kv_transfer_config": ktc,
+        "max_model_len": 12000,
+        "gpu_memory_utilization": 0.3,  # Reduced to prevent OOM after Phase 1
+        "max_num_batched_tokens": max_num_batched_tokens,
+        "block_size": 128,
+        "enforce_eager": llm_kwargs.get("enforce_eager", True),
+        "trust_remote_code": True,
+        "enable_prefix_caching": enable_prefix_caching,
+        "tensor_parallel_size": tensor_parallel_size,
+    }
+    default_args.update(llm_kwargs)
+
+    cleanup_gpu_memory()
+    time.sleep(1)  # Ensure memory is released before building LLM
+
+    llm_args = EngineArgs(**default_args)
+    llm = LLM(**asdict(llm_args))
+
+    try:
+        yield llm
+    finally:
+        logger.info("LLM engine is exiting")
+        del llm
+        cleanup_dist_env_and_memory(shutdown_ray=False)
+
+
+def run_offline_inference(
+    model_path: str,
+    ucm_config: Dict[str, Any],
+    prompts: List[str],
+    sampling_params_dict: Dict[str, Any],
+    enable_prefix_caching: bool,
+    enforce_eager: bool,
+    phase_description: str,
+    max_num_batched_tokens: int,
+) -> List[str]:
+    """Run a phase in the subprocess.
+
+    This function should be called via MultiprocessSpawner.run_in_subprocess().
+    It handles the actual test logic without subprocess management.
+
+    Args:
+        model_path: Path to the model
+        ucm_config: UCM configuration
+        prompts: List of prompts to send
+        sampling_params_dict: Sampling parameters as dict (for serialization)
+        enable_prefix_caching: Whether to enable HBM prefix caching
+        enforce_eager: Whether to enforce eager mode
+        phase_description: Description string for logging
+        max_num_batched_tokens: Max number of batched tokens
+
+    Returns:
+        List of generated outputs
+    """
+    sampling_params = from_dict_for_serialization(sampling_params_dict)
+
+    with build_llm_with_uc(
+        model_path=model_path,
+        ucm_config=ucm_config,
+        enable_prefix_caching=enable_prefix_caching,
+        gpu_memory_utilization=0.3,
+        max_num_batched_tokens=max_num_batched_tokens,
+        enforce_eager=enforce_eager,
+    ) as llm:
+        outputs = llm.generate(prompts, sampling_params)
+
+        generated_texts = [output.outputs[0].text for output in outputs]
+
+        if phase_description:
+            logger.info(f"{phase_description} completed")
+
+        return generated_texts
+
+
+def split_prompt_by_tokens(
+    prompt: str, tokenizer: AutoTokenizer, split_ratio: float = 0.5
+) -> Tuple[str, str]:
+    tokens = tokenizer.encode(prompt)
+    split_idx = int(len(tokens) * split_ratio)
+
+    first_tokens = tokens[:split_idx]
+    second_tokens = tokens[split_idx:]
+
+    first_part = tokenizer.decode(first_tokens, skip_special_tokens=False)
+    second_part = tokenizer.decode(second_tokens, skip_special_tokens=False)
+
+    return first_part, second_part
+
+
+def load_prompt_from_file(prompt_file: Optional[Path] = None) -> Tuple[str, List[str]]:
+    """Load prompt and answers from JSON file (LongBench format).
+    LongBench format structure:
+    {
+        "input": "任务输入/问题",
+        "context": "长上下文/文档",
+        "answers": ["答案列表"],
+        "length": 总长度,
+        "dataset": "数据集名称",
+        "language": "语言",
+        ...
+    }
+    For LongBench, the typical format is:
+    - context: 长文档/上下文（放在前面）
+    - input: 问题/查询（放在后面）
+    - Combined format: context + "\n\n" + input
+    Args:
+        prompt_file: Path to the prompt JSON file. If None, uses default path.
+    Returns:
+        Tuple of (combined_prompt_string, answers_list).
+        - combined_prompt_string: Combined prompt (context + input)
+        - answers_list: List of standard answers from the file
+    """
+    if not prompt_file.exists():
+        raise FileNotFoundError(f"Prompt file not found: {prompt_file}")
+
+    with open(prompt_file, "r", encoding="utf-8") as f:
+        content = f.read().strip()
+
+    try:
+        data = json.loads(content)
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON format in {prompt_file}: {e}")
+
+    if isinstance(data, list):
+        if len(data) == 0:
+            raise ValueError(f"Empty list in {prompt_file}")
+        data = data[0]
+
+    input_text = data.get("input", "")
+    context_text = data.get("context", "")
+
+    # LongBench standard format: context (long document) + input (question)
+    # Combine context and input to form the full prompt
+    if context_text and input_text:
+        full_prompt = f"{context_text}\n\n{input_text}"
+    elif context_text:
+        full_prompt = context_text
+    elif input_text:
+        full_prompt = input_text
+    else:
+        raise ValueError(f"No input or context found in {prompt_file}")
+
+    # Extract answers
+    answers = data.get("answers", [])
+    if not isinstance(answers, list):
+        answers = [answers] if answers else []
+
+    return full_prompt, answers
diff --git a/test/common/path_utils.py b/test/common/path_utils.py
new file mode 100644
index 000000000..e063857d6
--- /dev/null
+++ b/test/common/path_utils.py
@@ -0,0 +1,44 @@
+"""Utility functions for path management in tests."""
+
+import os
+from pathlib import Path
+
+
+def get_test_root() -> Path:
+    """Get the /test directory path regardless of where pytest is run from.
+
+    This function locates the test root directory by finding the directory
+    containing this module (common/) and moving up one level.
+
+    Returns:
+        Path: The absolute path to the /test directory
+
+    Example:
+        >>> from common.path_utils import get_test_root
+        >>> config_file = get_test_root() / "config.yaml"
+        >>> prompt_file = get_test_root() / "suites" / "E2E" / "prompts" / "test.json"
+    """
+    # Get the directory where this module is located (common/)
+    # Then go up one level to reach /test
+    return Path(__file__).resolve().parent.parent
+
+
+def get_path_relative_to_test_root(subdir_path: str | Path) -> Path:
+    """Get a path relative to the /test directory.
+
+    Args:
+        subdir_path: Relative path from test root (can be string or Path)
+
+    Returns:
+        Path: The absolute path to the requested subdirectory/file
+
+    Example:
+        >>> from common.path_utils import get_test_subdir
+        >>> config_file = get_test_subdir("config.yaml")
+        >>> prompt_file = get_test_subdir("suites/E2E/prompts/test.json")
+    """
+    return get_test_root() / subdir_path
+
+
+def get_path_to_model(model_name: str, config) -> str:
+    return os.path.join("/home/models/", model_name)
diff --git a/test/config.yaml b/test/config.yaml
index 2f90eba6b..d2f00de1f 100644
--- a/test/config.yaml
+++ b/test/config.yaml
@@ -9,7 +9,7 @@ reports:
 
 database:
   backup: "results/"
-  enabled: true
+  enabled: false
   host: "127.0.0.1"
   port: 5432
   name: "ucm_test"
diff --git a/test/conftest.py b/test/conftest.py
index a1b8af404..2189094e9 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -1,10 +1,13 @@
 from __future__ import annotations
 
 import datetime as dt
+import os
 import platform as pf
+import random
 import sys
 from pathlib import Path
 
+import pynvml
 import pytest
 from common.config_utils import config_utils as config_instance
 from common.db_utils import database_connection, write_to_db
@@ -156,3 +159,30 @@ def pytest_runtest_logreport(report):
         "error": str(report.longrepr) if report.failed else None,
     }
     write_to_db("test_case_info", test_result)
+
+
+def get_free_gpu(required_memory_mb):
+    pynvml.nvmlInit()
+    device_count = pynvml.nvmlDeviceGetCount()
+    device_indices = list(range(device_count))
+    random.shuffle(device_indices)
+    for i in device_indices:  # random order to reduce collisions
+        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        free_in_mb = info.free / 1024**2
+        if free_in_mb >= required_memory_mb:
+            return i, free_in_mb
+    return None, 0
+
+
+@pytest.fixture(autouse=True)
+def setup_gpu_resource(request):
+    marker = request.node.get_closest_marker("gpu_mem")
+    if marker:
+        mem_needed = marker.args[0]
+        gpu_id, free_in_mb = get_free_gpu(mem_needed)
+        if gpu_id is not None:
+            print(f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory")
+            os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+        else:
+            pytest.fail(f"No GPU with {mem_needed}MB free memory available")
diff --git a/test/pytest.ini b/test/pytest.ini
index 3a0ed9a91..e6504461b 100644
--- a/test/pytest.ini
+++ b/test/pytest.ini
@@ -21,4 +21,5 @@ markers =
     # -------- Features (Recommended) --------
     feature:     Feature tag
     platform: Platform tag(gpu/npu)
+    gpu_mem: GPU memory requirement in MB
 # end of markers
\ No newline at end of file
diff --git a/test/suites/E2E/prompts/test_offline_inference.json b/test/suites/E2E/prompts/test_offline_inference.json
new file mode 100644
index 000000000..90ad14c9a
--- /dev/null
+++ b/test/suites/E2E/prompts/test_offline_inference.json
@@ -0,0 +1 @@
+{"input": "全国美国文学研究会的第十八届年会在哪所大学举办的？", "context": "全国美国文学研究会\n受秘书处委托，由我向美文会会员单位的各位代表简单汇报一下全国美国文学研究会自上届（第十七届）年会召开以来所做的工作。美文会秘书处刚刚完成了教育部社团办“关于在教育部主管社会组织中开展调研工作的通知”中要求提交的“全国美国文学研究会调研报告”（2016年11月），主要内容4项：我想就把我们提交的“调研报告”中1、2两部分中的部分内容，作为我向大会汇报的“全国美国文学研究会2014-2016年工作总结”的内容。\n1、研究会现状和基本情况\n美文会现有会员单位127个（不招收个人会员），包括国内主要985与211高校，以及中国社科院等科研单位和知名出版社。会长单位是南京大学，副会长单位是南京大学，中国社会科学院，北京外国语大学，北京大学，复旦大学，山东大学，秘书处设在南京大学外国语学院，秘书长、副秘书长是南京大学赵文书、何宁。美文会正式成立于1979年7月，是我国改革开放后成立最早的高校外国文学研究机构。1992年8月18日在民政部正式注册登记，获颁《中华人民共和国社会团体登记证》。美文会挂靠南京大学，财务由南京大学财务处负责，接受南京大学审计处审计，按民政部要求，每年参加年检，年检结果均为“通过”。\n美文会秘书处聘有专职秘书，工作人员7人，包括会长，副会长，秘书长，副秘书长，常务理事等。美文会设有党小组，隶属外国语学院英语系党支部，由会长任党小组长，成员包括副会长、常务理事、副秘书长、以及参加秘书处工作的青年教师与博士生。美文会发行《全国美国文学研究会通讯》（CASAL Newsletter），现已刊出33期。美文会每年轮流召开年会和专题研讨会，迄今已经举办17届年会和11届专题研讨会。\n2、近两年主要工作和取得的成效\n1）上届年会。美文会第十七届年会于2014年10月24日至26日在中国人民大学苏州校区举行，由中国人民大学外国语学院承办。来自全国28个省市自治区175所高等院校、研究所、出版社的348名正式代表参加了该次年会。年会的主题是“全球化语境中的美国文学研究：理论与实践”，收到论文全文164篇，摘要273篇。会议期间，全国美国文学研究会第七届理事会召开第三次会议。会议讨论通过了增补美文会副会长、常务理事、理事、会员单位事宜。会议再次明确，两期不缴纳会费的单位视为自动退出。理事和常务理事连续两次无故不参加理事会会议自动取消理事和常务理事。\n2、上届专题研讨会。美文会第十一届专题研讨会于2015年10月23至25日在徐州江苏师范大学举行，由江苏师大外国语学院承办。专题研讨会的主题是“美国文学中的城市”。来自全国21个省市自治区125所高等院校、研究所、出版社的88名正式代表参加了本次研讨会。收到论文全文54篇，论文摘要73篇。会议期间，美文会召开第七届理事会第三次会议，讨论通过了美文会秘书处的提议，增补何宁为理事兼任副秘书长，提请本届年会的理事会确认。\n3、业务活动。1）继续举办“全国美国文学研究会学术成果奖”评选。美文会设立此奖项，是为了促进我国美国文学研究的繁荣与发展，每5年评选一次，迄今已经评选3次。第三届成果奖评选出“优秀专著奖”14项，在2015年10月公示，“优秀论文奖”9项，“优秀教材奖”3项，“优秀译作奖”1项。该活动不收取任何费用。2）第十七届美国戏剧研究年会。由南京师范大学外国语学院承办，2015年7月21-22日在南京举办，主题是“20-21世纪之交美国戏剧研究”。\n4、年检情况。美文会接受业务主管单位教育部的业务指导和社团管理机关民政部社团的监督管理，执行《民间非营利组织会计制度》，接受南京大学财务处、审计处的管理和督查，接受“江苏兴瑞会计师事务所有限公司”财务审计，结果报教育部、民政部。2015年3月进行年检，编制2015年度美文会工作报告书。8月民政部“中国社会组织网”公布年检结果：“合格”。\n2016年11月25日晚，全国美国文学研究会召开理事会，讨论了如下事项。\n会议申办：\n1. 19届年会（2018，浙江大学）和第12届专题研讨会（2017，河海大学），主题尚没有最终确定。\n2. 哈尔滨工业大学申办第20届年会（2020）\n新申请理事单位：\n1. 中国矿业大学，推荐王丽明副教授担任理事\n2. 哈尔滨工业大学，推荐刘克东院长任理事\n3. 南京大学，推荐何宁（美文会副秘书长）任理事\n会员单位变更：\n1. 解放军国际关系学院按照相关要求退出全国美国文学研究会，因此不再常务理事单位，方成教授退出常务理事\n2. 对外经贸大学英语学院英美文学研究所长金冰接替孙建秋担任理事\n3. 中央民族大学外国语学院朱小琳接替郭英剑担任理事\n4. 厦门理工大学张跃军担任理事（原为中南大学常务理事）\n5. 上海外语教育出版社孙静接替汪义群担任理事\n6. 黑龙江大学推荐徐文培为常务理事\n美国族裔文学研究：空间拓展与界域重绘\n全国美国文学研究会第十八届年会（2016）纪要\n2016年11月25日至28日，全国美国文学研究会第十八届年会在厦门大学举行。本届年会由厦门大学外文学院承办，来自全国各地180余所高等院校、研究所、出版社的296位正式代表参加了会议。大会组委会共收到论文全文127篇、摘要371篇，与会代表围绕大会主题“美国族裔文学研究：空间拓展与界域重绘”（Ethnic Studies in US: Extending Interspace and Redefining Typology）展开了广泛而深入的研讨。\n11月25日晚，全国美国文学研究会召开常务理事和理事会，共有23位常务理事和理事出席，理事会主要讨论通过了以下议题：\n1. 确认2017年的专题研讨会由河海大学承办，河海大学外国语学院院长蔡斌教授在闭幕式作简单介绍。\n2. 确认2018年第十九届年会由浙江大学承办。\n3. 因政策变化，解放军国际关系学院退出美文会，方成教授不再担任常务理事。\n4. 增补黑龙江大学徐文培教授为常务理事。\n5. 增补哈尔滨工业大学（刘克东教授）、北京航空航天大学（田俊武教授）、中国矿业大学（王丽明副教授）、北京联合大学（黄宗英教授）为理事单位，山东大学李保杰教授、对外经济贸易大学金冰教授（接替孙建秋教授）、中央民族大学朱小琳教授（接替郭英剑教授）、上海外语教育出版社孙静（接替汪义群）为理事。\n6. 重新明确会员单位申请原则。美文会实行单位会议制，欢迎尚未加入协会的单位申请加入。申请方法和申请表格可以从美文会官网上下载。填写后加盖单位公章邮寄到协会秘书处。美文会秘书处收到入会申请并收到会员费之后即通报理事会并确认会员单位资格。\n7. 重新明确理事单位申请条件。第一，理事单位必须是正常缴纳会费的会员单位；第二，原则上需有英语语言文学硕士点；第三，符合以上条件单位可以申请成为美文会理事单位并推荐合适人选担任理事。\n8. 理事会决定，在美文会的年会和专题研讨会上评选会议优秀论文并颁发证书。其中，优秀论文仅在向会议提交的论文全文（未发表）中评选；作者所在单位须为美文会员单位，在向会议提交论文时，注明论文未经发表，并注明申请参加会议优秀论文评选；美文会常务理事以上（含）不参加申请。\n9. 关于本次年会优秀论文评选：已向会议提交未发表论文全文的会员单位参会代表，在12月15日前，向本会秘书处提交修改后的论文申请评选，本会将在寒假组织评选，在2017年3月公布评选结果并颁发证书。\n11月26日上午，本届年会开幕式在厦门大学科艺报告厅举行。厦门大学校长助理张建霖教授，外文学院张龙海院长，全国美国文学研究会会长朱刚教授、副会长盛宁教授、郭继德教授、杨仁敬教授、金莉教授、王守仁教授、张冲教授、申富英教授，秘书长赵文书教授及其他与会代表出席了开幕式。\n开幕式由外文学院副院长李美华教授主持。张建霖校长助理首先代表厦门大学对来自全国各地的与会者表示热烈欢迎，并对全国美国文学研究会第十八届年会的顺利召开表示衷心祝贺。外文学院张龙海院长代表承办方致欢迎辞，向与会者介绍了厦大外文学院的人才培养、学术研究等情况，以及年会的准备情况。全国美国文学研究会朱刚会长代表与会人员感谢厦门大学对本届年会的大力支持。朱会长简要回顾了美文会的历史和现状，并向与会代表汇报了研究会自第十七届年会以来的主要工作。最后，朱刚会长感谢全体参会代表及承办方对美文会工作的大力支持和对共同推动美国文学研究所做出的贡献，并对今后的工作提出了殷切希望。\n本届年会共分为大会发言、小组讨论、专题研讨（panel discussion）及研究生学术论坛四个部分。11月26日上午的大会发言分别由美文会副会长金莉教授和美文会前副会长、南京大学王守仁教授主持，共有5位代表发言。\n中国社科院外国文学研究所盛宁教授的发言题目是《对政治正确的文化批评的再审视》。盛宁教授指出，美国总统大选造成的国内民族分裂愈演愈烈，这一新国情使我国的族裔文学研究更具价值和意义。作为学者我们必须凸显自己的立场和价值判断，对少数族裔文学的审美价值要有清晰的认识。盛教授以第一代华裔作家代表汤婷婷和第二代华裔作家代表哈金为例，评析了两代作家迥异的“政治正确”书写策略。他认为，借助“政治正确”发音的族裔文学的审美价值会很快消失，我们应深刻反思非裔作家的代表――托尼・莫里森――的创作遗产。莫里森不只着眼于描写黑人苦难，更深入探索人性，将黑人作为“人性”的缩影进行刻画，这是她能够进入美国文学传统、流芳传世的重要原因。\n复旦大学外文学院张冲教授以《超越族裔：美国族裔文学研究的几点思考》为题，探讨我国当前族裔文学研究面临的困境及出路。张冲教授指出，国内族裔文学研究仍然面临研究角度单一与模仿、研究方法过于“理论导向”、文本“碎片化”释读等问题。他建议可从“族裔文学发展流变史”、“比较族裔文学史”以及“本土裔与中国文学文化比较”等维度，重新思考我国方兴未艾的族裔文学研究，族裔文学研究应努力超越族裔而回归文学，既要思考族裔文学的“族裔性”也需关注其“文学性”。\n在《再议作家的族裔身份问题：本质主义与自由选择》的发言中，上海外国语大学虞建华教授以斯图亚特・霍尔对“身份”的定义为出发点，对现有族裔身份的归置基准进行拷问。虞教授强调，在讨论族裔作家文化身份时，我们需聚焦常被忽视的身份的表演性和叙事性，应以社会建构理论为指导思路，走出本质主义，作家的族裔身份在全球化大势下的多元社会，应被看作一个动态、临时、杂糅的建构过程。\n南京大学英语系朱雪峰副教授的发言《重组芝加哥：拉图尔ANT理论视阈下的<克莱伯恩公园>》以社会学家布鲁诺・拉图尔的“行动者网络理论”为视角，从“流动的城市”、“行动者网络”、“蚂蚁视角与新现实主义”三个层面审视《克莱伯恩公园》中的芝加哥城市再现。朱教授认为，此剧在美国本土政治正确风潮中的接受悖论正在于它如实近距离描述了芝加哥城市地理在互动中流变的复杂性，其政治相关性在于它没有给出一个关于芝加哥社会的明晰解释或批评，而是通过不断追踪新问题联合来重组社会，以貌似传统的新现实主义风格体现了戏剧价值。\n厦门大学外文学院张龙海教授以《美国少数族裔文学研究在中国》为题，向大家勾勒了我国美国族裔文学研究的历史图景。张教授通过大量的文献研究和详细的数据，从研究的规模、研究队伍的状况、期刊报纸的刊登情况以及研究中出现的不平衡等方面详细探析美国少数族裔文学研究在中国的涌现和繁荣发展。\n11月26日下午，年会设立23个分会场进行小组讨论。代表们围绕“华裔文学研究新视野”、“亚裔文学研究新视野”、“非裔文学研究新视野”、 “犹太裔文学研究新视野”、“拉美裔文学研究新视野”、“印第安裔文学研究新视野”、“族裔文学与性别研究”、“族裔文学批评理论新动向”、“少数族裔与多元文化”、“族裔文学研究中的中国视角”、“美国文学理论研究与教学”、“美国现代派文学研究”、“早期美国文学研究”等议题，对美国族裔文学展开了多层次全方位的探讨。\n第一组（专题讨论：族裔成长小说研究）由方红、芮渝萍主持，发言人有方红（南京大学）“消声、言说与成长：《褐姑娘、褐砖房》研究”；侯金萍（华南农业大学）“华裔美国文学对成长小说的改写与创新”；芮渝萍（宁波大学）“美国华裔成长小说的特点”；谭岸青（暨南大学）“解读任碧莲《世界与小镇》的成长书写”；邹惠玲（江苏师范大学）“《飞逸》：在自省与融合之中成长”。\n第二组（华裔文学研究新视野之一）由刘永杰、戴鸿斌主持，发言人有黄明（商丘师范学院）“严歌苓小说《扶桑》对华人形象的颠覆”；霍盛亚（北京外国语大学）“华裔美国科幻作家刘宇昆小说的“复族裔化”倾向”；刘向辉（许昌学院）“谭恩美小说《喜福会》中的文学地图与民族记忆”；刘永杰（郑州大学）“‘秘密’的真相：《蝴蝶君》主人公断袖之谊探析”；史博（华北科技学院）“解读《折纸》中爱的主题”；孙坚（陕西师范大学）“新历史主义关照下的《中国佬》”；颜碧洪（福建师范大学福清分校）“论汤亭亭《中国佬》的后现代主义书写”。\n第三组（华裔文学研究新视野之二）由郭栖庆、金衡山主持，发言人有黄一畅（南京航空航天大学）“虚构的权威―《谁是爱尔兰人？》中的叙事伦理之辨”；季峥（重庆工商大学）“华裔美国作家入典原因探究”；金衡山（华东师范大学）“The Puzzling and Enlightening Racial Identity in Who’ s Irish？”；苏娉（中山大学）“论李翊云的非母语写作及其意义”；王芳（中央民族大学）“《无声告白》中的华裔精神生存困境探析”；王增红（厦门大学）“种族冒充、冒充叙事与混血族身份政治―威妮弗蕾德•伊顿新解”；姚红艳（武汉大学）“族群记忆、族群认同与身份建构―《接骨师之女》中的仪式书写”；周凌敏（南方医科大学）“以物为导向的本体论下的后人文主义―以《咸鱼女孩》为例”。\n第四组（族裔文学与性别研究之一）由王玉括、田俊武主持，发言人有方小莉（四川大学）“20世纪黑人女性小说叙述策略研究”；李蕊（南京大学）“论《他们眼望上苍》中珍妮的‘生成女性’特质”；毛艳华（浙江大学）“性别‘引用’视域下《秀拉》中女性主体的初现与重构”；隋红升（浙江大学）“汉斯伯里《太阳下的葡萄干》对美国男性气质的反思”；田俊武（北京航空航天大学）“回归之路―托尼•莫里森作品中的旅行叙事”；王玉括（南京邮电大学）“黑人女性主义文学批评述评”；杨艳春（哈尔滨石油学院）“生态女性主义视域下艾丽丝•沃克作品中女性族裔身份的自我认同”；朱海峰（东北师范大学）“父权、女权、后女权―论《钢琴课》中黑人的种族出路”。\n第五组（族裔文学与性别研究之二）由张跃军主持，发言人有董秋芳（广东农工商职业技术学院）“美国华裔女性主体身份流变―以华裔女作家英语创作为例”；刘兮颖（华中师范大学）“《卢布林的魔术师》中雅夏的身份危机与伦理选择”；杨静（广东外语外贸大学）“全球化时代的跨国婚姻：《追寻亚裔女性》”；姚丽梅（佳木斯大学）“论邝丽莎在《雪花秘扇》中的女性主义身份伦理观”；张跃军（厦门理工学院）“‘温和的女性主义’：华裔美国诗人陈美玲诗歌解读”；朱骅（上海海洋大学）“跨国主义的美国族裔文学建构”。\n第六组（犹太裔文学研究新视野）由刘文松主持，发言人有高莉敏（上海立信会计金融学院）“《末世之城》：大屠杀的历史记忆”；胡选恩（陕西师范大学）“E.L.多克托罗《大进军》中的历史阐释模式”；孔伟（北京外国语大学）“俄国犹太人的‘应许之地’―新移民叙事中的‘发声’策略研究”；刘文松（厦门大学）“美国犹太知识分子小说探秘”；孙璐（上海外国语大学）“菲利普•罗斯《美国牧歌》中的美国民族神话及其当代启示”；张国庆（中国人民大学）“《人性的污秽》的后人道主义解读”；赵永健（浙江工商大学）“国外美国犹太戏剧研究评述”。\n第七组（美国后现代派文学研究之一）由陈世丹、刘雪岚主持，发言人有杨仁敬（厦门大学）“略论《时间》与《达洛威夫人》的互文性”；陈世丹（中国人民大学）“后现代文学伦理学批评要义”；曾艳钰（湖南科技大学）“‘流动的爱国主义盛宴’―评美国后现代战争小说”；谷红丽（华南师范大学）“后现代主义历史叙事”；刘雪岚（社会科学院外国文学研究所）“从‘加州三部曲’看托马斯•品钦的后现代城市书写”；方凡（浙江大学）“论威廉•加斯笔下的图像与文字”；王祖友（泰州学院）“后人道主义与人道主义辨析”；陈奔（厦门大学）“美国研究背景下的后现代主义文学研究”；范小玫（厦门大学外）“德里罗小说中的全球化”。\n第八组（美国后现代派文学研究之二）由吴泽庆、陈俊松主持，发言人有陈俊松（华东师范大学）“《地下世界》：冷战阴云的文化记忆与后现代恐怖叙事”；许希夷（南京大学）“福尔‘后9/11’小说《特别响，非常近》中的历史叙事”；史菊鸿（兰州大学）“一个城市，两幅画面――库切和詹姆斯对伦敦的不同文学再现”；吴泽庆（中央民族大学）“‘恶魔的诅咒’―欧茨的《被诅咒的》中历史书写”；姚本标（广西师范学院）“《白噪音》的‘风险社会’表征”；栾天宇（南京大学）“《赛姆勒先生的行星》中的记忆伦理与美国20世纪60年代”。\n第九组（美国后现代派文学研究之三）由甘文平、杨纪平主持，发言人有甘文平（武汉理工大学）“米歇尔•福柯、共同体、美国越战文学”；崔永光（大连海洋大学）“世界文学史视域中的纳博科夫形象及其创作密码”；范湘萍（上海政法学院）“论‘9.11文学’结构主义叙事中的空间与政治”；林莉（东北师范大学）“论小说《恶棍来访》的空间叙事策略”；刘丹（大连外国语大学）“融合与分裂：《地下世界》中的种族冲突与文化政治”；王程辉（湖南科技大学）“纳博科夫《国王、王后和杰克》与福楼拜《包法利夫人》的互文性”；杨纪平、胡燕（北京邮电大学）“《X战警：第一战》中的族裔观”；张芳芳（上海电力学院）“论纳博科夫小说《普宁》中‘坐错车’的隐喻与流亡主题”；张蓝予（中央民族大学）“文明对话与身份认同：评《恐怖分子》的身份观念”。\n第十组（拉美裔文学研究新视野）由李保杰、李毅峰主持，发言人有李保杰（山东大学）“当历史的重负成为过去―《古巴之王》中的‘反流亡’书写”；李毅峰（天津商业大学）“桑德拉•西斯内罗斯对女性原型形象的重新阐释”；乔玲玲（山西大同大学）“芒果街上的奇卡纳游荡者”；涂沙丽（中南民族大学）“论《石化鹿》中的奇卡娜形象”；王绵绵（浙江传媒学院）“加勒比裔美国移民女作家的空间意识及空间策略”。\n第十一组（美国文学理论研究与教学）由郭建辉、刘春芳主持，发言人有陈 Q（中央民族大学）“论当代反本质主义文学理论的发生因缘与中国进程”；郭建辉（四川外国语大学期刊社）“英美文学教学与审美教育”；焦敏（广东外语外贸大学）“人文主义与戏剧教学”；刘春芳（山东工商学院）“美国浪漫主义文学中的平民思想”；马特（中央财经大学）“文学批评的空间转向：空间批评的新动向”；许玉军（集美大学）“东方启蒙：西方的‘东方主义’话语”。\n第十二组（族裔文学批评理论新动向）由胡铁生、郭英剑主持，发言人有胡铁生（吉林大学）“美国少数族裔文学的演进”；郭英剑（中国人民大学）“2015美国文学：种族，还是种族问题”；洪琪（湖北第二师范学院）“美国华裔戏剧的创伤叙事”；任虎军（四川外国语大学）“性别视阈下新世纪中国的美国族裔小说研究”；王斐（集美大学）“追寻都会中的空间正义：美国非裔城市叙事嬗变初探”。\n第十三组（美国现代派文学研究之一）由黄宗英、王跃洪主持，发言人有陈秋红（青岛大学）“亨利•詹姆斯后期小说的进化叙述”；陈喜华（湘潭大学）“菲茨杰拉德的服饰书写与爵士时代美国文化”；黄宗英（北京联合大学）“‘其城/其人，一种身份’：读威廉斯的《帕特森》”；蒋贤萍（西北师范大学）“表演的自我――再读《进入黑夜的漫长旅程》”；李晶（中南财经政法大学）“生存还是生活？：凯瑟《一个迷途的女人》的伦理选择”；陶久胜（南昌大学）“无意识的种族偏见――《上帝的儿女都有翅膀》的心理原型解读”；朱晓萍（贵州大学）“追逐无的欲望――《嘉莉妹妹》的拉康式解读”。\n第十四组（美国现代派文学研究之二）由李建波、朴玉主持，发言人有朴玉（吉林大学）“科伦•麦凯恩在《光明这一面》中的城市创伤叙事”；王晓丹（哈尔滨师范大学）“阶层流动的幻灭：《纹身女孩》中的社会身份”；王跃洪、郝天昕（上海理工大学）“福柯凝视理论视角下的亨利•詹姆斯《德莫福夫人》研究”；薛丽（北京师范大学）“《布拉迪默传奇》中矛盾的女性意识形态”；姚学丽（安徽大学）“映射美国南方的‘隐约轮廓’――析《干旱的九月》碎片化叙事”；张金良（天津外国语大学）“哈贝马斯有效沟通视域下《奥利安娜》中的交流困境分析”；张小平（扬州大学）“旅行•幻梦•混沌――论麦卡锡小说《骏马》中的‘奇异吸引子’”。\n第十五组（早期美国文学研究）由张和龙、金冰主持，发言人有金冰（对外经贸大学）“美国自然主义文学的进化叙事与伦理想像”；李晋（中南财经政法大学）“19世纪美国文学市场研究综述”；李敏（山东工商学院）“《红字》通奸案的法、罚与霍桑的‘疼痛’书写”；李方木（北京外国语大学）“爱的伪装：《献给爱米丽的玫瑰》中的罗曼司及其多义性”；戚涛（安徽大学）“多数暴政下的碎片――梅尔维尔的价值困惑与身份建构”；张和龙（上海外国语大学）“中国杰克•伦敦研究中的话语模式及其历史嬗变”。\n第十六组（非裔文学研究新视野之一）由林元富主持，发言人有陈红（广东外语外贸大学）“‘所有的故事都是真的’―评怀德曼的编史元小说《法农》”；甘婷（集美大学）“第三空间理论视阈下《中间通道》的空间建构”；林元富（福建师范大学）“当代非裔美国涉奴题材小说的历史传承”；刘锦丽（湖北科技学院）“种族歧视下的身份困惑―论切斯纳特小说中的混血儿”；龙跃（湖南师范大学）“兰斯顿•休斯诗歌中的‘黑人性’”；吕春媚（大连外国语大学）“黑白的空间对峙――解读《莱尼大妈的黑臀舞》中的社会空间”；王予霞（集美大学）“美国黑人左翼文学消长的历史启示”；修树新（东北师范大学外国语学院）“论特瑞•麦克米兰小说中爱的主题”；张健然（四川外国语大学）“《他们眼望上苍》中原始性与现代性的背离与融合”。\n第十七组（非裔文学研究新视野之二）由徐文培、杜志卿主持，发言人有杜志卿（华侨大学外国语学院）“从霍妮的精神分析理论看阿契贝笔下的伊祖鲁”；蒯冲（荆楚理工学院）“非裔美国人的身份缺失与身份认同――以《阳光下的葡萄干》为例”；李美芹（浙江工商大学）“论埃里森‘文学爵士乐’美学中表达的种族政治思想”；李云瑾（华北科技学院）“论托妮•莫里森《宣叙》的含混性”；马粉英（西北师范大学）“《最蓝的眼睛》中克劳迪娅拆解行为的后殖民叙事”；唐莹（大连外国语大学）“从‘挪亚的诅咒’到非洲主义―对罗宾逊种族书写的反思”；徐文培（黑龙江大学）“《所罗门之歌》与奴隶叙事文学”；张宏薇（东北师范大学）“莫里森早晚期两部小说中‘儿童创伤’主题的对比分析”；朱小琳（中央民族大学）“悲莫悲兮伤永逝：以墓志为叙事策略的《宠儿》新解”。\n第十八组（亚裔文学研究新视野）由谷红丽、李汝成主持，发言人有李青霜（南京审计大学）“论戏剧《耻辱》中的穆斯林文化定势”；刘喜波（齐齐哈尔大学）“列斐伏尔的身体空间理论下的《灿烂千阳》解读”；李东风（盐城师范学院）“美国印度裔离散文学中的‘家叙事’”。", "answers": ["厦门大学"], "length": 9593, "dataset": "multifieldqa_zh", "language": "zh", "all_classes": null, "_id": "5b1b8e937b83c3ff9b75ac386fae9c4575c4b9f26a4fbdad"}
\ No newline at end of file
diff --git a/test/suites/E2E/test_offline_inference.py b/test/suites/E2E/test_offline_inference.py
new file mode 100644
index 000000000..345c759e5
--- /dev/null
+++ b/test/suites/E2E/test_offline_inference.py
@@ -0,0 +1,237 @@
+import os
+from pathlib import Path
+
+import pytest
+import yaml
+from common.offline_inference_utils import (
+    ensure_storage_dir,
+    load_prompt_from_file,
+    run_in_spawn_subprocess,
+    run_offline_inference,
+    split_prompt_by_tokens,
+    to_dict_for_serialization,
+)
+from common.path_utils import get_path_relative_to_test_root, get_path_to_model
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
+from ucm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class TestBasicOfflineInference:
+    """Test basic offline inference functionality."""
+
+    @pytest.mark.stage(1)
+    @pytest.mark.feature("offline_inference")
+    @pytest.mark.gpu_mem(30000)
+    @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
+    @pytest.mark.parametrize("max_tokens", [200])
+    @pytest.mark.parametrize("prompt_split_ratio", [0.5])  # Split prompt in half
+    @pytest.mark.parametrize("enforce_eager", [True, False])
+    @pytest.mark.parametrize("max_num_batched_tokens", [2047])
+    def test_offline_accuracy_hbm_ssd_mixed(
+        self,
+        model_name: str,
+        max_tokens: int,
+        prompt_split_ratio: float,
+        enforce_eager: bool,
+        max_num_batched_tokens: int,
+    ):
+        """Test HBM + SSD mixed hit accuracy (Phase 2).
+        This test first runs Phase 1 to generate a baseline output, then tests Phase 2.
+        Test flow:
+        1. Phase 1: Disable HBM PC, send full prompt -> KV cache saved to SSD (baseline)
+        2. Phase 2: Enable HBM PC, send partial prompt (warm HBM), then send full prompt (hits both HBM and SSD) -> verify mixed hit accuracy
+        The prompt is loaded from prompt.json file (LongBench format).
+        Args:
+            model_name: Name of model.
+            max_tokens: Maximum tokens to generate.
+            prompt_split_ratio: Ratio to split prompt for Phase 2 (0.5 = split in half).
+        """
+        config_file = get_path_relative_to_test_root("config.yaml")
+        with open(config_file, "r", encoding="utf-8") as f:
+            config = yaml.safe_load(f)
+
+        # # if no model_path from parameter, fallback to config or environment
+        # if not model_path:
+        #     logger.info(
+        #         "No model_path parameter provided, checking config and environment variable"
+        #     )
+        #     model_path = config.get("llm_connection", {}).get(
+        #         "model_path"
+        #     ) or os.getenv("MODEL_PATH")
+        #     assert (
+        #         model_path is not None
+        #     ), "model_path must be specified via parameter, config, or environment variable"
+
+        model_path = get_path_to_model(model_name, config)
+
+        assert os.path.exists(model_path), f"Model path does not exist: {model_path}"
+
+        ucm_storage_dir = "/tmp/ucm_cache"
+
+        # make sure UCM storage directory exists and is empty
+        ensure_storage_dir(ucm_storage_dir, clear_existing=True)
+
+        try:
+            test_prompt, standard_answers = load_prompt_from_file(
+                get_path_relative_to_test_root(
+                    "suites/E2E/prompts/test_offline_inference.json"
+                )
+            )
+            if not standard_answers:
+                pytest.fail(f"No standard answers found in prompt.json")
+        except Exception as e:
+            pytest.fail(f"Failed to load prompt from prompt.json: {e}")
+
+        logger.info(f"Standard answers: {standard_answers}")
+
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_chat_template=True)
+
+        try:
+            messages = [
+                {
+                    "role": "system",
+                    "content": "先读问题，再根据下面的文章内容回答问题，不要进行分析，不要重复问题，用简短的语句给出答案。\n\n例如：“全国美国文学研究会的第十八届年会在哪所大学举办的？”\n回答应该为：“xx大学”。\n\n",
+                },
+                {"role": "user", "content": test_prompt},
+            ]
+            formatted_full_prompt = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                add_special_tokens=True,
+            )
+        except Exception:
+            formatted_full_prompt = test_prompt
+
+        prompt_first_part, prompt_second_part = split_prompt_by_tokens(
+            formatted_full_prompt, tokenizer, split_ratio=prompt_split_ratio
+        )
+
+        ucm_config = {
+            "ucm_connectors": [
+                {
+                    "ucm_connector_name": "UcmNfsStore",
+                    "ucm_connector_config": {
+                        "storage_backends": ucm_storage_dir,
+                        "use_direct": False,
+                    },
+                }
+            ],
+            "load_only_first_rank": False,
+        }
+
+        sampling_params = SamplingParams(
+            temperature=0.0,
+            top_p=1,
+            max_tokens=max_tokens,
+            ignore_eos=False,
+        )
+
+        logger.info(f"\n===== HBM + SSD Mixed Accuracy Test =====")
+        logger.info(f"Model: {model_path}")
+        logger.info(f"Full prompt length: {len(test_prompt)} chars")
+        logger.info(f"Max tokens: {max_tokens}")
+        logger.info(f"Temperature: 0.0 (deterministic)")
+        logger.info(f"UCM storage: {ucm_storage_dir}")
+        logger.info(f"Prompt split ratio: {prompt_split_ratio}")
+        logger.info(f"Enforce eager: {enforce_eager}")
+        logger.info(f"Max num batched tokens: {max_num_batched_tokens}")
+
+        # ===== Phase 1: Disable HBM PC, save KV cache to SSD and load (baseline) =====
+        # Run Phase 1 in a separate subprocess to ensure GPU memory is fully released
+        logger.info(f"\n===== Phase 1: Save KV Cache to SSD And Load (Baseline) =====")
+
+        # Convert SamplingParams to dict for serialization, as non-picklable objects cannot be passed to subprocess
+        sampling_params_dict = to_dict_for_serialization(sampling_params)
+
+        phase1_outputs = run_in_spawn_subprocess(
+            run_offline_inference,
+            model_path,
+            ucm_config,
+            [formatted_full_prompt, formatted_full_prompt],
+            sampling_params_dict,
+            False,  # enable_prefix_caching=False for Phase 1
+            enforce_eager,
+            "Phase 1 (SSD save and load)",
+            max_num_batched_tokens,
+            timeout=180,
+        )
+        phase1_1_output = phase1_outputs[0]  # Phase 1.1: SSD save
+        phase1_2_output = phase1_outputs[1]  # Phase 1.2: SSD load
+        logger.info(f"Phase 1 completed in subprocess")
+        logger.info(f'Phase 1.1 output: "{phase1_1_output}"')
+        logger.info(f'Phase 1.2 output: "{phase1_2_output}"')
+
+        # ===== Phase 2: Enable HBM PC, test HBM + SSD mixed hit =====
+        # Run Phase 2 in a separate subprocess to ensure GPU memory is fully released
+        logger.info(f"\n===== Phase 2: HBM + SSD Mixed Hit Test =====")
+
+        phase2_outputs = run_in_spawn_subprocess(
+            run_offline_inference,
+            model_path,
+            ucm_config,
+            [prompt_first_part, formatted_full_prompt],
+            sampling_params_dict,
+            True,  # enable_prefix_caching=True for Phase 2
+            enforce_eager,
+            "Phase 2 (HBM + SSD mixed)",
+            max_num_batched_tokens,
+            timeout=180,
+        )
+        phase2_partial_output = phase2_outputs[0]
+        phase2_full_output = phase2_outputs[1]
+        logger.info(f"Phase 2 completed in subprocess")
+        logger.info(f"[INFO] Phase 2.1 output: {phase2_partial_output}")
+        logger.info(f"[INFO] Phase 2.2 output: {phase2_full_output}")
+
+        logger.info(f"\n[INFO] ===== Accuracy Test Results =====")
+
+        # Note: Small numerical precision differences in KV cache loading can cause
+        # punctuation token selection differences (e.g., full-width vs half-width comma)
+        def normalize_text(text: str) -> str:
+            """Normalize text for comparison by replacing similar punctuation."""
+            text = text.replace("，", ",")
+            text = text.replace("。", ".")
+            text = text.replace("！", "!")
+            text = text.replace("？", "?")
+            text = text.replace("：", ":")
+            text = text.replace("；", ";")
+            return text.strip()
+
+        def match_any_answer(output: str, answers: list[str]) -> bool:
+            """Check if output matches any of the standard answers."""
+            for answer in answers:
+                if normalize_text(output) == normalize_text(answer):
+                    return True
+            return False
+
+        # Compare Phase 1.1 vs Phase 1.2 (SSD load accuracy)
+        phase1_correct = match_any_answer(
+            phase1_1_output, standard_answers
+        ) and match_any_answer(phase1_2_output, standard_answers)
+        if not phase1_correct:
+            logger.warning(
+                f"\n===== Phase 1: SSD Load Accuracy Test (Exact Match) ====="
+            )
+            logger.warning(
+                f"Incorrect answer in Phase 1.1 (SSD save) or Phase 1.2 (SSD load) output!"
+            )
+            logger.warning(f"Phase 1.1 output:\n{phase1_1_output}")
+            logger.warning(f"Phase 1.2 output:\n{phase1_2_output}")
+            logger.warning(f"Standard answers:\n{standard_answers}")
+            pytest.fail("SSD Load Accuracy Test Failed!")
+
+        # Phase 2.1 should be skipped from accuracy check since it's only partial prompt
+        phase2_correct = match_any_answer(phase2_full_output, standard_answers)
+        if not phase2_correct:
+            logger.warning(
+                f"\n===== Phase 2: HBM + SSD Mixed Accuracy Test (Exact Match) ====="
+            )
+            logger.warning(f"Incorrect answer in Phase 2.2 (HBM + SSD mixed) output!")
+            logger.warning(f"Phase 2.2 output:\n{phase2_full_output}")
+            logger.warning(f"Standard answers:\n{standard_answers}")
+            pytest.fail("HBM + SSD Mixed Accuracy Test Failed!")
diff --git a/test/suites/E2E/test_offline_inference_sparse.py b/test/suites/E2E/test_offline_inference_sparse.py
new file mode 100644
index 000000000..49ead5d3c
--- /dev/null
+++ b/test/suites/E2E/test_offline_inference_sparse.py
@@ -0,0 +1,336 @@
+import os
+from pathlib import Path
+
+import pytest
+import yaml
+from common.offline_inference_utils import (
+    ensure_storage_dir,
+    load_prompt_from_file,
+    run_in_spawn_subprocess,
+    run_offline_inference,
+    split_prompt_by_tokens,
+    to_dict_for_serialization,
+)
+from common.path_utils import get_path_relative_to_test_root, get_path_to_model
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
+from ucm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class TestBasicOfflineInferenceSparse:
+    """Test basic offline inference functionality."""
+
+    @pytest.mark.stage(1)
+    @pytest.mark.feature("offline_inference_sparse")
+    @pytest.mark.gpu_mem(30000)
+    @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
+    @pytest.mark.parametrize("max_tokens", [200])
+    @pytest.mark.parametrize("prompt_split_ratio", [0.5])  # Split prompt in half
+    @pytest.mark.parametrize("enforce_eager", [True, False])
+    @pytest.mark.parametrize("max_num_batched_tokens", [2047])
+    def test_offline_accuracy_hbm_ssd_mixed_nosparse(
+        self,
+        model_name: str,
+        max_tokens: int,
+        prompt_split_ratio: float,
+        enforce_eager: bool,
+        max_num_batched_tokens: int,
+    ):
+        """Test HBM + SSD mixed hit accuracy (Phase 2).
+        This test first runs Phase 1 to generate a baseline output, then tests Phase 2.
+        Test flow:
+        1. Phase 1: Disable HBM PC, send full prompt -> KV cache saved to SSD (baseline)
+        2. Phase 2: Enable HBM PC, send partial prompt (warm HBM), then send full prompt (hits both HBM and SSD) -> verify mixed hit accuracy
+        The prompt is loaded from prompt.json file (LongBench format).
+        Args:
+            model_name: Name of the model.
+            max_tokens: Maximum tokens to generate.
+            prompt_split_ratio: Ratio to split prompt for Phase 2 (0.5 = split in half).
+        """
+        config_file = get_path_relative_to_test_root("config.yaml")
+        with open(config_file, "r", encoding="utf-8") as f:
+            config = yaml.safe_load(f)
+
+        model_path = get_path_to_model(model_name, config)
+
+        assert os.path.exists(model_path), f"Model path does not exist: {model_path}"
+
+        ucm_storage_dir = "/tmp/ucm_cache"
+
+        # make sure UCM storage directory exists and is empty
+        ensure_storage_dir(ucm_storage_dir, clear_existing=True)
+
+        try:
+            test_prompt, standard_answers = load_prompt_from_file(
+                Path(__file__).parent / "prompts" / "test_offline_inference.json"
+            )
+            logger.info(
+                f"Loaded prompt from prompt.json (length: {len(test_prompt)} chars)"
+            )
+            if standard_answers:
+                logger.info(f"Standard answers: {standard_answers}")
+            else:
+                pytest.fail(f"No standard answers found in prompt.json")
+        except Exception as e:
+            pytest.fail(f"Failed to load prompt from prompt.json: {e}")
+
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_chat_template=True)
+
+        try:
+            messages = [
+                {
+                    "role": "system",
+                    "content": "先读问题，再根据下面的文章内容回答问题，不要进行分析，不要重复问题，用简短的语句给出答案。\n\n例如：“全国美国文学研究会的第十八届年会在哪所大学举办的？”\n回答应该为：“xx大学”。\n\n",
+                },
+                {"role": "user", "content": test_prompt},
+            ]
+            formatted_full_prompt = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                add_special_tokens=True,
+            )
+        except Exception:
+            formatted_full_prompt = test_prompt
+
+        prompt_first_part, prompt_second_part = split_prompt_by_tokens(
+            formatted_full_prompt, tokenizer, split_ratio=prompt_split_ratio
+        )
+
+        ucm_config = {
+            "ucm_connectors": [
+                {
+                    "ucm_connector_name": "UcmNfsStore",
+                    "ucm_connector_config": {
+                        "storage_backends": ucm_storage_dir,
+                        "use_direct": False,
+                    },
+                }
+            ],
+            "load_only_first_rank": False,
+        }
+
+        sampling_params = SamplingParams(
+            temperature=0.0,
+            top_p=1,
+            max_tokens=max_tokens,
+            ignore_eos=False,
+        )
+
+        logger.info(f"\n===== HBM + SSD Mixed Accuracy Test =====")
+        logger.info(f"Model: {model_path}")
+        logger.info(f"Full prompt length: {len(test_prompt)} chars")
+        logger.info(f"Max tokens: {max_tokens}")
+        logger.info(f"Temperature: 0.0 (deterministic)")
+        logger.info(f"UCM storage: {ucm_storage_dir}")
+        logger.info(f"Prompt split ratio: {prompt_split_ratio}")
+        logger.info(f"Enforce eager: {enforce_eager}")
+        logger.info(f"Max num batched tokens: {max_num_batched_tokens}")
+
+        # ===== Phase 1: Disable HBM PC, save KV cache to SSD and load (baseline) =====
+        # Run Phase 1 in a separate subprocess to ensure GPU memory is fully released
+        logger.info(f"\n===== Phase 1: Save KV Cache to SSD And Load (Baseline) =====")
+
+        # Convert SamplingParams to dict for serialization, as non-picklable objects cannot be passed to subprocess
+        sampling_params_dict = to_dict_for_serialization(sampling_params)
+
+        phase1_outputs = run_in_spawn_subprocess(
+            run_offline_inference,
+            model_path,
+            ucm_config,
+            [formatted_full_prompt, formatted_full_prompt],
+            sampling_params_dict,
+            False,  # enable_prefix_caching=False for Phase 1
+            enforce_eager,
+            "Phase 1 (SSD save and load)",
+            max_num_batched_tokens,
+            timeout=180,
+        )
+        phase1_1_output = phase1_outputs[0]  # Phase 1.1: SSD save
+        phase1_2_output = phase1_outputs[1]  # Phase 1.2: SSD load
+        logger.info(f"Phase 1 completed in subprocess")
+        logger.info(f'Phase 1.1 output: "{phase1_1_output}"')
+        logger.info(f'Phase 1.2 output: "{phase1_2_output}"')
+
+        # ===== Phase 2: Enable HBM PC, test HBM + SSD mixed hit =====
+        # Run Phase 2 in a separate subprocess to ensure GPU memory is fully released
+        logger.info(f"\n===== Phase 2: HBM + SSD Mixed Hit Test =====")
+
+        phase2_outputs = run_in_spawn_subprocess(
+            run_offline_inference,
+            model_path,
+            ucm_config,
+            [prompt_first_part, formatted_full_prompt],
+            sampling_params_dict,
+            True,  # enable_prefix_caching=True for Phase 2
+            enforce_eager,
+            "Phase 2 (HBM + SSD mixed)",
+            max_num_batched_tokens,
+            timeout=180,
+        )
+        phase2_partial_output = phase2_outputs[0]
+        phase2_full_output = phase2_outputs[1]
+        logger.info(f"Phase 2 completed in subprocess")
+        logger.info(f"[INFO] Phase 2.1 output: {phase2_partial_output}")
+        logger.info(f"[INFO] Phase 2.2 output: {phase2_full_output}")
+
+        logger.info(f"\n[INFO] ===== Accuracy Test Results =====")
+
+        # Note: Small numerical precision differences in KV cache loading can cause
+        # punctuation token selection differences (e.g., full-width vs half-width comma)
+        def normalize_text(text: str) -> str:
+            """Normalize text for comparison by replacing similar punctuation."""
+            text = text.replace("，", ",")
+            text = text.replace("。", ".")
+            text = text.replace("！", "!")
+            text = text.replace("？", "?")
+            text = text.replace("：", ":")
+            text = text.replace("；", ";")
+            return text.strip()
+
+        def match_any_answer(output: str, answers: list[str]) -> bool:
+            """Check if output matches any of the standard answers."""
+            for answer in answers:
+                if normalize_text(output) == normalize_text(answer):
+                    return True
+            return False
+
+        # Compare Phase 1.1 vs Phase 1.2 (SSD load accuracy)
+        phase1_correct = match_any_answer(
+            phase1_1_output, standard_answers
+        ) and match_any_answer(phase1_2_output, standard_answers)
+        if not phase1_correct:
+            logger.warning(
+                f"\n===== Phase 1: SSD Load Accuracy Test (Exact Match) ====="
+            )
+            logger.warning(
+                f"Incorrect answer in Phase 1.1 (SSD save) or Phase 1.2 (SSD load) output!"
+            )
+            logger.warning(f"Phase 1.1 output:\n{phase1_1_output}")
+            logger.warning(f"Phase 1.2 output:\n{phase1_2_output}")
+            logger.warning(f"Standard answers:\n{standard_answers}")
+            pytest.fail("SSD Load Accuracy Test Failed!")
+
+        # Phase 2.1 should be skipped from accuracy check since it's only partial prompt
+        phase2_correct = match_any_answer(phase2_full_output, standard_answers)
+        if not phase2_correct:
+            logger.warning(
+                f"\n===== Phase 2: HBM + SSD Mixed Accuracy Test (Exact Match) ====="
+            )
+            logger.warning(f"Incorrect answer in Phase 2.2 (HBM + SSD mixed) output!")
+            logger.warning(f"Phase 2.2 output:\n{phase2_full_output}")
+            logger.warning(f"Standard answers:\n{standard_answers}")
+            pytest.fail("HBM + SSD Mixed Accuracy Test Failed!")
+
+    """Test ESA sparse attention."""
+
+    @pytest.mark.stage(1)
+    @pytest.mark.feature("offline_inference_sparse")
+    @pytest.mark.gpu_mem(30000)
+    @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
+    @pytest.mark.parametrize("max_tokens", [200])
+    @pytest.mark.parametrize("enforce_eager", [False])
+    @pytest.mark.parametrize("max_num_batched_tokens", [2047])
+    def test_offline_esa(
+        self,
+        model_name: str,
+        max_tokens: int,
+        enforce_eager: bool,
+        max_num_batched_tokens: int,
+    ):
+        config_file = get_path_relative_to_test_root("config.yaml")
+        with open(config_file, "r", encoding="utf-8") as f:
+            config = yaml.safe_load(f)
+
+        model_path = get_path_to_model(model_name, config)
+
+        assert os.path.exists(model_path), f"Model path does not exist: {model_path}"
+
+        ucm_storage_dir = "/tmp/ucm_cache"
+
+        # make sure UCM storage directory exists and is empty
+        ensure_storage_dir(ucm_storage_dir, clear_existing=True)
+
+        try:
+            test_prompt, standard_answers = load_prompt_from_file(
+                get_path_relative_to_test_root(
+                    "suites/E2E/prompts/test_offline_inference.json"
+                )
+            )
+            if not standard_answers:
+                pytest.fail(f"No standard answers found in prompt.json")
+        except Exception as e:
+            pytest.fail(f"Failed to load prompt from prompt.json: {e}")
+
+        logger.info(f"Standard answers: {standard_answers}")
+
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_chat_template=True)
+
+        try:
+            messages = [
+                {
+                    "role": "system",
+                    "content": "先读问题，再根据下面的文章内容回答问题，不要进行分析，不要重复问题，用简短的语句给出答案。\n\n例如：“全国美国文学研究会的第十八届年会在哪所大学举办的？”\n回答应该为：“xx大学”。\n\n",
+                },
+                {"role": "user", "content": test_prompt},
+            ]
+            formatted_full_prompt = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                add_special_tokens=True,
+            )
+        except Exception:
+            formatted_full_prompt = test_prompt
+
+        ucm_config = {
+            "ucm_connectors": [
+                {
+                    "ucm_connector_name": "UcmNfsStore",
+                    "ucm_connector_config": {
+                        "storage_backends": ucm_storage_dir,
+                        "use_direct": False,
+                    },
+                }
+            ],
+            "ucm_sparse_config": {
+                "ESA": {
+                    "init_window_sz": 1,
+                    "local_window_sz": 2,
+                    "min_blocks": 4,
+                    "sparse_ratio": 0.3,
+                    "retrieval_stride": 5,
+                }
+            },
+        }
+
+        sampling_params = SamplingParams(
+            temperature=0.0,
+            top_p=1,
+            max_tokens=max_tokens,
+            ignore_eos=False,
+        )
+
+        # Convert SamplingParams to dict for serialization, as non-picklable objects cannot be passed to subprocess
+        sampling_params_dict = to_dict_for_serialization(sampling_params)
+
+        phase1_outputs = run_in_spawn_subprocess(
+            run_offline_inference,
+            model_path,
+            ucm_config,
+            [formatted_full_prompt, formatted_full_prompt],
+            sampling_params_dict,
+            False,  # enable_prefix_caching=False
+            enforce_eager,
+            "ESA",
+            max_num_batched_tokens,
+            timeout=180,
+        )
+        phase1_1_output = phase1_outputs[0]  # Phase 1.1: SSD save
+        phase1_2_output = phase1_outputs[1]  # Phase 1.2: SSD load
+        logger.info(f"ESA inference completed in subprocess")
+        logger.info(f'Phase 1.1 output: "{phase1_1_output}"')
+        logger.info(f'Phase 1.2 output: "{phase1_2_output}"')