ModelEngine-Group
diff --git a/‎docker/Dockerfile-NPU‎
Lines changed: 10 additions & 2 deletions b/‎docker/Dockerfile-NPU‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎docs/source/getting-started/quickstart_vllm.md‎
Lines changed: 28 additions & 0 deletions b/‎docs/source/getting-started/quickstart_vllm.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎docs/source/getting-started/quickstart_vllm_ascend.md‎
Lines changed: 35 additions & 1 deletion b/‎docs/source/getting-started/quickstart_vllm_ascend.md‎
Lines changed: 35 additions & 1 deletion
diff --git a/‎docs/source/user-guide/prefix-cache/index.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/user-guide/prefix-cache/index.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎…user-guide/prefix-cache/pipline_store.md‎ ‎…ser-guide/prefix-cache/pipeline_store.md‎docs/source/user-guide/prefix-cache/pipline_store.md renamed to docs/source/user-guide/prefix-cache/pipeline_store.md
Lines changed: 3 additions & 3 deletions b/‎…user-guide/prefix-cache/pipline_store.md‎ ‎…ser-guide/prefix-cache/pipeline_store.md‎docs/source/user-guide/prefix-cache/pipline_store.md renamed to docs/source/user-guide/prefix-cache/pipeline_store.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/offline_inference_kvcomphbm.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/offline_inference_kvcomphbm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/ucm_config_example.yaml‎
Lines changed: 1 addition & 2 deletions b/‎examples/ucm_config_example.yaml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
@@ -1,5 +1,5 @@
 # Set to other image if needed
-FROM quay.io/ascend/vllm-ascend:v0.9.2rc1
+FROM quay.io/ascend/vllm-ascend:v0.9.2rc1-openeuler
 
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 
@@ -10,8 +10,16 @@ COPY . /workspace/unified-cache-management
 
 RUN pip config set global.index-url ${PIP_INDEX_URL}
 
-RUN export PLATFORM="ascend" && \
+RUN export PLATFORM="ascend" ENABLE_SPARSE=true && \
     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
     pip install -v -e /workspace/unified-cache-management --no-build-isolation
 
+# Apply patch for vLLM
+RUN cd /vllm-workspace/vllm \
+    && git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
+
+# Apply patch for vLLM-ascend
+RUN cd /vllm-workspace/vllm-ascend \
+    && git apply /workspace/unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
+
 CMD ["/bin/bash"]
@@ -77,6 +77,33 @@ Download the pre-built `vllm/vllm-openai:v0.9.2` docker image and build unified-
     pip install -v -e . --no-build-isolation
     ```
 
+3. Apply vLLM Integration Patches (Required)
+
+    To enable Unified Cache Management (UCM) integration with vLLM, you must **manually apply the corresponding vLLM patch**.
+
+    You may directly navigate to the vLLM source directory:
+    ```bash
+    cd <path_to_vllm>
+    ```
+    Apply the patch that matches your development needs:
+
+    - Full UCM integration (recommended):
+    ```bash
+    git apply unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
+    ```
+
+    - Sparse attention only:
+    ```bash
+    git apply unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch
+    ```
+
+    - ReRoPE support only:
+    ```bash
+    git apply unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt-rerope.patch
+    ```
+
+    Choose the patch according to your development needs.
+    If you are working on **sparse attention** or **ReRoPE** independently, applying only the corresponding patch is sufficient.
 
 
 ### Option 3: Install by pip
@@ -91,6 +118,7 @@ Download the pre-built `vllm/vllm-openai:v0.9.2` docker image and build unified-
     export PLATFORM=cuda
     pip install uc-manager
     ```
+> **Note:** If installing via `pip install`, you need to manually add the `config.yaml` file, similar to `unified-cache-management/examples/ucm_config_example.yaml`, because PyPI packages do not include YAML files.
 
 ## Step 2: Configuration
 
 
@@ -12,7 +12,7 @@ We offer 3 options to install UCM.
 
 ### Option 1: Build from source
 
-Follow commands below to install unified-cache-management from source code:
+1、Follow commands below to install unified-cache-management from source code:
 **Note:** The sparse module was not compiled by default. To enable it, set the environment variable `export ENABLE_SPARSE=TRUE` before you build.
 ```bash
 # Replace <branch_or_tag_name> with the branch or tag name needed
@@ -23,13 +23,39 @@ pip install -v -e . --no-build-isolation
 cd ..
 ```
 
+2、Apply vLLM and vLLM-Ascend Integration Patches (Required)
+To enable Unified Cache Management (UCM) integration, you need to apply patches to both vLLM and vLLM-Ascend source trees.
+
+**Step 1:** Apply the vLLM Patch
+
+First, apply the standard vLLM integration patch in the vLLM source directory:
+    
+```bash
+cd <path_to_vllm>
+git apply unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-adapt.patch
+```
+    
+**Step 2:** Apply the vLLM-Ascend Patch
+
+Then, switch to the vLLM-Ascend source directory and apply the Ascend-specific patch:
+
+```bash
+cd <path_to_vllm_ascend>
+git apply unified-cache-management/ucm/integration/vllm/patch/0.9.2/vllm-ascend-adapt.patch
+```
+
+**Note:**
+    The ReRoPE algorithm is not supported on Ascend at the moment.
+    Only the standard UCM integration is applicable for vLLM-Ascend.
+
 
 ### Option 2: Install by pip
 Install by pip or find the pre-build wheels on [Pypi](https://pypi.org/project/uc-manager/).
 ```
 export PLATFORM=ascend
 pip install uc-manager
 ```
+> **Note:** If installing via `pip install`, you need to manually add the `config.yaml` file, similar to `unified-cache-management/examples/ucm_config_example.yaml`, because PyPI packages do not include YAML files.
 
 ### Option 3: Setup from docker
 Download the pre-built `vllm-ascend` docker image and build unified-cache-management docker image by commands below:
@@ -39,6 +65,14 @@ Download the pre-built `vllm-ascend` docker image and build unified-cache-manage
  cd unified-cache-management
  docker build -t ucm-vllm:latest -f ./docker/Dockerfile-NPU ./
  ```
+vllm-ascend provides two variants: **Ubuntu** and **openEuler**.  
+The `Dockerfile-NPU` uses the **openEuler** variant by default.
+
+If you want to use the **Ubuntu** variant, please remove the `-openeuler` suffix and use the following image instead:
+
+```text
+quay.io/ascend/vllm-ascend:v0.9.2rc1
+```
 Then run your container using following command. You can add or remove Docker parameters as needed.
 ```bash
 # Update DEVICE according to your device (/dev/davinci[0-7])
 
@@ -80,4 +80,5 @@ performance.
 :::{toctree}
 :maxdepth: 1
 nfs_store
+pipeline_store
 :::
@@ -227,12 +227,12 @@ This log indicates that the **Cache Store** has received a **load or dump task**
 | `subtask_number` | Number of subtasks executed in this operation                                  |
 | `size`       | Total size of data transferred in bytes (across all tasks)                  |
 
-```test
+```text
 [UC][D] Cache task({task_id},{operation},{subtask_number},{size}) finished, cost {time}ms. [PID,TID]
 ```
 This log indicates that a load or dump task in the **Cache Store** has completed, along with its execution time **in ms**.
 
-```test
+```text
 [UC][D] Posix task({task_id},{operation},{subtask_number},{size}) dispatching. [PID,TID]
 ```
 This log indicates that the **Posix Store** has received a **load or dump task**
@@ -243,7 +243,7 @@ This log indicates that the **Posix Store** has received a **load or dump task**
 | `subtask_number` | Number of subtasks executed in this operation                                  |
 | `size`       | Total size of data transferred in bytes (across all tasks)                  |
 
-```test
+```text
 [UC][D] Posix task({task_id},{operation},{subtask_number},{size}) finished, cost {time}ms. [PID,TID]
 ```
 This log indicates that a load or dump task in the **Posix Store** has completed, along with its execution time in **in ms**.
@@ -77,7 +77,7 @@ def build_llm_with_uc(module_path: str, name: str, model: str):
                     },
                 }
             ],
-            "ucm_sparse_config": {"GSA": {}},
+            "ucm_sparse_config": {"KvCompOnDevice": {}},
         },
     )
 
 
@@ -31,8 +31,7 @@ load_only_first_rank: false
   # Or for GSA:
   # GSA: {}
   # Or for KvCompOnDevice:
-  # KvCompOnDevice:
-  #   "kvcompOnDevice_config_path": "workspace/unified-cache-management/ucm/sparse/kvcomp/configs/kvcomp_qwen3_32B_config.json"
+  # KvCompOnDevice: {}
 
 
 # Whether to use layerwise loading/saving (optional, default: True for UCMConnector)
 
@@ -139,7 +139,7 @@ def build_cmake(self, ext: CMakeExtension):
 
 setup(
     name="uc-manager",
-    version="0.2.0rc2",
+    version="0.2.0",
     description="Unified Cache Management",
     author="Unified Cache Team",
     packages=find_packages(),
Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ def build_llm_with_uc(module_path: str, name: str, model: str):`
`77`	`77`	`},`
`78`	`78`	`}`
`79`	`79`	`],`
`80`		`- "ucm_sparse_config": {"GSA": {}},`
	`80`	`+ "ucm_sparse_config": {"KvCompOnDevice": {}},`
`81`	`81`	`},`
`82`	`82`	`)`
`83`	`83`