Skip to content

Commit 1f1f212

Browse files
committed
user docker to build&test, update sparse attention e2e test
1 parent ea57b57 commit 1f1f212

4 files changed

Lines changed: 108 additions & 48 deletions

File tree

.github/workflows/pull-request.yml

Lines changed: 66 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,6 @@ jobs:
7676
test-e2e-pc-gpu:
7777
runs-on: gpu
7878
needs: lint-and-unit-tests
79-
env:
80-
BUILD_TYPE: Release
8179
permissions:
8280
checks: write
8381
pull-requests: write
@@ -90,30 +88,54 @@ jobs:
9088
rm -rf .[!.]*
9189
fi
9290
- uses: actions/checkout@v4
91+
- name: Install Docker CLI
92+
run: |
93+
if ! command -v docker &> /dev/null; then
94+
echo "Docker CLI not found, installing..."
95+
sudo apt-get update
96+
sudo apt-get install -y docker.io
97+
else
98+
echo "Docker CLI already installed"
99+
fi
100+
- name: Generate Docker Image Version
101+
id: version
102+
run: |
103+
DATE=$(date +%Y%m%d)
104+
SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7)
105+
VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}"
106+
echo "version=${VERSION}" >> $GITHUB_OUTPUT
107+
echo "Docker image version: ${VERSION}"
93108
- name: Build
94109
run: |
95110
cd ${{github.workspace}}
96-
export PLATFORM=cuda
97-
pip install -v -e . --no-build-isolation
98-
- name: Test E2E
111+
sudo docker build -t ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} -f ./docker/Dockerfile-onlyPC ./
112+
- name: Test E2E in Docker
99113
run: |
100-
cd ${{github.workspace}}
101-
cd test
102-
pip install -r requirements.txt
103-
python3 -m pytest -x --stage=1 --feature=offline_inference --junitxml=offline-inference.xml
114+
sudo docker run --rm \
115+
-itd \
116+
--gpus all \
117+
--network=host \
118+
--ipc=host \
119+
--cap-add IPC_LOCK \
120+
-v /home/models:/home/models \
121+
-v ${{github.workspace}}:/workspace \
122+
ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} \
123+
-c "cd /workspace/test && pip install -r requirements.txt && python3 -m pytest -x --stage=1 --feature=offline_inference --junitxml=offline-inference.xml"
104124
- name: Upload pytest results
105125
uses: EnricoMi/publish-unit-test-result-action/linux@v2
106126
if: (!cancelled())
107127
with:
108128
files: |
109129
${{github.workspace}}/test/offline-inference.xml
110130
check_name: Prefix cache test results
131+
- name: Cleanup Docker Image
132+
if: always()
133+
run: |
134+
sudo docker rmi ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} || true
111135
112136
test-e2e-sparse-gpu:
113137
runs-on: gpu
114138
needs: lint-and-unit-tests
115-
env:
116-
BUILD_TYPE: Release
117139
permissions:
118140
checks: write
119141
pull-requests: write
@@ -126,23 +148,48 @@ jobs:
126148
rm -rf .[!.]*
127149
fi
128150
- uses: actions/checkout@v4
151+
- name: Install Docker CLI
152+
run: |
153+
if ! command -v docker &> /dev/null; then
154+
echo "Docker CLI not found, installing..."
155+
sudo apt-get update
156+
sudo apt-get install -y docker.io
157+
else
158+
echo "Docker CLI already installed"
159+
fi
160+
- name: Generate Docker Image Version
161+
id: version
162+
run: |
163+
DATE=$(date +%Y%m%d)
164+
SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7)
165+
VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}"
166+
echo "version=${VERSION}" >> $GITHUB_OUTPUT
167+
echo "Docker image version: ${VERSION}"
129168
- name: Build
130169
run: |
131170
cd ${{github.workspace}}
132-
export PLATFORM=cuda
133-
export ENABLE_SPARSE=TRUE
134-
pip install -v -e . --no-build-isolation
135-
- name: Test E2E
171+
sudo docker build -t ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} -f ./docker/Dockerfile ./
172+
- name: Test E2E in Docker
136173
run: |
137-
cd ${{github.workspace}}
138-
cd test
139-
pip install -r requirements.txt
140-
python3 -m pytest -x --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml
174+
sudo docker run --rm \
175+
-itd \
176+
--gpus all \
177+
--network=host \
178+
--ipc=host \
179+
--cap-add IPC_LOCK \
180+
-v /home/models:/home/models \
181+
-v ${{github.workspace}}:/workspace \
182+
ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} \
183+
-c "cd /workspace/test && pip install -r requirements.txt && python3 -m pytest -x --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml"
141184
- name: Upload pytest results
142185
uses: EnricoMi/publish-unit-test-result-action/linux@v2
143186
if: (!cancelled())
144187
with:
145188
files: |
146189
${{github.workspace}}/test/offline-inference-sparse.xml
147190
check_name: Sparse attention test results
191+
- name: Cleanup Docker Image
192+
if: always()
193+
run: |
194+
sudo docker rmi ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} || true
148195

docker/Dockerfile-onlyPC

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Set to other image if needed
2+
FROM vllm/vllm-openai:v0.9.2
3+
4+
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
5+
6+
WORKDIR /workspace
7+
8+
# Install unified-cache-management
9+
COPY . /workspace/unified-cache-management
10+
11+
RUN pip config set global.index-url ${PIP_INDEX_URL}
12+
13+
RUN export PLATFORM="cuda" && \
14+
pip install -v -e /workspace/unified-cache-management --no-build-isolation
15+
16+
ENTRYPOINT ["/bin/bash"]

test/conftest.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -164,20 +164,25 @@ def pytest_runtest_logreport(report):
164164

165165

166166
def get_free_gpu(required_memory_mb):
167-
mem_needed_with_buffer = int(required_memory_mb * 1.3) # add buffer to avoid OOM
168-
pynvml.nvmlInit()
169-
device_count = pynvml.nvmlDeviceGetCount()
170-
device_indices = list(range(device_count))
171-
random.shuffle(device_indices)
172-
for i in device_indices: # random order to reduce collisions
173-
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
174-
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
175-
free_in_mb = info.free / 1024**2
176-
if free_in_mb >= mem_needed_with_buffer:
177-
utilization = (
178-
required_memory_mb * (1024**2) / info.total if info.total else 0
179-
)
180-
return i, free_in_mb, utilization
167+
try:
168+
mem_needed_with_buffer = int(
169+
required_memory_mb * 1.3
170+
) # add buffer to avoid OOM
171+
pynvml.nvmlInit()
172+
device_count = pynvml.nvmlDeviceGetCount()
173+
device_indices = list(range(device_count))
174+
random.shuffle(device_indices)
175+
for i in device_indices: # random order to reduce collisions
176+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
177+
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
178+
free_in_mb = info.free / 1024**2
179+
if free_in_mb >= mem_needed_with_buffer:
180+
utilization = (
181+
required_memory_mb * (1024**2) / info.total if info.total else 0
182+
)
183+
return i, free_in_mb, utilization
184+
finally:
185+
pynvml.nvmlShutdown()
181186
return None, 0, 0
182187

183188

@@ -189,7 +194,7 @@ def setup_gpu_resource(request):
189194
gpu_id, free_in_mb, gpu_utilization = get_free_gpu(mem_needed)
190195
if gpu_id is not None:
191196
print(
192-
f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization {gpu_utilization:.4%}"
197+
f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization for test {gpu_utilization:.4%}"
193198
)
194199
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
195200
if gpu_utilization:

test/suites/E2E/test_offline_inference_sparse.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -215,16 +215,16 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
215215
print(f"Standard answers:\n{standard_answers}")
216216
pytest.fail("HBM + SSD Mixed Accuracy Test Failed!")
217217

218-
"""Test ESA sparse attention."""
218+
"""Test GSA sparse attention."""
219219

220220
@pytest.mark.stage(1)
221221
@pytest.mark.feature("offline_inference_sparse")
222222
@pytest.mark.gpu_mem(6000)
223223
@pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
224224
@pytest.mark.parametrize("max_tokens", [200])
225-
@pytest.mark.parametrize("enforce_eager", [False])
226-
@pytest.mark.parametrize("max_num_batched_tokens", [2047])
227-
def test_offline_esa(
225+
@pytest.mark.parametrize("enforce_eager", [True])
226+
@pytest.mark.parametrize("max_num_batched_tokens", [30000])
227+
def test_offline_gsa(
228228
self,
229229
model_name: str,
230230
max_tokens: int,
@@ -286,15 +286,7 @@ def test_offline_esa(
286286
},
287287
}
288288
],
289-
"ucm_sparse_config": {
290-
"ESA": {
291-
"init_window_sz": 1,
292-
"local_window_sz": 2,
293-
"min_blocks": 4,
294-
"sparse_ratio": 0.3,
295-
"retrieval_stride": 5,
296-
}
297-
},
289+
"ucm_sparse_config": {"GSAOnDevice": {}},
298290
}
299291

300292
sampling_params = SamplingParams(
@@ -315,7 +307,7 @@ def test_offline_esa(
315307
sampling_params_dict,
316308
False, # enable_prefix_caching=False
317309
enforce_eager,
318-
"ESA",
310+
"GSA",
319311
max_num_batched_tokens,
320312
timeout=180,
321313
)

0 commit comments

Comments
 (0)