File tree Expand file tree Collapse file tree
benchmarks/single_node/agentic Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -149,19 +149,6 @@ case "$OFFLOADING" in
149149 CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation
150150 cd ..
151151
152- LMCACHE_ROCM_PATCH_DIR=" $RESULT_DIR /lmcache_rocm_patch"
153- write_lmcache_rocm_mp_patch " $LMCACHE_ROCM_PATCH_DIR "
154- write_chunked_connector_patch " $LMCACHE_ROCM_PATCH_DIR "
155- write_scheduler_assertion_patch " $LMCACHE_ROCM_PATCH_DIR "
156- export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1
157- export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
158- export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1
159- # Cap external KV tokens loaded per scheduling step to prevent GPU
160- # block exhaustion deadlock at high concurrency (c>=32). Default
161- # 32768 keeps peak block demand within the GPU KV pool. Set to 0 to
162- # disable chunking (only safe at low concurrency).
163- export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD=" ${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:- 32768} "
164- export PYTHONPATH=" $LMCACHE_ROCM_PATCH_DIR ${PYTHONPATH: +: $PYTHONPATH } "
165152 python3 -c " import lmcache.integration.vllm.lmcache_mp_connector" > /dev/null
166153
167154 # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
Original file line number Diff line number Diff line change @@ -153,19 +153,6 @@ case "$OFFLOADING" in
153153 CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation
154154 cd ..
155155
156- LMCACHE_ROCM_PATCH_DIR=" $RESULT_DIR /lmcache_rocm_patch"
157- write_lmcache_rocm_mp_patch " $LMCACHE_ROCM_PATCH_DIR "
158- write_chunked_connector_patch " $LMCACHE_ROCM_PATCH_DIR "
159- write_scheduler_assertion_patch " $LMCACHE_ROCM_PATCH_DIR "
160- export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1
161- export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
162- export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1
163- # Cap external KV tokens loaded per scheduling step to prevent GPU
164- # block exhaustion deadlock at high concurrency (c>=32). Default
165- # 32768 keeps peak block demand within the GPU KV pool. Set to 0 to
166- # disable chunking (only safe at low concurrency).
167- export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD=" ${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:- 32768} "
168- export PYTHONPATH=" $LMCACHE_ROCM_PATCH_DIR ${PYTHONPATH: +: $PYTHONPATH } "
169156 python3 -c " import lmcache.integration.vllm.lmcache_mp_connector" > /dev/null
170157
171158 # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
Original file line number Diff line number Diff line change @@ -149,19 +149,6 @@ case "$OFFLOADING" in
149149 CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation
150150 cd ..
151151
152- LMCACHE_ROCM_PATCH_DIR=" $RESULT_DIR /lmcache_rocm_patch"
153- write_lmcache_rocm_mp_patch " $LMCACHE_ROCM_PATCH_DIR "
154- write_chunked_connector_patch " $LMCACHE_ROCM_PATCH_DIR "
155- write_scheduler_assertion_patch " $LMCACHE_ROCM_PATCH_DIR "
156- export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1
157- export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
158- export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1
159- # Cap external KV tokens loaded per scheduling step to prevent GPU
160- # block exhaustion deadlock at high concurrency (c>=32). Default
161- # 32768 keeps peak block demand within the GPU KV pool. Set to 0 to
162- # disable chunking (only safe at low concurrency).
163- export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD=" ${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:- 32768} "
164- export PYTHONPATH=" $LMCACHE_ROCM_PATCH_DIR ${PYTHONPATH: +: $PYTHONPATH } "
165152 python3 -c " import lmcache.integration.vllm.lmcache_mp_connector" > /dev/null
166153
167154 # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
You can’t perform that action at this time.
0 commit comments