Skip to content

Commit b580d2a

Browse files
[Feat] Add UCM store compression module and compression config parametersFeature store compress (#940)
## Purpose This PR introduces a configurable compression module for the UCM store, designed to reduce **TTFT (Time-To-First-Token)** under SSD hit. By compressing stored BF16 tensors and supporting multi-threaded decompression, aim to lower I/O overhead and speed up cache access, improving end-to-end inference latency. ## Modifications - Added three new configuration parameters under ucm_connector_config to control compression behavior: - `compress_ratio`: Compression ratio (currently only 2.0x is supported) - `data_type`: Target data type (currently only BF16 is supported) - `decompress_thread_num`: Number of threads used for parallel decompression - Implemented full compression/decompression pipeline based on KVfold coding - Integrated the compression module into the existing UCM store pipeline - Added YAML configuration support for compression feature toggling and tuning ## Test - Model: Qwen2.5-14B-Instruct - Hardware: Kunpeng 920 5250 + 4 × Ascend 910B4 (tensor parallel = 4) - Test Config: 0% hit on DRAM cache, 50% / 80% / 100% hit on local SSD. Layer_wise (use_layerwise: true) : input tokens | output tokens | batch size | odirect | TTFT/ms, 50% hit, w/o compression | TTFT/ms, 50% hit, with compression | TTFT reduction | TTFT/ms, 80% hit, w/o compression | TTFT/ms, 80% hit, with compression | TTFT reduction | TTFT/ms, 100% hit, w/o compression | TTFT/ms, 100% hit, with compression | TTFT reduction -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- 4000 | 1 | 1 | TRUE | 357.99 | 341.97 | 4.47% | 207.02 | 225.74 | -9.04% | 208.37 | 193.22 | 7.27% 8000 | 1 | 1 | TRUE | 531.6 | 578.86 | -8.89% | 304.06 | 321.29 | -5.67% | 350.01 | 331.22 | 5.37% 16000 | 1 | 1 | TRUE | 1236.44 | 1284.19 | -3.86% | 800.25 | 720.4 | 9.98% | 611.74 | 532.57 | 12.94% 32000 | 1 | 1 | TRUE | 3278.62 | 3152.37 | 3.85% | 2077.46 | 1807.59 | 12.99% | 1204.62 | 959.57 | 20.34% 4000 | 1 | 8 | TRUE | 1339.06 | 1396.03 | -4.25% | 868.46 | 839.52 | 3.33% | 1056.99 | 829.89 | 21.49% 8000 | 1 | 8 | TRUE | 2353.23 | 2373.6 | -0.87% | 1436.33 | 1356.39 | 5.57% | 2024.04 | 1421.53 | 29.77% 16000 | 1 | 8 | TRUE | 5311.28 | 5548.89 | -4.47% | 3433.49 | 2829.16 | 17.60% | 3808.22 | 2906.72 | 23.67% 32000 | 1 | 8 | TRUE | 13796.04 | 13660.42 | 0.98% | 8353.08 | 7851.27 | 6.01% | 6939.55 | 5274.13 | 24.00% 4000 | 1 | 16 | TRUE | 2280.48 | 2343.98 | -2.78% | 1428.27 | 1232.16 | 13.73% | 2154.65 | 1342.75 | 37.68% 8000 | 1 | 16 | TRUE | 4414.05 | 4436.29 | -0.50% | 2639.71 | 2388.07 | 9.53% | 4170.57 | 2646 | 36.56% 16000 | 1 | 16 | TRUE | 10144.97 | 9960.42 | 1.82% | 6167.56 | 5084.08 | 17.57% | 7170.8 | 4005.21 | 44.15% 32000 | 1 | 16 | TRUE | 25967.02 | 24714.92 | 4.82% | 15550.23 | 12950.47 | 16.72% | 12261.77 | 7504.62 | 38.80% Block_wise (use_layerwise: false) : input tokens | output tokens | batch size | odirect | TTFT/ms, 50% hit, w/o compression | TTFT/ms, 50% hit, with compression | TTFT reduction | TTFT/ms, 80% hit, w/o compression | TTFT/ms, 80% hit, with compression | TTFT reduction | TTFT/ms, 100% hit, w/o compression | TTFT/ms, 100% hit, with compression | TTFT reduction -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- 4000 | 1 | 1 | TRUE | 423.87 | 406.85 | 4.02% | 307.49 | 253.9 | 17.43% | 260.84 | 198.13 | 24.04% 8000 | 1 | 1 | TRUE | 688.38 | 665.83 | 3.28% | 507.52 | 419.18 | 17.41% | 460.22 | 352.85 | 23.33% 16000 | 1 | 1 | TRUE | 1528.11 | 1479.79 | 3.16% | 1083.55 | 874.61 | 19.28% | 617.35 | 446.64 | 27.65% 32000 | 1 | 1 | TRUE | 3678.05 | 3426.42 | 6.84% | 2327.42 | 2039.48 | 12.37% | 1387.85 | 832.33 | 40.03% 4000 | 1 | 8 | TRUE | 1801.28 | 1673.41 | 7.10% | 1261.3 | 1025.12 | 18.73% | 1117.65 | 745.11 | 33.33% 8000 | 1 | 8 | TRUE | 3262.38 | 2952.3 | 9.50% | 2344.01 | 1880.82 | 19.76% | 2381.43 | 1314.63 | 44.80% 16000 | 1 | 8 | TRUE | 6982.66 | 6343.26 | 9.16% | 4897.66 | 3862.16 | 21.14% | 4157.12 | 2060.17 | 50.44% 32000 | 1 | 8 | TRUE | 16615.58 | 14960.37 | 9.96% | 10797.82 | 8443.65 | 21.80% | 7872.14 | 3896.58 | 50.50% 4000 | 1 | 16 | TRUE | 3181.24 | 2928.33 | 7.95% | 2228.06 | 1775.39 | 20.32% | 2402.1 | 1152.3 | 52.03% 8000 | 1 | 16 | TRUE | 6082.47 | 5437.54 | 10.60% | 4213.28 | 3197.17 | 24.12% | 4381.51 | 2386.19 | 45.54% 16000 | 1 | 16 | TRUE | 12832.2 | 11805.53 | 8.00% | 8489.46 | 6721.34 | 20.83% | 6231.51 | 3555.04 | 42.95% 32000 | 1 | 16 | TRUE | 29728.23 | 28228.75 | 5.04% | 18745.21 | 15293.23 | 18.42% | 13077.52 | 6831.96 | 47.76% Co-authored-by: xwLearnsLLM <wangxuan154@huawei.com>
1 parent 9222bd8 commit b580d2a

33 files changed

Lines changed: 9507 additions & 2 deletions

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ repos:
66
exclude: \.(jsonl|txt)$
77
args: [
88
'--skip', 'ucm/csrc/**,ucm.egg-info/**,.github/**,ucm/sparse/gsa_on_device/csrc/**',
9-
'-L', 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn'
9+
'-L', 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,Collet,DElt,re-use'
1010
]
1111
- repo: https://github.com/psf/black
1212
rev: 24.4.2

ucm/store/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@ target_link_libraries(storeintf INTERFACE storedetail infra_status)
55
add_subdirectory(nfsstore)
66
add_subdirectory(pcstore)
77
add_subdirectory(ds3fs)
8+
add_subdirectory(posix)
9+
add_subdirectory(compress)
810
add_subdirectory(cache)
911
add_subdirectory(empty)
1012
add_subdirectory(fake)
1113
add_subdirectory(mooncakestore)
1214
add_subdirectory(pipeline)
13-
add_subdirectory(posix)
1415
add_subdirectory(test)

ucm/store/compress/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
file(GLOB_RECURSE UCM_COMPRESSOR_STORE_CC_SOURCE_FILES "./cc/*.cc")
2+
3+
add_library(compressor SHARED ${UCM_COMPRESSOR_STORE_CC_SOURCE_FILES})
4+
target_include_directories(compressor PUBLIC
5+
${CMAKE_CURRENT_SOURCE_DIR}/cc
6+
${CMAKE_CURRENT_SOURCE_DIR}/cc/compress_lib
7+
)
8+
target_link_libraries(compressor PUBLIC storeintf infra_logger)
9+
10+
file(RELATIVE_PATH INSTALL_REL_PATH ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
11+
install(TARGETS compressor LIBRARY DESTINATION ${INSTALL_REL_PATH} COMPONENT ucm)

0 commit comments

Comments
 (0)