@@ -16,7 +16,6 @@ limitations under the License.
1616#include " block_manager_pool.h"
1717
1818#include < algorithm>
19- #include < limits>
2019
2120#include " block_manager_impl.h"
2221#include " common/global_flags.h"
@@ -125,7 +124,7 @@ bool BlockManagerPool::allocate_embedding_id(Sequence* sequence,
125124
126125void BlockManagerPool::deallocate_embedding_id (Sequence* sequence,
127126 int32_t dp_rank) {
128- DCHECK (sequence != nullptr );
127+ CHECK (sequence != nullptr );
129128 CHECK_GE (dp_rank, 0 );
130129 CHECK_LT (static_cast <size_t >(dp_rank), embedding_managers_.size ());
131130 auto embedding_block = sequence->reset_embedding_block ();
@@ -152,7 +151,7 @@ void BlockManagerPool::deallocate(std::vector<Sequence*>& sequences) {
152151}
153152
154153void BlockManagerPool::deallocate (Sequence* sequence) {
155- DCHECK (sequence != nullptr );
154+ CHECK (sequence != nullptr );
156155 // add blocks to the prefix cache
157156 int32_t dp_rank = get_dp_rank (sequence);
158157 cache (sequence);
@@ -173,13 +172,13 @@ void BlockManagerPool::reset_transfer_infos() {
173172}
174173
175174bool BlockManagerPool::allocate (Sequence* sequence) {
176- DCHECK (sequence != nullptr );
175+ CHECK (sequence != nullptr );
177176 return allocate (sequence, sequence->num_tokens ());
178177}
179178
180179bool BlockManagerPool::allocate (std::vector<Sequence*>& sequences) {
181180 for (auto * sequence : sequences) {
182- DCHECK (sequence != nullptr );
181+ CHECK (sequence != nullptr );
183182 if (!allocate (sequence, sequence->num_tokens ())) {
184183 // should we gurantee the atomicity of the allocation? all or nothing?
185184 return false ;
@@ -190,7 +189,7 @@ bool BlockManagerPool::allocate(std::vector<Sequence*>& sequences) {
190189
191190bool BlockManagerPool::allocate (Sequence* sequence, size_t num_tokens) {
192191 AUTO_COUNTER (allocate_blocks_latency_seconds);
193- DCHECK (sequence != nullptr );
192+ CHECK (sequence != nullptr );
194193 int32_t dp_rank = get_dp_rank (sequence);
195194 const bool started_empty = sequence->kv_state ().num_kv_blocks () == 0 ;
196195 const bool needs_embedding_id = !sequence->has_embedding_id ();
@@ -340,10 +339,30 @@ void BlockManagerPool::allocate_shared(Sequence* sequence) {
340339}
341340
342341void BlockManagerPool::cache (Sequence* sequence) {
342+ cache (sequence, sequence->kv_state ().kv_cache_tokens_num ());
343+ }
344+
345+ void BlockManagerPool::cache (Sequence* sequence, size_t num_tokens) {
346+ CHECK (sequence != nullptr );
347+ if (!options_.enable_prefix_cache ()) {
348+ return ;
349+ }
350+
351+ const size_t block_size = static_cast <size_t >(options_.block_size ());
352+ const size_t available_tokens_num =
353+ std::min ({num_tokens,
354+ sequence->kv_state ().num_kv_blocks () * block_size,
355+ sequence->tokens ().size ()});
356+ const size_t existed_shared_blocks_num =
357+ sequence->kv_state ().shared_kv_blocks_num ();
358+ if (available_tokens_num <= existed_shared_blocks_num * block_size) {
359+ return ;
360+ }
361+
343362 int32_t dp_rank = get_dp_rank (sequence);
344- const auto token_ids = sequence->cached_tokens ( );
363+ const auto token_ids = sequence->tokens (). slice ( 0 , available_tokens_num );
345364 auto * blocks = sequence->kv_state ().mutable_kv_blocks ();
346- auto existed_shared_blocks_num = sequence-> kv_state (). shared_kv_blocks_num ( );
365+ CHECK_GE (blocks-> size (), existed_shared_blocks_num );
347366 block_managers_[dp_rank]->cache (
348367 token_ids, *blocks, existed_shared_blocks_num);
349368}
@@ -399,7 +418,7 @@ double BlockManagerPool::kv_cache_utilization() const {
399418// currently use only for profile, which not need prefix cache.
400419// If more often used in the future, can be integrated into deallocate function.
401420void BlockManagerPool::deallocate_without_cache (Sequence* sequence) {
402- DCHECK (sequence != nullptr );
421+ CHECK (sequence != nullptr );
403422 int32_t dp_rank = get_dp_rank (sequence);
404423 block_managers_[dp_rank]->deallocate (sequence->kv_state ().kv_blocks ());
405424 deallocate_embedding_id (sequence, dp_rank);
0 commit comments