Skip to content

Commit e4cd0d7

Browse files
author
wangpengcheng
committed
issue/340 - patch the code to prevent errors.
1 parent a7d2c78 commit e4cd0d7

4 files changed

Lines changed: 74 additions & 26 deletions

File tree

csrc/engine/compiler/paged_compiler.cpp

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,7 @@
55
namespace infinilm::engine {
66
PagedCompiler::PagedCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier)
77
: GraphCompiler(model, barrier) {
8-
for (size_t b = 1; b < 64; ++b) {
9-
decode_batch_sizes_.push_back(b);
10-
}
11-
for (size_t b = 64; b < 128; b += 16) {
12-
decode_batch_sizes_.push_back(b);
13-
}
14-
for (size_t b = 128; b < 256; b += 32) {
15-
decode_batch_sizes_.push_back(b);
16-
}
17-
for (size_t b = 256; b <= 512; b += 64) {
8+
for (size_t b = 256; b > 0; b--) {
189
decode_batch_sizes_.push_back(b);
1910
}
2011
}

csrc/engine/rank_worker.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -386,18 +386,23 @@ void RankWorker::thread_loop() {
386386
const auto &vocab_size{logits_shape[2]};
387387
const auto &total_len{logits_shape[1]};
388388
const auto &batch_size{logits_shape[0]};
389+
int32_t seq_length = static_cast<int32_t>(batch_size * total_len);
389390

390391
auto n_req = local_args.input_offsets.value()->size(0) - 1;
391392
int32_t *input_offsets = (int32_t *)local_args.input_offsets.value()->data();
393+
ASSERT(input_offsets[n_req] == seq_length);
392394

393395
auto output_ids{infinicore::Tensor::empty({n_req}, infinicore::DataType::I64, rank_info_.device)};
394396

395397
for (auto i{decltype(n_req)(0)}; i < n_req; ++i) {
396-
auto score{logits->view({batch_size * total_len, vocab_size})->narrow({{0, size_t(input_offsets[i + 1] - 1), 1}})->view({vocab_size})};
398+
int32_t score_index = input_offsets[i + 1] - 1;
399+
ASSERT(input_offsets[i + 1] > input_offsets[i]);
400+
ASSERT(score_index >= 0 && score_index < seq_length);
401+
402+
auto score{logits->view({batch_size * total_len, vocab_size})->narrow({{0, size_t(score_index), 1}})->view({vocab_size})};
397403
auto out{output_ids->narrow({{0, i, 1}})->view({})};
398404
float random_val = std::uniform_real_distribution<float>(0, 1)(rng_);
399-
infinicore::op::random_sample_(
400-
out, score, random_val, top_p, top_k, temperature);
405+
infinicore::op::random_sample_(out, score, random_val, top_p, top_k, temperature);
401406
}
402407

403408
output_ids = output_ids->to(infinicore::Device::cpu());

python/infinilm/llm/scheduler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,8 @@ def schedule(self) -> Optional[SchedulerOutput]:
261261
scheduled_requests=scheduled_requests,
262262
is_prefill=is_prefill,
263263
)
264+
# logger.info("Scheduled decode: %d", len(scheduled_requests))
265+
264266
if self.connector is not None:
265267
meta = self.connector.build_connector_meta()
266268
scheduler_output.kv_connector_metadata = meta

python/infinilm/processors/basic_llm_processor.py

Lines changed: 63 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,29 @@
44
from ..llm.scheduler import SchedulerOutput
55

66

7+
def extend_to_next_power_of_two(lst):
8+
"""Pad ``lst`` to the next power-of-two length with ``-1``.
9+
10+
Padding marks unused slots (same convention as ``block_tables``).
11+
Callers must ``narrow`` to the real length before passing data to kernels.
12+
13+
Args:
14+
lst: Input list of numeric offsets or cumulative lengths.
15+
16+
Returns:
17+
A new list. Empty input yields ``[0]``; already power-of-two yields a copy.
18+
"""
19+
if not lst:
20+
return [0]
21+
n = len(lst)
22+
next_pow = 1
23+
while next_pow < n:
24+
next_pow <<= 1
25+
if next_pow == n:
26+
return lst[:]
27+
return lst + [-1] * (next_pow - n)
28+
29+
730
@register_processor("default")
831
class BasicLLMProcessor(InfinilmProcessor):
932
def __init__(self, model_dir_path: str):
@@ -35,9 +58,13 @@ def apply_chat_template(
3558
normalized_conversation = []
3659
for message in conversation:
3760
if isinstance(message["content"], list):
38-
assert len(message["content"]) == 1, "Only one content item supported in list"
61+
assert len(message["content"]) == 1, (
62+
"Only one content item supported in list"
63+
)
3964
content_item = message["content"][0]
40-
assert "type" in content_item and "text" in content_item, "Content dict must have 'type' and 'text' keys"
65+
assert "type" in content_item and "text" in content_item, (
66+
"Content dict must have 'type' and 'text' keys"
67+
)
4168
normalized_conversation.append(
4269
{"role": message["role"], "content": content_item["text"]}
4370
)
@@ -229,21 +256,44 @@ def _build_model_input_from_batch_scheduler_output(
229256
block_tables.append(padded_block_table)
230257
cu_seqlens.append(cu_seqlens[-1] + seq_len)
231258

232-
return {
233-
"input_ids": infinicore.from_list([tokens], dtype=infinicore.int64),
234-
"position_ids": infinicore.from_list(position_ids, dtype=infinicore.int64),
235-
"past_kv_lengths": infinicore.from_list(
236-
cached_lens, dtype=infinicore.int32
237-
),
238-
"total_kv_lengths": infinicore.from_list(seq_lens, dtype=infinicore.int32),
239-
"input_offsets": infinicore.from_list(seq_offsets, dtype=infinicore.int32),
240-
"cu_seqlens": infinicore.from_list(cu_seqlens, dtype=infinicore.int32),
241-
"block_tables": infinicore.from_list(block_tables, dtype=infinicore.int32),
242-
"slot_mapping": infinicore.from_list(slot_mapping, dtype=infinicore.int64),
259+
assert seq_offsets[-1] == len(tokens), (
260+
f"seq_offsets[-1]={seq_offsets[-1]} != len(tokens)={len(tokens)}"
261+
)
262+
263+
length = len(seq_offsets)
264+
seq_offsets = extend_to_next_power_of_two(seq_offsets)
265+
cu_seqlens = extend_to_next_power_of_two(cu_seqlens)
266+
267+
input_ids = infinicore.from_list([tokens], dtype=infinicore.int64)
268+
position_ids = infinicore.from_list(position_ids, dtype=infinicore.int64)
269+
past_kv_lengths = infinicore.from_list(cached_lens, dtype=infinicore.int32)
270+
total_kv_lengths = infinicore.from_list(seq_lens, dtype=infinicore.int32)
271+
272+
input_offsets = infinicore.from_list(
273+
seq_offsets, dtype=infinicore.int32
274+
).narrow(0, 0, length)
275+
276+
cu_seqlens = infinicore.from_list(cu_seqlens, dtype=infinicore.int32).narrow(
277+
0, 0, length
278+
)
279+
280+
block_tables = infinicore.from_list(block_tables, dtype=infinicore.int32)
281+
slot_mapping = infinicore.from_list(slot_mapping, dtype=infinicore.int64)
282+
283+
return_dict = {
284+
"input_ids": input_ids,
285+
"position_ids": position_ids,
286+
"past_kv_lengths": past_kv_lengths,
287+
"total_kv_lengths": total_kv_lengths,
288+
"input_offsets": input_offsets,
289+
"cu_seqlens": cu_seqlens,
290+
"block_tables": block_tables,
291+
"slot_mapping": slot_mapping,
243292
"temperature": temperature,
244293
"top_k": top_k,
245294
"top_p": top_p,
246295
}
296+
return return_dict
247297

248298
def get_tokenizer(self):
249299
return self.tokenizer

0 commit comments

Comments
 (0)