Skip to content

Commit e0e10cc

Browse files
authored
Fix dangling pointer in TextTokenGenerator non-kv-cache path (#18725)
Differential Revision: D99408541 Pull Request resolved: #18725
1 parent 19bbeac commit e0e10cc

File tree

2 files changed

+66
-2
lines changed

2 files changed

+66
-2
lines changed

extension/llm/runner/test/test_text_llm_runner.cpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,4 +497,56 @@ TEST_F(RunnerTest, GenerateEmptyWithoutPrefillFails) {
497497
EXPECT_EQ(err, Error::InvalidState);
498498
}
499499

500+
// Test that TextTokenGenerator works correctly in non-kv-cache mode.
501+
// Exercises the code path fixed by reserving capacity before from_blob:
502+
// without reserve(), vector reallocation would invalidate the data pointer.
503+
TEST_F(RunnerTest, NonKvCacheGenerateCompletesSuccessfully) {
504+
auto tokenizer = createMockTokenizer();
505+
auto text_decoder_runner = createMockTextDecoderRunner();
506+
507+
// In non-kv-cache mode, the input tensor should grow by 1 token each step.
508+
// Verify data is readable each time (catches dangling pointers under ASan).
509+
int step_count = 0;
510+
ON_CALL(*text_decoder_runner, step)
511+
.WillByDefault(
512+
[&](executorch::extension::TensorPtr& tokens_tensor, int64_t) {
513+
// Initial tokens = 4 (prompt 1,2,3 + prefill token 4).
514+
// Each step appends one token before the next call.
515+
int64_t expected_size = 4 + step_count;
516+
EXPECT_EQ(tokens_tensor->size(1), expected_size);
517+
518+
// Read data to verify the pointer is still valid.
519+
auto* data = tokens_tensor->const_data_ptr<int64_t>();
520+
EXPECT_EQ(data[0], 1); // first prompt token
521+
EXPECT_EQ(data[1], 2);
522+
EXPECT_EQ(data[2], 3);
523+
EXPECT_EQ(data[3], 4); // prefill token
524+
525+
step_count++;
526+
return Result<executorch::aten::Tensor>(tensor);
527+
});
528+
529+
Stats stats;
530+
auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
531+
std::unordered_set<uint64_t>{100});
532+
TextTokenGenerator generator(
533+
tokenizer.get(),
534+
text_decoder_runner.get(),
535+
false, // use_kv_cache = false
536+
std::move(eos_ids),
537+
&stats);
538+
539+
// 4 tokens: prompt (1,2,3) + prefill token (4)
540+
std::vector<uint64_t> tokens = {1, 2, 3, 4};
541+
// Generate enough tokens that the vector would reallocate without reserve.
542+
int32_t max_new_tokens = 20;
543+
544+
auto result = generator.generate(
545+
tokens, 4, max_new_tokens, 0.0f, [](const std::string&) {});
546+
547+
EXPECT_TRUE(result.ok());
548+
EXPECT_EQ(result.get(), max_new_tokens);
549+
EXPECT_EQ(step_count, max_new_tokens);
550+
}
551+
500552
} // namespace

extension/llm/runner/text_token_generator.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,23 @@ class ET_EXPERIMENTAL TextTokenGenerator {
7777
} else {
7878
token_data = tokens;
7979
token_shape = {1, static_cast<int>(tokens.size())};
80+
// Prevent reallocation that would invalidate from_blob's data pointer.
81+
token_data.reserve(token_data.size() + max_new_tokens);
8082
}
8183

82-
// initialize tensor wrappers
84+
// Create tensor wrapper. For non-kv-cache, use max capacity shape so
85+
// numel_bound_ is large enough for subsequent resize_tensor_ptr calls,
86+
// then resize down to the actual token count.
87+
auto max_shape = use_kv_cache_
88+
? token_shape
89+
: std::vector<executorch::aten::SizesType>{
90+
1, static_cast<int>(tokens.size() + max_new_tokens)};
8391
auto tokens_managed = from_blob(
84-
token_data.data(), token_shape, executorch::aten::ScalarType::Long);
92+
token_data.data(), max_shape, executorch::aten::ScalarType::Long);
93+
if (!use_kv_cache_) {
94+
ET_CHECK_OK_OR_RETURN_ERROR(
95+
resize_tensor_ptr(tokens_managed, token_shape));
96+
}
8597

8698
should_stop_ = false;
8799

0 commit comments

Comments
 (0)