Skip to content

Commit 818fb0a

Browse files
committed
[Executorch][LLM] Use caching allocator for runner
Pull Request resolved: #15730 We observed that on iOS it improves perf by 6% because SDPA op does temp allocations. No significant difference on android though. ghstack-source-id: 368353572 @exported-using-ghexport Differential Revision: [D86120038](https://our.internmc.facebook.com/intern/diff/D86120038/)
1 parent a43675c commit 818fb0a

4 files changed

Lines changed: 25 additions & 4 deletions

File tree

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1124,6 +1124,8 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
11241124
endif()
11251125

11261126
if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
1127+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/memory_allocator)
1128+
list(APPEND _executorch_extensions extension_memory_allocator)
11271129
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
11281130
list(APPEND _executorch_extensions extension_llm_runner)
11291131
endif()

extension/llm/runner/CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,9 @@ add_subdirectory(
3939
${CMAKE_CURRENT_BINARY_DIR}/../sampler
4040
)
4141

42-
set(runner_deps executorch_core extension_module extension_tensor
43-
extension_llm_sampler tokenizers::tokenizers
42+
set(runner_deps
43+
executorch_core extension_module extension_tensor extension_llm_sampler
44+
extension_memory_allocator tokenizers::tokenizers
4445
)
4546

4647
# depend on arange_utils

extension/llm/runner/llm_runner_helper.cpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <executorch/extension/llm/runner/text_llm_runner.h>
1818
#include <executorch/extension/llm/runner/text_prefiller.h>
1919
#include <executorch/extension/llm/runner/text_token_generator.h>
20+
#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
2021
#include <executorch/runtime/core/result.h>
2122
#include <executorch/runtime/platform/runtime.h>
2223
#include <pytorch/tokenizers/hf_tokenizer.h>
@@ -226,12 +227,28 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
226227

227228
// Create the Module
228229
std::unique_ptr<Module> module;
230+
uint32_t max_cached_memory_size_bytes_ = 1024 * 1024 * 10; // 10MB
229231
if (data_files.size() > 0) {
230232
module = std::make_unique<Module>(
231-
model_path, data_files, load_mode, std::move(event_tracer));
233+
model_path,
234+
data_files,
235+
load_mode,
236+
std::move(event_tracer),
237+
nullptr, // memory allocator
238+
std::make_unique<
239+
executorch::extension::CPUCachingAllocator>( // temp memory
240+
// allocator
241+
max_cached_memory_size_bytes_));
232242
} else {
233243
module = std::make_unique<Module>(
234-
model_path, load_mode, std::move(event_tracer));
244+
model_path,
245+
load_mode,
246+
std::move(event_tracer), // event tracer
247+
nullptr, // memory allocator
248+
std::make_unique<
249+
executorch::extension::CPUCachingAllocator>( // temp memory
250+
// allocator
251+
max_cached_memory_size_bytes_));
235252
}
236253

237254
// Get metadata from Module

extension/llm/runner/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ def define_common_targets():
132132
":text_prefiller" + aten_suffix,
133133
":text_token_generator" + aten_suffix,
134134
"//executorch/extension/llm/runner/io_manager:io_manager" + aten_suffix,
135+
"//executorch/extension/memory_allocator:cpu_caching_allocator",
135136
"//pytorch/tokenizers:hf_tokenizer",
136137
"//pytorch/tokenizers:llama2c_tokenizer",
137138
"//pytorch/tokenizers:sentencepiece",

0 commit comments

Comments
 (0)