Skip to content

Commit 6ebb435

Browse files
kimishpatelfacebook-github-bot
authored andcommitted
Use caching allocator for runner (#15730)
Summary: We observed that on iOS it improves perf by 6% because SDPA op does temp allocations. No significant difference on android though. ghstack-source-id: 328001114 exported-using-ghexport Reviewed By: navsud, derekdixu Differential Revision: D86120038
1 parent 3466332 commit 6ebb435

4 files changed

Lines changed: 23 additions & 3 deletions

File tree

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,6 +1098,8 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
10981098
endif()
10991099

11001100
if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
1101+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/memory_allocator)
1102+
list(APPEND _executorch_extensions extension_memory_allocator)
11011103
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
11021104
list(APPEND _executorch_extensions extension_llm_runner)
11031105
endif()

extension/llm/runner/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ add_subdirectory(
4040
)
4141

4242
set(runner_deps executorch_core extension_module extension_tensor
43-
extension_llm_sampler tokenizers::tokenizers
43+
extension_llm_sampler extension_memory_allocator tokenizers::tokenizers
4444
)
4545

4646
# depend on arange_utils

extension/llm/runner/llm_runner_helper.cpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <executorch/extension/llm/runner/text_llm_runner.h>
1818
#include <executorch/extension/llm/runner/text_prefiller.h>
1919
#include <executorch/extension/llm/runner/text_token_generator.h>
20+
#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
2021
#include <executorch/runtime/core/result.h>
2122
#include <executorch/runtime/platform/runtime.h>
2223
#include <pytorch/tokenizers/hf_tokenizer.h>
@@ -223,12 +224,28 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
223224

224225
// Create the Module
225226
std::unique_ptr<Module> module;
227+
uint32_t max_cached_memory_size_bytes_ = 1024 * 1024 * 10; // 10MB
226228
if (data_files.size() > 0) {
227229
module = std::make_unique<Module>(
228-
model_path, data_files, load_mode, std::move(event_tracer));
230+
model_path,
231+
data_files,
232+
load_mode,
233+
std::move(event_tracer),
234+
nullptr, // memory allocator
235+
std::make_unique<
236+
executorch::extension::CPUCachingAllocator>( // temp memory
237+
// allocator
238+
max_cached_memory_size_bytes_));
229239
} else {
230240
module = std::make_unique<Module>(
231-
model_path, load_mode, std::move(event_tracer));
241+
model_path,
242+
load_mode,
243+
std::move(event_tracer), // event tracer
244+
nullptr, // memory allocator
245+
std::make_unique<
246+
executorch::extension::CPUCachingAllocator>( // temp memory
247+
// allocator
248+
max_cached_memory_size_bytes_));
232249
}
233250

234251
// Get metadata from Module

extension/llm/runner/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ def define_common_targets():
132132
":text_prefiller" + aten_suffix,
133133
":text_token_generator" + aten_suffix,
134134
"//executorch/extension/llm/runner/io_manager:io_manager" + aten_suffix,
135+
"//executorch/extension/memory_allocator:cpu_caching_allocator",
135136
"//pytorch/tokenizers:hf_tokenizer",
136137
"//pytorch/tokenizers:llama2c_tokenizer",
137138
"//pytorch/tokenizers:sentencepiece",

0 commit comments

Comments
 (0)