Use caching allocator for runner (#15730)

kimishpatel · web-flow · commit cb4e5ae6b969 · 2026-04-27T21:40:35.000Z
Differential Revision: D86120038 Pull Request resolved: #15730
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1124,6 +1124,8 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/memory_allocator)
+  list(APPEND _executorch_extensions extension_memory_allocator)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
   list(APPEND _executorch_extensions extension_llm_runner)
 endif()
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
@@ -39,8 +39,9 @@ add_subdirectory(
   ${CMAKE_CURRENT_BINARY_DIR}/../sampler
 )
 
-set(runner_deps executorch_core extension_module extension_tensor
-                extension_llm_sampler tokenizers::tokenizers
+set(runner_deps
+    executorch_core extension_module extension_tensor extension_llm_sampler
+    extension_memory_allocator tokenizers::tokenizers
 )
 
 # depend on arange_utils
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
@@ -17,6 +17,7 @@
 #include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
+#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
@@ -226,12 +227,28 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 
   // Create the Module
   std::unique_ptr<Module> module;
+  uint32_t max_cached_memory_size_bytes_ = 1024 * 1024 * 10; // 10MB
   if (data_files.size() > 0) {
     module = std::make_unique<Module>(
-        model_path, data_files, load_mode, std::move(event_tracer));
+        model_path,
+        data_files,
+        load_mode,
+        std::move(event_tracer),
+        nullptr, // memory allocator
+        std::make_unique<
+            executorch::extension::CPUCachingAllocator>( // temp memory
+                                                         // allocator
+            max_cached_memory_size_bytes_));
   } else {
     module = std::make_unique<Module>(
-        model_path, load_mode, std::move(event_tracer));
+        model_path,
+        load_mode,
+        std::move(event_tracer), // event tracer
+        nullptr, // memory allocator
+        std::make_unique<
+            executorch::extension::CPUCachingAllocator>( // temp memory
+                                                         // allocator
+            max_cached_memory_size_bytes_));
   }
 
   // Get metadata from Module
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
@@ -132,6 +132,7 @@ def define_common_targets():
                 ":text_prefiller" + aten_suffix,
                 ":text_token_generator" + aten_suffix,
                 "//executorch/extension/llm/runner/io_manager:io_manager" + aten_suffix,
+                "//executorch/extension/memory_allocator:cpu_caching_allocator",
                 "//pytorch/tokenizers:hf_tokenizer",
                 "//pytorch/tokenizers:llama2c_tokenizer",
                 "//pytorch/tokenizers:sentencepiece",

Original file line number	Diff line number	Diff line change
`@@ -39,8 +39,9 @@ add_subdirectory(`
`39`	`39`	`${CMAKE_CURRENT_BINARY_DIR}/../sampler`
`40`	`40`	`)`
`41`	`41`
`42`		`-set(runner_deps executorch_core extension_module extension_tensor`
`43`		`- extension_llm_sampler tokenizers::tokenizers`
	`42`	`+set(runner_deps`
	`43`	`+ executorch_core extension_module extension_tensor extension_llm_sampler`
	`44`	`+ extension_memory_allocator tokenizers::tokenizers`
`44`	`45`	`)`
`45`	`46`
`46`	`47`	`# depend on arange_utils`