Skip to content

Commit 879f4a7

Browse files
Metal backend: compute init/execute times (pytorch#16639)
This pull request introduces detailed performance tracking and reporting for the Metal backend, focusing on timing statistics for both initialization and execution phases. It adds infrastructure to collect, reset, and print timing data, and integrates statistics output into the Parakeet model example. Additionally, a preprocessor macro is defined to signal Metal backend availability. **Metal Backend Performance Statistics:** * Added timing statistics collection for `init()` and `execute()` calls in the Metal backend, including total time, call count, and per-method breakdowns. Accessor and reset functions are provided in `stats.h` and implemented in `metal_backend.cpp`. * Introduced a new `stats.cpp` file with a function to print all collected Metal backend statistics, including per-method breakdowns for both initialization and execution. **Build and Integration Improvements:** * Added `runtime/stats.cpp` to the Metal backend build sources and defined the `ET_BUILD_METAL` preprocessor macro to indicate Metal backend support. **Example Model Enhancements:** * Updated the Parakeet example (`main.cpp`) to show Metal backend timing stats after model execution if Metal is enabled. These changes provide better visibility into Metal backend performance and make it easier to profile and optimize model execution on Apple devices.
1 parent a20cb0d commit 879f4a7

7 files changed

Lines changed: 375 additions & 1 deletion

File tree

CMakePresets.json

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,8 @@
167167
"llm-release"
168168
],
169169
"cacheVariables": {
170-
"EXECUTORCH_BUILD_METAL": "ON"
170+
"EXECUTORCH_BUILD_METAL": "ON",
171+
"EXECUTORCH_METAL_COLLECT_STATS": "OFF"
171172
},
172173
"condition": {
173174
"lhs": "${hostSystemName}",
@@ -216,6 +217,23 @@
216217
"rhs": "Darwin"
217218
}
218219
},
220+
{
221+
"name": "llm-metal-stats",
222+
"displayName": "LLM Metal build with stats collection and logging",
223+
"inherits": [
224+
"llm-release-metal"
225+
],
226+
"cacheVariables": {
227+
"EXECUTORCH_METAL_COLLECT_STATS": "ON",
228+
"EXECUTORCH_ENABLE_LOGGING": "ON",
229+
"EXECUTORCH_LOG_LEVEL": "Info"
230+
},
231+
"condition": {
232+
"lhs": "${hostSystemName}",
233+
"type": "equals",
234+
"rhs": "Darwin"
235+
}
236+
},
219237
{
220238
"name": "profiling",
221239
"displayName": "Build ExecuTorch with Profiling Enabled",
@@ -320,6 +338,15 @@
320338
"install"
321339
],
322340
"jobs": 0
341+
},
342+
{
343+
"name": "llm-metal-stats-install",
344+
"displayName": "Build and install LLM extension artifacts with Metal stats",
345+
"configurePreset": "llm-metal-stats",
346+
"targets": [
347+
"install"
348+
],
349+
"jobs": 0
323350
}
324351
],
325352
"workflowPresets": [
@@ -406,6 +433,20 @@
406433
"name": "llm-debug-metal-install"
407434
}
408435
]
436+
},
437+
{
438+
"name": "llm-metal-stats",
439+
"displayName": "Configure, build and install ExecuTorch LLM extension with Metal stats and logging",
440+
"steps": [
441+
{
442+
"type": "configure",
443+
"name": "llm-metal-stats"
444+
},
445+
{
446+
"type": "build",
447+
"name": "llm-metal-stats-install"
448+
}
449+
]
409450
}
410451
]
411452
}

backends/apple/metal/CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ find_package_torch()
3636

3737
set(_aoti_metal_sources
3838
runtime/metal_backend.cpp
39+
runtime/stats.cpp
3940
runtime/shims/memory.cpp
4041
runtime/shims/et_metal.mm
4142
runtime/shims/et_metal_ops.mm
@@ -68,6 +69,17 @@ target_link_libraries(
6869

6970
target_compile_options(metal_backend PUBLIC -fexceptions -frtti -fPIC)
7071

72+
# Define C++ preprocessor macro for Metal backend availability
73+
target_compile_definitions(metal_backend PUBLIC ET_BUILD_METAL)
74+
75+
# Define macro for stats collection if enabled
76+
if(EXECUTORCH_METAL_COLLECT_STATS)
77+
target_compile_definitions(
78+
metal_backend PUBLIC EXECUTORCH_METAL_COLLECT_STATS
79+
)
80+
message(STATUS "Metal backend stats collection enabled")
81+
endif()
82+
7183
target_link_options(metal_backend PUBLIC -Wl,-export_dynamic)
7284

7385
# Find PyTorch's OpenMP library specifically for libtorch-less AOTI

backends/apple/metal/runtime/metal_backend.cpp

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@
1212
#include <executorch/runtime/core/evalue.h>
1313
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
1414
#include <unistd.h>
15+
#include <chrono>
1516
#include <cstdio>
1617

1718
#include <filesystem>
1819
#include <fstream>
20+
#include <mutex>
1921
#include <string>
2022
#include <vector>
2123

@@ -29,9 +31,96 @@
2931
#include <executorch/backends/apple/metal/runtime/shims/shim_mps.h>
3032
#include <executorch/backends/apple/metal/runtime/shims/tensor_attribute.h>
3133
#include <executorch/backends/apple/metal/runtime/shims/utils.h>
34+
#include <executorch/backends/apple/metal/runtime/stats.h>
3235

3336
namespace executorch::backends::metal {
3437

38+
#ifdef EXECUTORCH_METAL_COLLECT_STATS
39+
40+
// Per-method timing statistics
41+
struct MethodStats {
42+
double total_ms = 0.0;
43+
int64_t call_count = 0;
44+
};
45+
46+
// Singleton struct containing all timing statistics and mutex
47+
struct StatsData {
48+
std::mutex mutex;
49+
double execute_total_ms = 0.0;
50+
int64_t execute_call_count = 0;
51+
double init_total_ms = 0.0;
52+
int64_t init_call_count = 0;
53+
std::unordered_map<std::string, MethodStats> method_stats;
54+
std::unordered_map<std::string, MethodStats> init_method_stats;
55+
};
56+
57+
// Thread-safe singleton accessor using C++11 magic statics
58+
static StatsData& get_stats_data() {
59+
static StatsData instance;
60+
return instance;
61+
}
62+
63+
// Accessor functions for execute timing statistics
64+
double get_metal_backend_execute_total_ms() {
65+
auto& stats = get_stats_data();
66+
std::lock_guard<std::mutex> lock(stats.mutex);
67+
return stats.execute_total_ms;
68+
}
69+
70+
int64_t get_metal_backend_execute_call_count() {
71+
auto& stats = get_stats_data();
72+
std::lock_guard<std::mutex> lock(stats.mutex);
73+
return stats.execute_call_count;
74+
}
75+
76+
// Accessor functions for init timing statistics
77+
double get_metal_backend_init_total_ms() {
78+
auto& stats = get_stats_data();
79+
std::lock_guard<std::mutex> lock(stats.mutex);
80+
return stats.init_total_ms;
81+
}
82+
83+
int64_t get_metal_backend_init_call_count() {
84+
auto& stats = get_stats_data();
85+
std::lock_guard<std::mutex> lock(stats.mutex);
86+
return stats.init_call_count;
87+
}
88+
89+
void reset_metal_backend_stats() {
90+
auto& stats = get_stats_data();
91+
std::lock_guard<std::mutex> lock(stats.mutex);
92+
stats.execute_total_ms = 0.0;
93+
stats.execute_call_count = 0;
94+
stats.init_total_ms = 0.0;
95+
stats.init_call_count = 0;
96+
stats.method_stats.clear();
97+
stats.init_method_stats.clear();
98+
}
99+
100+
std::unordered_map<std::string, std::pair<double, int64_t>>
101+
get_metal_backend_per_method_stats() {
102+
auto& stats = get_stats_data();
103+
std::lock_guard<std::mutex> lock(stats.mutex);
104+
std::unordered_map<std::string, std::pair<double, int64_t>> result;
105+
for (const auto& entry : stats.method_stats) {
106+
result[entry.first] = {entry.second.total_ms, entry.second.call_count};
107+
}
108+
return result;
109+
}
110+
111+
std::unordered_map<std::string, std::pair<double, int64_t>>
112+
get_metal_backend_init_per_method_stats() {
113+
auto& stats = get_stats_data();
114+
std::lock_guard<std::mutex> lock(stats.mutex);
115+
std::unordered_map<std::string, std::pair<double, int64_t>> result;
116+
for (const auto& entry : stats.init_method_stats) {
117+
result[entry.first] = {entry.second.total_ms, entry.second.call_count};
118+
}
119+
return result;
120+
}
121+
122+
#endif // EXECUTORCH_METAL_COLLECT_STATS
123+
35124
#define LOAD_SYMBOL(handle, member, name, so_handle) \
36125
do { \
37126
handle->member = reinterpret_cast<name##Func>(dlsym(so_handle, #name)); \
@@ -137,6 +226,9 @@ class ET_EXPERIMENTAL MetalBackend final
137226
FreeableBuffer* processed, // This will be a empty buffer
138227
ArrayRef<CompileSpec> compile_specs // This will be my empty list
139228
) const override {
229+
#ifdef EXECUTORCH_METAL_COLLECT_STATS
230+
auto init_start = std::chrono::high_resolution_clock::now();
231+
#endif
140232
ET_LOG(Info, "MetalBackend::init - Starting initialization");
141233

142234
std::string method_name;
@@ -261,6 +353,29 @@ class ET_EXPERIMENTAL MetalBackend final
261353
}
262354

263355
ET_LOG(Info, "MetalBackend::init - Initialization completed successfully");
356+
357+
#ifdef EXECUTORCH_METAL_COLLECT_STATS
358+
// Accumulate init timing statistics
359+
auto init_end = std::chrono::high_resolution_clock::now();
360+
double elapsed_ms =
361+
std::chrono::duration<double, std::milli>(init_end - init_start)
362+
.count();
363+
364+
{
365+
auto& stats_data = get_stats_data();
366+
std::lock_guard<std::mutex> lock(stats_data.mutex);
367+
stats_data.init_total_ms += elapsed_ms;
368+
stats_data.init_call_count++;
369+
370+
// Track per-method init timing
371+
if (!method_name.empty()) {
372+
auto& method_stats = stats_data.init_method_stats[method_name];
373+
method_stats.total_ms += elapsed_ms;
374+
method_stats.call_count++;
375+
}
376+
}
377+
#endif
378+
264379
return (DelegateHandle*)handle; // Return the handle post-processing
265380
}
266381

@@ -269,6 +384,9 @@ class ET_EXPERIMENTAL MetalBackend final
269384
BackendExecutionContext& context,
270385
DelegateHandle* handle_,
271386
Span<EValue*> args) const override {
387+
#ifdef EXECUTORCH_METAL_COLLECT_STATS
388+
auto execute_start = std::chrono::high_resolution_clock::now();
389+
#endif
272390
ET_LOG(Debug, "MetalBackend execute");
273391

274392
AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
@@ -514,6 +632,29 @@ class ET_EXPERIMENTAL MetalBackend final
514632

515633
ET_LOG(Debug, "MetalBackend execution completed successfully");
516634

635+
#ifdef EXECUTORCH_METAL_COLLECT_STATS
636+
// Accumulate timing statistics
637+
auto execute_end = std::chrono::high_resolution_clock::now();
638+
double elapsed_ms =
639+
std::chrono::duration<double, std::milli>(execute_end - execute_start)
640+
.count();
641+
642+
{
643+
auto& stats_data = get_stats_data();
644+
std::lock_guard<std::mutex> lock(stats_data.mutex);
645+
stats_data.execute_total_ms += elapsed_ms;
646+
stats_data.execute_call_count++;
647+
648+
// Track per-method timing
649+
const char* method_name = context.get_method_name();
650+
if (method_name != nullptr) {
651+
auto& method_stats = stats_data.method_stats[method_name];
652+
method_stats.total_ms += elapsed_ms;
653+
method_stats.call_count++;
654+
}
655+
}
656+
#endif
657+
517658
return Error::Ok;
518659
}
519660

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/apple/metal/runtime/stats.h>
10+
11+
#ifdef EXECUTORCH_METAL_COLLECT_STATS
12+
13+
#include <iostream>
14+
15+
namespace executorch {
16+
namespace backends {
17+
namespace metal {
18+
19+
void print_metal_backend_stats() {
20+
std::cout << "\n--- Metal Backend Performance Statistics ---" << std::endl;
21+
22+
// Init stats
23+
double metal_init_total_ms = get_metal_backend_init_total_ms();
24+
int64_t metal_init_call_count = get_metal_backend_init_call_count();
25+
std::cout << "Metal init() total: " << metal_init_total_ms << " ms ("
26+
<< metal_init_call_count << " calls)";
27+
if (metal_init_call_count > 0) {
28+
std::cout << " (avg: " << metal_init_total_ms / metal_init_call_count
29+
<< " ms/call)";
30+
}
31+
std::cout << std::endl;
32+
33+
// Per-method init breakdown
34+
auto init_per_method_stats = get_metal_backend_init_per_method_stats();
35+
if (!init_per_method_stats.empty()) {
36+
std::cout << " Per-method init breakdown:" << std::endl;
37+
for (const auto& entry : init_per_method_stats) {
38+
const std::string& method_name = entry.first;
39+
double method_total_ms = entry.second.first;
40+
int64_t method_call_count = entry.second.second;
41+
std::cout << " " << method_name << ": " << method_total_ms << " ms ("
42+
<< method_call_count << " calls)";
43+
if (method_call_count > 0) {
44+
std::cout << " (avg: " << method_total_ms / method_call_count
45+
<< " ms/call)";
46+
}
47+
std::cout << std::endl;
48+
}
49+
}
50+
51+
// Execute stats
52+
double metal_total_ms = get_metal_backend_execute_total_ms();
53+
int64_t metal_call_count = get_metal_backend_execute_call_count();
54+
std::cout << "\nMetal execute() total: " << metal_total_ms << " ms ("
55+
<< metal_call_count << " calls)";
56+
if (metal_call_count > 0) {
57+
std::cout << " (avg: " << metal_total_ms / metal_call_count << " ms/call)";
58+
}
59+
std::cout << std::endl;
60+
61+
// Per-method execute breakdown
62+
auto per_method_stats = get_metal_backend_per_method_stats();
63+
if (!per_method_stats.empty()) {
64+
std::cout << " Per-method execute breakdown:" << std::endl;
65+
for (const auto& entry : per_method_stats) {
66+
const std::string& method_name = entry.first;
67+
double method_total_ms = entry.second.first;
68+
int64_t method_call_count = entry.second.second;
69+
std::cout << " " << method_name << ": " << method_total_ms << " ms ("
70+
<< method_call_count << " calls)";
71+
if (method_call_count > 0) {
72+
std::cout << " (avg: " << method_total_ms / method_call_count
73+
<< " ms/call)";
74+
}
75+
std::cout << std::endl;
76+
}
77+
}
78+
79+
std::cout << "--------------------------------------------\n" << std::endl;
80+
}
81+
82+
} // namespace metal
83+
} // namespace backends
84+
} // namespace executorch
85+
86+
#endif // EXECUTORCH_METAL_COLLECT_STATS

0 commit comments

Comments
 (0)