diff --git a/devtools/scripts/benchmark_android.sh b/devtools/scripts/benchmark_android.sh new file mode 100755 index 00000000000..08bde91fd10 --- /dev/null +++ b/devtools/scripts/benchmark_android.sh @@ -0,0 +1,424 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Android Benchmark Script +# +# Cross-compiles executor_runner for Android (arm64-v8a), pushes it and a model +# to an Android device via adb, runs the benchmark, and summarizes results. +# +# Usage: +# ./devtools/scripts/benchmark_android.sh [options] +# +# Options: +# --warmup Number of warmup executions (default: 1) +# --iterations Number of timed executions (default: 10) +# --num-threads CPU threads for inference (default: -1, auto-detect) +# --method Method to run (default: first method in the program) +# --backends Comma-separated backends (default: xnnpack) +# Supported: xnnpack, coreml, vulkan, qnn +# --device ADB device serial (for multiple devices) +# --etdump Enable event tracer and pull etdump back to host +# --no-cleanup Leave model file on device after benchmarking +# --rebuild Force cmake reconfigure and rebuild +# --build-dir Reuse existing build directory (skip build step) + +set -euo pipefail + +# --- Locate ExecuTorch root from script path --- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +EXECUTORCH_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +if [[ ! -f "$EXECUTORCH_ROOT/CMakeLists.txt" ]] || [[ ! -d "$EXECUTORCH_ROOT/devtools" ]]; then + echo "Error: Could not locate ExecuTorch root from script path: $SCRIPT_DIR" + exit 1 +fi + +cd "$EXECUTORCH_ROOT" + +# --- Defaults --- +MODEL_PATH="" +WARMUP=1 +ITERATIONS=10 +NUM_THREADS=-1 +METHOD="" +BACKENDS="xnnpack" +DEVICE="" +ETDUMP=false +NO_CLEANUP=false +REBUILD=false +BUILD_DIR="" + +DEVICE_DIR="/data/local/tmp/et_benchmark" + +# --- Argument parsing --- +usage() { + cat < [options] + +Options: + --warmup Number of warmup executions (default: 1) + --iterations Number of timed executions (default: 10) + --num-threads CPU threads for inference (default: -1, auto-detect) + --method Method to run (default: first method in the program) + --backends Comma-separated backends to build (default: xnnpack) + Supported: xnnpack, vulkan, qnn + --device ADB device serial (for multiple devices) + --etdump Enable event tracer and pull etdump back to host + --no-cleanup Leave model file on device after benchmarking + --rebuild Force cmake reconfigure and rebuild + --build-dir Reuse existing build directory (skip build step) +EOF + exit 1 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --warmup) WARMUP="$2"; shift 2 ;; + --iterations) ITERATIONS="$2"; shift 2 ;; + --num-threads) NUM_THREADS="$2"; shift 2 ;; + --method) METHOD="$2"; shift 2 ;; + --backends) BACKENDS="$2"; shift 2 ;; + --device) DEVICE="$2"; shift 2 ;; + --etdump) ETDUMP=true; shift ;; + --no-cleanup) NO_CLEANUP=true; shift ;; + --rebuild) REBUILD=true; shift ;; + --build-dir) BUILD_DIR="$2"; shift 2 ;; + -h|--help) usage ;; + *) + if [[ -z "$MODEL_PATH" && "$1" != -* ]]; then + MODEL_PATH="$1"; shift + else + echo "Unknown option: $1"; usage + fi + ;; + esac +done + +if [[ -z "$MODEL_PATH" ]]; then + echo "Error: model path is required." + usage +fi + +if [[ ! -f "$MODEL_PATH" ]]; then + echo "Error: Model file not found: $MODEL_PATH" + exit 1 +fi + +MODEL_NAME=$(basename "$MODEL_PATH") + +# --- ADB helper --- +adb_cmd() { + if [[ -n "$DEVICE" ]]; then + adb -s "$DEVICE" "$@" + else + adb "$@" + fi +} + +# --- Locate NDK --- +find_ndk() { + # Try directories in priority order. + local candidates=() + + if [[ -n "${ANDROID_NDK:-}" ]]; then + candidates+=("$ANDROID_NDK") + fi + if [[ -n "${ANDROID_NDK_HOME:-}" ]]; then + candidates+=("$ANDROID_NDK_HOME") + fi + + # SDK-relative ndk/ directories (pick latest version). + local sdk_roots=() + [[ -n "${ANDROID_HOME:-}" ]] && sdk_roots+=("$ANDROID_HOME") + [[ -n "${ANDROID_SDK_ROOT:-}" ]] && sdk_roots+=("$ANDROID_SDK_ROOT") + sdk_roots+=("$HOME/Library/Android/sdk") + + for sdk in "${sdk_roots[@]}"; do + if [[ -d "$sdk/ndk" ]]; then + local latest + latest=$(ls -d "$sdk/ndk"/*/ 2>/dev/null | sort -V | tail -1) + [[ -n "$latest" ]] && candidates+=("$latest") + fi + done + + candidates+=("/opt/ndk") + + local toolchain="build/cmake/android.toolchain.cmake" + for ndk in "${candidates[@]}"; do + # Strip trailing slash. + ndk="${ndk%/}" + if [[ -f "$ndk/$toolchain" ]]; then + echo "$ndk" + return 0 + fi + done + + echo "Error: Could not find Android NDK. Searched:" >&2 + for c in "${candidates[@]}"; do + echo " $c" >&2 + done + echo "" >&2 + echo "Set ANDROID_NDK or install the NDK via Android Studio." >&2 + return 1 +} + +# --- Build executor_runner --- +if [[ -z "$BUILD_DIR" ]]; then + BUILD_DIR="cmake-out-android-benchmark" + + ANDROID_NDK=$(find_ndk) + + # Resolve backend names to CMake flags. + backends_lower=$(echo "$BACKENDS" | tr '[:upper:]' '[:lower:]') + BACKEND_FLAGS=() + IFS=',' read -ra backend_list <<< "$backends_lower" + for b in "${backend_list[@]}"; do + b=$(echo "$b" | tr -d ' ') + case "$b" in + xnnpack) BACKEND_FLAGS+=("-DEXECUTORCH_BUILD_XNNPACK=ON") ;; + vulkan) BACKEND_FLAGS+=("-DEXECUTORCH_BUILD_VULKAN=ON") ;; + qnn) BACKEND_FLAGS+=("-DEXECUTORCH_BUILD_QNN=ON") ;; + *) echo "Error: Unknown backend '$b'. Supported: xnnpack, vulkan, qnn"; exit 1 ;; + esac + done + + cmake_args=( + -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" + --preset android-arm64-v8a + -DANDROID_PLATFORM=android-26 + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON + -DEXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL=ON + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON + -DEXECUTORCH_ENABLE_LOGGING=ON + -DCMAKE_BUILD_TYPE=Release + "${BACKEND_FLAGS[@]}" + ) + + if [[ "$ETDUMP" == true ]]; then + cmake_args+=( + -DEXECUTORCH_ENABLE_EVENT_TRACER=ON + -DEXECUTORCH_BUILD_DEVTOOLS=ON + ) + else + cmake_args+=(-DEXECUTORCH_ENABLE_EVENT_TRACER=OFF) + fi + + # Check if we can skip the build entirely. + ARGS_STAMP="$BUILD_DIR/.benchmark_cmake_args" + CURRENT_ARGS=$(printf '%s\n' "${cmake_args[@]}" | sort) + PREV_ARGS="" + [[ -f "$ARGS_STAMP" ]] && PREV_ARGS=$(cat "$ARGS_STAMP") + ARGS_CHANGED=false + [[ "$CURRENT_ARGS" != "$PREV_ARGS" ]] && ARGS_CHANGED=true + + if [[ "$REBUILD" == false && "$ARGS_CHANGED" == false && -f "$BUILD_DIR/executor_runner" ]]; then + echo "executor_runner already built, skipping build. Use --rebuild to force." + else + echo "Using NDK: $ANDROID_NDK" + # Re-configure if args changed, --rebuild forced, or no CMakeCache yet. + if [[ "$REBUILD" == true || "$ARGS_CHANGED" == true || ! -f "$BUILD_DIR/CMakeCache.txt" ]]; then + echo "Configuring build..." + cmake . "${cmake_args[@]}" -B "$BUILD_DIR" + echo "$CURRENT_ARGS" > "$ARGS_STAMP" + else + echo "Build configuration unchanged, skipping configure." + fi + + + if [[ "$(uname)" == "Darwin" ]]; then + CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 )) + else + CMAKE_JOBS=$(( $(nproc) - 1 )) + fi + [[ "$CMAKE_JOBS" -lt 1 ]] && CMAKE_JOBS=1 + + echo "Building executor_runner..." + cmake --build "$BUILD_DIR" -j "$CMAKE_JOBS" --target executor_runner + echo "Build complete." + fi +else + echo "Using existing build directory: $BUILD_DIR" +fi + +RUNNER_BIN="$BUILD_DIR/executor_runner" +if [[ ! -f "$RUNNER_BIN" ]]; then + echo "Error: executor_runner not found at $RUNNER_BIN" + exit 1 +fi + +# --- Push to device --- +echo "Pushing files to device..." +adb_cmd shell mkdir -p "$DEVICE_DIR" +adb_cmd push --sync "$RUNNER_BIN" "$DEVICE_DIR/" +adb_cmd push --sync "$MODEL_PATH" "$DEVICE_DIR/" +adb_cmd shell chmod +x "$DEVICE_DIR/executor_runner" + +DEVICE_MODEL="$DEVICE_DIR/$MODEL_NAME" + +# --- Runner args --- +runner_args=( + "--model_path=$DEVICE_MODEL" + "--cpu_threads=$NUM_THREADS" + "--print_output=false" +) + +if [[ -n "$METHOD" ]]; then + runner_args+=("--method_name=$METHOD") +fi + +# --- Run helper: capture output, print on failure --- +RUNNER_OUTPUT=$(mktemp) +run_on_device() { + local rc=0 + adb_cmd shell "$DEVICE_DIR/executor_runner" "$@" > "$RUNNER_OUTPUT" 2>&1 || rc=$? + if [[ "$rc" -ne 0 ]]; then + echo "" + echo "Error: executor_runner exited with code $rc" + echo "--- device output ---" + cat "$RUNNER_OUTPUT" + echo "--- end output ---" + # Also dump recent logcat for ET_LOG messages. + echo "--- logcat (ExecuTorch) ---" + adb_cmd logcat -d -s ExecuTorch:* | tail -30 + echo "--- end logcat ---" + rm -f "$RUNNER_OUTPUT" + exit 1 + fi +} + +# --- Warmup --- +if [[ "$WARMUP" -gt 0 ]]; then + echo "Running $WARMUP warmup iteration(s)..." + run_on_device "${runner_args[@]}" "--num_executions=$WARMUP" +fi + +# Clear logcat after warmup so the benchmark progress reader doesn't pick up +# stale entries. This must happen synchronously before the pipeline starts. +adb_cmd logcat -c + +# --- Benchmark --- +# Clear any stale progress output from orphaned readers of previous runs. +[[ -t 1 ]] && printf "\r\033[K" +echo "Running $ITERATIONS benchmark iteration(s)..." + +bench_args=("${runner_args[@]}" "--num_executions=$ITERATIONS") +if [[ "$ETDUMP" == true ]]; then + bench_args+=("--etdump_path=$DEVICE_DIR/model.etdump") +fi + +# Stream logcat for live progress. A wrapper subshell records adb logcat's PID +# so we can kill it directly at cleanup -- killing only the reader subshell of a +# plain pipeline leaves adb logcat alive, blocking subsequent adb calls. +LOGCAT_PID="" +READER_PID="" + +if [[ -t 1 ]]; then + LOGCAT_PID_FILE=$(mktemp) + ( + adb_cmd logcat -s ExecuTorch:I 2>/dev/null & + echo $! > "$LOGCAT_PID_FILE" + wait + ) 2>/dev/null | \ + while IFS= read -r line; do + if [[ "$line" == *"Iteration "* ]]; then + progress=$(echo "$line" | sed -n "s/.*Iteration \([0-9]* of $ITERATIONS\): \([0-9.]*\) ms/\1 (\2 ms)/p") + [[ -n "$progress" ]] && printf "\r\033[K Progress: %s" "$progress" + fi + done & + READER_PID=$! + disown "$READER_PID" 2>/dev/null || true + while [[ ! -s "$LOGCAT_PID_FILE" ]]; do sleep 0.1; done + LOGCAT_PID=$(cat "$LOGCAT_PID_FILE") + rm -f "$LOGCAT_PID_FILE" +fi + +run_on_device "${bench_args[@]}" + +# Shut down the logcat pipeline: kill adb logcat → pipe closes → reader exits. +# disown above suppresses bash's "Terminated" job notification. +if [[ -n "$LOGCAT_PID" ]]; then kill "$LOGCAT_PID" 2>/dev/null || true; fi +if [[ -n "$READER_PID" ]]; then kill "$READER_PID" 2>/dev/null || true; fi +sleep 0.5 +[[ -t 1 ]] && printf "\r\033[K" + +# Parse the timing line from logcat. +LOGCAT_OUTPUT=$(adb_cmd logcat -d -s ExecuTorch:I) +TIMING_LINE=$(echo "$LOGCAT_OUTPUT" | grep "Model executed successfully" | tail -1 || true) +LOAD_LINE=$(echo "$LOGCAT_OUTPUT" | grep "Model loaded in" | tail -1 || true) +ITER_TIMES=$(echo "$LOGCAT_OUTPUT" | grep "Iteration " | sed -n 's/.*: \([0-9.]*\) ms/\1/p' || true) + +# --- Pull etdump --- +if [[ "$ETDUMP" == true ]]; then + ETDUMP_LOCAL="./${MODEL_NAME%.pte}.etdump" + echo "" + echo "Pulling etdump from device..." + adb_cmd pull "$DEVICE_DIR/model.etdump" "$ETDUMP_LOCAL" + adb_cmd shell rm -f "$DEVICE_DIR/model.etdump" + echo "ETDump saved to $ETDUMP_LOCAL" + + # Run the inspector CLI to print a tabular summary. + echo "" + "$EXECUTORCH_ROOT/run_python_script.sh" \ + "$EXECUTORCH_ROOT/devtools/inspector/inspector_cli.py" \ + --etdump_path="$ETDUMP_LOCAL" +fi + +# --- Cleanup --- +if [[ "$NO_CLEANUP" == false ]]; then + echo "Cleaning up model on device..." + adb_cmd shell rm -f "$DEVICE_MODEL" +else + echo "Skipping cleanup (--no-cleanup)." +fi +rm -f "$RUNNER_OUTPUT" + +# --- Summarize --- +echo "" +echo "=========================================" +echo " Benchmark Results" +echo "=========================================" +echo "Model: $MODEL_NAME" +if [[ -n "$DEVICE" ]]; then + echo "Device: $DEVICE" +fi +echo "Warmup: $WARMUP iteration(s)" + +if [[ -n "$LOAD_LINE" ]]; then + LOAD_MS=$(echo "$LOAD_LINE" | sed -n 's/.*Model loaded in \([0-9.]*\) ms\..*/\1/p') + if [[ -n "$LOAD_MS" ]]; then + echo "Load: $(printf '%.3f' "$LOAD_MS") ms" + fi +fi + +if [[ -n "$TIMING_LINE" ]]; then + TOTAL_MS=$(echo "$TIMING_LINE" | sed -n 's/.*in \([0-9.]*\) ms\..*/\1/p') + if [[ -n "$TOTAL_MS" ]]; then + AVG_MS=$(echo "scale=3; $TOTAL_MS / $ITERATIONS" | bc) + echo "Benchmark: $ITERATIONS iteration(s) in $(printf '%.3f' "$TOTAL_MS") ms" + echo "Average: ${AVG_MS} ms/iteration" + if [[ -n "$ITER_TIMES" ]]; then + MIN_MS=$(echo "$ITER_TIMES" | sort -g | head -1) + MAX_MS=$(echo "$ITER_TIMES" | sort -g | tail -1) + echo "Min: $(printf '%.3f' "$MIN_MS") ms" + echo "Max: $(printf '%.3f' "$MAX_MS") ms" + fi + else + echo "Benchmark: $ITERATIONS iteration(s) (could not parse timing)" + echo "Raw output: $TIMING_LINE" + fi +else + echo "Benchmark: $ITERATIONS iteration(s) (no timing data captured)" + echo "Check logcat for ExecuTorch output." +fi + +if [[ "$NUM_THREADS" -ne -1 ]]; then + echo "Threads: $NUM_THREADS" +fi +if [[ "$ETDUMP" == true ]]; then + echo "ETDump: $ETDUMP_LOCAL" +fi +echo "=========================================" diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index c76ab0042ae..210d754ca39 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -69,11 +69,15 @@ DEFINE_string( "", "Base name of output file. If not empty output will be written to the file(s)."); -DEFINE_bool( - print_all_output, - false, - "Prints all output. By default only first and last 100 elements are printed."); +DEFINE_string( + print_output, + "summary", + "Output printing mode: 'none' to suppress, 'summary' for first/last 100 elements, 'all' for everything."); DEFINE_uint32(num_executions, 1, "Number of times to run the model."); +DEFINE_string( + method_name, + "", + "Name of the method to run. If empty, uses the first method in the program."); #ifdef ET_EVENT_TRACER_ENABLED DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path."); #endif // ET_EVENT_TRACER_ENABLED @@ -112,6 +116,8 @@ using executorch::runtime::Span; using executorch::runtime::Tag; using executorch::runtime::TensorInfo; +enum class PrintOutputMode { None, Summary, All }; + /// Helper to manage resources for ETDump generation class EventTraceManager { public: @@ -194,6 +200,21 @@ int main(int argc, char** argv) { return 1; } + PrintOutputMode print_output_mode; + if (FLAGS_print_output == "none") { + print_output_mode = PrintOutputMode::None; + } else if (FLAGS_print_output == "summary") { + print_output_mode = PrintOutputMode::Summary; + } else if (FLAGS_print_output == "all") { + print_output_mode = PrintOutputMode::All; + } else { + ET_LOG( + Error, + "Unknown --print_output mode '%s'. Expected 'none', 'summary', or 'all'.", + FLAGS_print_output.c_str()); + return 1; + } + #if defined(ET_USE_THREADPOOL) auto cpu_threads = FLAGS_cpu_threads; uint32_t num_performant_cores = cpu_threads == -1 @@ -344,6 +365,7 @@ int main(int argc, char** argv) { // Parse the program file. This is immutable, and can also be reused between // multiple execution invocations across multiple threads. + const et_timestamp_t before_load = executorch::runtime::pal_current_ticks(); Result program = Program::load(loader.get()); if (!program.ok()) { ET_LOG(Error, "Failed to parse model file %s", FLAGS_model_path.c_str()); @@ -351,9 +373,10 @@ int main(int argc, char** argv) { } ET_LOG(Info, "Model file %s is loaded.", FLAGS_model_path.c_str()); - // Use the first method in the program. const char* method_name = nullptr; - { + if (!FLAGS_method_name.empty()) { + method_name = FLAGS_method_name.c_str(); + } else { const auto method_name_result = program->get_method_name(0); ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); method_name = *method_name_result; @@ -432,12 +455,21 @@ int main(int argc, char** argv) { &memory_manager, tracer.get_event_tracer(), ptd_data_map.get()); + const et_timestamp_t after_load = executorch::runtime::pal_current_ticks(); ET_CHECK_MSG( method.ok(), "Loading of method %s failed with status 0x%" PRIx32, method_name, (uint32_t)method.error()); - ET_LOG(Info, "Method loaded."); + { + const auto load_tick_ratio = et_pal_ticks_to_ns_multiplier(); + ET_LOG( + Info, + "Model loaded in %f ms.", + static_cast(after_load - before_load) * + load_tick_ratio.numerator / load_tick_ratio.denominator / + 1000000.0); + } et_timestamp_t time_spent_executing = 0; // Run the model. @@ -481,12 +513,21 @@ int main(int argc, char** argv) { Error status = method->execute(); const et_timestamp_t after_execute = executorch::runtime::pal_current_ticks(); - time_spent_executing += after_execute - before_execute; + const et_timestamp_t iter_elapsed = after_execute - before_execute; + time_spent_executing += iter_elapsed; ET_CHECK_MSG( status == Error::Ok, "Execution of method %s failed with status 0x%" PRIx32, method_name, static_cast(status)); + const auto iter_tick_ratio = et_pal_ticks_to_ns_multiplier(); + ET_LOG( + Info, + "Iteration %" PRIu32 " of %" PRIu32 ": %f ms", + i + 1, + FLAGS_num_executions, + static_cast(iter_elapsed) * iter_tick_ratio.numerator / + iter_tick_ratio.denominator / 1000000.0); } const auto tick_ratio = et_pal_ticks_to_ns_multiplier(); constexpr auto NANOSECONDS_PER_MILLISECOND = 1000000; @@ -518,7 +559,7 @@ int main(int argc, char** argv) { } } - if (FLAGS_print_all_output) { + if (print_output_mode == PrintOutputMode::All) { for (int i = 0; i < outputs.size(); ++i) { if (outputs[i].isTensor()) { Tensor tensor = outputs[i].toTensor(); @@ -555,7 +596,7 @@ int main(int argc, char** argv) { printf("Output[%d]: Not Tensor\n", i); } } - } else { + } else if (print_output_mode == PrintOutputMode::Summary) { // Print the first and last 100 elements of long lists of scalars. std::cout << executorch::extension::evalue_edge_items(100);