Skip to content

Commit ac4fe82

Browse files
authored
Merge pull request #4 from PolyCam/jgibson/neon-custom-kernels
optimized: add NEON grid_sampler_2d.out and Vectorized<float> sum.IntList_out
2 parents 6e6c2a7 + b2bdc94 commit ac4fe82

7 files changed

Lines changed: 1155 additions & 0 deletions

File tree

kernels/optimized/CMakeLists.txt

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,21 @@ target_link_libraries(
7575
kernels_util_all_deps
7676
)
7777
target_compile_options(optimized_kernels PUBLIC ${_common_compile_options})
78+
79+
# op_grid_sampler_2d.cpp uses ARMv8.2-a+fp16 NEON intrinsics
80+
# (vcvt_f32_f16 / vld1_f16) when compiled for aarch64. Scope the extra
81+
# `-march` flag to just that source so non-arm64 targets (e.g. x86_64 on
82+
# Android) are unaffected — the kernel itself has `#ifdef __aarch64__`
83+
# guards and falls through to the portable kernel otherwise.
84+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64"
85+
OR ANDROID_ABI STREQUAL "arm64-v8a"
86+
)
87+
set_source_files_properties(
88+
${EXECUTORCH_ROOT}/kernels/optimized/cpu/op_grid_sampler_2d.cpp
89+
PROPERTIES COMPILE_OPTIONS "-march=armv8.2-a+fp16"
90+
)
91+
endif()
92+
7893
# Build a library for _optimized_kernels_srcs
7994
#
8095
# optimized_ops_lib: Register optimized ops kernels into Executorch runtime
@@ -83,6 +98,30 @@ gen_operators_lib(
8398
executorch_core
8499
)
85100

101+
# On-device verifier for optimized grid_sampler_2d / sum.IntList_out.
102+
# Opt-in via -DEXECUTORCH_BUILD_OPTIMIZED_VERIFY=ON so it doesn't affect
103+
# default AAR / library builds. Cross-checks both ops against an fp32
104+
# reference derived from the portable kernel; non-zero exit on divergence.
105+
if(EXECUTORCH_BUILD_OPTIMIZED_VERIFY)
106+
add_executable(
107+
verify_optimized_kernels ${EXECUTORCH_ROOT}/kernels/optimized/verify.cpp
108+
)
109+
target_link_libraries(
110+
verify_optimized_kernels
111+
PRIVATE optimized_kernels portable_kernels executorch_core
112+
)
113+
target_compile_options(
114+
verify_optimized_kernels PRIVATE ${_common_compile_options}
115+
)
116+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64"
117+
OR ANDROID_ABI STREQUAL "arm64-v8a"
118+
)
119+
target_compile_options(
120+
verify_optimized_kernels PRIVATE -march=armv8.2-a+fp16
121+
)
122+
endif()
123+
endif()
124+
86125
install(
87126
# eigen_blas doesn't export itself, so we have to do our own install to export
88127
# it.

0 commit comments

Comments
 (0)