Skip to content

Commit 929a1c9

Browse files
authored
Merge pull request #7 from PolyCam/jgibson/sync-neon-to-upstream
Sync NEON optimized kernels to current upstream PR (pytorch#19119)
2 parents ac4fe82 + 891b69d commit 929a1c9

7 files changed

Lines changed: 463 additions & 119 deletions

File tree

kernels/optimized/CMakeLists.txt

Lines changed: 27 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -76,17 +76,34 @@ target_link_libraries(
7676
)
7777
target_compile_options(optimized_kernels PUBLIC ${_common_compile_options})
7878

79-
# op_grid_sampler_2d.cpp uses ARMv8.2-a+fp16 NEON intrinsics
80-
# (vcvt_f32_f16 / vld1_f16) when compiled for aarch64. Scope the extra
81-
# `-march` flag to just that source so non-arm64 targets (e.g. x86_64 on
82-
# Android) are unaffected — the kernel itself has `#ifdef __aarch64__`
83-
# guards and falls through to the portable kernel otherwise.
84-
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64"
85-
OR ANDROID_ABI STREQUAL "arm64-v8a"
79+
# op_grid_sampler_2d_fp16_hw.cpp uses hardware fp16 NEON intrinsics
80+
# (vcvt_f32_f16 / vld1_f16). Those are part of the ARMv8.2-a+fp16 extension and
81+
# raise SIGILL on chips without it. Build it as a separate OBJECT library so the
82+
# `-march=armv8.2-a+fp16` flag stays strictly scoped to that translation unit
83+
# and never reaches the dispatcher / fallback code in op_grid_sampler_2d.cpp
84+
# (which would otherwise risk auto-vectorizing into fp16 NEON instructions). The
85+
# dispatcher chooses between this entry point and the fp16 software-convert path
86+
# at runtime via cpuinfo_has_arm_neon_fp16(). Mirrors the buck
87+
# `grid_sampler_2d_fp16_hw_impl` library.
88+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64" OR ANDROID_ABI STREQUAL
89+
"arm64-v8a"
8690
)
87-
set_source_files_properties(
88-
${EXECUTORCH_ROOT}/kernels/optimized/cpu/op_grid_sampler_2d.cpp
89-
PROPERTIES COMPILE_OPTIONS "-march=armv8.2-a+fp16"
91+
add_library(
92+
grid_sampler_2d_fp16_hw_impl OBJECT
93+
${EXECUTORCH_ROOT}/kernels/optimized/cpu/op_grid_sampler_2d_fp16_hw.cpp
94+
)
95+
target_compile_options(
96+
grid_sampler_2d_fp16_hw_impl PRIVATE -march=armv8.2-a+fp16
97+
${_common_compile_options}
98+
)
99+
target_link_libraries(grid_sampler_2d_fp16_hw_impl PRIVATE executorch_core)
100+
# BUILD_LOCAL_INTERFACE: object files are baked into optimized_kernels.a at
101+
# archive time, so this OBJECT target stays out of the install EXPORT set and
102+
# downstream consumers of the installed optimized_kernels need no separate
103+
# link entry.
104+
target_link_libraries(
105+
optimized_kernels
106+
PRIVATE $<BUILD_LOCAL_INTERFACE:grid_sampler_2d_fp16_hw_impl>
90107
)
91108
endif()
92109

@@ -98,30 +115,6 @@ gen_operators_lib(
98115
executorch_core
99116
)
100117

101-
# On-device verifier for optimized grid_sampler_2d / sum.IntList_out.
102-
# Opt-in via -DEXECUTORCH_BUILD_OPTIMIZED_VERIFY=ON so it doesn't affect
103-
# default AAR / library builds. Cross-checks both ops against an fp32
104-
# reference derived from the portable kernel; non-zero exit on divergence.
105-
if(EXECUTORCH_BUILD_OPTIMIZED_VERIFY)
106-
add_executable(
107-
verify_optimized_kernels ${EXECUTORCH_ROOT}/kernels/optimized/verify.cpp
108-
)
109-
target_link_libraries(
110-
verify_optimized_kernels
111-
PRIVATE optimized_kernels portable_kernels executorch_core
112-
)
113-
target_compile_options(
114-
verify_optimized_kernels PRIVATE ${_common_compile_options}
115-
)
116-
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64"
117-
OR ANDROID_ABI STREQUAL "arm64-v8a"
118-
)
119-
target_compile_options(
120-
verify_optimized_kernels PRIVATE -march=armv8.2-a+fp16
121-
)
122-
endif()
123-
endif()
124-
125118
install(
126119
# eigen_blas doesn't export itself, so we have to do our own install to export
127120
# it.

0 commit comments

Comments
 (0)