Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions kernels/optimized/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,38 @@ target_link_libraries(
kernels_util_all_deps
)
target_compile_options(optimized_kernels PUBLIC ${_common_compile_options})

# op_grid_sampler_2d_fp16_hw.cpp uses hardware fp16 NEON intrinsics
# (vcvt_f32_f16 / vld1_f16). Those are part of the ARMv8.2-a+fp16 extension and
# raise SIGILL on chips without it. Build it as a separate OBJECT library so the
# `-march=armv8.2-a+fp16` flag stays strictly scoped to that translation unit
# and never reaches the dispatcher / fallback code in op_grid_sampler_2d.cpp
# (which would otherwise risk auto-vectorizing into fp16 NEON instructions). The
# dispatcher chooses between this entry point and the fp16 software-convert path
# at runtime via cpuinfo_has_arm_neon_fp16(). Mirrors the buck
# `grid_sampler_2d_fp16_hw_impl` library.
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64" OR ANDROID_ABI STREQUAL
"arm64-v8a"
)
add_library(
grid_sampler_2d_fp16_hw_impl OBJECT
${EXECUTORCH_ROOT}/kernels/optimized/cpu/op_grid_sampler_2d_fp16_hw.cpp
)
target_compile_options(
grid_sampler_2d_fp16_hw_impl PRIVATE -march=armv8.2-a+fp16
${_common_compile_options}
)
target_link_libraries(grid_sampler_2d_fp16_hw_impl PRIVATE executorch_core)
# BUILD_LOCAL_INTERFACE: object files are baked into optimized_kernels.a at
# archive time, so this OBJECT target stays out of the install EXPORT set and
# downstream consumers of the installed optimized_kernels need no separate
# link entry.
target_link_libraries(
optimized_kernels
PRIVATE $<BUILD_LOCAL_INTERFACE:grid_sampler_2d_fp16_hw_impl>
)
endif()

# Build a library for _optimized_kernels_srcs
#
# optimized_ops_lib: Register optimized ops kernels into Executorch runtime
Expand Down
Loading
Loading