@@ -76,17 +76,34 @@ target_link_libraries(
7676)
7777target_compile_options (optimized_kernels PUBLIC ${_common_compile_options} )
7878
79- # op_grid_sampler_2d.cpp uses ARMv8.2-a+fp16 NEON intrinsics
80- # (vcvt_f32_f16 / vld1_f16) when compiled for aarch64. Scope the extra
81- # `-march` flag to just that source so non-arm64 targets (e.g. x86_64 on
82- # Android) are unaffected — the kernel itself has `#ifdef __aarch64__`
83- # guards and falls through to the portable kernel otherwise.
84- if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64"
85- OR ANDROID_ABI STREQUAL "arm64-v8a"
79+ # op_grid_sampler_2d_fp16_hw.cpp uses hardware fp16 NEON intrinsics
80+ # (vcvt_f32_f16 / vld1_f16). Those are part of the ARMv8.2-a+fp16 extension and
81+ # raise SIGILL on chips without it. Build it as a separate OBJECT library so the
82+ # `-march=armv8.2-a+fp16` flag stays strictly scoped to that translation unit
83+ # and never reaches the dispatcher / fallback code in op_grid_sampler_2d.cpp
84+ # (which would otherwise risk auto-vectorizing into fp16 NEON instructions). The
85+ # dispatcher chooses between this entry point and the fp16 software-convert path
86+ # at runtime via cpuinfo_has_arm_neon_fp16(). Mirrors the buck
87+ # `grid_sampler_2d_fp16_hw_impl` library.
88+ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64" OR ANDROID_ABI STREQUAL
89+ "arm64-v8a"
8690)
87- set_source_files_properties (
88- ${EXECUTORCH_ROOT} /kernels/optimized/cpu/op_grid_sampler_2d.cpp
89- PROPERTIES COMPILE_OPTIONS "-march=armv8.2-a+fp16"
91+ add_library (
92+ grid_sampler_2d_fp16_hw_impl OBJECT
93+ ${EXECUTORCH_ROOT} /kernels/optimized/cpu/op_grid_sampler_2d_fp16_hw.cpp
94+ )
95+ target_compile_options (
96+ grid_sampler_2d_fp16_hw_impl PRIVATE -march=armv8.2-a+fp16
97+ ${_common_compile_options}
98+ )
99+ target_link_libraries (grid_sampler_2d_fp16_hw_impl PRIVATE executorch_core )
100+ # BUILD_LOCAL_INTERFACE: object files are baked into optimized_kernels.a at
101+ # archive time, so this OBJECT target stays out of the install EXPORT set and
102+ # downstream consumers of the installed optimized_kernels need no separate
103+ # link entry.
104+ target_link_libraries (
105+ optimized_kernels
106+ PRIVATE $<BUILD_LOCAL_INTERFACE :grid_sampler_2d_fp16_hw_impl >
90107 )
91108endif ()
92109
@@ -98,30 +115,6 @@ gen_operators_lib(
98115 executorch_core
99116)
100117
101- # On-device verifier for optimized grid_sampler_2d / sum.IntList_out.
102- # Opt-in via -DEXECUTORCH_BUILD_OPTIMIZED_VERIFY=ON so it doesn't affect
103- # default AAR / library builds. Cross-checks both ops against an fp32
104- # reference derived from the portable kernel; non-zero exit on divergence.
105- if (EXECUTORCH_BUILD_OPTIMIZED_VERIFY)
106- add_executable (
107- verify_optimized_kernels ${EXECUTORCH_ROOT} /kernels/optimized/verify.cpp
108- )
109- target_link_libraries (
110- verify_optimized_kernels
111- PRIVATE optimized_kernels portable_kernels executorch_core
112- )
113- target_compile_options (
114- verify_optimized_kernels PRIVATE ${_common_compile_options}
115- )
116- if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64"
117- OR ANDROID_ABI STREQUAL "arm64-v8a"
118- )
119- target_compile_options (
120- verify_optimized_kernels PRIVATE -march=armv8.2-a+fp16
121- )
122- endif ()
123- endif ()
124-
125118install (
126119 # eigen_blas doesn't export itself, so we have to do our own install to export
127120 # it.
0 commit comments