executorch/kernels/optimized/CMakeLists.txt at b2bdc94c4d2c111e9db800cc65c0830e63bf9a91 · PolyCam/executorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Kernel library for optimized kernels. Please this file formatted by running:
# ~~~
# cmake-format -i CMakeLists.txt
# ~~~

cmake_minimum_required(VERSION 3.19)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if(NOT CMAKE_CXX_STANDARD)
  set(CMAKE_CXX_STANDARD 17)
endif()

# Source root directory for executorch.
if(NOT EXECUTORCH_ROOT)
  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
endif()

set(_common_compile_options
    $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations>
    $<$<CXX_COMPILER_ID:GNU>:-Wno-psabi>
)

# Note for apple platform we can rely on Accelerate framework Will come back to
# this
include(${CMAKE_CURRENT_LIST_DIR}/External/EigenBLAS.cmake)
list(APPEND _common_compile_options -DET_BUILD_WITH_BLAS)

# For us to set CPU_CAPABILITY_AVX2 we need to detect architecture plus
# processor. The way aten has implemented this is slightly different. We
# probably need to figure out how to detect compiler flag that suggest we are
# compiling for avx2 for now punting this to come back

include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)

# Build cpublas.
list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/")
add_library(cpublas STATIC ${_optimized_cpublas__srcs})
target_include_directories(cpublas PRIVATE ${TORCH_INCLUDE_DIRS})
target_link_libraries(
  cpublas PUBLIC executorch_core eigen_blas extension_threadpool
)
target_compile_options(cpublas PUBLIC ${_common_compile_options})

# Generate C++ bindings to register kernels into both PyTorch (for AOT) and
# Executorch (for runtime). Here select all ops in optimized.yaml
set(_yaml "${CMAKE_CURRENT_LIST_DIR}/optimized.yaml")
gen_selected_ops(LIB_NAME "optimized_ops_lib" OPS_SCHEMA_YAML "${_yaml}")

generate_bindings_for_kernels(
  LIB_NAME "optimized_ops_lib" FUNCTIONS_YAML
  ${CMAKE_CURRENT_SOURCE_DIR}/optimized.yaml ADD_EXCEPTION_BOUNDARY
)
message("Generated files ${gen_command_sources}")

list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
add_library(optimized_kernels ${_optimized_kernels__srcs})
target_include_directories(
  optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS}
                            "${EXECUTORCH_ROOT}/third-party/pocketfft"
)
target_compile_definitions(
  optimized_kernels PRIVATE "ET_USE_PYTORCH_HEADERS=ET_HAS_EXCEPTIONS"
                            POCKETFFT_USE_POSIX_MEMALIGN
)
target_link_libraries(
  optimized_kernels PUBLIC executorch_core cpublas extension_threadpool
                           kernels_util_all_deps
)
target_compile_options(optimized_kernels PUBLIC ${_common_compile_options})

# op_grid_sampler_2d.cpp uses ARMv8.2-a+fp16 NEON intrinsics
# (vcvt_f32_f16 / vld1_f16) when compiled for aarch64. Scope the extra
# `-march` flag to just that source so non-arm64 targets (e.g. x86_64 on
# Android) are unaffected — the kernel itself has `#ifdef __aarch64__`
# guards and falls through to the portable kernel otherwise.
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64"
   OR ANDROID_ABI STREQUAL "arm64-v8a"
)
  set_source_files_properties(
    ${EXECUTORCH_ROOT}/kernels/optimized/cpu/op_grid_sampler_2d.cpp
    PROPERTIES COMPILE_OPTIONS "-march=armv8.2-a+fp16"
  )
endif()

# Build a library for _optimized_kernels_srcs
#
# optimized_ops_lib: Register optimized ops kernels into Executorch runtime
gen_operators_lib(
  LIB_NAME "optimized_ops_lib" KERNEL_LIBS optimized_kernels DEPS
  executorch_core
)

# On-device verifier for optimized grid_sampler_2d / sum.IntList_out.
# Opt-in via -DEXECUTORCH_BUILD_OPTIMIZED_VERIFY=ON so it doesn't affect
# default AAR / library builds. Cross-checks both ops against an fp32
# reference derived from the portable kernel; non-zero exit on divergence.
if(EXECUTORCH_BUILD_OPTIMIZED_VERIFY)
  add_executable(
    verify_optimized_kernels ${EXECUTORCH_ROOT}/kernels/optimized/verify.cpp
  )
  target_link_libraries(
    verify_optimized_kernels
    PRIVATE optimized_kernels portable_kernels executorch_core
  )
  target_compile_options(
    verify_optimized_kernels PRIVATE ${_common_compile_options}
  )
  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64"
     OR ANDROID_ABI STREQUAL "arm64-v8a"
  )
    target_compile_options(
      verify_optimized_kernels PRIVATE -march=armv8.2-a+fp16
    )
  endif()
endif()

install(
  # eigen_blas doesn't export itself, so we have to do our own install to export
  # it.
  TARGETS cpublas optimized_kernels optimized_ops_lib eigen_blas
  EXPORT ExecuTorchTargets
  DESTINATION ${CMAKE_INSTALL_LIBDIR}
  PUBLIC_HEADER
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/kernels/optimized/
)