diff --git a/.gitignore b/.gitignore index bbcdfd1d6..3d6100c30 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,6 @@ android_build ios_build gpu_build output + +.idea +.vscode diff --git a/.travis.yml b/.travis.yml index bf0ada02b..a1d2785ee 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,8 +7,8 @@ os: env: - JOB="-p NVIDIA-GPU -o Centos" - JOB="-p NVIDIA-GPU -o Ubuntu" - #- JOB="-p AMD_GPU -o Centos" - #- JOB="-p AMD_GPU -o Ubuntu" + #- JOB="-p AMD-GPU -o Centos" + #- JOB="-p AMD-GPU -o Ubuntu" #- JOB="-p X86-ONLY -o Centos" #- JOB="-p X86-ONLY -o Ubuntu" #- JOB="-p ARM -o Centos" @@ -31,6 +31,8 @@ branches: only: - master - developing + - AMD + - dev_v2 notifications: email: diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 000000000..bcc5f3ead --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,27 @@ +| Github account | name | +|---|---| +| chenjiaoAngel | Jiao Chen | +| cyj1986 | Yujuan Cheng | +| feifei14119 | Fei Wang | +| jackyh | Chengjie He | +| Jayoprell | Xiaocheng Luo | +| jjsbear | Jingsong Ji | +| LittleMaer | Yi Zhuang | +| mengkai94 | Kai Meng | +| micytw | Michael Wu | +| pangge | Chaowen Cui | +| perchbird | Xiaokun Yu | +| PeterJkPeng | Junyi Peng | +| qq332982511 | Junjie Liu | +| Shixiaowei02 | Xiaowei Shi | +| sogalin | Soga Lin | +| throneclay | Shuai Zhang | +| vin-huang | Vin Huang | +| wgy0804 | Guoya Wang | +| xklnono | Kailu Xu | +| xyoungli | Xiaoyang Li | +| yanan1112 | Yanan Liu | +| yao-matrix | Weifeng Yao | +| zdcocnftcp10 | Dachuan Zhao | +| zhouhuan2009 | Huan Zhou | +| zoooooooyuan | Yuan Zu | \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index ccb37468f..189a3414f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,12 +1,19 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved -# @file root cmakefile -# @auther cuichaowen -# @date 2017-10-24 -# ---------------------------------------------------------------------------- +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + include(cmake/thirdparty_version.cmake) -cmake_minimum_required(VERSION ${MIN_CMAKE_V} FATAL_ERROR) -project(ANAKIN C CXX) +project(ANAKIN C CXX) include(cmake/msg_color.cmake) include(cmake/utils.cmake) include(cmake/statistic.cmake) @@ -14,10 +21,12 @@ include(cmake/statistic.cmake) # ---------------------------------------------------------------------------- # section: global anakin version and lib name # ---------------------------------------------------------------------------- -# global anakin version 2.0.1 -set(VERSION_MAJOR "2") -set(VERSION_MINOR "0") -set(VERSION_PATCH "1") +cmake_minimum_required(VERSION ${MIN_CMAKE_V} FATAL_ERROR) + +# global anakin version 0.1.0 +set(VERSION_MAJOR "0") +set(VERSION_MINOR "1") +set(VERSION_PATCH "0") set(VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") # anakin lib name and global directories @@ -28,12 +37,15 @@ set(ANAKIN_ROOT ${PROJECT_SOURCE_DIR}) include_directories(${ANAKIN_ROOT}) set(ANAKIN_FRAMEWORK ${ANAKIN_ROOT}/framework) -set(ANAKIN_THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/third-party) +set(ANAKIN_LITE ${ANAKIN_FRAMEWORK}/lite) set(ANAKIN_UTILS ${ANAKIN_ROOT}/utils) set(ANAKIN_THIRD_PARTY_PATH ${ANAKIN_ROOT}/third-party) set(ANAKIN_MODEL_PARSER ${ANAKIN_FRAMEWORK}/model_parser) +set(ANAKIN_SERVICE ${ANAKIN_FRAMEWORK}/service) set(ANAKIN_SABER ${ANAKIN_ROOT}/saber) set(ANAKIN_UNIT_TEST ${ANAKIN_ROOT}/test) +set(ANAKIN_EXAMPLES ${ANAKIN_ROOT}/examples) + # ---------------------------------------------------------------------------- # section: options for anakin @@ -48,12 +60,13 @@ anakin_option(ANAKIN_TYPE_INT8 "define the INT8 for data precision." NO) anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." YES) anakin_option(USE_X86_PLACE "Select the build mode for X86 place." YES) anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO) +anakin_option(USE_BM_PLACE "Select the build mode for BM place." NO) # plantfrom details anakin_option(NVIDIA_GPU "Use NVIDIA GPU place." YES if USE_GPU_PLACE) anakin_option(AMD_GPU "Use AMD GPU place." NO if USE_GPU_PLACE AND NOT NVIDIA_GPU) -anakin_option(TARGET_ANDROID "" NO if USE_ARM_PLACE) -anakin_option(TARGET_IOS "" NO if USE_ARM_PLACE) +anakin_option(TARGET_ANDROID "build for android" YES if USE_ARM_PLACE) +anakin_option(TARGET_IOS "not supported now" YES if USE_ARM_PLACE AND NOT TARGET_ANDROID) # compile options for NVIDIA_GPU place anakin_option(USE_CUDA "Use Cuda libs." YES if NVIDIA_GPU) @@ -64,60 +77,52 @@ anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_CUDA) anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_CUDA) anakin_option(BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform" NO if BUILD_CROSS_PLANTFORM) -# compile options for BM place -#anakin_option(USE_BM "Use Cuda libs." YES if NVIDIA_GPU) -#anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_BM) -#anakin_option(USE_CURAND "Use Curand libs." YES if USE_BM) -#anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_BM) -#anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_BM) -#anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_BM) - - if(USE_CUDA) # Select gpu target arch for local high performance implement sass code . Now we have checked on sm_61 sm_50 and it works well. set(SELECTED_SASS_TARGET_ARCH "61") -elseif(USE_BM) - # Select gpu target arch for local high performance implement sass code . Now we have checked on sm_61 sm_50 and it works well. - #set(SELECTED_SASS_TARGET_ARCH "61") endif() if((NOT BUILD_FAT_BIN) AND (NOT BUILD_CROSS_PLANTFORM) AND USE_CUDA) # Select the only nvidia gpu arch you want to be built on - set(TARGET_GPUARCH 6.1) + set(TARGET_GPUARCH 6.1) endif() # build options for cuda. anakin_option(BUILD_CUBIN "BUILD with the -cubin option in Device mode" NO if USE_CUDA) anakin_option(COMPILE_PTX "Returns a list of PTX files generated from src." NO if USE_CUDA) -# build options for BM. -anakin_option(BUILD_CUBIN "BUILD with the -cubin option in Device mode" NO if USE_BM) -anakin_option(COMPILE_PTX "Returns a list of PTX files generated from src." NO if USE_BM) - - # common build options -anakin_option(ENABLE_DEBUG "Enable DEBUG(default) mode." NO) +anakin_option(ENABLE_DEBUG "Enable DEBUG(default) mode." YES) anakin_option(ENABLE_VERBOSE_MSG "Enable verbose=1 : compile msg during make." NO) anakin_option(DISABLE_ALL_WARNINGS "Disable all the warning msg during compile." YES) anakin_option(ENABLE_NOISY_WARNINGS "Enable noisy warning msg during compile." NO if DISABLE_ALL_WARNINGS) # using 3rd party libs -anakin_option(USE_GLOG "Build Glog components." NO) +anakin_option(USE_LOGGER "Build native logger components." YES) +anakin_option(USE_GLOG "Build Glog components." NO if NOT USE_LOGGER) anakin_option(USE_PROTOBUF "Build Google protobuf components." YES) anakin_option(USE_OPENCV "Use static opencv libs." NO) anakin_option(USE_BOOST "Use static BOOST libs." NO) -anakin_option(USE_OPENMP "Use Openmp when in andriod environment." YES if TARGET_ANDROID) +anakin_option(USE_OPENMP "Use Openmp when in android environment." YES if TARGET_ANDROID) anakin_option(USE_GTEST "Use googletest libs." NO if BUILD_WITH_UNIT_TEST) anakin_option(USE_PYTHON "Generate py wrappers." NO) -anakin_option(USE_OPENCL "Use OpenCL ." NO) +anakin_option(USE_OPENCL "Use OpenCL ." YES if AMD_GPU) anakin_option(USE_GFLAGS "Build Google gflags components." NO) anakin_option(USE_MKL "Use mkl libs." NO if USE_X86_PLACE) anakin_option(USE_MKLML "Use MKLML libs." YES if USE_X86_PLACE) anakin_option(USE_XBYAK "Use XBYAK libs." YES if USE_X86_PLACE) -anakin_option(USE_OPENMP "Use Openmp when in andriod environment." YES if TARGET_ANDROID) +anakin_option(USE_OPENMP "Use Openmp when in android environment." YES if TARGET_ANDROID) # build components anakin_option(BUILD_WITH_UNIT_TEST "Build anakin unit test components." YES) +anakin_option(BUILD_WITH_FRAMEWORK "Build anakin framework" YES) + +anakin_option(BUILD_RPC "Build anakin rpc service components." NO if BUILD_WITH_FRAMEWORK) +anakin_option(BUILD_WITH_LITE "Build anakin lite components." YES if USE_GPU_PLACE AND BUILD_WITH_FRAMEWORK) + +# build examples +anakin_option(BUILD_EXAMPLES "build detection and classification examples" NO) + # build target anakin_option(BUILD_SHARED "Build anakin shared lib." YES) anakin_option(BUILD_STATIC "Build anakin static lib." YES if NOT BUILD_SHARED) @@ -127,10 +132,16 @@ anakin_option(ENABLE_OP_TIMER "Enable op timer mode." NO) # ---------------------------------------------------------------------------- # section: anakin compiler and linker options # ---------------------------------------------------------------------------- +set(CMAKE_BUILD_TYPE Debug FORCE) if(ENABLE_DEBUG) - set(CMAKE_BUILD_TYPE Debug FORCE) + set(CMAKE_BUILD_TYPE Debug FORCE) else() - set(CMAKE_BUILD_TYPE Release FORCE) + set(CMAKE_BUILD_TYPE Release FORCE) +endif() + +if(USE_LOGGER) + anakin_option(ENABLE_STACKTRACES "If enable local logger with stacktrace." YES if NOT USE_ARM_PLACE) + anakin_option(SUPPORT_PTHREADS "If enable local logger with supporting pthreads. " YES) endif() # ---------------------------------------------------------------------------- @@ -138,8 +149,8 @@ endif() # code # ---------------------------------------------------------------------------- configure_file ( - "${PROJECT_SOURCE_DIR}/cmake/config/anakin_config.h.in" - "${PROJECT_BINARY_DIR}/anakin_config.h" + "${PROJECT_SOURCE_DIR}/cmake/config/anakin_config.h.in" + "${PROJECT_BINARY_DIR}/anakin_config.h" ) # add the binary tree to the search path so that anakin will find ak_config.h include_directories(${PROJECT_BINARY_DIR}) @@ -157,10 +168,6 @@ if(USE_CUDA) include(cmake/cuda.cmake) endif() -if(USE_BM) - #include(cmake/cuda.cmake) -endif() - if(USE_X86_PLACE) set(ANAKIN_TEMP_THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/third-party) if(USE_MKLML) @@ -172,6 +179,10 @@ if(USE_X86_PLACE) #include(cmake/external/mkldnn.cmake) endif() +if(AMD_GPU) + include(cmake/amd.cmake) +endif() + # gather all the config options to anakin include(cmake/gather.cmake) @@ -181,14 +192,35 @@ include(cmake/gather.cmake) # ---------------------------------------------------------------------------- # add source sub_directory whick holds the cmake build module # fetch files of model_parser -add_subdirectory(${ANAKIN_MODEL_PARSER}) + + add_subdirectory(${ANAKIN_SABER}) -add_subdirectory(${ANAKIN_FRAMEWORK}) + +if(USE_BM_PLACE) + add_subdirectory(${ANAKIN_SABER}/funcs/impl/bm) +endif() + +if(BUILD_WITH_FRAMEWORK) + add_subdirectory(${ANAKIN_MODEL_PARSER}) + if(BUILD_RPC) + add_subdirectory(${ANAKIN_SERVICE}) + endif() + if(BUILD_WITH_LITE) + add_subdirectory(${ANAKIN_LITE}) + endif() + add_subdirectory(${ANAKIN_FRAMEWORK}) +endif() if(BUILD_WITH_UNIT_TEST) add_subdirectory(${ANAKIN_UNIT_TEST}) endif() +if (BUILD_EXAMPLES) + if(BUILD_WITH_FRAMEWORK) + add_subdirectory(${ANAKIN_EXAMPLES}) + endif() +endif() + anakin_print_statistic() diff --git a/README.md b/README.md index 4cabf240b..fcbfe9ae6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Anakin +# Anakin12 [![Build Status](https://travis-ci.org/PaddlePaddle/Anakin.svg?branch=developing)](https://travis-ci.org/PaddlePaddle/Anakin) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -7,63 +7,65 @@ Welcome to the Anakin GitHub. -Anakin is an cross-platform, high-performance inference engine, which is originally +Anakin is a cross-platform, high-performance inference engine, which is originally developed by Baidu engineers and is a large-scale application of industrial products. -Please refer to our [release announcement]() to track the latest feature of Anakin. +Please refer to our [release announcement](https://github.com/PaddlePaddle/Anakin/releases) to track the latest feature of Anakin. ## Features - **Flexibility** Anakin supports a wide range of neural network architectures and - diffrent hardware platform. It is easy to run Anakin at GPU/x86/ARM platform. + different hardware platforms. It is easy to run Anakin on GPU / x86 / ARM platform. - **High performance** - In order to giving full play to the performance of hardware, we optimize the - forward prediction at diffrent levels. - - Automatic graph fusion. The goal of all performance optimization under a - given algorithm is to make ALU as busy as possible, Operator fusion - can effectively reduce memory access and keep ALU busy. - - - Memory reuse. Forward prediction is a one-way calculation. We reuse - the memory between the input and output of different operators, thus + In order to give full play to the performance of hardware, we optimized the + forward prediction at different levels. + - Automatic graph fusion. The goal of all performance optimizations under a + given algorithm is to make the ALU as busy as possible. Operator fusion + can effectively reduce memory access and keep the ALU busy. + + - Memory reuse. Forward prediction is a one-way calculation. We reuse + the memory between the input and output of different operators, thus reducing the overall memory overhead. - - Assembly level optimization. Saber is Anakin's underlying DNN library, which + - Assembly level optimization. Saber is a underlying DNN library for Anakin, which is deeply optimized at assembly level. Performance comparison between Anakin, TensorRT - and Tensorflow-lite, please refer to the benchmark tests. + and Tensorflow-lite, please refer to the [benchmark tests](benchmark/README.md). ## Installation It is recommended to check out the -[Docker installation guide](docker/README.md). +[docker installation guide](docker/README.md). before looking into the [build from source guide](docs/Manual/INSTALL_en.md). +For ARM, please refer [run on arm](docs/Manual/run_on_arm_en.md). + ## Benchmark -It is recommended to check out the [Benchmark Readme](benchmark/README.md) +It is recommended to check out the [readme of benchmark](benchmark/README.md). ## Documentation -We provide [English](docs/Manual/Tutorial_en.md) and -[Chinese](docs/Manual/Tutorial_ch.md) documentation. +We provide [English](docs/Manual/Tutorial_en.md) and [Chinese](docs/Manual/Tutorial_ch.md) documentation. -- [Anakin developer guide]() +- Developer guide - You might want to know more details of Anakin and make it better. + You might want to know more details of Anakin and make it better. Please refer to [how to add custom devices](docs/Manual/addCustomDevice.md) and [how to add custom device operators](docs/Manual/addCustomOp.md). -- [C++ API]() +- User guide - Python API is under-developing. + You can get the working principle of the project, C++ interface description and code examples from [here](docs/Manual/Tutorial_ch.md). You can also learn about the model converter [here](docs/Manual/Converter_ch.md). -- [How to Contribute]() +- [How to Contribute](docs/Manual/Contribution_ch.md) We appreciate your contributions! + ## Ask Questions You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Anakin/issues). diff --git a/benchmark/README.md b/benchmark/README.md index 5dcf61d93..94f57930f 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -1,173 +1,42 @@ # Benchmark -## Machine: - -This time, we only provide benchmark on GPU. In the near future, we will add benchmark on ARM and CPU. - -> CPU: `12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz` -> GPU: `Tesla P4` -> cuDNN: `v7` - -## Counterpart of anakin : -The counterpart of **`Anakin`** is the acknowledged high performance inference engine **`NVIDIA TensorRT 3`** , The models which TensorRT 3 doesn't support we use the custom plugins to support. - ## Benchmark Model -The following convolutional neural networks are tested with both `Anakin` and `TenorRT3`. - You can use pretrained caffe model or the model trained by youself. - > Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](#) +### GPU -- [Vgg16](#1) *caffe model can be found [here->](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)* -- [Yolo](#2) *caffe model can be found [here->](https://github.com/hojel/caffe-yolo-model)* -- [Resnet50](#3) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)* -- [Resnet101](#4) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)* -- [Mobilenet v1](#5) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)* -- [Mobilenet v2](#6) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)* -- [RNN](#7) *not support yet* - -We tested them on single-GPU with single-thread. - -### VGG16 - -- Latency (`ms`) of different batch - - BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 8.8690 | 8.2815 - 2 | 15.5344 | 13.9116 - 4 | 26.6000 | 21.8747 - 8 | 49.8279 | 40.4076 - 32 | 188.6270 | 163.7660 - -- GPU Memory Used (`MB`) - - BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 963 | 997 - 2 | 965 | 1039 - 4 | 991 | 1115 - 8 | 1067 | 1269 - 32 | 1715 | 2193 - - -### Yolo - -- Latency (`ms`) of different batch - - BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 16.4596| 15.2124 - 2 | 26.6347| 25.0442 - 4 | 43.3695| 43.5017 - 8 | 80.9139 | 80.9880 - 32 | 293.8080| 310.8810 - -- GPU Memory Used (`MB`) - - BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 1569 | 1775 - 2 | 1649 | 1815 - 4 | 1709 | 1887 - 8 | 1731 | 2031 - 32 | 2253 | 2907 - -### Resnet50 - -- Latency (`ms`) of different batch - - BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 4.2459 | 4.1061 - 2 | 6.2627 | 6.5159 - 4 | 10.1277 | 11.3327 - 8 | 17.8209 | 20.6680 - 32 | 65.8582 | 77.8858 - -- GPU Memory Used (`MB`) - - BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 531 | 503 - 2 | 543 | 517 - 4 | 583 | 541 - 8 | 611 | 589 - 32 | 809 | 879 - -### Resnet101 - -- Latency (`ms`) of different batch - - BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 7.5562 | 7.0837 - 2 | 11.6023 | 11.4079 - 4 | 18.3650 | 20.0493 - 8 | 32.7632 | 36.0648 - 32 | 123.2550 | 135.4880 - -- GPU Memory Used (`MB)` - - BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 701 | 683 - 2 | 713 | 697 - 4 | 793 | 721 - 8 | 819 | 769 - 32 | 1043 | 1059 - - -### MobileNet V1 - -- Latency (`ms`) of different batch - - BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 45.5156 | 1.3947 - 2 | 46.5585 | 2.5483 - 4 | 48.4242 | 4.3404 - 8 | 52.7957 | 8.1513 - 32 | 83.2519 | 31.3178 - -- GPU Memory Used (`MB`) - - BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 329 | 283 - 2 | 345 | 289 - 4 | 371 | 299 - 8 | 393 | 319 - 32 | 531 | 433 - -### MobileNet V2 - -- Latency (`ms`) of different batch +The following convolutional neural networks are tested with both `Anakin` and `TenorRT3` on GPU. + You can use pretrained caffe model or the model trained by youself. - BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 65.6861 | 2.9842 - 2 | 66.6814 | 4.7472 - 4 | 69.7114 | 7.4163 - 8 | 76.1092 | 12.8779 - 32 | 124.9810 | 47.2142 +- [Vgg16]() *caffe model can be found [here->](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)* +- [Yolo]() *caffe model can be found [here->](https://github.com/hojel/caffe-yolo-model)* +- [Resnet50]() *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)* +- [Resnet101]() *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)* +- [Mobilenet v1]() *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)* +- [Mobilenet v2]() *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)* +- [RNN]() *not support yet* -- GPU Memory Used (`MB`) +### CPU - BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 341 | 293 - 2 | 353 | 301 - 4 | 385 | 319 - 8 | 421 | 351 - 32 | 637 | 551 +The following convolutional neural networks are tested with `Anakin`, 'Tensorflow' and `Tensorflow`. + You can use pretrained model or the model trained by youself. +- [Language model]() *fluid model can be found [here->](https://github.com/PaddlePaddle/models/tree/develop/fluid/language_model)* +- [Chinese_ner]() *fluid model can be found [here->](https://github.com/PaddlePaddle/models/blob/develop/fluid/chinese_ner)* +- [text_classification]() *fluid model can be found [here->](https://github.com/PaddlePaddle/models/blob/develop/fluid/text_classification)* -### RNN +### ARM -The benchmark of rnn network will be added later. +The following convolutional neural networks are tested with `Anakin`, 'Tensorflow' and `Tensorflow`. + You can use pretrained model or the model trained by youself. -## How to run those Benchmark models? +- [Mobilenet v1]() *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)* +- [Mobilenet v2]() *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)* +- [mobilenet-ssd]() *caffe model can be found [here->](https://github.com/chuanqi305/MobileNet-SSD)* -> Please refer to [Instructions](CNN/README.md) +## Test Results +The detailed test results can be seen here. +- [GPU](./README_GPU.md) +- [CPU](./README_CPU.md) +- [ARM](./README_ARM.md) diff --git a/benchmark/README_ARM.md b/benchmark/README_ARM.md new file mode 100644 index 000000000..368706570 --- /dev/null +++ b/benchmark/README_ARM.md @@ -0,0 +1,66 @@ +# BenchMark + +## Machine: + ++ Compile circumstance: Android ndk cross compile,gcc 4.9,enable neon ++ ABI: armveabi-v7a with neon -mfloat-abi=softfp ++ Testing platform + - honor v9(root): Kirin960, 4 big cores in 2.36GHz, 4 little cores in 1.8GHz + - nubia z17:Qualcomm835, 4 big cores in 2.36GHz, 4 little cores in 1.9GHz + - 360 N5:Qualcomm653, 4 big cores in 1.8GHz, 4 little cores in 1.4GHz ++ Time:warmup 10,running 10 times to get average time ++ ncnn :git clone on github master branch and commits ID is 307a77f04be29875f40d337cfff6df747df09de6(msg:convert LogisticRegressionOutput) ++ TFlite:git clone on github master branch and commits ID is 65c05bc2ac19f51f7027e66350bc71652662125c(msg:Removed unneeded file copy that was causing failure in Pi builds) + +## Counterpart of Anakin + +The counterpart of **`Anakin`** are **`ncnn`** and **`TFlite`**. + +## BenchMark model + +> Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md) + +- [Mobilenet v1](#11) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)* +- [Mobilenet v2](#22) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)* +- [mobilenet-ssd](#33) *caffe model can be found [here->](https://github.com/chuanqi305/MobileNet-SSD)* + +We tested them on ARM with multi-thread and single-batchsize. + +### mobilenetv1 + +- Latency (`ms`) of different thread + + |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| + |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| + |Kirin960|107.7|61.1ms|38.2 |152.8 |85.2 |51.9 |152.6 |nan|nan| + |Qualcomm835|105.7 |63.1 |~~46.8 ~~|152.7 |87.0 |~~92.7 ~~|146.9 |nan|nan| + |Qualcomm653|120.3 |64.2 |46.6 |202.5 |117.6 |84.8 |158.6 |nan|nan| + +### mobilenetv2 + +- Latency (`ms`) of different thread + + |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| + |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| + |Kirin960|93.1 |53.9 |34.8 |144.4 |84.3 |55.3 |100.6 |nan|nan| + |Qualcomm835|93.0 |55.6 |41.1 |139.1 |88.4 |58.1 |95.2 |nan|nan| + |Qualcomm653|106.6 |64.2 |48.0 |199.9 |125.1 |98.9 |108.5 |nan|nan| + +### mobilenet-ssd + +- Latency (`ms`) of different thread + + |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| + |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| + |Kirin960|213.9 |120.5 |74.5 |307.9 |166.5 |104.2 |nan|nan|nan| + |Qualcomm835|213.0 |125.7 |~~98.4 ~~|292.9 |177.9 |~~167.8 ~~|nan|nan|nan| + |Qualcomm653|236.0 |129.6 |96.0 |377.7 |228.9 |165.0 |nan|nan|nan + +## How to run those Benchmark models? + +1. At first, you should parse the caffe model with [External Converter](../docs/Manual/Converter_en.md) +2. Second, adb push Anakin model and benchmark_arm bin to testing phone +3. Then, switch to /data/local/tmp/ directory on testing phone, run `./benchmark_arm ./ anakin_model.anakin.bin 1 10 10 1` command +4. Finally,model latency summary will be displayed on the screen. +5. You can see the detailed parameters meaning by running `/benchmark_arm` + diff --git a/benchmark/README_CPU.md b/benchmark/README_CPU.md new file mode 100644 index 000000000..6113e2e2c --- /dev/null +++ b/benchmark/README_CPU.md @@ -0,0 +1,281 @@ +# Benchmark + +## Machine: + +This time, we only provide benchmark on CPU. In the near future, we will add benchmark on ARM and GPU. + +> System: `CentOS 7 in Docker`, for benchmark between Anakin and Tensorflow +> System: `CentOS 6.3`, for benchmark between Anakin and Paddle + +## Counterpart of anakin : + +The counterpart of **`Anakin`** is `Tensorflow 1.8.0`, which installed by Anaconda 4.5.4, run by Python 3.6 + +## Benchmark Model + + You can use pretrained model or the model trained by youself. + +> Please note that you should transform fluid model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md) + +- [Language model](#1) *fluid model can be found [here->](https://github.com/PaddlePaddle/models/tree/develop/fluid/language_model)* +- [Chinese_ner](#4) *fluid model can be found [here->](https://github.com/PaddlePaddle/models/blob/develop/fluid/chinese_ner)* +- [text_classification](#7) *fluid model can be found [here->](https://github.com/PaddlePaddle/models/blob/develop/fluid/text_classification)* + +We tested them on single-CPU with different thread numbers. + +1. **`Anakin`** VS **`Tensorflow`** + +### language model in i7-7700 + +- Latency (`ms`) of one batch + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 5.64 | 2.44 + 2 | 8.29 | 4.44 + 4 | 14.23 | 9.91 + 6 | 19.83 | 15.51 + +- Throughput (`words/s`) + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 3459 | 8536 + 2 | 4772 | 9399 + 4 | 5498 | 8418 + 6 | 5764 | 8070 + +### language model in E5-2620 v4 + +- Latency (`ms`) of one batch + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 6.31 | 2.84 + 2 | 7.94 | 2.678 + 4 | 8.66 | 4.32 + 6 | 12.33 | 7.12 + +- Throughput (`words/s`) + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 2890 | 7257 + 2 | 4726 | 15439 + 4 | 8659 | 18351 + 6 | 9414 | 17461 + +### language model in E5-2650 v4 + +- Latency (`ms`) of one batch + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 3.69 | 2.84 + 2 | 4.62 | 2.85 + 4 | 7.78 | 3.48 + 6 | 13.54 | 4.79 + +- Throughput (`words/s`) + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 4456 | 7300 + 2 | 7522 | 14556 + 4 | 9580 | 22086 + 6 | 8664 | 23938 + +### text_classfication model in i7-7700 + +- Latency (`ms`) of one batch + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 1.25 | 0.32 + 2 | 1.87 | 0.33 + 4 | 2.01 | 0.35 + 6 | 2.81 | 0.58 + +- Throughput (`words/s`) + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 12797 | 53506 + 2 | 17933 | 95898 + 4 | 31965 | 148427 + 6 | 31784 | 118684 + +### text_classfication in E5-2620 v4 + +- Latency (`ms`) of one batch + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 3.89 | 0.58 + 2 | 3.77 | 0.61 + 4 | 3.05 | 0.62 + 6 | 3.84 | 0.66 + +- Throughput (`words/s`) + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 4281 | 28192 + 2 | 8804 | 49840 + 4 | 19949 | 89710 + 6 | 24798 | 116975 + +### text_classfication in E5-2650 v4 + +- Latency (`ms`) of one batch + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 2.26 | 0.67 + 2 | 2.34 | 0.7 + 4 | 2.25 | 0.72 + 6 | 2.47 | 0.73 + +- Throughput (`words/s`) + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 6337 | 24636 + 2 | 12266 | 45368 + 4 | 24869 | 81952 + 6 | 34872 | 109993 + +### chinese_ner model in i7-7700 + +- Latency (`ms`) of one batch + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 1.96 | 0.094 + 2 | 2.59 | 0.098 + 4 | 3.74 | 0.1 + 6 | 3.95 | 0.13 + +- Throughput (`words/s`) + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 8747 | 156564 + 2 | 13293 | 208484 + 4 | 18294 | 114348 + 6 | 25338 | 66480 + +### chinese_ner in E5-2620 v4 + +- Latency (`ms`) of one batch + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 5.44 | 0.13 + 2 | 5.45 | 0.14 + 4 | 4.84 | 0.15 + 6 | 5.18 | 0.16 + +- Throughput (`words/s`) + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 4281 | 93527 + 2 | 8804 | 127232 + 4 | 19949 | 118649 + 6 | 24798 | 99553 + +### chinese_ner in E5-2650 v4 + +- Latency (`ms`) of one batch + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 3.61 | 0.16 + 2 | 3.78 | 0.16 + 4 | 3.74 | 0.17 + 6 | 3.78 | 0.16 + +- Throughput (`words/s`) + + ThreadNum | Tensorflow | Anakin + :---: | :---: | :---: | + 1 | 4669 | 79225 + 2 | 8953 | 115761 + 4 | 18074 | 118696 + 6 | 26607 | 102044 + +2. **`Anakin`** VS **`PaddlePaddle/Fluid`** +We use private dataset and different QPS index in this benchmark. +### language model in E5-2650 v4 + +- Latency (`ms`) of one batch + + ThreadNum | Fluid | Anakin + :---: | :---: | :---: | + 1 | 42.7418 | 1.93589 + 2 | 42.7418 | 2.49537 + 6 | 42.7734 | 3.14332 + 10 | 43.0721 | 4.55329 + 12 | 42.8501 | 5.09893 + +- Throughput (`sentence/s`) + + ThreadNum | Fluid | Anakin + :---: | :---: | :---: | + 1 | 23 | 504 + 2 | 46 | 762 + 6 | 134 | 1393 + 10 | 218 | 1556 + 12 | 260 | 1541 + +### Chinese_ner model in E5-2650 v4 + +- Latency (`ms`) of one batch + + ThreadNum | Fluid | Anakin + :---: | :---: | :---: | + 1 | 0.380475 | 0.17034 + 4 | 0.380475 | 0.171143 + 6 | 0.380475 | 0.172688 + 10 | 0.380475 | 0.173269 + 12 | 0.380475 | 0.17668 + +- Throughput (`sentence/s`) + + ThreadNum | Fluid | Anakin + :---: | :---: | :---: | + 1 | 7844 | 5822 + 4 | 7844 | 11377 + 6 | 7844 | 29725 + 10 | 7844 | 41238 + 12 | 7844 | 42790 + +### text_classfication model in E5-2650 v4 + +- Latency (`ms`) of one batch + + ThreadNum | Fluid | Anakin + :---: | :---: | :---: | + 1 | 1.48578 | 1.10088 + 4 | 1.54025 | 1.11258 + 6 | 1.68529 | 1.1257 + 10 | 1.9817 | 1.13267 + 12 | 2.21864 | 1.1429 + +- Throughput (`sentence/s`) + + ThreadNum | Fluid | Anakin + :---: | :---: | :---: | + 1 | 673 | 901 + 4 | 1289 | 1665 + 6 | 3458 | 4449 + 10 | 4875 | 6183 + 12 | 5265 | 6188 + +## How to run those Benchmark models? + +> 1. You can just run `sh benchmark_tensorflow.sh` and `sh benchmark_anakin.sh` +> 2. Get the model of caffe or fluid, convert model to anakin model, use net_test_*** to test your model. + + diff --git a/benchmark/README_GPU.md b/benchmark/README_GPU.md new file mode 100644 index 000000000..04326535a --- /dev/null +++ b/benchmark/README_GPU.md @@ -0,0 +1,176 @@ +# Benchmark + +## Machine: + +> CPU: `12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz` +> GPU: `Tesla P4` +> cuDNN: `v7` + + +## Counterpart of anakin : + +The counterpart of **`Anakin`** is the acknowledged high performance inference engine **`NVIDIA TensorRT 3`** , The models which TensorRT 3 doesn't support we use the custom plugins to support. + +## Benchmark Model + +The following convolutional neural networks are tested with both `Anakin` and `TenorRT3`. + You can use pretrained caffe model or the model trained by youself. + +> Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md) + + +- [Vgg16](#1) *caffe model can be found [here->](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)* +- [Yolo](#2) *caffe model can be found [here->](https://github.com/hojel/caffe-yolo-model)* +- [Resnet50](#3) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)* +- [Resnet101](#4) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)* +- [Mobilenet v1](#5) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)* +- [Mobilenet v2](#6) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)* +- [RNN](#7) *not support yet* + +We tested them on single-GPU with single-thread. + +### VGG16 + +- Latency (`ms`) of different batch + + BatchSize | TensorRT | Anakin + :---: | :---: | :---: | + 1 | 8.85176 | 8.15362 + 2 | 15.6517 | 13.8716 + 4 | 26.5303 | 21.8478 + 8 | 48.2286 | 40.496 + 32 | 183.994 | 163.035 + +- GPU Memory Used (`MB`) + + BatchSize | TensorRT | Anakin + :---: | :---: | :---: | + 1 | 887 | 648 + 2 | 965 | 733 + 4 | 991 | 810 + 8 | 1067 | 911 + 32 | 1715 | 1325 + + +### Yolo + +- Latency (`ms`) of different batch + + BatchSize | TensorRT | Anakin + :---: | :---: | :---: | + 1 | 16.4623| 15.3214 + 2 | 26.7082| 25.0305 + 4 | 43.2129| 43.4758 + 8 | 80.0053 | 80.7645 + 32 | 283.352| 311.152 + +- GPU Memory Used (`MB`) + + + BatchSize | TensorRT | Anakin + :---: | :---: | :---: | + 1 | 1226 | 1192 + 2 | 1326 | 1269 + 4 | 1435 | 1356 + 8 | 1563 | 1434 + 32 | 2150 | 1633 + +### Resnet50 + +- Latency (`ms`) of different batch + + BatchSize | TensorRT | Anakin + :---: | :---: | :---: | + 1 | 4.26834 | 3.25853 + 2 | 6.2811 | 6.12156 + 4 | 10.1183 | 10.9219 + 8 | 18.1395 | 20.323 + 32 | 66.4728 | 83.9934 + +- GPU Memory Used (`MB`) + + BatchSize | TensorRT | Anakin + :---: | :---: | :---: | + 1 | 932 | 272 + 2 | 936 | 318 + 4 | 720 | 376 + 8 | 697 | 480 + 32 | 842 | 835 + +### Resnet101 + +- Latency (`ms`) of different batch + + BatchSize | TensorRT | Anakin + :---: | :---: | :---: | + 1 | 7.58234 | 5.66457 + 2 | 11.6014 | 10.9213 + 4 | 18.3298 | 19.3987 + 8 | 32.6523 | 37.5575 + 32 | 123.114 | 149.089 + +- GPU Memory Used (`MB)` + + BatchSize | TensorRT | Anakin + :---: | :---: | :---: | + 1 | 1020 | 420 + 2 | 961 | 467 + 4 | 943 | 503 + 8 | 885 | 606 + 32 | 1048 | 1077 + +### MobileNet V1 + +- Latency (`ms`) of different batch + + BatchSize | TensorRT | Anakin + :---: | :---: | :---: | + 1 | 45.2189 | 1.39566 + 2 | 46.4538 | 2.50698 + 4 | 47.8918 | 4.38727 + 8 | 52.3636 | 8.21416 + 32 | 83.0503 | 31.33 + +- GPU Memory Used (`MB`) + + BatchSize | TensorRT | Anakin + :---: | :---: | :---: | + 1 | 516 | 176 + 2 | 524 | 166 + 4 | 497 | 165 + 8 | 508 | 239 + 32 | 628 | 388 + +### MobileNet V2 + +- Latency (`ms`) of different batch + + BatchSize | TensorRT | Anakin + :---: | :---: | :---: | + 1 | 65.4277 | 1.80542 + 2 | 66.2048 | 3.85568 + 4 | 68.8045 | 6.80921 + 8 | 75.64 | 12.6038 + 32 | 124.09 | 47.6079 + +- GPU Memory Used (`MB`) + + BatchSize | TensorRT | Anakin + :---: | :---: | :---: | + 1 | 341 | 293 + 2 | 353 | 301 + 4 | 385 | 319 + 8 | 421 | 351 + 32 | 637 | 551 + +## How to run those Benchmark models? + +> 1. At first, you should parse the caffe model with [`external converter ->`](../docs/Manual/Converter_en.md). +> 2. Switch to *source_root/benchmark/CNN* directory. Use 'mkdir ./models' to create ./models and put anakin models into this file. +> 3. Use command 'sh run.sh', we will create files in logs to save model log with different batch size. Finally, model latency summary will be displayed on the screen. +> 4. If you want to get more detailed information with op time, you can modify CMakeLists.txt with setting `ENABLE_OP_TIMER` to `YES`, then recompile and run. You will find detailed information in model log file. + + + + + diff --git a/benchmark/RNN/README.md b/benchmark/RNN/README.md new file mode 100644 index 000000000..0232d7d22 --- /dev/null +++ b/benchmark/RNN/README.md @@ -0,0 +1,10 @@ +# RNN BenchMark + + +## 1. How to run + +Two way to run anakin + +> 1.You can just run `sh benchmark_tensorflow.sh` `sh benchmark_anakin.sh` +> 2.Get the model of caffe or fluid, convert model to anakin model, use net_test_*** to test your model + diff --git a/benchmark/RNN/Tokenizer.py b/benchmark/RNN/Tokenizer.py new file mode 100644 index 000000000..cceac5310 --- /dev/null +++ b/benchmark/RNN/Tokenizer.py @@ -0,0 +1,384 @@ +# -*- coding: utf-8 -*- +"""Utilities for text input preprocessing. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import string +import sys +import warnings +from collections import OrderedDict +from hashlib import md5 + +import numpy as np +from six.moves import range +from six.moves import zip + +if sys.version_info < (3,): + maketrans = string.maketrans +else: + maketrans = str.maketrans + + +def text_to_word_sequence(text, + filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', + lower=True, split=" "): + """Converts a text to a sequence of words (or tokens). + + # Arguments + text: Input text (string). + filters: list (or concatenation) of characters to filter out, such as + punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`, + includes basic punctuation, tabs, and newlines. + lower: boolean. Whether to convert the input to lowercase. + split: str. Separator for word splitting. + + # Returns + A list of words (or tokens). + """ + if lower: + text = text.lower() + + if sys.version_info < (3,): + if isinstance(text, unicode): + translate_map = dict((ord(c), unicode(split)) for c in filters) + text = text.translate(translate_map) + elif len(split) == 1: + translate_map = maketrans(filters, split * len(filters)) + text = text.translate(translate_map) + else: + for c in filters: + text = text.replace(c, split) + else: + translate_dict = dict((c, split) for c in filters) + translate_map = maketrans(translate_dict) + text = text.translate(translate_map) + + seq = text.split(split) + return [i for i in seq if i] + + +def one_hot(text, n, + filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', + lower=True, + split=' '): + """One-hot encodes a text into a list of word indexes of size n. + + This is a wrapper to the `hashing_trick` function using `hash` as the + hashing function; unicity of word to index mapping non-guaranteed. + + # Arguments + text: Input text (string). + n: int. Size of vocabulary. + filters: list (or concatenation) of characters to filter out, such as + punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`, + includes basic punctuation, tabs, and newlines. + lower: boolean. Whether to set the text to lowercase. + split: str. Separator for word splitting. + + # Returns + List of integers in [1, n]. Each integer encodes a word + (unicity non-guaranteed). + """ + return hashing_trick(text, n, + hash_function=hash, + filters=filters, + lower=lower, + split=split) + + +def hashing_trick(text, n, + hash_function=None, + filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', + lower=True, + split=' '): + """Converts a text to a sequence of indexes in a fixed-size hashing space. + + # Arguments + text: Input text (string). + n: Dimension of the hashing space. + hash_function: defaults to python `hash` function, can be 'md5' or + any function that takes in input a string and returns a int. + Note that 'hash' is not a stable hashing function, so + it is not consistent across different runs, while 'md5' + is a stable hashing function. + filters: list (or concatenation) of characters to filter out, such as + punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`, + includes basic punctuation, tabs, and newlines. + lower: boolean. Whether to set the text to lowercase. + split: str. Separator for word splitting. + + # Returns + A list of integer word indices (unicity non-guaranteed). + + `0` is a reserved index that won't be assigned to any word. + + Two or more words may be assigned to the same index, due to possible + collisions by the hashing function. + The [probability]( + https://en.wikipedia.org/wiki/Birthday_problem#Probability_table) + of a collision is in relation to the dimension of the hashing space and + the number of distinct objects. + """ + if hash_function is None: + hash_function = hash + elif hash_function == 'md5': + hash_function = lambda w: int(md5(w.encode()).hexdigest(), 16) + + seq = text_to_word_sequence(text, + filters=filters, + lower=lower, + split=split) + return [(hash_function(w) % (n - 1) + 1) for w in seq] + + +class Tokenizer(object): + """Text tokenization utility class. + + This class allows to vectorize a text corpus, by turning each + text into either a sequence of integers (each integer being the index + of a token in a dictionary) or into a vector where the coefficient + for each token could be binary, based on word count, based on tf-idf... + + # Arguments + num_words: the maximum number of words to keep, based + on word frequency. Only the most common `num_words` words will + be kept. + filters: a string where each element is a character that will be + filtered from the texts. The default is all punctuation, plus + tabs and line breaks, minus the `'` character. + lower: boolean. Whether to convert the texts to lowercase. + split: str. Separator for word splitting. + char_level: if True, every character will be treated as a token. + oov_token: if given, it will be added to word_index and used to + replace out-of-vocabulary words during text_to_sequence calls + + By default, all punctuation is removed, turning the texts into + space-separated sequences of words + (words maybe include the `'` character). These sequences are then + split into lists of tokens. They will then be indexed or vectorized. + + `0` is a reserved index that won't be assigned to any word. + """ + + def __init__(self, num_words=None, + filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', + lower=True, + split=' ', + char_level=False, + oov_token=None, + **kwargs): + # Legacy support + if 'nb_words' in kwargs: + warnings.warn('The `nb_words` argument in `Tokenizer` ' + 'has been renamed `num_words`.') + num_words = kwargs.pop('nb_words') + if kwargs: + raise TypeError('Unrecognized keyword arguments: ' + str(kwargs)) + + self.word_counts = OrderedDict() + self.word_docs = {} + self.filters = filters + self.split = split + self.lower = lower + self.num_words = num_words + self.document_count = 0 + self.char_level = char_level + self.oov_token = oov_token + self.index_docs = {} + + def fit_on_texts(self, texts): + """Updates internal vocabulary based on a list of texts. + + In the case where texts contains lists, + we assume each entry of the lists to be a token. + + Required before using `texts_to_sequences` or `texts_to_matrix`. + + # Arguments + texts: can be a list of strings, + a generator of strings (for memory-efficiency), + or a list of list of strings. + """ + for text in texts: + self.document_count += 1 + if self.char_level or isinstance(text, list): + seq = text + else: + seq = text_to_word_sequence(text, + self.filters, + self.lower, + self.split) + for w in seq: + if w in self.word_counts: + self.word_counts[w] += 1 + else: + self.word_counts[w] = 1 + for w in set(seq): + if w in self.word_docs: + self.word_docs[w] += 1 + else: + self.word_docs[w] = 1 + + wcounts = list(self.word_counts.items()) + wcounts.sort(key=lambda x: x[1], reverse=True) + sorted_voc = [wc[0] for wc in wcounts] + # note that index 0 is reserved, never assigned to an existing word + self.word_index = dict( + list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1))))) + + if self.oov_token is not None: + i = self.word_index.get(self.oov_token) + if i is None: + self.word_index[self.oov_token] = len(self.word_index) + 1 + + for w, c in list(self.word_docs.items()): + self.index_docs[self.word_index[w]] = c + # print(self.word_index) + # print(self.index_docs) + + def fit_on_sequences(self, sequences): + """Updates internal vocabulary based on a list of sequences. + + Required before using `sequences_to_matrix` + (if `fit_on_texts` was never called). + + # Arguments + sequences: A list of sequence. + A "sequence" is a list of integer word indices. + """ + self.document_count += len(sequences) + for seq in sequences: + seq = set(seq) + for i in seq: + if i not in self.index_docs: + self.index_docs[i] = 1 + else: + self.index_docs[i] += 1 + + def texts_to_sequences(self, texts): + """Transforms each text in texts in a sequence of integers. + + Only top "num_words" most frequent words will be taken into account. + Only words known by the tokenizer will be taken into account. + + # Arguments + texts: A list of texts (strings). + + # Returns + A list of sequences. + """ + res = [] + for vect in self.texts_to_sequences_generator(texts): + res.append(vect) + return res + + def texts_to_sequences_generator(self, texts): + """Transforms each text in `texts` in a sequence of integers. + + Each item in texts can also be a list, + in which case we assume each item of that list to be a token. + + Only top "num_words" most frequent words will be taken into account. + Only words known by the tokenizer will be taken into account. + + # Arguments + texts: A list of texts (strings). + + # Yields + Yields individual sequences. + """ + num_words = self.num_words + for text in texts: + if self.char_level or isinstance(text, list): + seq = text + else: + seq = text_to_word_sequence(text, + self.filters, + self.lower, + self.split) + vect = [] + # print(self.word_index) + for w in seq: + i = self.word_index.get(w) + + if num_words and i >= num_words: + if self.oov_token==None: + continue + else: + vect.append(num_words) + else: + vect.append(i) + yield vect + + def texts_to_matrix(self, texts, mode='binary'): + """Convert a list of texts to a Numpy matrix. + + # Arguments + texts: list of strings. + mode: one of "binary", "count", "tfidf", "freq". + + # Returns + A Numpy matrix. + """ + sequences = self.texts_to_sequences(texts) + return self.sequences_to_matrix(sequences, mode=mode) + + def sequences_to_matrix(self, sequences, mode='binary'): + """Converts a list of sequences into a Numpy matrix. + + # Arguments + sequences: list of sequences + (a sequence is a list of integer word indices). + mode: one of "binary", "count", "tfidf", "freq" + + # Returns + A Numpy matrix. + + # Raises + ValueError: In case of invalid `mode` argument, + or if the Tokenizer requires to be fit to sample data. + """ + if not self.num_words: + if self.word_index: + num_words = len(self.word_index) + 1 + else: + raise ValueError('Specify a dimension (num_words argument), ' + 'or fit on some text data first.') + else: + num_words = self.num_words + + if mode == 'tfidf' and not self.document_count: + raise ValueError('Fit the Tokenizer on some data ' + 'before using tfidf mode.') + + x = np.zeros((len(sequences), num_words)) + for i, seq in enumerate(sequences): + if not seq: + continue + counts = {} + for j in seq: + if j >= num_words: + continue + if j not in counts: + counts[j] = 1. + else: + counts[j] += 1 + for j, c in list(counts.items()): + if mode == 'count': + x[i][j] = c + elif mode == 'freq': + x[i][j] = c / len(seq) + elif mode == 'binary': + x[i][j] = 1 + elif mode == 'tfidf': + # Use weighting scheme 2 in + # https://en.wikipedia.org/wiki/Tf%E2%80%93idf + tf = 1 + np.log(c) + idf = np.log(1 + self.document_count / + (1 + self.index_docs.get(j, 0))) + x[i][j] = tf * idf + else: + raise ValueError('Unknown vectorization mode:', mode) + return x diff --git a/benchmark/RNN/benchmark_anakin.sh b/benchmark/RNN/benchmark_anakin.sh new file mode 100755 index 000000000..51a8bc107 --- /dev/null +++ b/benchmark/RNN/benchmark_anakin.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -e +set -x +sdir=$(cd `dirname $0`; pwd) + +sh $sdir/prepare.sh + +#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 1 $sdir/../../output/unit_test/net_exec_x86_oneinput $sdir/model/language_model/ $sdir/data/ptb.valid_tokenlize.txt 1 +#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 2 $sdir/../../output/unit_test/net_exec_x86_oneinput $sdir/model/language_model/ $sdir/data/ptb.valid_tokenlize.txt 2 +#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 4 $sdir/../../output/unit_test/net_exec_x86_oneinput $sdir/model/language_model/ $sdir/data/ptb.valid_tokenlize.txt 4 +#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 6 $sdir/../../output/unit_test/net_exec_x86_oneinput $sdir/model/language_model/ $sdir/data/ptb.valid_tokenlize.txt 6 + +for i in {1,2,4,6} ;do +$sdir/../../output/unit_test/net_exec_x86_oneinput $sdir/model/language_model/ $sdir/data/ptb.valid_tokenlize.txt $i +done + +for i in {1,2,4,6} ;do +$sdir/../../output/unit_test/net_exec_test_chinese_ner $sdir/model/chinese_ner_model/ $sdir/data/ner_data.txt $i 1 +done + +for i in {1,2,4,6} ;do +$sdir/../../output/unit_test/net_exec_x86_oneinput $sdir/model/text_classfication/ $sdir/data/ptb.valid_tokenlize.txt $i +done \ No newline at end of file diff --git a/benchmark/RNN/benchmark_tensorflow.sh b/benchmark/RNN/benchmark_tensorflow.sh new file mode 100755 index 000000000..0d874dcf0 --- /dev/null +++ b/benchmark/RNN/benchmark_tensorflow.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -e +set -x + +sdir=$(cd `dirname $0`; pwd) + +sh $sdir/prepare.sh + +#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 1 python $sdir/tensorflow_language_model.py 1 +#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 2 python $sdir/tensorflow_language_model.py 2 +#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 4 python $sdir/tensorflow_language_model.py 4 +#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 6 python $sdir/tensorflow_language_model.py 6 + +for i in {1,2,4,6};do +python $sdir/tensorflow_language_model.py --process_num=$i +done + +for i in {1,2,4,6};do +python $sdir/tensorflow_chinese_ner.py --process_num=$i +done + +for i in {1,2,4,6};do +python $sdir/tensorflow_text_classfication.py --process_num=$i +done \ No newline at end of file diff --git a/benchmark/RNN/prepare.sh b/benchmark/RNN/prepare.sh new file mode 100755 index 000000000..7762fff96 --- /dev/null +++ b/benchmark/RNN/prepare.sh @@ -0,0 +1,20 @@ +#!/bin/bash +sdir=$(cd `dirname $0`; pwd) + +if [ ! -e $sdir/data/ptb.valid.txt ]; then +echo "can not find language_data download now" +wget -P $sdir/data/ http://ojf1xbmzo.bkt.clouddn.com/ptb.valid.txt +fi + +if [ ! -e $sdir/data/ner_data.txt ]; then +echo "can not find language_data download now" +wget -P $sdir/data/ https://raw.githubusercontent.com/PaddlePaddle/models/develop/fluid/chinese_ner/data/test_files/test_part_1 +for n in $(seq 30); do cat $sdir/data/test_part_1 >> $sdir/data/ner_data.txt; done +rm $sdir/data/test_part_1 +fi + +if [ ! -e $sdir/data/ptb.valid_tokenlize.txt ]; then +python $sdir/read_ptb_data.py +fi + + diff --git a/benchmark/RNN/read_ptb_data.py b/benchmark/RNN/read_ptb_data.py new file mode 100644 index 000000000..f2c4ad92f --- /dev/null +++ b/benchmark/RNN/read_ptb_data.py @@ -0,0 +1,36 @@ +from Tokenizer import Tokenizer +# from keras.preprocessing.text import Tokenizer +import os +import sys +class PTB_Data_Reader(): + + def read(self): + # print('!',sys.argv[0]) + # print(os.path.dirname(__file__)+'/data/ptb.valid.txt') + file=open(os.path.dirname(__file__)+'/data/ptb.valid.txt') + lines=file.readlines() + tokenizer=Tokenizer(9999,oov_token=1) + tokenizer.fit_on_texts(lines) + self.seqs=tokenizer.texts_to_sequences(lines) + return self.seqs + + def save_to(self): + save_file=open(os.path.dirname(__file__)+'/data/ptb.valid_tokenlize.txt','w') + for line in self.seqs: + line_str=''.join(str(i)+' ' for i in line) + line_str=line_str[:-1] + save_file.write(line_str+'\n') + +class NER_Data_Reader(): + def read(self): + # print(os.path.dirname(__file__)+'/data/ptb.valid.txt') + file=open(os.path.dirname(__file__)+'/data/ner_data.txt') + self.seqs=[[[int(i) for i in line.split(';')[1].split(' ')],[int(i) for i in line.split(';')[3].split(' ')]] for line in file.readlines()] + + return self.seqs + +if __name__ == '__main__': + read=PTB_Data_Reader() + read.read() + read.save_to() + diff --git a/benchmark/RNN/sh_base/cpu_benchmark_base_one_socket.sh b/benchmark/RNN/sh_base/cpu_benchmark_base_one_socket.sh new file mode 100755 index 000000000..49beef068 --- /dev/null +++ b/benchmark/RNN/sh_base/cpu_benchmark_base_one_socket.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e +core_per_socker=`lscpu | grep "Core(s) per socket" | awk -F ':' '{print $2}' | sed 's/^ *\| *$//g'` +core_num=$core_per_socker + +echo $core_num +core_idx=$[$core_num-1] +echo $core_idx +core_range='0-'${core_idx} + +echo ${core_range} + +unset OMP_NUM_THREADS +export OMP_NUM_THREADS=${core_num} +unset MKL_NUM_THREADS +export MKL_NUM_THREADS=${core_num} + +taskset -c ${core_range} numactl -l $* \ No newline at end of file diff --git a/benchmark/RNN/sh_base/cpu_benchmark_base_some_thread.sh b/benchmark/RNN/sh_base/cpu_benchmark_base_some_thread.sh new file mode 100755 index 000000000..9b6d75910 --- /dev/null +++ b/benchmark/RNN/sh_base/cpu_benchmark_base_some_thread.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -e +set -x +core_num=$1 +shift + + + +core_range='1-'$core_num + + +echo ${core_range} + +unset OMP_NUM_THREADS +export OMP_NUM_THREADS=${core_num} +unset MKL_NUM_THREADS +export MKL_NUM_THREADS=${core_num} + +#taskset -c ${core_range} numactl -l $* +taskset -c ${core_range} $* diff --git a/benchmark/RNN/tensorflow_c_benchmark/benchmark_tensorflow.sh b/benchmark/RNN/tensorflow_c_benchmark/benchmark_tensorflow.sh new file mode 100755 index 000000000..1631f9fd7 --- /dev/null +++ b/benchmark/RNN/tensorflow_c_benchmark/benchmark_tensorflow.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e +set -x + +sdir=$(cd `dirname $0`; pwd) + + +for i in {1,2,4,6};do +bazel run //tensorflow/cc:example_model /root/tf_mount/RNN/model/language_model_tf/all.pb /root/tf_mount/RNN/data/ptb.valid_tokenlize.txt $i +done + +for i in {1,2,4,6};do +bazel run //tensorflow/cc:example_model /root/tf_mount/RNN/model/text_classfi_model_tf/all.pb /root/tf_mount/RNN/data/ptb.valid_tokenlize.txt $i +done diff --git a/benchmark/RNN/tensorflow_c_benchmark/example_model.cc b/benchmark/RNN/tensorflow_c_benchmark/example_model.cc new file mode 100644 index 000000000..291f89e33 --- /dev/null +++ b/benchmark/RNN/tensorflow_c_benchmark/example_model.cc @@ -0,0 +1,295 @@ +#include "tensorflow/core/public/session.h" +#include "tensorflow/core/platform/env.h" +#include "vector" +#include +#include +#include "sys/time.h" +#define DEFINE_GLOBAL(type, var, value) \ + type (GLB_##var) = (value) +DEFINE_GLOBAL(int, run_threads, 1); +volatile DEFINE_GLOBAL(int, batch_size, 1); +volatile DEFINE_GLOBAL(int, max_word_len, 0); +volatile DEFINE_GLOBAL(int, word_count, 0); +DEFINE_GLOBAL(std::string, model_dir, ""); +DEFINE_GLOBAL(std::string, input_file, ""); +DEFINE_GLOBAL(std::string, split_word, "\t"); +DEFINE_GLOBAL(std::string, output_name, ""); +DEFINE_GLOBAL(std::string, run_mode, "instance"); +DEFINE_GLOBAL(int, split_index, 0); + +using namespace tensorflow; +int read_file(std::vector& results, const char* file_name) { + + std::ifstream infile(file_name); + + if (!infile.good()) { + std::cout << "Cannot open " << std::endl; + return false; + } + + LOG(INFO) << "found filename: " << file_name; + std::string line; + + while (std::getline(infile, line)) { + results.push_back((float)atof(line.c_str())); + } + + return 0; +} +void SplitString(const std::string& s, + std::vector& v, const std::string& c) { + std::string::size_type pos1, pos2; + pos2 = s.find(c); + pos1 = 0; + + while (std::string::npos != pos2) { + v.push_back(s.substr(pos1, pos2 - pos1)); + + pos1 = pos2 + c.size(); + pos2 = s.find(c, pos1); + } + + if (pos1 != s.length()) { + v.push_back(s.substr(pos1)); + } +} + +int split_word_from_file( + std::vector >& word_idx, + const std::string input_file_path, + const std::string split_token, + const std::string inner_split_token, + const int col_select) { + + std::ifstream infile(input_file_path.c_str()); + + if (!infile.good()) { + std::cout << "Cannot open " << std::endl; + return 1; + } + + LOG(INFO) << "found filename: " << input_file_path; + std::string line; + std::vector split_v; + std::vector split_w; + int word_count = 0; + + while (std::getline(infile, line)) { + split_v.clear(); + SplitString(line, split_v, split_token); + CHECK_GE(split_v.size(), col_select + 1) << " file need ; split"; + std::vector word; + std::vector mention; + split_w.clear(); + SplitString(split_v[col_select], split_w, inner_split_token); + + for (auto w : split_w) { + word.push_back(atof(w.c_str())); + word_count++; + // printf("%d,",atoi(w.c_str())); + } + + // printf("\n"); + // exit(0); + word_idx.push_back(word); + } + + GLB_word_count = word_count; + return 0; +} + +int get_batch_data_offset( + std::vector& out_data, + const std::vector >& seq_data, + std::vector& seq_offset, + const int start_idx, + const int batch_num) { + + seq_offset.clear(); + out_data.clear(); + seq_offset.push_back(0); + int len = 0; + + for (int i = 0; i < batch_num; ++i) { + for (auto d : seq_data[i + start_idx]) { + len += 1; + out_data.push_back(d); + // printf("%.0f, ",d); + } + + // printf("\n"); + seq_offset.push_back(len); + } + + return len; +} +std::vector > get_input_data() { + std::vector > word_idx; + + if (split_word_from_file(word_idx, GLB_input_file, GLB_split_word, " ", GLB_split_index)) { + LOG(ERROR) << " NOT FOUND " << GLB_input_file; + exit(-1); + } + + return word_idx; +}; +void sess_thread(std::vector* tensor_vec) { + SessionOptions opts; + opts.config.set_intra_op_parallelism_threads(1); + opts.config.set_inter_op_parallelism_threads(1); + opts.config.set_use_per_session_threads(true); + Session* session; + Status status = NewSession(opts, &session); + + if (!status.ok()) { + std::cerr << status.ToString() << std::endl; + return ; + } else { + std::cout << "Session created successfully" << std::endl; + } + + // Load the protobuf graph + GraphDef graph_def; + std::string graph_path = GLB_model_dir;//argv[1]; + status = ReadBinaryProto(Env::Default(), graph_path, &graph_def); + + if (!status.ok()) { + std::cerr << status.ToString() << std::endl; + return ; + } else { + std::cout << "Load graph protobuf successfully" << std::endl; + } + + // Add the graph to the session + status = session->Create(graph_def); + + if (!status.ok()) { + std::cerr << status.ToString() << std::endl; + return ; + } else { + std::cout << "Add graph to session successfully" << std::endl; + } + + { + //warm up + std::vector> inputs = { + { "x_input", *(*tensor_vec)[0] }, + }; + std::vector outputs; + session->Run(inputs, {"Softmax"}, {}, &outputs); + + if (!status.ok()) { + std::cerr << status.ToString() << std::endl; + return ; + } else { + // std::cout << "Run session successfully i" << std::endl; + } + + + } + + std::cout << "thread ready to run " << std::endl; + struct timeval time_start, time_end; + + gettimeofday(&time_start, nullptr); + { + for (int i = 0; i < tensor_vec->size(); i++) { + std::vector> inputs = { + { "x_input", *(*tensor_vec)[i] }, + }; + std::vector outputs; + session->Run(inputs, {"Softmax"}, {}, &outputs); + + if (!status.ok()) { + std::cerr << status.ToString() << std::endl; + return ; + } else { + // std::cout << "Run session successfully i" << std::endl; + } + } + + + } + gettimeofday(&time_end, nullptr); + + float use_ms = (time_end.tv_sec - time_start.tv_sec) * 1000.f + (time_end.tv_usec - + time_start.tv_usec) / 1000.f; + std::cout << "thread summary : " << "usetime = " << use_ms << " ms," << "word_sum = " << + GLB_word_count << ",delay = " << (use_ms / tensor_vec->size()) << ", QPS = " << + (GLB_word_count / use_ms * 1000) << std::endl; + + session->Close(); +} +/** + * @brief deep model for click through rate prediction + * @details [long description] + * + * @param argv[1] graph protobuf + * + * @return [description] + */ +int main(int argc, char* argv[]) { + if (argc < 3) { + LOG(INFO) << "Example of Usage:\n \ + ./output/unit_test/model_test\n \ + anakin_models\n input file\n"; + exit(0); + } else if (argc >= 3) { + GLB_model_dir = std::string(argv[1]); + GLB_input_file = std::string(argv[2]); + } + + if (argc >= 4) { + GLB_run_threads = atoi(argv[3]); + } + + // Initialize a tensorflow session + + std::vector > word_idx; + word_idx = get_input_data(); + std::vector tensor_vec; + + for (int i = 0; i < word_idx.size(); i++) { + tensorflow::Tensor* t_tensor_p = new Tensor(DT_INT32, TensorShape({1, word_idx[i].size()})); + auto input_tensor_mapped = t_tensor_p->tensor(); + + for (int j = 0; j < word_idx[i].size(); j++) { + input_tensor_mapped(0, j) = word_idx[i][j]; + + } + + tensor_vec.push_back(t_tensor_p); + } + + std::cout << "get word success!" << std::endl; + std::cout << "first data = " << tensor_vec[0]->tensor()(0, 0) << std::endl; + // Setup inputs and outputs: + // Our graph doesn't require any inputs, since it specifies default values, + // but we'll change an input to demonstrate. + std::vector> threads; + int thread_num = GLB_run_threads; + + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back( + new std::thread(&sess_thread, &tensor_vec)); + } + + for (int i = 0; i < thread_num; ++i) { + threads[i]->join(); + } + + // Grab the first output (we only evaluated one graph node: "c") + // and convert the node to a scalar representation. + //auto output_c = outputs[0].scalar(); + + // (There are similar methods for vectors and matrices here: + // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/tensor.h) + + // Print the results + //std::cout << outputs[0].DebugString() << std::endl; // Tensor + //std::cout << "output value: " << output_c() << std::endl; // 30 + + // Free any resources used by the session + + return 0; +} diff --git a/benchmark/RNN/tensorflow_chinese_ner.py b/benchmark/RNN/tensorflow_chinese_ner.py new file mode 100644 index 000000000..e939290df --- /dev/null +++ b/benchmark/RNN/tensorflow_chinese_ner.py @@ -0,0 +1,167 @@ + +# coding: utf-8 + +# In[1]: + + +import tensorflow as tf +import numpy as np +import time +import timeit + +# In[2]: + +def language_run(data_set): + word_voc_size=1942562 + mention_voc_size=57 + word_hidden_size=32 + mention_hidden_size=20 + gru_hidden_size=36 + + fc1_hidden_size=49 + + + batch_size=1 + tf.device('/cpu:0') + + + # In[3]: + + + x_input = tf.placeholder( + tf.int32, [1,None], name="x_input") + x_input_len = tf.placeholder( + tf.int32, [None],name="x_input_len") + mention_input = tf.placeholder( + tf.int32, [1,None], name="mention_input") + + # In[4]: + + + embedding_table_word_r = tf.get_variable('emb_w_r', [word_voc_size, word_hidden_size], dtype=tf.float32) + embedding_out_r=tf.nn.embedding_lookup(embedding_table_word_r, x_input) + + embedding_table_mention_r = tf.get_variable('emb_m_r', [mention_voc_size, mention_hidden_size], dtype=tf.float32) + embedding_mention_out_r=tf.nn.embedding_lookup(embedding_table_mention_r, mention_input) + ## + embedding_table_word_l = tf.get_variable('emb_w_l', [word_voc_size, word_hidden_size], dtype=tf.float32) + embedding_out_l=tf.nn.embedding_lookup(embedding_table_word_l, x_input) + + embedding_table_mention_l = tf.get_variable('emb_m_l', [mention_voc_size, mention_hidden_size], dtype=tf.float32) + embedding_mention_out_l=tf.nn.embedding_lookup(embedding_table_mention_l, mention_input) + + emb_r=tf.concat([embedding_out_r,embedding_mention_out_r],axis=-1) + emb_l=tf.concat([embedding_out_l,embedding_mention_out_l],axis=-1) + # In[5]: + with tf.variable_scope('forward'): + gru_cell_r = tf.contrib.rnn.GRUCell(gru_hidden_size) + gru_init_state_r = gru_cell_r.zero_state(batch_size, dtype=tf.float32) + gru_out_r, _ = tf.nn.dynamic_rnn(gru_cell_r, emb_r, initial_state=gru_init_state_r) + + with tf.variable_scope('backward'): + gru_cell_l = tf.contrib.rnn.GRUCell(gru_hidden_size) + gru_init_state_l = gru_cell_l.zero_state(batch_size, dtype=tf.float32) + gru_out_l, _ = tf.nn.dynamic_rnn(gru_cell_l, emb_l, initial_state=gru_init_state_l) + + bi_gru_out=tf.concat([gru_out_l,gru_out_r],axis=-1) + + # In[6]: + + + fc_weights = tf.get_variable( + 'fc_weights', [ gru_hidden_size*2,fc1_hidden_size], + initializer=tf.truncated_normal_initializer( + stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + fc_bias = tf.get_variable( + 'fc_bias', [fc1_hidden_size], + initializer=tf.truncated_normal_initializer( + stddev=0.0, dtype=tf.float32), + dtype=tf.float32) + bi_gru_out=tf.squeeze(bi_gru_out,[0]) + fc1_out=tf.matmul(bi_gru_out,fc_weights) + fc_bias + + + # In[7]: + crf_weights = tf.get_variable( + 'crf_weights', [ fc1_hidden_size,fc1_hidden_size], + initializer=tf.truncated_normal_initializer( + stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + + fc1_out=tf.reshape(fc1_out,[batch_size,-1,fc1_hidden_size]) + crf_out,_=tf.contrib.crf.crf_decode(fc1_out,crf_weights,x_input_len) + + + + + + # In[8]: + + init = tf.global_variables_initializer() + sess = tf.Session() + sess.run(init) + + # In[9]: + + + def clock(func): + def clocked(*args): + t0 = timeit.default_timer() + result = func(*args) + elapsed = timeit.default_timer() - t0 + name = func.__name__ + arg_str = ', '.join(repr(arg) for arg in args) + print('[%0.8fs] %s(%s) -> %r' % (elapsed, name, 'arg_str', result)) + lines=len(args[0]) + counter=sum(len(line) for line in args[0]) + print('Delay = '+str(elapsed*1000/lines)+'ms') + return result + return clocked + + + # In[10]: + + + @clock + def benchmark(data_set): + for one_batch in data_set: + word_vec,mention_vec=one_batch[0],one_batch[1] + sess.run([crf_out],{x_input:np.array(word_vec).reshape(1,len(word_vec)),mention_input:np.array(mention_vec).reshape(1,len(mention_vec)),x_input_len:[len(word_vec)]}) + + # tf.train.write_graph(sess.graph.as_graph_def(), 'model/language_model_tf/', 'graph.pb', as_text=False) + # saver=tf.train.Saver() + # saver.save(sess, "model/chinese_ner_model_tf/") + # exit() + + benchmark(data_set) +if __name__=='__main__': + import getopt + import sys + proc_num=1 + try: + opts, args = getopt.getopt(sys.argv[1:], "ho:", ["help", "process_num="]) + for key,arg in opts: + if key in ('-h','--help'): + print('usage --process_num=k ,default=1') + if key in ('--process_num'): + proc_num=int(arg) + print(opts) + except getopt.GetoptError: + pass + + from read_ptb_data import NER_Data_Reader + data_set=NER_Data_Reader().read() + word_sum=sum(len(i[0]) for i in data_set) + from multiprocessing import Process + threads=[] + t0 = timeit.default_timer() + for i in range(proc_num): + t =Process(target=language_run,args=(data_set,)) + t.start() + threads.append(t) + + for t in threads: + t.join() + elapsed = timeit.default_timer() - t0 + print(__file__,'process = ',proc_num,',QPS = ',len(data_set)/elapsed*proc_num,' line / second ,',word_sum/elapsed*proc_num,'words/second') \ No newline at end of file diff --git a/benchmark/RNN/tensorflow_language_model.py b/benchmark/RNN/tensorflow_language_model.py new file mode 100644 index 000000000..31b4997fb --- /dev/null +++ b/benchmark/RNN/tensorflow_language_model.py @@ -0,0 +1,136 @@ + +# coding: utf-8 + +# In[1]: + + +import tensorflow as tf +import numpy as np +import time +import timeit + +# In[2]: + +def language_run(data_set): + voc_size=10001 + hidden_size=200 + batch_size=1 + tf.device('/cpu:0') + + + # In[3]: + + + x_input = tf.placeholder( + tf.int32, [1,None], name="x_input") + # x_input_len = tf.placeholder( + # tf.int32, name="x_input_len") + + + # In[4]: + + + embedding_table = tf.get_variable('emb', [voc_size, hidden_size], dtype=tf.float32) + embedding_out=tf.nn.embedding_lookup(embedding_table, x_input) + + + # In[5]: + + + gru_cell = tf.contrib.rnn.GRUCell(hidden_size) + gru_init_state=gru_cell.zero_state(batch_size, dtype=tf.float32) + gru_out,_=tf.nn.dynamic_rnn(gru_cell,embedding_out,initial_state=gru_init_state) + + + # In[6]: + + + fc_weights = tf.get_variable( + 'fc_weights', [ hidden_size,voc_size], + initializer=tf.truncated_normal_initializer( + stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + fc_bias = tf.get_variable( + 'fc_bias', [voc_size], + initializer=tf.truncated_normal_initializer( + stddev=0.0, dtype=tf.float32), + dtype=tf.float32) + gru_out=tf.squeeze(gru_out,[0]) + fc_out=tf.matmul(gru_out,fc_weights) + fc_bias + + + # In[7]: + + + softmax=tf.nn.softmax(fc_out) + + + # In[8]: + + init = tf.global_variables_initializer() + sess = tf.Session() + sess.run(init) + + # In[9]: + + + def clock(func): + def clocked(*args): + t0 = timeit.default_timer() + result = func(*args) + elapsed = timeit.default_timer() - t0 + name = func.__name__ + arg_str = ', '.join(repr(arg) for arg in args) + print('[%0.8fs] %s(%s) -> %r' % (elapsed, name, 'arg_str', result)) + lines=len(args[0]) + counter=sum(len(line) for line in args[0]) + print('Delay = '+str(elapsed*1000/lines)+'ms') + return result + return clocked + + + # In[10]: + + + @clock + def benchmark(data_set): + for one_batch in data_set: + sess.run([softmax],{x_input:np.array(one_batch).reshape(1,len(one_batch))}) + + # tf.train.write_graph(sess.graph.as_graph_def(), 'model/language_model_tf/', 'graph.pb', as_text=False) + # saver=tf.train.Saver() + # saver.save(sess, "model/language_model_tf/model.cpkt") + # exit() + + + benchmark(data_set) +if __name__=='__main__': + import getopt + import sys + proc_num=1 + try: + opts, args = getopt.getopt(sys.argv[1:], "ho:", ["help", "process_num="]) + for key,arg in opts: + if key in ('-h','--help'): + print('usage --process_num=k ,default=1') + if key in ('--process_num'): + proc_num=int(arg) + print(opts) + except getopt.GetoptError: + pass + + from read_ptb_data import PTB_Data_Reader + data_set=PTB_Data_Reader().read() + word_sum=sum(len(i) for i in data_set) + from multiprocessing import Process + threads=[] + t0 = timeit.default_timer() + for i in range(proc_num): + t =Process(target=language_run,args=(data_set,)) + t.start() + threads.append(t) + + for t in threads: + t.join() + elapsed = timeit.default_timer() - t0 + print(__file__,'process = ',proc_num,',QPS = ',len(data_set)/elapsed*proc_num,' line / second ,',word_sum/elapsed*proc_num,'words/second') \ No newline at end of file diff --git a/benchmark/RNN/tensorflow_text_classfication.py b/benchmark/RNN/tensorflow_text_classfication.py new file mode 100644 index 000000000..c690085e0 --- /dev/null +++ b/benchmark/RNN/tensorflow_text_classfication.py @@ -0,0 +1,149 @@ + +# coding: utf-8 + +# In[1]: + + +import tensorflow as tf +import numpy as np +import time +import timeit + +# In[2]: + +def language_run(data_set): + voc_size=566227 + hidden_size=128 + hidden_size_after_lstm=96 + hidden_size_after_fc=2 + batch_size=1 + tf.device('/cpu:0') + + + # In[3]: + + + x_input = tf.placeholder( + tf.int32, [1,None], name="x_input") + + + # In[4]: + + + embedding_table = tf.get_variable('emb', [voc_size, hidden_size], dtype=tf.float32) + embedding_out=tf.nn.embedding_lookup(embedding_table, x_input) + + + # In[5]: + + + lstm_cell = tf.contrib.rnn.LSTMCell(hidden_size) + # lstm_init_state=lstm_cell.zero_state(batch_size, dtype=tf.float32) + # lstm_out,_=tf.nn.dynamic_rnn(lstm_cell,embedding_out,initial_state=lstm_init_state) + (output_fw, output_bw), _=tf.nn.bidirectional_dynamic_rnn(lstm_cell, + lstm_cell, embedding_out, + dtype=tf.float32) + + bi_lstm_out = tf.concat([output_fw, output_bw], axis=-1) + + # In[6]: + + + fc_weights = tf.get_variable( + 'fc_weights', [ hidden_size*2,hidden_size_after_lstm], + initializer=tf.truncated_normal_initializer( + stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + fc_bias = tf.get_variable( + 'fc_bias', [hidden_size_after_lstm], + initializer=tf.truncated_normal_initializer( + stddev=0.0, dtype=tf.float32), + dtype=tf.float32) + bi_lstm_out=tf.squeeze(bi_lstm_out,[0]) + fc1_out=tf.tanh(tf.matmul(bi_lstm_out,fc_weights) + fc_bias) + + # In[7]: + fc2_weights = tf.get_variable( + 'fc2_weights', [ hidden_size_after_lstm,hidden_size_after_fc], + initializer=tf.truncated_normal_initializer( + stddev=0.01, dtype=tf.float32), + dtype=tf.float32) + fc2_bias = tf.get_variable( + 'fc2_bias', [hidden_size_after_fc], + initializer=tf.truncated_normal_initializer( + stddev=0.0, dtype=tf.float32), + dtype=tf.float32) + fc2_out=tf.matmul(fc1_out,fc2_weights) + fc2_bias + + softmax=tf.nn.softmax(fc2_out) + + + # In[8]: + + init = tf.global_variables_initializer() + sess = tf.Session() + sess.run(init) + + # In[9]: + + + def clock(func): + def clocked(*args): + t0 = timeit.default_timer() + result = func(*args) + elapsed = timeit.default_timer() - t0 + name = func.__name__ + arg_str = ', '.join(repr(arg) for arg in args) + print('[%0.8fs] %s(%s) -> %r' % (elapsed, name, 'arg_str', result)) + lines=len(args[0]) + counter=sum(len(line) for line in args[0]) + print('Delay = '+str(elapsed*1000/lines)+'ms') + return result + return clocked + + + # In[10]: + + + @clock + def benchmark(data_set): + for one_batch in data_set: + sess.run([softmax],{x_input:np.array(one_batch).reshape(1,len(one_batch))}) + + # tf.train.write_graph(sess.graph.as_graph_def(), 'model/text_classfi_model_tf/', 'graph.pb', as_text=False) + # saver=tf.train.Saver() + # saver.save(sess, "model/text_classfi_model_tf/model.cpkt") + # exit() + + + benchmark(data_set) +if __name__=='__main__': + import getopt + import sys + proc_num=1 + try: + opts, args = getopt.getopt(sys.argv[1:], "ho:", ["help", "process_num="]) + for key,arg in opts: + if key in ('-h','--help'): + print('usage --process_num=k ,default=1') + if key in ('--process_num'): + proc_num=int(arg) + print(opts) + except getopt.GetoptError: + pass + + from read_ptb_data import PTB_Data_Reader + data_set=PTB_Data_Reader().read() + word_sum=sum(len(i) for i in data_set) + from multiprocessing import Process + threads=[] + t0 = timeit.default_timer() + for i in range(proc_num): + t =Process(target=language_run,args=(data_set,)) + t.start() + threads.append(t) + + for t in threads: + t.join() + elapsed = timeit.default_timer() - t0 + print(__file__,'process = ',proc_num,',QPS = ',len(data_set)/elapsed*proc_num,' line / second ,',word_sum/elapsed*proc_num,'words/second') \ No newline at end of file diff --git a/benchmark/arm_benchmark.md b/benchmark/arm_benchmark.md new file mode 100644 index 000000000..3ab4feb48 --- /dev/null +++ b/benchmark/arm_benchmark.md @@ -0,0 +1,57 @@ +# 测试环境和参数: ++ 测试模型Mobilenetv1, mobilenetv2, mobilenet-ssd ++ 采用android ndk交叉编译,gcc 4.9,enable neon, ABI: armveabi-v7a with neon -mfloat-abi=softfp ++ 测试平台 + - 荣耀v9(root): 处理器:麒麟960, 4 big cores in 2.36GHz, 4 little cores in 1.8GHz + - nubia z17:处理器:高通835, 4 big cores in 2.36GHz, 4 little cores in 1.9GHz + - 360 N5:处理器:高通653, 4 big cores in 1.8GHz, 4 little cores in 1.4GHz ++ 多线程:openmp ++ 时间:warmup10次,运行10次取均值 ++ ncnn版本:来源于github的master branch中commits ID:307a77f04be29875f40d337cfff6df747df09de6(msg:convert LogisticRegressionOutput)版本 ++ TFlite版本:来源于github的master branch中commits ID:65c05bc2ac19f51f7027e66350bc71652662125c(msg:Removed unneeded file copy that was causing failure in Pi builds)版本 + +## Anakin + +在BenchMark中本文将使用**`ncnn`**、**`TFlite`**和**`Anakin`**进行性能对比分析 + +## BenchMark model + +> 注意在性能测试之前,请先将测试model通过[External Converter](#10003)转换为Anakin model +> 对这些model,本文在ARM上进行多线程的单batch size测试。 + +- [Mobilenet v1](#11) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载* +- [Mobilenet v2](#22) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载* +- [mobilenet-ssd](#33) *caffe model 可以在[这儿](https://github.com/chuanqi305/MobileNet-SSD)下载* + +### mobilenetv1 + + |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| + |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| + |麒麟960|107.7ms|61.1ms|38.2ms|152.8ms|85.2ms|51.9ms|152.6ms|nan|nan| + |高通835|105.7ms|63.1ms|~~46.8ms~~|152.7ms|87.0ms|~~92.7ms~~|146.9ms|nan|nan| + |高通653|120.3ms|64.2ms|46.6ms|202.5ms|117.6ms|84.8ms|158.6ms|nan|nan| + +### mobilenetv2 + + |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| + |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| + |麒麟960|93.1ms|53.9ms|34.8ms|144.4ms|84.3ms|55.3ms|100.6ms|nan|nan| + |高通835|93.0ms|55.6ms|41.1ms|139.1ms|88.4ms|58.1ms|95.2ms|nan|nan| + |高通653|106.6ms|64.2ms|48.0ms|199.9ms|125.1ms|98.9ms|108.5ms|nan|nan| + +### mobilenet-ssd + + |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| + |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| + |麒麟960|213.9ms|120.5ms|74.5ms|307.9ms|166.5ms|104.2ms|nan|nan|nan| + |高通835|213.0ms|125.7ms|~~98.4ms~~|292.9ms|177.9ms|~~167.8ms~~|nan|nan|nan| + |高通653|236.0ms|129.6ms|96.0ms|377.7ms|228.9ms|165.0ms|nan|nan|nan + +## How to run those Benchmark models? + +1. 首先, 使用[External Converter](../docs/Manual/Converter_en.md)对caffe model 进行转换 +2. 然后将转换后的Anakin model和编译好的benchmark_arm 二进制文件通过'adb push'命令上传至测试机 +3. 接着在测试机含有Anakin model的目录中运行'./benchmark_arm ./ anakin_model.anakin.bin 1 10 10 1' 命令 +4. 最后,终端显示器上将会打印该模型的运行时间 +5. 其中运行命令的参数个数和含义可以通过运行'./benchmark_arm'看到 + diff --git a/cmake/amd.cmake b/cmake/amd.cmake new file mode 100644 index 000000000..1ebc7bf56 --- /dev/null +++ b/cmake/amd.cmake @@ -0,0 +1,53 @@ +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +################################################################################ +macro(amd_set_opencl_path) + if(NOT DEFINED OpenCL_INCLUDE_DIR) + set(OpenCL_INCLUDE_DIR "/opt/rocm/opencl/include") + endif() + if(NOT DEFINED OpenCL_LIBRARY) + set(OpenCL_LIBRARY "/opt/rocm/opencl/lib/x86_64/libOpenCL.so") + endif() + + #FIND_PACKAGE(OpenCL REQUIRED) + #if(OpenCL_FOUND) + # message(STATUS "Found OpenCL in ${OpenCL_INCLUDE_DIRS}") + # message(STATUS "Found OpenCL lib in ${OpenCL_LIBRARIES}") + # include_directories(${OpenCL_INCLUDE_DIRS}) + # LINK_LIBRARIES(${OpenCL_LIBRARIES}) + #endif() +endmacro() + +macro(amd_build_cl_file file_path dest_path) + FILE(GLOB CL_FILES ${file_path}/*.cl) + message(STATUS "found cl files: ${CL_FILES}") + foreach(src_file ${CL_FILES}) + get_filename_component(src_file_name ${src_file} NAME) + message(STATUS "copy ${src_file} to : ${dest_path}/${src_file_name}") + configure_file( ${absdir}/${src_file} ${dest_path}/${src_file_name} COPYONLY) + endforeach() +endmacro() + + +macro(amd_build_cl_binary_file file_path dest_path) + FILE(GLOB CL_FILES ${file_path}/*.so) + message(STATUS "found cl files: ${CL_FILES}") + foreach(src_file ${CL_FILES}) + get_filename_component(src_file_name ${src_file} NAME) + message(STATUS "copy ${src_file} to : ${dest_path}/${src_file_name}") + configure_file( ${absdir}/${src_file} ${dest_path}/${src_file_name} COPYONLY) + endforeach() +endmacro() + diff --git a/cmake/compiler_options.cmake b/cmake/compiler_options.cmake index 49d133c7f..ef4a0dbcf 100644 --- a/cmake/compiler_options.cmake +++ b/cmake/compiler_options.cmake @@ -1,10 +1,16 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved -# @file compiler_options.cmake -# @auther cuichaowen -# @date 2017-3-2 -# ---------------------------------------------------------------------------- - +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # ---------------------------------------------------------------------------- # section: set the compiler and linker options @@ -15,7 +21,6 @@ set(ANAKIN_NVCC_FLAG "") anakin_add_compile_option(-std=c++11) anakin_add_compile_option(-fPIC) anakin_add_compile_option(-ldl) -anakin_add_compile_option(-mavx2) if(NOT USE_ARM_PLACE) anakin_add_compile_option(-lrt) endif() @@ -34,6 +39,9 @@ anakin_add_compile_option(-Wshadow) anakin_add_compile_option(-fpermissive) anakin_add_compile_option(-Wsign-promo) anakin_add_compile_option(-fdiagnostics-show-option) +if(USE_BM_PLACE) + anakin_add_compile_option(-lbmlib-asic) +endif() if(ENABLE_NOISY_WARNINGS) anakin_add_compile_option(-Wcast-align) @@ -47,8 +55,10 @@ else() anakin_add_compile_option(-Wno-delete-non-virtual-dtor) anakin_add_compile_option(-Wno-comment) anakin_add_compile_option(-Wno-sign-compare) - anakin_add_compile_option(-Wno-ignored-qualifiers) - anakin_add_compile_option(-Wno-enum-compare) + anakin_add_compile_option(-Wno-write-strings) + anakin_add_compile_option(-Wno-ignored-qualifiers) + anakin_add_compile_option(-Wno-enum-compare) + anakin_add_compile_option(-Wno-missing-field-initializers) endif() if(CMAKE_BUILD_TYPE MATCHES Debug) @@ -57,6 +67,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug) anakin_add_compile_option(-gdwarf-2) # for old version gcc and gdb. see: http://stackoverflow.com/a/15051109/673852 else() anakin_add_compile_option(-O3) +# anakin_add_compile_option(-g) anakin_add_compile_option(-DNDEBUG) endif() @@ -74,6 +85,10 @@ if(TARGET_IOS) endif() if(USE_X86_PLACE) +# anakin_add_compile_option(-mavx2) +# anakin_add_compile_option(-fopenmp) + anakin_add_compile_option(-fabi-version=6) + anakin_add_compile_option(-march=native) anakin_add_compile_option(-Ofast) anakin_add_compile_option(-ffast-math) anakin_add_compile_option(-Wall) @@ -101,6 +116,7 @@ if(USE_CUDA) anakin_add_compile_option(-G NVCC) anakin_add_compile_option(-g NVCC) anakin_add_compile_option(-std=c++11 NVCC) + anakin_add_compile_option("--default-stream per-thread" NVCC) anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC) # suppress warning by architectures are deprecated (2.0,2.1) else() anakin_add_compile_option("-Xcompiler -fPIC" NVCC) @@ -112,21 +128,3 @@ if(USE_CUDA) # set default nvidia gpu arch set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1") endif() - -if(USE_BM) - if(CMAKE_BUILD_TYPE MATCHES Debug) - anakin_add_compile_option("-Xcompiler -fPIC" NVCC) - anakin_add_compile_option(-G NVCC) - anakin_add_compile_option(-g NVCC) - anakin_add_compile_option(-std=c++11 NVCC) - anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC) # suppress warning by architectures are deprecated (2.0,2.1) - else() - anakin_add_compile_option("-Xcompiler -fPIC" NVCC) - anakin_add_compile_option(-O3 NVCC) - anakin_add_compile_option(-std=c++11 NVCC) - anakin_add_compile_option("--default-stream per-thread" NVCC) - anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC) - endif() - # set default nvidia gpu arch - set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1") -endif() diff --git a/cmake/config/anakin_config.h.in b/cmake/config/anakin_config.h.in index 0a8560593..860e77f58 100644 --- a/cmake/config/anakin_config.h.in +++ b/cmake/config/anakin_config.h.in @@ -1,16 +1,17 @@ -/********************************************************** - * Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved - * - * @file anakin_config.h.in - * @brief file ak_config.h is autogenerated from config.h.in - * during the cmake configuration of anakin. - * - * @auther cuichaowen - * @version ANAKIN V @VERSION@ - * @date 2017-10-23 - * - **********************************************************/ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ #ifndef _ANAKIN_CONFIGURATION_HEADER_GUARD_H_ #define _ANAKIN_CONFIGURATION_HEADER_GUARD_H_ @@ -26,7 +27,6 @@ // build options #cmakedefine ENABLE_DEBUG -// boost #cmakedefine USE_BOOST #cmakedefine USE_CUBLAS @@ -35,8 +35,6 @@ #cmakedefine USE_CUDA -#cmakedefine USE_BM - #cmakedefine USE_CUDNN #cmakedefine USE_PYTHON @@ -49,8 +47,11 @@ #cmakedefine USE_OPENMP +#cmakedefine USE_LOGGER + #cmakedefine USE_GFLAGS + // plantform to use #cmakedefine USE_GPU_PLACE @@ -58,7 +59,9 @@ #cmakedefine USE_ARM_PLACE -#cmakedefine TARGET_ANDRIOD +#cmakedefine USE_BM_PLACE + +#cmakedefine TARGET_ANDROID #cmakedefine TARGET_IOS @@ -66,6 +69,15 @@ #cmakedefine NVIDIA_GPU +#cmakedefine AMD_GPU + +#cmakedefine ENABLE_STACKTRACES + +#cmakedefine SUPPORT_PTHREADS + +// build AOT lite for device +#cmakedefine BUILD_WITH_LITE + #if defined(ANDROID) || defined(__ANDROID__) #define PLATFORM_ANDROID diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index da5540a91..4e2cd4815 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -1,10 +1,16 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved -# @file cuda.cmake -# @auther cuichaowen -# @date 2017-10-23 -# ---------------------------------------------------------------------------- - +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # ---------------------------------------------------------------------------- # section: Set nvcc arch info. # ---------------------------------------------------------------------------- @@ -137,6 +143,9 @@ macro(anakin_find_cuda) if(USE_CURAND) list(APPEND ANAKIN_LINKER_LIBS ${CUDA_curand_LIBRARY}) endif() + if(BUILD_RPC) + list(APPEND ANAKIN_LINKER_LIBS ${CUDA_INCLUDE_DIRS}/../lib64/stubs/libnvidia-ml.so) + endif() list(APPEND ANAKIN_LINKER_LIBS ${CUDA_CUDART_LIBRARY}) else() message(FATAL_ERROR "Cuda SHARED lib Could not found !") diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index de9b90531..b8a1358bb 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -14,6 +14,11 @@ # limitations under the License. #=============================================================================== +anakin_find_mklml() +if(MKLML_FOUND) + return() +endif() + # download mklml package is only for iomp so far include(ExternalProject) @@ -59,5 +64,12 @@ list(APPEND ANAKIN_SABER_DEPENDENCIES mklml) list(APPEND ANAKIN_LINKER_LIBS ${MKLML_LIB};${MKLML_IOMP_LIB}) +#set(OPENMP_FLAGS "-fopenmp") +##set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) +#set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) +#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") + + # iomp5 must be installed -install(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib) +#install(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib) diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake index 796b33c94..8d1bd276b 100644 --- a/cmake/find_modules.cmake +++ b/cmake/find_modules.cmake @@ -1,9 +1,16 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved -# @file find_modules.cmake -# @auther cuichaowen -# @date 2016-11-9 -# ---------------------------------------------------------------------------- +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. #anakin cmake module set(CMAKE_MODULE_PATH "${ANAKIN_ROOT}/cmake") @@ -11,12 +18,16 @@ set(CMAKE_MODULE_PATH "${ANAKIN_ROOT}/cmake") set(ANAKIN_LINKER_LIBS "") if(UNIX) - find_library(RTLIB rt) - if(RTLIB) - list(APPEND ANAKIN_LINKER_LIBS ${RTLIB}) - else() - message(SEND_ERROR "Could not found -lrt !") - endif() + if(USE_ARM_PLACE ) + elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + else() + find_library(RTLIB rt) + if(RTLIB) + list(APPEND ANAKIN_LINKER_LIBS ${RTLIB}) + else() + message(SEND_ERROR "Could not found -lrt !") + endif() + endif() find_library(DLLIB dl) if(DLLIB) @@ -28,30 +39,38 @@ endif() #find opencv version >= 2.4.3 macro(anakin_find_opencv) - if(BUILD_SHARED OR TRUE) # temporary not support static link opencv. - #set(CMAKE_FIND_ROOT_PATH ${ANAKIN_ROOT}/third-party/opencv243/lib) - find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs) - if(NOT OpenCV_FOUND) - find_package(OpenCV QUIET COMPONENTS core highgui imgproc) - endif() - if(OpenCV_FOUND) - message(STATUS "Found opencv: ${OpenCV_INCLUDE_DIRS}") - include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS}) - list(APPEND ANAKIN_LINKER_LIBS ${OpenCV_LIBS}) - else() - message(SEND_ERROR "Could not found opencv !") - endif() - else() # BUILD_STATIC - list(APPEND OPENCV_STATIC_LIBS libopencv_core.a - libopencv_highgui.a - libopencv_imgproc.a - libopencv_contrib.a) - foreach(CV_LIB ${OPENCV_STATIC_LIBS}) - set(__CV_LIB_FULL_PATH "${ANAKIN_ROOT}/third-party/opencv243/lib/${CV_LIB}") - #message(STATUS ${__CV_LIB_FULL_PATH}) - list(APPEND ANAKIN_LINKER_LIBS ${__CV_LIB_FULL_PATH}) - endforeach() - unset(__CV_LIB_FULL_PATH) + + if(USE_ARM_PLACE AND TARGET_ANDROID) + include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/) + LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/) + + else() + + if(BUILD_SHARED) # temporary not support static link opencv. + find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs) + if(NOT OpenCV_FOUND) + find_package(OpenCV QUIET COMPONENTS core highgui imgproc) + endif() + if(OpenCV_FOUND) + message(STATUS "Found opencv: ${OpenCV_INCLUDE_DIRS}") + include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS}) + list(APPEND ANAKIN_LINKER_LIBS ${OpenCV_LIBS}) + + else() + message(SEND_ERROR "Could not found opencv !") + endif() + else() # BUILD_STATIC + set(OPENCV_LIB_PATH "" CACHE "Path to oopen cv library") + list(APPEND OPENCV_STATIC_LIBS ${OPENCV_LIB_PATH}/libopencv_core.a + ${OPENCV_LIB_PATH}libopencv_highgui.a + ${OPENCV_LIB_PATH}libopencv_imgproc.a + ${OPENCV_LIB_PATH}libopencv_contrib.a) + foreach(CV_LIB ${OPENCV_STATIC_LIBS}) + list(APPEND ANAKIN_LINKER_LIBS ${CV_LIB}) + endforeach() + unset(__CV_LIB_FULL_PATH) + endif() + endif() endmacro() @@ -60,8 +79,8 @@ macro(anakin_find_opencl) set(OCL_ROOT "" CACHE PATH "openCL root dir.") find_path(OCL_INCLUDE_DIR NAMES CL/cl.h PATHS ${OCL_ROOT}/include $ENV{OCL_ROOT}/include) + find_library(OCL_LIBRARIES NAMES libOpenCL.so PATHS ${OCL_ROOT} ${OCL_ROOT}/lib/x86_64 $ENV{OCL_ROOT}/lib $ENV{OCL_ROOT}/lib/x86_64) - find_library(OCL_LIBRARIES NAMES libOpenCL.so PATHS ${OCL_ROOT}) if(OCL_INCLUDE_DIR AND OCL_LIBRARIES) set(OCL_FOUND TRUE) message(STATUS "Found opencl: ${OCL_INCLUDE_DIR}") @@ -259,32 +278,100 @@ macro(anakin_find_mklml) list(APPEND MKLML_LIBRARIES ${MKLML_ROOT}/lib/libiomp5.so) list(APPEND MKLML_LIBRARIES ${MKLML_ROOT}/lib/libmklml_intel.so) list(APPEND ANAKIN_LINKER_LIBS ${MKLML_LIBRARIES}) - else() - message(FATAL_ERROR "NOT FOUND MKLML") +# else() +# message(FATAL_ERROR "NOT FOUND MKLML") endif() endmacro() macro(anakin_find_protobuf) - list(APPEND ANAKIN_LINKER_LIBS ${PROTOBUF_LIBRARIES}) - find_package(Protobuf REQUIRED) - if(PROTOBUF_FOUND) - message(STATUS "Found protobuf in ${PROTOBUF_INCLUDE_DIR}") - include_directories(${PROTOBUF_INCLUDE_DIR}) - list(APPEND ANAKIN_LINKER_LIBS ${PROTOBUF_LIBRARIES}) - endif() + if(USE_ARM_PLACE) + set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf") + include_directories(${ARM_RPOTO_ROOT}/include) + set(PROTOBUF_LIBRARIES "") + #if(BUILD_SHARED) + # list(APPEND ANAKIN_LINKER_LIBS ${ARM_RPOTO_ROOT}/lib/libprotobuf.so) + #else() + list(APPEND ANAKIN_LINKER_LIBS ${ARM_RPOTO_ROOT}/lib/libprotobuf.a) + #endif() + find_library( # Sets the name of the path variable. + log-lib + + # Specifies the name of the NDK library that + # you want CMake to locate. + log ) + list(APPEND ANAKIN_LINKER_LIBS ${log-lib}) + else() + find_program(PROTOBUF_PROTOC_EXECUTABLE protoc) + if(PROTOBUF_PROTOC_EXECUTABLE) + find_package(Protobuf REQUIRED) + message(STATUS "Found protobuf in ${PROTOBUF_INCLUDE_DIR}") + include_directories(${PROTOBUF_INCLUDE_DIR}) + list(APPEND ANAKIN_LINKER_LIBS ${PROTOBUF_LIBRARIES}) + else() + set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") + if (NOT "${PROTOBUF_ROOT}" STREQUAL "") + find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH) + find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH) + if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE) + message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") + set(PROTOBUF_LIBRARIES ${PROTOBUF_LIBRARY} ${PROTOBUF_LITE_LIBRARY} ${PROTOBUF_PROTOC_LIBRARY}) + list(APPEND ANAKIN_LINKER_LIBS ${PROTOBUF_LIBRARIES}) + include_directories(${PROTOBUF_INCLUDE_DIR}) + else() + message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}.") + endif() + endif() + endif() + endif() endmacro() +macro(anakin_find_baidu_rpc) + set(BAIDU_RPC_ROOT "/usr/local/" CACHE PATH "baidu rpc root dir") + find_path(RPC_INCLUDE_DIR server.h PATHS ${BAIDU_RPC_ROOT}/include/brpc/ $ENV{BAIDU_RPC_ROOT}/include/brpc/) + find_library(RPC_LIBRARY NAMES libbrpc.so + PATHS ${BAIDU_RPC_ROOT}/lib $ENV{BAIDU_RPC_ROOT}/include/brpc/ + DOC "library path for baidu rpc.") + if(RPC_INCLUDE_DIR AND RPC_LIBRARY) + include_directories(${BAIDU_RPC_ROOT}/include) + list(APPEND ANAKIN_LINKER_LIBS ${RPC_LIBRARY}) + else() + message(SEND_ERROR "Could not found baidu-rpc !") + endif() +endmacro() macro(anakin_find_openmp) find_package(OpenMP REQUIRED) if(OPENMP_FOUND OR OpenMP_CXX_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") message(STATUS "Found openmp in ${OPENMP_INCLUDE_DIR}") - message(STATUS " |-- openmp c flags: ${OpenMP_C_FLAGS}") - message(STATUS " |-- openmp cxx flags: ${OpenMP_CXX_FLAGS}") - message(STATUS " `-- openmp link flags: ${OpenMP_EXE_LINKER_FLAGS}") - include_directories(${OPENMP_INCLUDE_DIR}) - list(APPEND ANAKIN_LINKER_LIBS ${OPENMP_LIBRARIES}) + message(STATUS " |--openmp cflags: ${OpenMP_C_FLAGS}") + message(STATUS " |--openmp cxxflags: ${OpenMP_CXX_FLAGS}") + message(STATUS " |--openmp cflags: ${OpenMP_EXE_LINKER_FLAGS}") else() message(FATAL_ERROR "Could not found openmp !") endif() endmacro() + +macro(anakin_find_bmlib) + find_path(BM_ROOT include/bmlib/bmlib_runtime.h /usr/local/include/bm/ $ENV{BM_ROOT}/) + if(BM_ROOT) + set(BM_FOUND TRUE) + endif() + if(BM_FOUND) + message(STATUS " Found bm_lib in ${BM_ROOT}") + anakin_fetch_include_recursively(${BM_ROOT}/include) + set(BM_LIBRARIES "") +# list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/cmodel/bmlib.a) +# list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/cmodel/cmodel.a) +# list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/cmodel/common.a) +# list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/cmodel/fwcore.a) + list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmlib-asic.so) + list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARIES}) + else() + message(FATAL_ERROR "Could not found bm_lib") + endif() +endmacro() diff --git a/cmake/gather.cmake b/cmake/gather.cmake index 5017efff7..e6aafc9f3 100644 --- a/cmake/gather.cmake +++ b/cmake/gather.cmake @@ -1,9 +1,16 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved -# @file gather_libs.cmake -# @auther cuichaowen -# @date 2017-10-24 -# ---------------------------------------------------------------------------- +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # find cudnn default cudnn 5 if(USE_CUDNN) @@ -17,16 +24,22 @@ if(USE_CUDA) anakin_find_cuda() endif() -if(USE_BM) - #set other cuda path - #set(CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_PATH}) - #anakin_find_cuda() +if(USE_BM_PLACE) + anakin_find_bmlib() endif() +# set amd opencl path +if(AMD_GPU) + amd_build_cl_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl" "${CMAKE_BINARY_DIR}/cl/amd") + amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/lib" "${CMAKE_BINARY_DIR}/cl/amd") + amd_build_cl_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl" "${PROJECT_SOURCE_DIR}/output/unit_test") + amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/lib" "${PROJECT_SOURCE_DIR}/output/unit_test") + amd_build_cl_file("${CMAKE_SOURCE_DIR}/test/saber/amd" "${PROJECT_SOURCE_DIR}/output/unit_test") +endif() # find opencl if(USE_OPENCL) - anakin_generate_kernel(${ANAKIN_ROOT}) + #anakin_generate_kernel(${ANAKIN_ROOT}) anakin_find_opencl() endif() @@ -49,6 +62,10 @@ if(USE_PROTOBUF) anakin_protos_processing() endif() +if(BUILD_RPC) + anakin_find_baidu_rpc() +endif() + if (USE_GFLAGS) anakin_find_gflags() endif() @@ -71,9 +88,11 @@ endif() if(DISABLE_ALL_WARNINGS) anakin_disable_warnings(CMAKE_CXX_FLAGS) endif() - +if(USE_OPENMP) + anakin_find_openmp() +endif() if(USE_ARM_PLACE) - if(TARGET_ANDRIOD) + if(TARGET_ANDROID) if(USE_OPENMP) anakin_find_openmp() endif() diff --git a/cmake/ios/iosxc.toolchain.cmake b/cmake/ios/iosxc.toolchain.cmake new file mode 100644 index 000000000..bcfd76937 --- /dev/null +++ b/cmake/ios/iosxc.toolchain.cmake @@ -0,0 +1,39 @@ +# Standard settings +# set(UNIX True) +# set(Darwin True) +# set(IOS True) +set (CMAKE_SYSTEM_NAME Darwin) +set (CMAKE_SYSTEM_VERSION 1) +set (UNIX True) +set (APPLE True) +set (IOS True) + +# suppress -rdynamic +# set(CMAKE_SYSTEM_NAME Generic) + +set(CMAKE_C_COMPILER arm-apple-darwin11-clang) +set(CMAKE_CXX_COMPILER arm-apple-darwin11-clang++) + +set(_CMAKE_TOOLCHAIN_PREFIX arm-apple-darwin11-) + +set(CMAKE_IOS_SDK_ROOT ${IOS_SDK_PATH}) + +# Set the sysroot default to the most recent SDK +set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support") + +# set the architecture for iOS +# set(IOS_ARCH arm64) +set(IOS_ARCH armv7;arm64) + +set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") + +# Set the find root to the iOS developer roots and to user defined paths +set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root") + +# searching for frameworks only +set(CMAKE_FIND_FRAMEWORK FIRST) + +# set up the default search directories for frameworks +set(CMAKE_SYSTEM_FRAMEWORK_PATH + ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks +) diff --git a/cmake/msg_color.cmake b/cmake/msg_color.cmake index 3bf6da6b9..18fc4cf5e 100644 --- a/cmake/msg_color.cmake +++ b/cmake/msg_color.cmake @@ -1,9 +1,16 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved -# @file msg_color.cmake -# @auther cuichaowen -# @date 2016-11-8 -# ---------------------------------------------------------------------------- +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # ---------------------------------------------------------------------------- # section: help to get colorful cmake message. diff --git a/cmake/statistic.cmake b/cmake/statistic.cmake index 65a9f7964..86138122a 100644 --- a/cmake/statistic.cmake +++ b/cmake/statistic.cmake @@ -1,10 +1,16 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved -# @file statistic.cmake -# @auther cuichaowen -# @date 2017-4-20 -# ---------------------------------------------------------------------------- - +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # ---------------------------------------------------------------------------- # section: prints the statistic of configuration of anakin. @@ -113,7 +119,7 @@ function(anakin_print_statistic) elseif(USE_ARM_PLACE) message(STATUS " USE_ARM_PLACE : ${USE_ARM_PLACE}") if(TARGET_ANDROID) - message(STATUS " `--Target Andriod : ${TARGET_ANDROID}") + message(STATUS " `--Target Android : ${TARGET_ANDROID}") else() message(STATUS " `--Target IOS : ${TARGET_IOS}") endif() diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 5daa82913..1804343d7 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -1,9 +1,16 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved -# @file utils.cmake -# @auther cuichaowen -# @date 2016-11-8 -# ---------------------------------------------------------------------------- +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # ---------------------------------------------------------------------------- # section: help to search src and include files @@ -24,9 +31,9 @@ function(anakin_fetch_files_with_suffix search_dir suffix outputs) endforeach() set(${outputs} ${${outputs}} ${abs_dir} PARENT_SCOPE) else() - #message(WARNING "anakin_fetch_files_recursively ${BoldRed}failed${ColourReset}:\n" - # "real_dir:${BoldYellow}${search_dir}${ColourReset}\n" - # "suffix:*.${BoldYellow}${suffix}${ColourReset} \n") + #message(WARNING "anakin_fetch_files_recursively ${BoldRed}failed${ColourReset}:\n" + # "real_dir:${BoldYellow}${search_dir}${ColourReset}\n" + # "suffix:*.${BoldYellow}${suffix}${ColourReset} \n") endif() endfunction() @@ -39,7 +46,7 @@ endfunction() # recursively fetch include dir function(anakin_fetch_include_recursively root_dir) if (IS_DIRECTORY ${root_dir}) - #message(STATUS "include dir: " ${Magenta}${root_dir}${ColourReset}) + #message(STATUS "include dir: " ${Magenta}${root_dir}${ColourReset}) include_directories(${root_dir}) endif() @@ -51,6 +58,14 @@ function(anakin_fetch_include_recursively root_dir) endforeach() endfunction() +# judge fetch files +function(anakin_judge_avx outputs) + exec_program(cat /proc/cpuinfo|greps flag|uniq + OUTPUT_VARIABLE OUTPUT + RETURN_VALUE VALUE) + message("it is anakin_judge_avx " OUTPUT) + set(${outputs} ${${outputs}} PARENT_SCOPE) +endfunction() # ---------------------------------------------------------------------------- # section: help to detect the compiler options # ---------------------------------------------------------------------------- @@ -129,19 +144,19 @@ macro(anakin_check_compiler_flag LANG FLAG RESULT) endmacro() macro(anakin_check_flag_support lang flag varname) - if("_${lang}_" MATCHES "_CXX_") - set(_lang CXX) + if("_${lang}_" MATCHES "_CXX_") + set(_lang CXX) elseif("_${lang}_" MATCHES "_CU_") set(_lang NVCC) - else() - set(_lang ${lang}) - endif() + else() + set(_lang ${lang}) + endif() - string(TOUPPER "${flag}" ${varname}) - string(REGEX REPLACE "^(/|-)" "HAVE_${_lang}_" ${varname} "${${varname}}") - string(REGEX REPLACE " --|-|=| |\\." "_" ${varname} "${${varname}}") + string(TOUPPER "${flag}" ${varname}) + string(REGEX REPLACE "^(/|-)" "HAVE_${_lang}_" ${varname} "${${varname}}") + string(REGEX REPLACE " --|-|=| |\\." "_" ${varname} "${${varname}}") - anakin_check_compiler_flag("${_lang}" "${ARGN} ${flag}" ${${varname}}) + anakin_check_compiler_flag("${_lang}" "${ARGN} ${flag}" ${${varname}}) endmacro() macro(anakin_add_compile_option option) @@ -224,31 +239,26 @@ endfunction() # ---------------------------------------------------------------------------- # section: generate the protobuf .h and .cpp files. # ---------------------------------------------------------------------------- -function(anakin_protos_processing) - set(PROTO_SRC_PATH ${ANAKIN_MODEL_PARSER}/proto) - set(__working_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/PROTO_TEMP/) - - anakin_fetch_files_with_suffix(${PROTO_SRC_PATH} "proto" PROTO_SRC_FILES) - foreach(__file ${PROTO_SRC_FILES}) - exec_program(protoc ${__working_dir} ARGS " -I=${PROTO_SRC_PATH} --cpp_out=. ${__file}" - OUTPUT_VARIABLE OUTPUT - RETURN_VALUE VALUE) - if(NOT VALUE) - anakin_fetch_files_with_suffix(${__working_dir} "h" PROTO_GENERATE_H) - # get *.cpp or *.cc - anakin_fetch_files_with_suffix(${__working_dir} "c*" PROTO_GENERATE_C) - foreach(__include_file ${PROTO_GENERATE_H}) - exec_program(mv ARGS ${__include_file} ${PROTO_SRC_PATH} - OUTPUT_VARIABLE __out - RETURN_VALUE __value) - endforeach() - foreach(__src_file ${PROTO_GENERATE_C}) - if(POLICY CMP0007) - cmake_policy(PUSH) - cmake_policy(SET CMP0007 NEW) - endif() - string(REPLACE "." ";" SRC_LIST ${__src_file}) - list(GET SRC_LIST -1 __src_file_name_suffix) +function(anakin_gen_pb proto_src_path) + set(__working_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/PROTO_TEMP/) + foreach(__proto_file ${ARGN}) + exec_program(${PROTOBUF_PROTOC_EXECUTABLE} ${__working_dir} ARGS " -I=${proto_src_path} --cpp_out=. ${__proto_file}" + OUTPUT_VARIABLE OUTPUT RETURN_VALUE VALUE) + if(NOT VALUE) + anakin_fetch_files_with_suffix(${__working_dir} "h" PROTO_GENERATE_H) + # get *.cpp or *.cc + anakin_fetch_files_with_suffix(${__working_dir} "c*" PROTO_GENERATE_C) + foreach(__include_file ${PROTO_GENERATE_H}) + exec_program(mv ARGS ${__include_file} ${proto_src_path} + OUTPUT_VARIABLE __out RETURN_VALUE __value) + endforeach() + foreach(__src_file ${PROTO_GENERATE_C}) + if(POLICY CMP0007) + cmake_policy(PUSH) + cmake_policy(SET CMP0007 NEW) + endif() + string(REPLACE "." ";" SRC_LIST ${__src_file}) + list(GET SRC_LIST -1 __src_file_name_suffix) list(GET SRC_LIST -3 __src_file_name) string(REPLACE "/" ";" SRC_LIST_PATH ${__src_file_name}) @@ -259,18 +269,31 @@ function(anakin_protos_processing) else() set(__full_src_filename "${__pure_src_file_name}.pb.cc") endif() - #message(STATUS " first ---> ${__working_dir}${__full_src_filename} ${ANAKIN_ROOT}/src/${__pure_src_file_name}.pb.cpp") - exec_program(mv ARGS " ${__working_dir}${__full_src_filename} ${PROTO_SRC_PATH}/${__pure_src_file_name}.pb.cpp" + exec_program(mv ARGS " ${__working_dir}${__full_src_filename} ${proto_src_path}/${__pure_src_file_name}.pb.cpp" OUTPUT_VARIABLE __out RETURN_VALUE __value) if(POLICY CMP0007) cmake_policy(POP) endif() - endforeach() - else() - message(FATAL_ERROR "anakin_protos_processing : ${__file} \n error msg: ${OUTPUT}") - endif() - endforeach() + endforeach() + else() + message(FATAL_ERROR "anakin_gen_bp: ${__file} \n error msg: ${OUTPUT}") + endif() + endforeach() +endfunction() + +function(anakin_protos_processing) + set(PROTO_SRC_PATH ${ANAKIN_MODEL_PARSER}/proto) + set(SERVICE_API_SRC_PATH ${ANAKIN_SERVICE}/api) + + set(__working_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/PROTO_TEMP/) + + anakin_fetch_files_with_suffix(${PROTO_SRC_PATH} "proto" PROTO_SRC_FILES) + anakin_fetch_files_with_suffix(${SERVICE_API_SRC_PATH} "proto" SERVICE_API_PROTO_SRC_FILES) + anakin_gen_pb(${PROTO_SRC_PATH} ${PROTO_SRC_FILES}) + if(BUILD_RPC) + anakin_gen_pb(${SERVICE_API_SRC_PATH} ${SERVICE_API_PROTO_SRC_FILES}) + endif() endfunction() # ---------------------------------------------------------------------------- diff --git a/docker/AMD/centos/centos7-rocm-opencl/Dockerfile b/docker/AMD/centos/centos7-rocm-opencl/Dockerfile new file mode 100755 index 000000000..f0c65b7cc --- /dev/null +++ b/docker/AMD/centos/centos7-rocm-opencl/Dockerfile @@ -0,0 +1,42 @@ + +FROM centos:7.4.1708 + +# anakin install ubuntu GPU env +RUN yum -y install vim wget git make glibc-devel libstdc++-deve epel-release gcc gcc-c++ libstdc++ && rm -rf /var/cache/yum/* + +RUN yum -y install python-pip && rm -rf /var/cache/yum/* + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir \ + flask numpy pyyaml scipy pandas + +# set env +ENV LIBRARY_PATH /usr/lib64:$LIBRARY_PATH + +# install cmake +RUN wget https://cmake.org/files/v3.2/cmake-3.2.0.tar.gz && tar xzf cmake-3.2.0.tar.gz && \ + cd cmake-3.2.0 && ./bootstrap && \ + make -j4 && make install && cd .. && rm -f cmake-3.2.0.tar.gz + +# install protobuf +RUN wget --no-check-certificate https://mirror.sobukus.de/files/src/protobuf/protobuf-cpp-3.4.0.tar.gz \ + && tar -xvf protobuf-cpp-3.4.0.tar.gz \ + && cd protobuf-3.4.0 && ./configure \ + && make -j4 && make install && cd .. \ + && rm -f protobuf-cpp-3.4.0.tar.gz + +RUN echo "[ROCm]" > /etc/yum.repos.d/rocm.repo \ + && echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo \ + && echo "baseurl=http://repo.radeon.com/rocm/yum/rpm" >> /etc/yum.repos.d/rocm.repo \ + && echo "enabled=1" >> /etc/yum.repos.d/rocm.repo \ + && echo "gpgcheck=0" >> /etc/yum.repos.d/rocm.repo + +RUN yum -y install rocm-opencl rocm-opencl-devel && rm -rf /var/cache/yum/* + +# set env +ENV LIBRARY_PATH /opt/rocm/lib:/opt/rocm/opencl/lib/x86_64:$LIBRARY_PATH +ENV OCL_ROOT /opt/rocm/opencl/lib/x86_64 +ENV PATH /opt/rocm/bin:/opt/rocm/opencl/bin/x86_64:$PATH + +RUN git clone --branch AMD --recursive "https://github.com/PaddlePaddle/Anakin.git" /root/Anakin && cd /root/Anakin/tools/ && ./amd_gpu_build.sh && cd - + diff --git a/docker/NVIDIA/ubuntu/ubuntu16.04-cuda8-cudnn7/Dockerfile b/docker/NVIDIA/ubuntu/ubuntu16.04-cuda8-cudnn7/Dockerfile index ab81b4ac8..3bb0ffb8c 100644 --- a/docker/NVIDIA/ubuntu/ubuntu16.04-cuda8-cudnn7/Dockerfile +++ b/docker/NVIDIA/ubuntu/ubuntu16.04-cuda8-cudnn7/Dockerfile @@ -43,4 +43,4 @@ ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ENV CUDNN_ROOT=/usr/local/cuda/include # build and install anakin -RUN git clone --branch developing --recursive https://github.com/PaddlePaddle/Anakin.git +#RUN git clone --branch developing --recursive https://github.com/PaddlePaddle/Anakin.git diff --git a/docker/README.md b/docker/README.md index fcc1511f9..5ea351395 100644 --- a/docker/README.md +++ b/docker/README.md @@ -37,6 +37,22 @@ $chmod +x ./anakin_docker_build_and_run.sh $./anakin_docker_build_and_run.sh -p NVIDIA-GPU -o Centos -m Run ``` +### AMD Docker +#### Build Image +```bash +$/usr/bash anakin_docker_build_and_run.sh -p AMD-GPU -o Centos -m Build +or +$chmod +x ./anakin_docker_build_and_run.sh +$./anakin_docker_build_and_run.sh -p AMD-GPU -o Centos -m Build +``` + +#### Run docker +```bash +$/usr/bash anakin_docker_build_and_run.sh -p AMD-GPU -o Centos -m Run +or +$chmod +x ./anakin_docker_build_and_run.sh +$./anakin_docker_build_and_run.sh -p AMD-GPU -o Centos -m Run +``` ### X86 Docker > Not support yet diff --git a/docker/README_cn.md b/docker/README_cn.md new file mode 100644 index 000000000..6d5ed994a --- /dev/null +++ b/docker/README_cn.md @@ -0,0 +1,46 @@ +# Anakin 2.0 And Docker +--- + +## 依赖软件 + ++ 你的操作系统上应该已经安装了docker. ++ 如果你要在docker中使用`NVIDIA GPU` 还需要安装[nvidia-docker2](https://github.com/NVIDIA/nvidia-docker/wiki/Installation-(version-2.0)) + +## 使用方法 + +推荐使用 `anakin_docker_build_and_run.sh` 脚本来构建和运行docker镜像,脚本的使用方法如下 + +```bash +Usage: anakin_docker_build_and_run.sh -p -o -m + +选项: + + -p 硬件的运行环境 [ NVIDIA-GPU / AMD_GPU / X86-ONLY / ARM ] + -o 主机的操作系统类型 [ Centos / Ubuntu ] + -m 脚本的执行模式[ Build / Run / All] 默认模式是 build and run +``` + +### GPU Docker +#### 构建镜像 +```bash +/usr/bash anakin_docker_build_and_run.sh -p NVIDIA-GPU -o Centos -m Build +或者 +chmod +x ./anakin_docker_build_and_run.sh +./anakin_docker_build_and_run.sh -p NVIDIA-GPU -o Centos -m Build +``` + +#### 运行 docker容器 +```bash +/usr/bash anakin_docker_build_and_run.sh -p NVIDIA-GPU -o Centos -m Run +或者 +chmod +x ./anakin_docker_build_and_run.sh +./anakin_docker_build_and_run.sh -p NVIDIA-GPU -o Centos -m Run +``` + +### X86 Docker + +> Not support yet + +### ARM Docer + +> Not support yet diff --git a/docker/anakin_docker_build_and_run.sh b/docker/anakin_docker_build_and_run.sh index 97802a4e8..1eb989ceb 100755 --- a/docker/anakin_docker_build_and_run.sh +++ b/docker/anakin_docker_build_and_run.sh @@ -14,7 +14,7 @@ help_anakin_docker_run() { echo "" echo "Options:" echo "" - echo " -p Hardware Place where docker will running [ NVIDIA-GPU / AMD_GPU / X86-ONLY / ARM ] " + echo " -p Hardware Place where docker will running [ NVIDIA-GPU / AMD-GPU / X86-ONLY / ARM ] " echo " -o Operating system docker will reside on [ Centos / Ubuntu ] " echo " -m Script exe mode [ Build / Run ] default mode is build and run" exit 1 @@ -56,7 +56,7 @@ building_and_run_nvidia_gpu_docker() { if [ ! $MODE = "Run" ]; then echo "Building nvidia docker ... [ docker_image_name: anakin image_tag: $tag ]" sudo docker build --network=host -t anakin:$tag"-base" . -f $DockerfilePath - sudo docker run --network=host -it anakin:$tag"-base" Anakin/tools/gpu_build.sh + sudo docker run --network=host -it anakin:$tag"-base" Anakin/tools/nv_gpu_build.sh container_id=$(sudo docker ps -l | sed -n 2p | awk '{print $1}') sudo docker commit $container_id anakin:$tag else @@ -67,9 +67,19 @@ building_and_run_nvidia_gpu_docker() { # buiding and running docker for amd gpu building_and_run_amd_gpu_docker() { - echo "not support yet" - read - exit 1 + if [ ! $# -eq 2 ]; then + exit 1 + fi + DockerfilePath=$1 + MODE=$2 + tag="$(echo $DockerfilePath | awk -F/ '{print tolower($(NF-3) "_" $(NF-1))}')" + if [ ! $MODE = "Run" ]; then + echo "Building amd docker ... [ docker_image_name: anakin image_tag: $tag ]" + sudo docker build --network=host -t anakin:$tag . -f $DockerfilePath + else + echo "Running amd docker ... [ docker_image_name: anakin image_tag: $tag ]" + sudo docker run -it --device=/dev/kfd --device=/dev/dri --group-add video anakin:$tag /bin/bash + fi } # building and running docker for x86 @@ -91,7 +101,7 @@ dispatch_docker_path() { # declare associative map from place to relative path declare -A PLACE2PATH PLACE2PATH["NVIDIA-GPU"]=NVIDIA - PLACE2PATH["AMD_GPU"]=AMD + PLACE2PATH["AMD-GPU"]=AMD PLACE2PATH["X86-ONLY"]=X86 PLACE2PATH["ARM"]=ARM # declare associative map from os to relative path @@ -155,7 +165,7 @@ dispatch_docker_path $place $os if [ $place = "NVIDIA-GPU" ]; then building_and_run_nvidia_gpu_docker $SupportDockerFilePath $mode -elif [ $place = "AMD_GPU" ]; then +elif [ $place = "AMD-GPU" ]; then building_and_run_amd_gpu_docker $SupportDockerFilePath $mode elif [ $place = "X86-ONLY" ]; then building_and_run_x86_docker $SupportDockerFilePath $mode diff --git a/docs/Manual/C++APIs_ch.md b/docs/Manual/C++APIs_ch.md new file mode 100644 index 000000000..e0dc81d71 --- /dev/null +++ b/docs/Manual/C++APIs_ch.md @@ -0,0 +1,624 @@ +# C++ APIs ## + +本教程将会介绍Anakin的一些基本的API及如何调用这些API。 + +主要内容如下: + +- [Anakin APIs](#api) +- [示例代码](#example) + +## Anakin APIs ### +### Tensor #### + +`Tensor`提供基础的数据操作和管理,为ops提供统一的数据接口。`Tensor`包含以下几个属性: + +- Buffer + 数据存储区 +- Shape + 数据的维度信息 +- Event + 用于异步计算的同步 + + `Tensor` 类包含三个`Shape`对象, 分别是`_shape`, `_valid_shape`和 `offset`。 `_shape`为`tensor`真正空间信息,`_valid_shape`表示当前`tensor`使用的空间信息, `_offset`表示当前`tensor`数据指针相对于真正数据空间的信息。 `Tensor`不同维度与分别与数学中的向量、矩阵等相对应如下表所示。 + + +Dimentions | Math entity | + :----: | :----: +1 | vector +2 | matrix +3 | 3-tensor +n | n-tensor + +#### 声明tensor对象 + +`Tensor`接受三个模板参数: + + +```c++ + template + class Tensor .../* Inherit other class */{ + //some implements + ... + }; +``` + +TargetType是平台类型,如X86,GPU等等,在Anakin内部有相应的标识与之对应;datatype是普通的数据类型,在Anakin内部也有相应的标志与之对应;[LayOutType](#layout)是数据分布类型,如batch x channel x height x width [NxCxHxW], 在Anakin内部用一个struct来标识。 Anakin中数据类型与基本数据类型的对应如下: + +1. TargetType + + Anakin TargetType | platform + :----: | :----:| + NV | NVIDIA GPU + ARM | ARM + AMD | AMD GPU + X86 | X86 + NVHX86 | NVIDIA GPU with Pinned Memory + +2. DataType + +Anakin DataType | C++ | Description +:---: | :---: | :---: | +AK_HALF | short | fp16 +AK_FLOAT | float | fp32 +AK_DOUBLE | double | fp64 +AK_INT8 | char | int8 +AK_INT16 | short | int16 +AK_INT32 | int | int32 +AK_INT64 | long | int64 +AK_UINT8 | unsigned char | uint8 +AK_UINT16 | unsigned short | uint8 +AK_UINT32 | unsigned int | uint32 +AK_STRING | std::string | / +AK_BOOL | bool | / +AK_SHAPE | / | Anakin Shape +AK_TENSOR | / | Anakin Tensor + + +3. LayOutType + +Anakin LayOutType ( Tensor LayOut ) | Tensor Dimention | Tensor Support | Op Support +:---: | :---: | :---: | :---: | +W | 1-D | YES | NO +HW | 2-D | YES | NO +WH | 2-D | YES | NO +NW | 2-D | YES | YES +NHW | 3-D | YES |YES +NCHW ( default ) | 4-D | YES | YES +NHWC | 4-D | YES | NO +NCHW_C4 | 5-D | YES | YES + + +理论上,Anakin支持申明1维以上的tensor,但是对于Anakin中的Op来说,只支持NW、NHW、NCHW、NCHW_C4这四种LayOut,其中NCHW是默认的LayOutType,NCHW_C4是专门针对于int8这种数据类型的。 + + +例子 + +> 下面的代码将展示如何使用tensor, 我们建议先看看这些示例。 + +> 要想获得更多关于tensor的信息, 请参考 *soure_path/core/tensor.h* + +> 1. 使用shape对象初始化tensor +``` c++ + //create a null tensor. A null tensor holds for nothing. + //tensor's buffer is resident at CPU and its datatype is AK_FLOAT. + //tensor's Layout is NCHW(default) + Tensor mytensor; + + //1. using shape object to create a tensor. + Shape shape1(NUM); //1-D shape. NUM is the number of dimention. + Tensor mytensor1(shape1); //1-D tensor. + + // A 4-D shape + Shape shape2(N, C, H, W); // batch x channel x height x width +``` + +>`注意:Shape的维度必须和tensor的`[LayoutType](#layout)`相同,比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW,否则会出错。如下列代码所示` + + +```c++ + // A 4-D tensor. + Tensor mytensor2(shape2); //right + + //A 4-D tensor which is resident at GPU and its datatype is AK_INT8 + Tensor mytensor3(shape2); //right + + Tensor mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout. + Tensor mytensor5(shape2); //wrong!!!! + +``` + +> 2. 使用现有的数据和shape初始化tensor + +```c++ + + /** + * A construtor of Tensor. + * data_ptr is a pointer to any data type of data + * TargetType is type of a platform [Anakin TargetType] + * id : device id + * shape: a Anakin shape + */ + Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape); + + //using existing data feed to a tensor + Tensor mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W). + +``` + +> 3. 使用tensor初始化tensor + +```c++ + Tensor tensor(exist_tensor); +``` + + +> 提示: 你可以用` typedef Tensor Tensor4d_X86 `方便定义tensor + + +#### 填充tensor数据区 + + +填充数据区得看你申明tensor的方式, 下面展示了如何填充tensor的数据区。 + +```c++ +首先来看看tensor的四种声明方式: + +1. Tensor mytensor; +2. Tensor mytensor1(shape1); +3. Tensor mytensor(data_ptr, TargetType, device_id, shape); +4. Tensor tensor(exist_tensor); + + +相关的声明方式的数据填充方法如下: + +1:声明一个空的tensor,此时没有为其分配内存,所以,我们需要手动的为其分配内存。 + + //parama shape + mytensor.re_alloc(Shape shape); + + //Get writable pointer to mytensor. + //parama index (int): where you start to write. + //Dtype is your data type such int, float or double. + Dtype *p = mytensor.mutable_data(index/*=0*/); + //write data to mytensor + for(int i = 0; i < mytensor.size(); i++){ + p[i] = 1.0f; + } + //do something ... + +2: 这种声明方式会自动分配内存 + + //Get writable pointer to mytensor. + //parama index (int): where you start to write. + //Dtype is your data type such int, float or double. + Dtype *p = mytensor1.mutable_data(index/*=0*/); + //write data to mytensor + for(int i = 0; i < mytensor.size(); i++){ + p[i] = 1.0f; + } + //do something ... + + +3:在该种声明方式中,我们仍不需要手动为其分配内存。但在构造函数内部是否为其分配内存,得依情况而定。如果data_ptr和申明的 +tensor都在都一个目标平台上,那么该tensor就会与data_ptr共享内存空间,相反,如果他们不在同一个平台上(如data_ptr在X86上,而 +tensor在GPU上),那么此时tensor就会开辟一个新的内存空间,并将data_ptr所指向的数据拷贝到tensor的buffer中。 + + //Get writable pointer to mytensor. + //parama index (int): where you start to write. + //Dtype is your data type such int, float or double. + Dtype *p = mytensor.mutable_data(index/*=0*/); + //write data to mytensor + for(int i = 0; i < mytensor.size(); i++){ + p[i] = 1.0f; + } + //do something ... + +4:该种方式仍不需要手动分配内存 + + //Get writable pointer to mytensor. + //parama index (int): where you start to write. + //Dtype is your data type such int, float or double. + Dtype *p = mytensor.mutable_data(index/*=0*/); + //write data to mytensor + for(int i = 0; i < mytensor.size(); i++){ + p[i] = 1.0f; + } + //do something ... + + +另外,你还可以获取一个tensor的可读指针,示例如下: + //Get read-only pointer to mytensor. + //parama index (int): where you start to read. + //Dtype is your data type such int, float or double. + Dtype *p = mytensor.data(index/*=0*/); + //do something ... +``` + +如果想更详细的了解tensor,请查阅*soure_path/saber/core/tensor.h* + +#### 获取tensor的shape + +```c++ +//some declarations +// ... +Shape shape = mytensor.shape(); + +//Get a first dimetion size of tesor, if it has. +int d1 = shape[0]; + +//Get a second dimention size of tensor, if it has. +int d2 = shape[1]; + +... + +//Get a n-th dimention size of tensor, if it has. +int dn = shape[n-1]; + + +//Get a tensor's dimention +int dims = mytensor.dims(); + +//Get the size of tensor. +//size = d1 x d2 x ... x dn. +int size = mytensor.size(); + +//Get the size of tensor at interval [Di, Dj) +// form i-th dimention to j-th dimention, but not including the j-th dimention. +// which means di x (di+1) x ... x (dj -1) +int size = mytensor.count(start, end); +``` + +#### 设置tensor的shape + +我们可以用tensor的成员函数set_shape来设置tensor的shape。 下面是set_shape的定义 + + +```c++ +/** + * \brief set a tensor's shape + * \param valid_shape [a Shape object] + * \param shape [a Shape object] + * \param offset [a Shape object] + * \return the status of this operation, that means whether it success * or not. + */ +SaberStatus set_shape(Shape valid_shape, Shape shape = Shape::zero(TensorAPI::layout_dims::value), Shape offset = Shape::minusone(TensorAPI::layout_dims::value)); +``` + +这个成员函数只设置tensor的shape。这些shape对象(valid_shape, shape, offset)的[LayOutType](#layout)必须和当前的tensor的相应三个shape对象的LayOutType相同,如果不同就会出错,返回SaberInvalidValue。 如果相同,那么将成功设置tensor的shape。 + +```c++ + +// some declarations +// ... +//valid_shape, shape , offset are Shape object; +//All these Shape object's LayOutType must be equal to mytensor's. +mytensor.set_shape(valid_shape, shape, offset); + +``` + +#### 重置 tensor的shape + +```c++ +//some declarations +Shape shape, valid_shape, offset; + +//do some initializations +... +mytensor.reshape(valid_shape, shape, offset); +``` + +注意: Reshape操作仍然需要shape的[LayOutType](#layout) 与tensor的相同 + + +### Graph ### + +`Graph`类负责加载Anakin模型生成计算图、对图进行优化、存储模型等操作。 + +#### 图的声明 + +与`Tensor`一样,graph也接受三个模板参数。 + +```c++ + +template +class Graph ... /* inherit other class*/{ + + //some implements + ... + +}; +``` + +前面已经介绍过[TargetType](#target)和[DataType](#datatype)是Anakin内部自定义数据类型。[TargetType](#target)表示平台类型 (如NV、X86), [DataType](#datatype)是Anakin基本数据类型与C++/C中的基本数据类型相对应。 [Precision](#precision)为op所支持的精度类型, 稍后我们在介绍它。 + + +```c++ + +//Create a empty graph object. +Graph graph = Graph tmp(); + +//Create a pointer to a empty graph. +Graph *graph = new Graph(); + +//Create a pointer to a empty graph. +auto graph = new Graph(); + +``` + +#### 加载 Anakin 模型 + +```c++ +//some declarations +... +auto graph = new Graph(); +std::string model_path = "the/path/to/where/your/models/are"; +const char *model_path1 = "the/path/to/where/your/models/are"; + +//Loading Anakin model to generate a compute graph. +auto status = graph->load(model_path); + +//Or this way. +auto status = graph->load(model_path1); +//Check whether load operation success. +if(!status){ + std::cout << "error" << endl; + //do something... +} + +``` + +#### 优化计算图 + +```c++ +//some declarations +... +//Load graph. +... +//According to the ops of loaded graph, optimize compute graph. +graph->Optimize(); + +``` + +> 注意: 第一次加载原始图,必须要优化。 + +#### 保存模型 + +你可以在任何时候保存模型, 特别的, 你可以保存一个优化的模型,这样,下次再加载模型时,就不必进行优化操作。 + + +```c++ +//some declarations +... +//Load graph. +... +// save a model +//save_model_path: the path to where your model is. +auto status = graph->save(save_model_path); + +//Checking +if(!status){ + cout << "error" << endl; + //do somethin... +} +``` + +#### 重新设置计算图里的tensor的shape + +```c++ +//some declarations +... +//Load graph. +... +vector shape{10, 256, 256, 10}; +//input_name : std::string. +//Reshape a tensor named input_name. +graph->Reshape(input_name, shape);//Note: shape is a vector, not a Shape object. +``` + +#### 设置 batch size + +`Graph` 支持重新设置batch size的大小。 + +```c++ +//some declarations +... +//Load graph. +... +//input_name : std::string. +//Reset a tensor named input_name. +int new_batch_size = 4; +graph->ResetBatchSize(input_name, new_batch_size); +``` + +### Net ### + + +`Net` 是计算图的执行器。你可以通过Net对象获得输入和输出 +#### Creating a graph executor + +`Net`接受四个模板参数。 + + +```c++ +template +class Net{ + //some implements + ... + +}; +``` +由于有些Op可能支持多种精度,我们可以通过Precision来指定。OpRunType表示同步或异步类型,异步是默认类型。OpRunType::SYNC表示同步,在GPU上只有单个流;OpRunType::ASYNC表示异步,在GPU上有多个流并以异步方式执行。实际上,Precision和OpRunType都是enum class, 详细设计请参考*source_root/framework/core/types.h*. + + +1. Precision + +Precision | Op support +:---: | :---: +Precision::INT4 | NO +Precision::INT8 | NO +Precision::FP16 | NO +Precision::FP32 | YES +Precision::FP64 | NO + +现在Op的精度只支持FP32, 但在将来我们会支持剩下的Precision. + + + +2. OpRunType + +OpRunType | Sync/Aync |Description +:---: | :---: | :---: +OpRunType::SYNC | Synchronization | single-stream on GPU +OpRunType::ASYNC | Asynchronization | multi-stream on GPU + +用graph对象创建一个执行器。 +```c++ +//some declarations +... +//Create a pointer to a graph. +auto graph = new Graph(); +//do something... +... + +//create a executor +Net executor(*graph); + +``` + +#### 获取输入输出tensor + + +获取输入输出tensor,并填充输入tensor的buffer。如果想要获取输入和输出tensor,那么必须指定输入的名字,如"input_0", "input_1", "input_2", ..., 必须传入如上字符串才能够获得输入tensor。另外,如果想知道input_i对应哪个输入,你需要去dash board查看,如何使用dash board请看[Anakin Parser](Converter_ch.md)。请看如下示例代码 + +```c++ +//some declaratinos +... + +//create a executor +//TargetType is NV [NVIDIA GPU] +Net executor(*graph); + +//Get the first input tensor. +//The following tensors(tensor_in0, tensor_in2 ...) are resident at GPU. +//Note: Member function get_in returns an pointer to tensor. +Tensor* tensor_in0 = executor.get_in("input_0"); + +//If you have multiple input tensors +//You just type this code below. +Tensor* tensor_in1 = executor.get_in("input_1"); +... +auto tensor_inn = executor.get_in("input_n"); +``` + +当得到输入tensor之后,就可以填充它的数据区了。 + +```c++ +//This tensor is resident at GPU. +auto tensor_d_in = executor.get_in("input_0"); + +//If we want to feed above tensor, we must feed the tensor which is resident at host. And then copy the host tensor to the device's one. + +//using Tensor4d = Tensor; +Tensor4d tensor_h_in; //host tensor; +//Tensor tensor_h_in; + +//Allocate memory for host tensor. +tensor_h_in.re_alloc(tensor_d_in->valid_shape()); +//Get a writable pointer to tensor. +float *h_data = tensor_h_in.mutable_data(); + +//Feed your tensor. +/** example +for(int i = 0; i < tensor_h_in.size(); i++){ + h_data[i] = 1.0f; +} +*/ +//Copy host tensor's data to device tensor. +tensor_d_in->copy_from(tensor_h_in); + +// And then +``` + + +类似的,我们可以利用成员函数get_out来获得输出tensor。但与获得输入tensor不同的是, 我们需要指定输入tensor结点的名字,这个可以从dash board中看到,请从[Anakin Parser](Converter_ch.md)中查看dash board的使用方法。假如有个输出结点叫pred_out, 那么我们可以通过如下代码获得相应的输出tensor: +```c++ +//Note: this tensor are resident at GPU. +Tensor* tensor_out_d = executor.get_out("pred_out"); + +``` + + +#### Executing graph + + +当一切准备就绪后,我们就可以执行真正的计算了! +```c++ +executor.prediction(); +``` + +## 示例代码 ## + +下面的例子展示了如何调用Anakin。 + +在这儿之前, 请确保你已经有了Anakin模型。如果还没有,那么请使用[Anakin Parser](Converter_ch.md)转换你的模型。 + +### Single-thread + +单线程例子在 *`source_root/test/framework/net/net_exec_test.cpp`* + +```c++ + +std::string model_path = "your_Anakin_models/xxxxx.anakin.bin"; +// Create an empty graph object. +auto graph = new Graph(); +// Load Anakin model. +auto status = graph->load(model_path); +if(!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); +} +// Reshape +graph->Reshape("input_0", {10, 384, 960, 10}); +// You must optimize graph for the first time. +graph->Optimize(); +// Create a executer. +Net net_executer(*graph); + +//Get your input tensors through some specific string such as "input_0", "input_1", and +//so on. +//And then, feed the input tensor. +//If you don't know Which input do these specific string ("input_0", "input_1") correspond with, you can launch dash board to find out. +auto d_tensor_in_p = net_executer.get_in("input_0"); +Tensor4d h_tensor_in; +auto valid_shape_in = d_tensor_in_p->valid_shape(); +for (int i=0; icopy_from(h_tensor_in); + +//Do inference. +net_executer.prediction(); + +//Get result tensor through the name of output node. +//And also, you need to see the dash board again to find out how many output nodes are and remember their name. + +//For example, you've got a output node named obj_pre_out +//Then, you can get an output tensor. +auto d_tensor_out_0_p = net_executer.get_out("obj_pred_out"); //get_out returns a pointer to output tensor. +auto d_tensor_out_1_p = net_executer.get_out("lc_pred_out"); //get_out returns a pointer to output tensor. +//...... +// do something else ... +//... +//save model. +//You might not optimize the graph when you load the saved model again. +std::string save_model_path = model_path + std::string(".saved"); +auto status = graph->save(save_model_path); +if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); +} + +``` diff --git a/docs/Manual/Contribution_ch.md b/docs/Manual/Contribution_ch.md new file mode 100644 index 000000000..438d207b8 --- /dev/null +++ b/docs/Manual/Contribution_ch.md @@ -0,0 +1,178 @@ +# 如何贡献代码 + +我们真诚地感谢您的贡献,欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。 + +## Contributor License Agreements + +在您的代码合入之前请签署个人或者公司的Contributor License Agreement(CLA)。 + +- 如果您个人是原始代码的拥有者,并拥有代码的知识产权,您需要签署[个人CLA](https://gist.github.com/tanzhongyibidu/6605bdef5f7bb03b9084dd8fed027037) +- 如果原始代码属于公司,并且公司同意提交代码到我们的仓储,那您需要签署[公司CLA](https://gist.github.com/tanzhongyibidu/709c675c1e79804e3e871f8c1e62292d) + +请您选择合适的CLA并仔细阅读,在您签署CLA后方可将代码合入。 + +## 添加License + +在新提交的代码中包含license: + +- c++代码头文件 + +```c++ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +``` + +- python代码 + +```python +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +``` + +## 代码要求 + +- 代码注释请遵守[Doxygen](http://www.stack.nl/~dimitri/doxygen/)的样式 +- 所有代码必须具有单元测试 +- 通过所有单元测试 +- 请遵守提交代码的一些约定 + +以下教程将指导您提交代码 + +## Fork +首先跳转到[Anakin](https://github.com/PaddlePaddle/Anakin)的github首页,然后点击`Fork`, 生成自己目录下的仓库 + +## 克隆(clone) + +将远程仓库clone到本地: + +```bash +git clone YOUR_REPOSITORY_URL +cd Anakin +``` + +## 创建本地分支 +Anakin目前使用[Git流分支模型](https://nvie.com/posts/a-successful-git-branching-model/)进行开发, 测试和维护。 +所有的feature和bug fix的开发工作都应该在一个新的分支上完成,根据需要从现有分支上创建新分支。 +使用`git checkout -b`创建并切换到新分支 +```bash +git checkout -b YOUR_NEW_BRANCH +``` + +## 开始开发 + +编写代码 + + +## 构建和测试 + +详细请参考[Docker installation guide](docker/README.md) 和 [build from source guide](docs/Manual/INSTALL_en.md)。 + + +## 提交(commit) + +提交代码时,请认真写好提交说明,这样其他人就可以清楚的知道这次提交做了哪些改变: +```bash +git commit -m 'description' +``` + +## 保持本地仓库最新 + +在发起Pull Request之前,需要与原始仓库同步。 + +如果还没添加原仓库,请先添加源,可通过`git remote -v`查看是否添加源: +```bash +git remote -v +origin .... (fetch) +origin .... (push) +``` +如果只出现origin,说明还未添加源,可通过如下命令添加源: +```bash +git remote add upstream ORIGIN_REPOSITORY_URL +``` +获取 upstream 的最新代码并更新当前分支 +```bash +git fetch upstream +git pull upstream BRANCH_NAME +``` +## Push到远程仓库 + +将本地的修改push到远程仓库上 +```bash +git push origin BRANCH_NAME +``` + +## 提交Pull Request + +切换到所建分支,然后点击`New pull request`。 +![](./contri1.JPG) + +选择目标分支: +![](./contri2.JPG) + +接下来等待review。 + +## 删除远程分支 +在PR被merge进主仓库后,可以在PR的界面删除远程仓库的分支。 +也可以通过以下命令删除远程分支: +```bash +git push origin :YOUR_NEW_BRANCH +``` + +## 删除本地分支 + +最后,删除本地分支。 +```bash +#切换到其他分支 +git checkout OTHER_BRANCH + +#删除YOUR_NEW_BRANCH分支 +git branch -D YOUR_NEW_BRANCH +``` + +至此,我们就完成了一次代码贡献的过程。 + +## 提交代码的一些约定 + +为了使评审人在评审代码时更好地专注于代码本身,请您每次提交代码时,遵守以下约定: + +1. 提交Pull Request前: +- 注意commit的数量 + + - 原因:如果仅仅修改一个文件但提交了十几个commit,每个commit只做了少量的修改,这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改,且不排除commit之间的修改存在相互覆盖的情况。 + + - 建议:每次提交时,保持尽量少的commit,可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit,可以参考[squash commits after push](https://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed) + +- 注意每个commit的名称:应能反映当前commit的内容,不能太随意。 + +2. 如果解决了某个Issue的问题,请在该Pull Request的第一个评论框中加上:`fix #issue_number`,这样当该Pull Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。 + +在回复评审人意见时,请您遵守以下约定: +1. 评审人的每个意见都必须回复 + - 对评审意见同意且按其修改完的,给个简单的Done即可 + - 对评审意见不同意的,请给出您自己的反驳理由。 +2. 如果评审意见比较多 + - 请给出总体的修改情况。 + - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复,而非直接回复的方式。 diff --git a/docs/Manual/Converter_ch.md b/docs/Manual/Converter_ch.md index d137ba24a..56ca582b2 100644 --- a/docs/Manual/Converter_ch.md +++ b/docs/Manual/Converter_ch.md @@ -1,77 +1,73 @@ -# External Converter +# 模型转换指南 -This guide will show you how to convert your models to Anakin models. +Anakin 支持不同框架的模型预测。但由于格式的差别,Anakin 需要您预先转换模型。本文档介绍如何转换模型。 -## Introduction +## 简介 -Before using Anakin, you must convert your models to Anakin ones. If you don't, Anakin won't work properly. +Anakin 模型转换器输入支持 Caffe 和 Fluid 两种格式的预测模型,模型包含网络结构(model 或 prototxt)和权重参数(param 或 caffemodel)。 -## Requirements +模型转换的输出是一个 bin 文件,它作为 Anakin 框架的 graph 参数导入。 + +您还可以使用模型转换器的 launch board 功能生成网络结构的 HTML 预览。 + + +## 系统要求 - python 2.7+ - pyyaml - flask +- protobuf 3.5+ -## Downloading Converter Source -```bash -git clone https://xxxxxxxxx -``` +## 用法 + +### 1、环境 +转换器所需的依赖标注于 *系统要求* 一节。 -## Usage +### 2、配置 +您需要对 *config.yaml* 文件进行修改以告知您的需求。工程中给出了 *config.yaml* 示例,下面作进一步说明。 -### 1. Configuration -Configure your *config.yaml* file. Find example *config.yaml* file in the `converter source` directory. The example below explains how to configure your config.yaml file. -#### Caffe Case +#### config.yaml ```bash OPTIONS: - Framework: CAFFE # select a target dl-framework you want parsing - SavePath: ./output - ResultName: googlenet # the name you want when saving the parsed model + Framework: CAFFE # 依框架类型填写 CAFFE 或 FLUID + SavePath: ./output # 转换结束后模型的保存位置 + ResultName: googlenet # 输出模型的名字 Config: - LaunchBoard: ON # should be on if you want to launch graph board + LaunchBoard: ON # 是否生成网络结构预览页面 Server: ip: 0.0.0.0 - port: 8888 - OptimizedGraph: # only enable(set enable(ON) and path) when you have optimized graph model. - enable: ON + port: 8888 # 从一个可用端口访问预览页面 + OptimizedGraph: # 当您使用了 Anakin 框架的 Optimized 功能时,才应该打开此项 + enable: OFF path: /path/to/anakin_optimized_anakin_model/googlenet.anakin.bin.saved LOGGER: - LogToPath: ./log/ # the path where log - WithColor: ON # colorful log message + LogToPath: ./log/ # 生成日志的路径 + WithColor: ON TARGET: CAFFE: - # path to proto files + # 当 Framework 为 CAFFE 时需填写 ProtoPaths: - /path/to/caffe/src/caffe/proto/caffe.proto PrototxtPath: /path/to/your/googlenet.prototxt ModelPath: /path/to/your/googlenet.caffemodel - - # not support yet - PADDLE: - # path to proto files - ProtoPath: - - /path/to/proto_0 - - /path/to/proto_1 - - /path/to/proto_n - PrototxtPath: /path/to/prototxt - ModelPath: /path/to/model - # ... -``` - -### 2. Converting -After finishing configuration , you just need to call python script ```python converter.py``` to complete transfromation. - -### 3. Launching dash board -Anakin external converter will be launched on site http://0.0.0.0:8888 (configurable). -Then open you browser and search http://0.0.0.0:8888, amazing things will happen! - -> if you set ip to 0.0.0.0 in remote server, you need to open local browser and search the server real ip:port, not the 0.0.0.0. + FLUID: + # 当 Framework 为 FLUID 时需填写 + Debug: NULL + ProtoPaths: + - / + PrototxtPath: /path/to/fluid/inference_model + ModelPath: /path/to/fluid/inference_model + # ... +``` -### 4. Note +### 3、转换 +在完成配置文件的修改后,您只需执行 ```python converter.py``` 就可以进行模型转换了。 -> 1.We support caffe so far +### 4、预览 +最后一步,就是在浏览器中查看令人振奋的转换结果!网址是在 *config.yaml* 中配置的,例如 http://0.0.0.0:8888 。 +> 注意:若您使用了默认的 IP 地址 0.0.0.0,请在预览时使用真实的服务器地址 real_ip:port 替代它。 diff --git a/docs/Manual/Converter_en.md b/docs/Manual/Converter_en.md index d137ba24a..4262726ba 100644 --- a/docs/Manual/Converter_en.md +++ b/docs/Manual/Converter_en.md @@ -16,7 +16,7 @@ Before using Anakin, you must convert your models to Anakin ones. If you don't, ```bash git clone https://xxxxxxxxx -``` +``` ## Usage @@ -47,9 +47,8 @@ TARGET: - /path/to/caffe/src/caffe/proto/caffe.proto PrototxtPath: /path/to/your/googlenet.prototxt ModelPath: /path/to/your/googlenet.caffemodel - - # not support yet - PADDLE: + + FLUID: # path to proto files ProtoPath: - /path/to/proto_0 @@ -57,10 +56,10 @@ TARGET: - /path/to/proto_n PrototxtPath: /path/to/prototxt ModelPath: /path/to/model - # ... + # ... ``` -### 2. Converting +### 2. Converting After finishing configuration , you just need to call python script ```python converter.py``` to complete transfromation. ### 3. Launching dash board @@ -73,5 +72,3 @@ Then open you browser and search http://0.0.0.0:8888, amazing things will happen ### 4. Note > 1.We support caffe so far - - diff --git a/docs/Manual/INSTALL_ch.md b/docs/Manual/INSTALL_ch.md index 833976936..212e07c7e 100644 --- a/docs/Manual/INSTALL_ch.md +++ b/docs/Manual/INSTALL_ch.md @@ -6,7 +6,7 @@ * [在CentOS上安装 Anakin]() * [在Ubuntu上安装 Anakin]() -* [在ARM上安装 Anakin]() +* [在ARM上安装 Anakin](run_on_arm_ch.md) * [验证安装]() @@ -63,10 +63,11 @@ ### 在ARM上安装 Anakin ### -暂时还不支持 +请参考[ARM安装文档](run_on_arm_ch.md) ### 验证安装 ### -we are coming soon... + +安装完成后,如果没有报错信息,你可以通过运行 `output/unit_test`路径下的单测示例验证是否编译成功。 diff --git a/docs/Manual/INSTALL_en.md b/docs/Manual/INSTALL_en.md index 506e80b80..c02401473 100644 --- a/docs/Manual/INSTALL_en.md +++ b/docs/Manual/INSTALL_en.md @@ -66,8 +66,93 @@ Not support yet. #### 4. Building Anakin with AMD GPU Support #### -Coming soon.. + For more detials of ROCm please see [RadeonOpenCompute/ROCm](https://github.com/RadeonOpenCompute/ROCm) + +- 4.1. Setup Environment + + - 4.1.1 Update OS (Option, if your OS is able to be updated) + >$sudo yum update + + - 4.1.2 Add ROCM repo + Create a /etc/yum.repos.d/rocm.repo file with the following contents: + ```bash + [ROCm] + name=ROCm + baseurl=http://repo.radeon.com/rocm/yum/rpm + enabled=1 + gpgcheck=0 + ``` + + - 4.1.3 Install ROCK-DKMS + Please check your kernel version before installing ROCk-DKMS and make sure the result is same as your installed kernel related packages, such as kernel-headers and kerenl-devel + >$ uname -r + + - 4.1.3.1 For kernel ver 3.10.0-`693` (Option 1) + Download kernel-devel-3.10.0-693.el7.x86_64.rpm and kernel-headers-3.10.0-693.el7.x86_64.rpm + >$sudo yum install kernel-devel-3.10.0-693.el7.x86_64.rpm kernel-headers-3.10.0-693.el7.x86_64.rpm + + - 4.1.3.2 For kernel ver 3.10.0-`862` (Option 2) + >$ sudo yum install kernel-devel kernel-headers + + - 4.1.3.3 Install ROCk-DKMS + >$ sudo yum install epel-release + >$ sudo yum install dkms + >$ sudo yum install rock-dkms + + Use below command to check amdgpu is installed successful or not. + >$ sudo dkms status + >$ 'amdgpu, 1.8-151.el7, ..., x86_64: installed (original_module exists)' + + - 4.1.3.4 + Reboot your device. + + ** If you are using docker than step 4.1.4 to 4.1.8 are not required ** + + - 4.1.4 Install ROCm-OpenCL + >$sudo yum install rocm-opencl rocm-opencl-devel rocm-smi rocminfo + + - 4.1.5 Add user to the video (or wheel) group + >$sudo usermod -a -G video $LOGNAME + + - 4.1.6 Setting Environment variables + ```bash + echo 'export PATH=/opt/rocm/bin:/opt/rocm/opencl/bin/x86_64:$PATH' >> $HOME/.bashrc + echo 'export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH' >> $HOME/.bashrc + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib:/usr/local/lib64' >>$HOME/.bashrc + source ~/.bashrc + ``` + Check + >$ clinfo + + - 4.1.7 protobuf 3.4.0 + Download source from https://github.com/google/protobuf/releases/tag/v3.4.0 + >tar -zxvf protobuf-cpp-3.4.0.tar.gz + >$ cd protobuf-3.4.0 + >$ ./configure + >$ make + >$ make install + + Check + >$ protoc --version + Any problems for protobuf installation, Please see [here](https://github.com/google/protobuf/blob/master/src/README.md) + + - 4.1.8 cmake 3.2.0 + Download source from https://cmake.org/files/v3.2/cmake-3.2.0.tar.gz + >tar -zxvf cmake-3.2.0.tar.gz + >$ cd cmake-3.2.0 + >$ ./bootstrap + >$ make -j4 + >$ make install + +- 4.2. Compile Anakin + >$ git clone xxx + >$ cd anakin + >$ ./tools/amd_gpu_build.sh + +- 4.3. Run Benchmark + >$ cd output/unit_test + >$ benchmark ../../benchmark/CNN/models/ vgg16.anakin.bin 1 2 100 ### Installing on Ubuntu ### @@ -76,8 +161,10 @@ Coming soon.. ### Installing on ARM ### -Coming soon.. +Please refer to [run on arm](run_on_arm_en.md) ### Verifying installation ### +If build successfully, the libs will be in the directory `output/`, and you can run unit test in `output/unit_test` to verify your installation. + diff --git a/docs/Manual/addCustomDevice.md b/docs/Manual/addCustomDevice.md new file mode 100644 index 000000000..0c8c7fd6f --- /dev/null +++ b/docs/Manual/addCustomDevice.md @@ -0,0 +1,459 @@ +# 如何支持一个新的设备 + +## 概览 + +添加一个新的设备需要以下3个步骤: + +* [在`CMakeList`中添加设备的支持](#0001) +* [在`saber`中添加设备的实现](#0002) +* [在`framework`中添加设备的具体化或实例化](#0003) + +假设新设备的名称为`TNEW`, 以下将以这个设备名称进行演示。 + +## 在`CMakeList`中添加设备的支持 ## + +* 修改根目录`CMakeList.txt` +```cmake +#select the plantform to build +anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." NO) +anakin_option(USE_X86_PLACE "Select the build mode for X86 place." NO) +anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO) +anakin_option(USE_TNEW_PLACE "Select the build mode for ARM place." YES) +``` + +* 修改`saber/CMakeList.txt` + +根据新增设备的目录完善`saber`目录下的`CMakeList.txt`。 +```cmake +if(USE_TNEW_PLACE) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC) +endif() +``` + +* 修改`test/CMakeList.txt` + +新增设备的单测文件放在`test/saber/tnew`目录下,修改`test`目录下的`CMakeList.txt`。 +```cmake +if(USE_TNEW_PLACE) + anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/tnew "cpp" ANAKIN_TEST_CASE_SRC) +endif() +``` + +* 修改`cmake/anakin_config.h.in` +```c++ +// plantform to use +#cmakedefine USE_GPU_PLACE + +#cmakedefine USE_X86_PLACE + +#cmakedefine USE_ARM_PLACE + +#cmakedefine USE_TNEW_PLACE +``` + +* 其他依赖和编译选项 +修改`cmake`目录下的`compiler_options.cmake`和`find_modules.cmake` + + +## 在`saber`中添加设备的实现 ## +`saber`是`Anakin`的基础计算库,对外提供设备无关的统一的API,设备相关的实现都会封装到`TargetWrapper`中。 + +### 在`saber/saber_types.h`中添加设备 + +```c++ +enum TargetTypeEnum { + eINVALID = -1, + eNV = 1, + eAMD = 2, + eARM = 3, + eX86 = 4, + eNVHX86 = 5, + eTNEW = 6 +}; + +typedef TargetType NV; +typedef TargetType ARM; +typedef TargetType AMD; +typedef TargetType X86; +typedef TargetType TNEW; + +``` + +### 在`saber/core`中添加设备的实现 + +1. 在`target_traits.h`中添加新设备 + +* 增加设备类型 +```c++ +struct __cuda_device{}; +struct __arm_device{}; +struct __amd_device{}; +struct __x86_device{}; +struct __tnew_device{}; +``` + +* `TargetTypeTraits`模板具体化 +```c++ +template <> +struct TargetTypeTraits { + typedef __xxx_target target_category;//根据实际设备是host端还是device端进行选择 + typedef __tnew_device target_type; +}; +``` + +2. 在`data_traits.h`中特化`DataTrait`模板类 + +如果设备需要特殊的数据类型,则特化出设备的`DataTrait`类的实现,例如opencl数据类型的实现如下: +```c++ +#ifdef USE_OPENCL +struct ClMem{ + ClMem(){ + dmem = nullptr; + offset = 0; + } + + ClMem(cl_mem* mem_in, int offset_in = 0) { + dmem = mem_in; + offset = offset_in; + } + + ClMem(ClMem& right) { + dmem = right.dmem; + offset = right.offset; + } + + ClMem& operator=(ClMem& right) { + this->dmem = right.dmem; + this->offset = right.offset; + return *this; + } + + ClMem& operator+(int offset_in) { + this->offset += offset_in; + return *this; + } + + int offset{0}; + cl_mem* dmem; +}; + +template <> +struct DataTrait { + typedef ClMem Dtype; + typedef float dtype; +}; + +template <> +struct DataTrait { + typedef ClMem Dtype; + typedef double dtype; +}; + +template <> +struct DataTrait { + typedef ClMem Dtype; + typedef char dtype; +}; +#endif //use_opencl +``` + +3. 在`target_wrapper.h`中特化`TargetWrapper`模板类 + +特化`TargetWrapper`模板类,在`target_wrapper.h`中声明函数,具体如下: +```c++ +template <> +struct TargetWrapper { //根据TNEW的具体类型修改__xxx_target,__host_target或者__device_target + + typedef xxx_event event_t; //根据设备实现xxx_event + typedef xxx_stream stream_t; //根据设备实现xxx_stream + + static void get_device_count(int& count); + + static void set_device(int id); + + //We should add strategy to avoid malloc directly + static void mem_alloc(void** ptr, size_t n); + + static void mem_free(void* ptr); + + static void mem_set(void* ptr, int value, size_t n); + + static void create_event(event_t& event, bool flag = false); + + static void create_stream(stream_t& stream); + + static void create_stream_with_flag(stream_t& stream, unsigned int flag); + + static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority); + + static void destroy_stream(stream_t& stream); + + static void destroy_event(event_t& event); + + static void record_event(event_t& event, stream_t stream); + + static void query_event(event_t& event); + + static void sync_event(event_t& event); + + static void sync_stream(event_t& event, stream_t& stream); + + static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, __DtoD); + + static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, stream_t& stream, __DtoD); + + static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, __HtoD); + + static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, stream_t& stream, __HtoD); + + static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, __DtoH); + + static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + size_t count, stream_t& stream, __DtoH); + + static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ + int src_dev, size_t count); + + static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \ + int src_dev, size_t count, stream_t& stream); + + static int get_device_id(); +}; + +``` + +4. 在`impl/`目录下添加设备目录和实现 + +在`saber/core/impl`目录下添加设备目录`tnew`。 +* 实现`TargetWrapper`结构体中各函数的定义。 +如果`TargetWrapper`的实现与默认的模板类一致,则不用特化出该类。 + +```c++ +typedef TargetWrapper TNEW_API; +void TNEW_API::get_device_count(int &count) { + // add implementation +} + +void TNEW_API::set_device(int id){ + // add implementation +} + +void TNEW_API::mem_alloc(void** ptr, size_t n){ + // add implementation +} + +void TNEW_API::mem_free(void* ptr){ + if(ptr != nullptr){ + // add implementation + } +} +... + +``` + +* 特化实现`device.h`中的`Device` + +```c++ +template <> +void Device::create_stream() { + // add implementation +} + +template <> +void Device::get_info() { + + // add implementation +} + +``` + +### 在`saber/funcs`中实现设备相关的op + +参考[如何增加新的Operator](addCustomOp.md) + + +## 在`framework`中添加设备的具体化或实例化 ## + +### `framework/core` + +* `net.cpp`中添加实例化 + +```c++ +#ifdef USE_TNEW_PLACE +template class Net; +template class Net; +#endif +``` + +* `operator_func.cpp`中添加实例化 + +```c++ +#ifdef USE_TNEW_PLACE +template class OperatorFunc; +#endif +``` + +* `worker.cpp`中添加实例化 + +```c++ +#ifdef USE_TNEW_PLACE +template class Worker; +template class Worker; +#endif +``` + +* `operator_attr.cpp`中添加实例化 + +```c++ +template +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); +template +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); +template +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); +``` + +* `parameter.h`中添加设备的实现 + +```c++ +#ifdef USE_TNEW_PLACE +template +class PBlock { +public: + typedef Tensor4d::type> type; + + PBlock() { + _inner_tensor = std::make_shared(); + } + ... +} +#endif //TNEW +``` + +* `type_traits_extend.h`中添加设备的实现 + +```c++ +template<> +struct target_host { + typedef saber::X86 type; //根据TNEW选择正确的host type +}; +``` + +### `framework/graph` + +* `graph.cpp`中添加实例化 + +```c++ + #ifdef USE_TNEW_PLACE + template class Graph; + template class Graph; + template class Graph; + #endif +``` + +### `framework/model_parser` + +* `parser.cpp`中添加实例化 + +```c++ + #ifdef USE_TNEW_PLACE + template + Status load(graph::Graph* graph, + const char* model_path); + template + Status load(graph::Graph* graph, + const char* model_path); + template + Status load(graph::Graph* graph, + const char* model_path); + + template + Status save(graph::Graph* graph, + std::string& model_path); + template + Status save(graph::Graph* graph, + std::string& model_path); + template + Status save(graph::Graph* graph, + std::string& model_path); + + template + Status load(graph::Graph* graph, + std::string& model_path); + template + Status load(graph::Graph* graph, + std::string& model_path); + template + Status load(graph::Graph* graph, + std::string& model_path); + + template + Status save(graph::Graph* graph, + const char* model_path); + template + Status save(graph::Graph* graph, + const char* model_path); + template + Status save(graph::Graph* graph, + const char* model_path); + #endif +``` + +* `model_io.cpp`中添加实例化 + +```c++ +#ifdef USE_TNEW_PLACE +template class NodeIO; +template class NodeIO; +template class NodeIO; +#endif +``` + +### `framework/operators` + +为`framework/operators`目录下所有op添加实例化或具体化 +以`activation.cpp`为例,实例化如下: + +```c++ +#ifdef USE_TNEW_PLACE +INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32); +INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16); +INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8); +template class ActivationHelper; +ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32); +#endif +``` + +如果TNEW设备函数的实现与现有模板实现不一致,可以特化实现如下(以init()为例): +```c++ +#ifdef USE_TNEW_PLACE +INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32); +INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16); +INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8); +template <> +Status ActivationHelper::Init(OpContext &ctx,\ + const std::vector >& ins, \ + std::vector >& outs) { + SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, SPECIFY, SABER_IMPL, ctx)); //在这里选择实现方式 + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32); +#endif +``` + +在`ANAKIN_REGISTER_OP(Activation)`中添加TNEW的注册 + +```c++ +#ifdef USE_TNEW_PLACE +.__alias__("activation") +#endif +``` + +## 注意事项 +不要修改`Tensor`/`Buffer`/`Env`/`Context`这些类函数的接口和实现 \ No newline at end of file diff --git a/docs/Manual/addCustomOp.md b/docs/Manual/addCustomOp.md new file mode 100644 index 000000000..f2783eb9f --- /dev/null +++ b/docs/Manual/addCustomOp.md @@ -0,0 +1,405 @@ +# 如何增加新的Operator + +## 基本概念 + +简单介绍下几个同Operator相关的基本概念,详情请参考设计文档。 + +```framework```: 上层的逻辑代码,负责从parser中获取参数及weights,添加op时主要修改framework/operator目录下的内容。 + +```saber```: 底层的实现代码,Anakin通过saber封装了不同的backends,不同的实现(impl)分别特化出自己的实现,外层framework通过不同的template进入各自的impl完成调用。各个op的parameter放在saber/saber_funcs_param.h文件中,增加op主要修改saber/funcs下的内容。 + +saber的文件结构: +* saber/funcs下的是各个funcs的外部接口,这一层的op与具体的设备实现无关,只与各op完成的功能有关。由于跟实现(impl)无关,本层文件明均不带impl。 +* saber/funcs/impl下是各个op的impl声明,特定设备需要完成该层声明的特化版本,如saber/funcs/impl/x86实现了上一层impl声明的x86特化版本,saber/funcs/impl/cuda实现了上一层impl声明的NV特化版本。当增加新的backends时需要特化出新的实现。本层代码同实现相关,均带有```impl_```前缀。 +* saber/funcs/impl/cuda/base/cuda_c内有cuda```.cu```扩展名的文件,添加cuda的kernel需要在该文件目录下添加。 +* saber/funcs/impl/cuda/base/sass 内有不同架构的汇编代码编译的静态库。 + +### 涉及到的基类及各个类之前的关系 + +简单介绍相关的基类 + +* ```anakin::Operator```: framework的operator基类,位于framework/core/operator/operator.h + +* ```anakin::saber::BaseFunc```: saber对外的op接口基类,提供统一的对外接口,位于saber/funcs/base.h。BaseFunc的```compute_output_shape```接口只根据input的shape和param的参数计算输出的shape,并通过```tensor```的```set_shape```接口(只设置shape,不分配空间)设置到output中。```operator()```接口为各个op的计算接口。 + +* ```ankain::saber::ImplBase```: saber设备实现的op的接口,所有设备相关实现的基类。位于saber/funcs/impl/impl_base.h。实现版本中这里分为两类,一类以```vender_```为前缀,带有```vender_```代码意为使用第三方库来实现该op,如cudnn的conv,或mkl的conv等等,这类op的性能我们难以调优,因此单独列为一类。另一类是带有源码的saber实现,这些实现都带有```saber_```为前缀,此类实现带有源码,能够通过后续优化不断提升性能,实现起名时需要注意这一点。 + +## 添加operator + +添加一个新的op需要以下几步: + +1. 添加saber的param +2. 定义saber的Operator类 +3. 定义新的impl声明 +3. 完成新的impl实现 +4. 增加framework的实现或特化 + +接下来就针对这几步,以一个简单例子为例介绍实现。 + +例如我们要添加新的Mul op。给出计算公式如下:$$Out = alpha \dot X * Y$$ + +### 为operator增加param + +涉及到的文件:```saber/saber_funcs_param.h```。如果之前已经存在需要添加的op的param,这一步可以跳过。 +这里```XXXParam```是一个```struct```。包含一个无参数的构造函数,含参数的构造函数,复制构造函数,```operator=()```及```operator==()```。 +``` +template // 能够获得target, datatype, layout +struct MulParam{ + MulParam() + : alpha(0) + {} + MulParam(float alpha_in) + : alpha(alpha_in) + {} + MulParam(const MulParam& right) + : alpha(right.alpha) + {} + MulParam &operator=(const MulParam &right) { + alpha = right.alpha; + } + bool operator==(const MulParam &right) { + return alpha == right.alpha; + } + float alpha; +}; +``` + +### 定义Operator类 +涉及到的文件:```saber/funcs/mul.h```。如果之前定义过该op的类,这里需要修改输入的impl定义头文件。 +下面给出一个相对完整的定义结构供参考。 +``` +//不同的设备需要包含对应的operator实现.[详见](#impl) +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_mul.h" +#include "saber/funcs/impl/cuda/vender_mul.h" +#endif +//如果一个设备现在还没有对应的operator实现,需要包含声明。[详见](#declare) +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/impl_mul.h" +#endif +namespace anakin { +namespace saber { +template +class Mul : public BaseFunc< + Tensor, + Tensor, + Tensor, + ImplBase, MulParam> { +public: + using BaseFunc< + Tensor, + Tensor, + Tensor, + ImplBase, MulParam>::BaseFunc; + Mul() = default; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef MulParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + //计算输出的shape, + Shape output_shape = (input[0]->valid_shape()); + /* code */ + return output[0]->set_shape(output_shape); + } + virtual SaberStatus init_impl(ImplEnum implenum) override { + // 不同设备均使用此init_impl, 此接口创建对应impl的实现。 + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderMul ); + return SaberSuccess; + case SABER_IMPL: + this->_impl.push_back(new SaberMul ); + return SaberSuccess; + default: + return SaberUnImplError; + } + } +private: + virtual void pick_best_static() override { + if (true) // some condition? + this->_best_impl = this->_impl[0]; + } + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } +}; +} // namespace saber +} // namespace anakin +``` + +### 为operator增加新的impl声明 + +涉及的文件:```saber/funcs/impl/impl_mul.h```。不同的设备都特化同一个声明,特化版本放在对应的文件夹下,这里的声明就是给出所有设备的统一声明。下面给出一个参考。 +``` +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ +namespace saber{ +DEFINE_OP_CLASS(Mul, MulParam); // 第一个参数是op的名字,第二个是对应param的名字 +} +} +``` + +### 完成新的operator特定后端实现 + +涉及的文件:```saber/funcs/impl/xxx/vender_mul.h```或```saber/funcs/impl/xxx/saber_mul.h``` +这里```xxx```指代特定的一种设备。```vender```是指的使用第三方库实现的op,```saber```指的源码实现的op。这里以cuda的vender实现为例,简单介绍一下特化出的函数的几个基本接口。 + +``` +// include 对应的声明 +#include "saber/funcs/impl/impl_mul.h" + +namespace anakin{ +namespace saber{ +template +class VenderMul : + public ImplBase< + Tensor, + Tensor, + Tensor, + MulParam > > +{ +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + typedef typename DataTensor_in::Dtype InDataType; + typedef typename DataTensor_out::Dtype OutDataType; + typedef typename OpTensor::Dtype OpDataType; + VenderMul(){} + ~VenderMul() {} + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + MulParam& param, Context& ctx) { + this->_ctx = ctx; + create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + MulParam& param, Context& ctx) { + // set内部参数 + } + + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + MulParam& param) { + // dispatch kernel. + } + +private: +}; +} +} +``` +```init```和```create```的区别:```init```接口是第一次初始化op的时候进入的接口,此函数只在第一次初始化op时调用,这个接口一般放一些只需要执行一次的代码,如malloc或者create之类的函数。```create```函数除了第一次init执行外,在输入发生变化或者param发生变化时会再次触发,create一般放置set函数,设置内部变量,当input发生变化时这里执行一些同input或weights直接相关的代码。但create因为触发位置在网络内,如果```create```函数执行了一些严重耗时的操作,这里会拖慢整个op的执行时间,需要慎重选择操作放置的位置。 +### 添加framework的特化 + +涉及的文件:```framework/operators/mul.h```和```framework/operators/mul.cpp```。 +这里简单介绍下如果添加或修改framework内的operator + +``` +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/mul.h" // 需要包对应的saber头文件 +namespace anakin { +namespace ops { +template +class MulHelper; + +template +class Mul : public Operator { +public: + Mul() {} + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + } + friend class MulHelper; +}; +template +class MulHelper : public OperatorHelper { +public: + MulHelper() = default; + ~MulHelper(); + Status InitParam() override; + + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + saber::MulParam> _param_mul; + saber::Mul _funcs_mul; +}; +} +} /* namespace anakin */ +``` +对应的```.cpp```文件如下: +``` +#include "framework/operators/mul.h" + +namespace anakin { +namespace ops { + +#ifdef USE_CUDA +template<> +void Mul::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = + static_cast*>(this->_helper); + auto& param = + static_cast*>(this->_helper)->_param_mul; + impl->_funcs_mul(ins, outs, param, ctx); +} +#endif + +template +Status MulHelper::InitParam() { + auto alpha = GET_PARAMETER(float, alpha); + MulParam> param_mul(alpha); + _param_mul = param_mul; + return Status::OK(); +} + +template +Status MulHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + + SABER_CHECK(_funcs_mul.init(ins, outs, _param_mul, SPECIFY, VENDER_IMPL, ctx)); + return Status::OK(); +} + +template +Status MulHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_mul.compute_output_shape(ins, outs, _param_mul)); + return Status::OK(); +} + +#ifdef USE_CUDA +template class MulHelper; +#endif +#ifdef USE_ARM_PLACE +template class MulHelper; +#endif +// register helper +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32); +#endif +#ifdef USE_ARM_PLACE +ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(Mul) +.Doc("Mul operator") +#ifdef USE_CUDA +.__alias__("mul") +#endif +#ifdef USE_ARM_PLACE +.__alias__("mul") +#endif +.num_in(1) +.num_out(1) +.Args("alpha", " alpha of Mul "); //注册 + +} /* namespace ops */ + +} /* namespace anakin */ +``` + +## 实现单元测试 +涉及的文件:```test/saber/xxx/test_saber_funcs_mul_xxx.cpp``` +在对应的test下需要添加新的单元测试 + +``` +TEST(TestSaberFuncNV, test_depthwise_conv) { + + // init tensors and some param. + + // start Reshape & doInfer + Context ctx1(0, 1, 1); + + // create param + MulParam > param(alpha); + + std::vector*> input; + std::vector*> output; + + // create saber op + Mul mul; + + // compute output shape + mul.compute_output_shape(input, output, param); + + // re_alloc output tensors memory based on output shape + output[0]->re_alloc(output[0]->shape()); + + // init saber op(calling init and create) + mul.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); + + // call operator() + mul(input, output, param, ctx1); + + // cuda specified, record events + cudaStream_t cuda_stream = ctx1.get_compute_stream(); + output[0]->record_event(cuda_stream); + output_dev.sync(); + + // param changed + param.alpha = 2.0; + // auto calling saber op(create and dispatch) + mul(input, output, param, ctx1); + + cudaDeviceSynchronize(); + CUDA_CHECK(cudaPeekAtLastError()); +} + +int main(int argc, const char** argv){ + anakin::saber::Env::env_init(); + + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + +``` +## 调试及注意事项 + +一个op需要有对外的op接口和内部实现,由于存在saber/funcs/impl的非特化版本声明,当有op在某种设备下没有对应实现时,也能够编译,但此时是没有任何实现的空实现, diff --git a/docs/Manual/int8_design_ch.md b/docs/Manual/int8_design_ch.md new file mode 100644 index 000000000..2444c8735 --- /dev/null +++ b/docs/Manual/int8_design_ch.md @@ -0,0 +1,17 @@ + +# Int8设计文档 + +## 计算流程 + +![Anakin_int8](pics/int8_design.png) + +## saber完成的功能 + +对于支持int8的op,接口需要完成的功能做如下规定: +1、init/create部分完成外部变量的量化和应有的判断,weights和bias计算后,scale存回对应的tensor + +2、dispatch检查input,如果是int8,检查是否符号合适,如果是fp32,需要添加量化部分代码(静态量化在tensor中的scale里,动态量化需要实时计算,并存回原tensor的scale中)。检查output,如果是fp32,按照输出fp32的逻辑反量化回fp32的tensor,如果是int8,根据当前kernel的实现,选择输出s8或u8(对于带relu合并的输出u8,对于单独的conv输出s8)。 + +3、输入是s8还是u8主要取决于kernel是否支持,跟设备相关。 + +对于不支持fp32的op,需要确认输入输出都是fp32,防止误调用。 diff --git a/docs/Manual/pics/contri1.JPG b/docs/Manual/pics/contri1.JPG new file mode 100755 index 000000000..753f7c4e7 Binary files /dev/null and b/docs/Manual/pics/contri1.JPG differ diff --git a/docs/Manual/pics/contri2.JPG b/docs/Manual/pics/contri2.JPG new file mode 100755 index 000000000..e7880585e Binary files /dev/null and b/docs/Manual/pics/contri2.JPG differ diff --git a/docs/Manual/pics/int8_design.png b/docs/Manual/pics/int8_design.png new file mode 100644 index 000000000..d6feafbd3 Binary files /dev/null and b/docs/Manual/pics/int8_design.png differ diff --git a/docs/Manual/run_on_arm_ch.md b/docs/Manual/run_on_arm_ch.md new file mode 100644 index 000000000..ebeb38f53 --- /dev/null +++ b/docs/Manual/run_on_arm_ch.md @@ -0,0 +1,151 @@ +## 源码编译 Anakin ## + +目前Anakin支持ARM Android平台,采用Android NDK交叉编译工具链,已在mac os和centos上编译和测试通过。 + +### 安装概览 ### + +* [系统需求](#0001) +* [安装第三方依赖](#0002) +* [Anakin源码编译](#0003) +* [验证安装](#0004) + + +### 1. 系统需求 ### + +* 宿主机: linux, mac +* cmake 3.8.2+ +* Android NDK r14, Linux 版本[从这里下载](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip) + +### 2. 安装第三方依赖 ### + +- 2.1 protobuf3.4.0 + 源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0) + - 2.1.1 为宿主机编译protobuf + ```bash + $ tar -xzf protobuf-3.4.0.tar.gz + $ cd protobuf-3.4.0 + $ ./autogen.sh + $ ./configure + $ make + $ make check + $ make install + ``` + 上述 $make install 执行后,可在 /usr/local/include/google 找到 libprotobuf 所需的头文件,将整个google文件夹拷贝至Anakin/third-party/arm-android/protobuf/下, + 如有问题,请点[这里](https://github.com/google/protobuf/blob/v3.4.0/src/README.md)。 + 然后将已经生成文件清除。 + ```bash + $ make distclean + ``` + - 2.1.1 交叉编译Android`armeabi-v7a`的protobuf,注意设置ANDROID_NDK的路径,以及ARCH_ABI、HOSTOSN的值, + ```bash + + $ export ANDROID_NDK=your_ndk_path + $ ARCH_ABI="arm-linux-androideabi-4.9" + $ HOSTOSN="darwin-x86_64" + $ export SYSROOT=$ANDROID_NDK/platforms/android-9/arch-arm + $ export PREBUILT=$ANDROID_NDK/toolchains/$ARCH_ABI + $ export LDFLAGS="--sysroot=$SYSROOT" + $ export LD="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/arm-linux-androideabi/bin/ld $LDFLAGS" + $ export LIBS="-llog $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/libgnustl_static.a" + $ export CPPFLAGS="" + $ export INCLUDES="-I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/include/ -I$ANDROID_NDK/platforms/android-9/arch-arm/usr/include/ -I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/include/" + $ export CXXFLAGS="-march=armv7-a -mfloat-abi=softfp -DGOOGLE_PROTOBUF_NO_RTTI --sysroot=$SYSROOT" + $ export CCFLAGS="$CXXFLAGS" + $ export CXX="$PREBUILT/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-g++ $CXXFLAGS" + $ export CC="$CXX" + $ export RANLIB="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-ranlib" + $ ./autogen.sh + $ ./configure --host=arm-linux-androideabi --with-sysroot=$SYSROOT --enable-cross-compile --with-protoc=protoc --disable-shared CXX="$CXX" CC="$CC" LD="$LD" + $ make + ``` + + 编译生成 *.a 静态库,若希望编译*.so 动态链接库 ,请在./configure参数中改--disable-shared为--disable-static --enable-shared。 + 生成文件在src/.libs/下,将生成的文件拷贝至Anakin/third-party/arm-android/protobuf/lib下。 + 在[cmake](../../cmake/find_modules.cmake)中更新`ARM_RPOTO_ROOT`的路径。 + ```cmake + set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf") + ``` + +- 2.2 opencv 2.4.3+(optional) + Anakin只在examples示例中使用opencv + Android系统的opencv从[这里下载](https://opencv.org/releases.html) + 解压后将 `3rdparty/libs/armeabi-v7a`中的库文件拷贝到`libs/armeabi-v7a` + 在[cmake](../../cmake/find_modules.cmake)中搜索`anakin_find_opencv`, + 并设置 `include_directories` 和 `LINK_DIRECTORIES`为自己安装的库的路径。 + ```cmake + include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/) + LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/) + ``` +### 3. Anakin源码编译 ### + +#### 编译Android版本 + + 克隆[源码](https://github.com/PaddlePaddle/Anakin/tree/arm) +```bash + cd your_dir + git clone https://github.com/PaddlePaddle/Anakin.git + cd Anakin + git fetch origin arm + git checkout arm + ``` + 修改`android_build.sh` +- 修改NDK路径 + ```bash + #modify "your_ndk_path" to your NDK path + export ANDROID_NDK=your_ndk_path + ``` +- 修改ARM 处理器架构 + 对于32位ARM处理器, 将ANDROID_ABI 设置为 `armeabi-v7a with NEON`, + 对于64位ARM处理器, 可以将ANDROID_ABI 设置为 `armeabi-v7a with NEON`或者`arm64-v8a`。 + 目前我们只支持 `armeabi-v7a with NEON`;`arm64-v8a` 还在开发中。 + ```bash + -DANDROID_ABI="armeabi-v7a with NEON" + ``` +- 设置Android API + 根据Android系统的版本设置API level, 例如API Level 21 -> Android 5.0.1 + ```bash + -DANDROID_NATIVE_API_LEVEL=21 + ``` + +- 选择编译静态库或动态库 + 设置`BUILD_SHARED=NO`编译静态库 + 设置`BUILD_SHARED=YES`编译动态库 + ```bash + -DBUILD_SHARED=NO + ``` +- OpenMP多线程支持 + 设置`USE_OPENMP=YES`开启OpenMP多线程 + ```bash + -DUSE_OPENMP=YES + ``` + +- 编译单测文件 + 设置`BUILD_WITH_UNIT_TEST=YES`将会编译单测文件 + ```bash + -DBUILD_WITH_UNIT_TEST=YES + ``` + +- 编译示例文件 + 设置`BUILD_EXAMPLES=YES`将会编译示例文件 + ```bash + -DBUILD_EXAMPLES=YES + ``` + +- 开启opencv + 如果使用opencv,设置`USE_OPENCV=YES` + ```bash + -DUSE_OPENCV=YES + ``` + +- 开始编译 + 运行脚本 `android_build.sh` 将自动编译Anakin + ```bash + ./android_build.sh + ``` + +### 4. 验证安装 ### + 编译好的库会放在目录`${Anakin_root}/output`下; + 编译好的单测文件会放在`${Anakin_root}/output/unit_test`目录下; + 编译好的示例文件会放在`${Anakin_root}/output/examples`目录下。 + + 对于Android系统,打开设备的调试模式,通过ADB可以访问的目录是`data/local/tmp`,通过ADB push将测试文件、模型和数据发送到设备目录, 运行测试文件。 diff --git a/docs/Manual/run_on_arm_en.md b/docs/Manual/run_on_arm_en.md new file mode 100644 index 000000000..a726b7d82 --- /dev/null +++ b/docs/Manual/run_on_arm_en.md @@ -0,0 +1,127 @@ +## Build Anakin for ARM from source ## + +Now, we have successfully build on mac os and centos, using Android NDK + +### Installation overview ### + +* [system requirements](#0001) +* [dependencies](#0002) +* [build from source](#0003) +* [verification](#0004) + + +### 1. system requirements ### + +* Host machine: linux, mac +* cmake 3.8.2+ +* Android NDK r14, download linux version from [here](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip) + +### 2. dependencies ### + +- 2.1 protobuf3.4.0 + Download source from https://github.com/google/protobuf/releases/tag/v3.4.0 + - 2.1.1 Build protobuf for host + ```bash + $ tar -xzf protobuf-3.4.0.tar.gz + $ cd protobuf-3.4.0 + $ ./autogen.sh + $ ./configure + $ make + $ make check + $ make install + ``` + for details, please refer [here](https://github.com/google/protobuf/blob/v3.4.0/src/README.md) + + - 2.1.2 Build protobuf for ARM `armeabi-v7a` + ```bash + + ``` + Set your protobuf path [here](../../cmake/find_modules.cmake), search `anakin_find_protobuf`, and set `ARM_RPOTO_ROOT` to your path. + ```cmake + set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf") + ``` + +- 2.2 opencv 2.4.3+(optional) + We only use opencv in examples + For Android, visit opencv [release page](https://opencv.org/releases.html), choose Android pack and download, + copy libs in `3rdparty/libs/armeabi-v7a` to `libs/armeabi-v7a`. + Set your opencv path [here](../../cmake/find_modules.cmake), Search `anakin_find_opencv`, + and set `include_directories` and `LINK_DIRECTORIES` according to your path. + ```cmake + include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/) + LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/) + ``` +### 3. build from source ### + +#### build for Android + + clone the [source code](https://github.com/PaddlePaddle/Anakin/tree/arm) +```bash + cd your_dir + git clone https://github.com/PaddlePaddle/Anakin.git + cd Anakin + git fetch origin arm + git checkout arm + ``` + change the `android_build.sh` +- Set NDK path to yours + ```bash + #modify "your_ndk_path" to your NDK path + export ANDROID_NDK=your_ndk_path + ``` +- Set your ARM target platform + + For 32bits ARM CPU with NEON, Set ANDROID_ABI to `armeabi-v7a with NEON`, + for 64bits ARM CPU, either `arm64-v8a` or `armeabi-v7a with NEON` can work. + Now, we only support `armeabi-v7a with NEON`,`arm64-v8a` is under developing + ```bash + -DANDROID_ABI="armeabi-v7a with NEON" + ``` +- Set Android API level + Choose your API LEVEL according to your android system version + API Level 21 -> Android 5.0.1 + ```bash + -DANDROID_NATIVE_API_LEVEL=21 + ``` + +- build static or shared lib + if building static lib, set `BUILD_SHARED=NO` + if building shared lib, set `BUILD_SHARED=YES` + ```bash + -DBUILD_SHARED=NO + ``` +- OpenMP for multi-threads + set `USE_OPENMP=YES` to use OpenMP multi-threads + ```bash + -DUSE_OPENMP=YES + ``` + +- build unit test + set `BUILD_WITH_UNIT_TEST=YES` to build unit tests + ```bash + -DBUILD_WITH_UNIT_TEST=YES + ``` + +- build examples + set `BUILD_EXAMPLES=YES` to build detection and classification examples + ```bash + -DBUILD_EXAMPLES=YES + ``` + +- use opencv in examples + set `USE_OPENCV=YES` to use opencv in examples + ```bash + -DUSE_OPENCV=YES + ``` + +- build + run `android_build.sh` to build the Anakin + ```bash + ./android_build.sh + ``` + +### 4. Verification ### + The libs is in `${Anakin_root}/output`, the unit test and benchmark file is in `${Anakin_root}/output/unit_test` + and the examples is in `${Anakin_root}/output/examples` + Open `USB debug mode` in your Android device, Use ADB to push the test files and model files to `data/local/tmp/your_dir` + run the test \ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 000000000..b93757289 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,58 @@ +# used for temporary +anakin_fetch_include_recursively(${ANAKIN_FRAMEWORK}) +anakin_fetch_include_recursively(${ANAKIN_MODEL_PARSER}) +anakin_fetch_include_recursively(${ANAKIN_SABER}) + +if(NVIDIA_GPU) +anakin_fetch_files_with_suffix(${ANAKIN_EXAMPLES}/cuda "cpp" ANAKIN_TEST_CASE_SRC) +endif() + +if(AMD_GPU) +anakin_fetch_files_with_suffix(${ANAKIN_EXAMPLES}/amd "cpp" ANAKIN_TEST_CASE_SRC) +endif() + +if(USE_X86_PLACE) +anakin_fetch_files_with_suffix(${ANAKIN_EXAMPLES}/x86 "cpp" ANAKIN_TEST_CASE_SRC) +endif() + +if(USE_ARM_PLACE) #build unit test for arm devices + anakin_fetch_files_with_suffix(${ANAKIN_EXAMPLES}/arm "cpp" ANAKIN_TEST_CASE_SRC) + if(USE_OPENMP) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") + endif() + if (USE_PROTOBUF) + find_library(log-lib log) + endif() +endif() + +file(REMOVE ${PROJECT_SOURCE_DIR}/output/examples/*) + +# build test cases +foreach(SRC_NAME ${ANAKIN_TEST_CASE_SRC}) + #unpack the dir "/" + string(REPLACE "/" ";" SEXY_LIST ${SRC_NAME}) + list(GET SEXY_LIST -1 TEST_CASE_NAME) + #get the file name without suffix + string(REPLACE "." ";" SEXY_LIST ${TEST_CASE_NAME}) + list(GET SEXY_LIST 0 TEST_CASE_NAME) + add_executable(${TEST_CASE_NAME} ${SRC_NAME}) + if(BUILD_SHARED) + target_link_libraries(${TEST_CASE_NAME} ${anakin_lib_so} ${ANAKIN_LINKER_LIBS}) + else() + target_link_libraries(${TEST_CASE_NAME} -Wl,--whole-archive ${anakin_lib_static} -Wl,--no-whole-archive ${ANAKIN_LINKER_LIBS}) + endif() + if(USE_ARM_PLACE) + target_link_libraries(${TEST_CASE_NAME} ${log-lib}) + endif() + if(USE_OPENCV) + if (USE_ARM_PLACE) + target_link_libraries(${TEST_CASE_NAME} -lopencv_core -lopencv_highgui -lopencv_imgproc + -ltbb -llibtiff -llibpng -llibjpeg -llibjasper -lIlmImf -lc -lz -llog -ldl) + else() + target_link_libraries(${TEST_CASE_NAME} -lopencv_core -lopencv_highgui -lopencv_imgproc) + endif() + endif() + set_target_properties(${TEST_CASE_NAME} PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_SOURCE_DIR}/output/examples) +endforeach() diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 000000000..160231de6 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,20 @@ +# hands on examples + +## dependecies + +- opencv2.4.3+ for image reading + +## NV GPU + + + +## ARM +- refer [run on arm](../docs/Manual/run_on_arm_en.md) to set your opencv path +- Enable `USE_OPENCV` in [CMakeList.txt](../CMakeLists.txt) +- Enable building examples in [CMakeList.txt](../CMakeLists.txt) + +### mobilenet_ssd detection + + +### mobilenetv1 classification + diff --git a/examples/amd/classification.cpp b/examples/amd/classification.cpp new file mode 100644 index 000000000..1d64ead71 --- /dev/null +++ b/examples/amd/classification.cpp @@ -0,0 +1,238 @@ +#include "graph_base.h" +#include "graph.h" +#include "scheduler.h" +#include "net.h" +#include "worker.h" +#include "tensor_op.h" +#include "timer.h" +#include "saber/utils.h" + +using namespace anakin::saber; +using namespace anakin::graph; +using namespace anakin; +typedef Tensor Tensor4hf; +typedef Tensor Tensor4df; + +void load_labels(std::string path, std::vector& labels) { + + FILE* fp = fopen(path.c_str(), "r"); + if (fp == nullptr) { + LOG(FATAL) << "load label file failed"; + } + while (!feof(fp)) { + char str[1024]; + fgets(str, 1024, fp); + std::string str_s(str); + + if (str_s.length() > 0) { + for (int i = 0; i < str_s.length(); i++) { + if (str_s[i] == ' ') { + std::string strr = str_s.substr(i, str_s.length() - i - 1); + labels.push_back(strr); + i = str_s.length(); + } + } + } + } + fclose(fp); +} + +void print_topk(const float* scores, const int size, const int topk, \ + const std::vector& labels) { + + std::vector< std::pair > vec; + vec.resize(size); + for (int i = 0; i < size; i++) { + vec[i] = std::make_pair(scores[i], i); + } + + std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), + std::greater< std::pair >()); + + // print topk and score + for (int i = 0; i < topk; i++) { + float score = vec[i].first; + int index = vec[i].second; + LOG(INFO) << i <<": " << index << " " << labels[index] << " " << score; + } +} + +#ifdef USE_OPENCV +#include "opencv2/opencv.hpp" + +using namespace cv; + +void fill_tensor_with_cvmat(const Mat& img_in, Tensor4hf& tout, const int num, \ + const int width, const int height, const float* mean, const float* scale) { + cv::Mat im; + cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f); + float* ptr_data_in = tout.mutable_data(); + int stride = width * height; + for (int i = 0; i < num; i++) { + float* ptr_in = ptr_data_in + i * tout.channel() * tout.height() * tout.width(); + for (int r = 0; r < height; r++) { + for (int c = 0; c < width; c++) { + ptr_in[r * width + c] = (im.at(r, c)[0] - mean[0]) * scale[0]; + ptr_in[stride + r * width + c] = (im.at(r, c)[1] - mean[1]) * scale[1]; + ptr_in[2 * stride + r * width + c] = (im.at(r, c)[2] - mean[2]) * scale[2]; + } + } + } +} +#endif + +void test_net(const std::string model_file_name, const std::string image_file_name, \ + const std::vector& labels, const int topk, const int threads, \ + const int test_iter) { + + int batch_size = 1; + + //! create runtime context + LOG(INFO) << "create runtime context"; + std::shared_ptr> ctx1 = std::make_shared>(0,0,0); + + //! load model + LOG(WARNING) << "load anakin model file from " << model_file_name << " ..."; + Graph graph; + auto status = graph.load(model_file_name); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + //! set batch size + graph.ResetBatchSize("input_0", batch_size); + + //! optimize the graph + LOG(INFO) << "optimize the graph"; + graph.Optimize(); + + //! get output name + std::vector& vout_name = graph.get_outs(); + LOG(INFO) << "output size: " << vout_name.size(); + + //! constructs the executer net + LOG(INFO) << "create net to execute"; + Net net_executer(graph, ctx1, true); + + //! get in + LOG(INFO) << "get input"; + auto d_tensor_in_p = net_executer.get_in("input_0"); + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i = 0; i < valid_shape_in.size(); i++) { + LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; + } + Tensor4hf thin(valid_shape_in); + + LOG(INFO) << thin.width() << "x" << thin.height() << " size" << thin.valid_size();; + //! feed input image to input tensor + +#ifdef USE_OPENCV + LOG(INFO) << "loading image " << image_file_name << " ..."; + Mat img = imread(image_file_name, CV_LOAD_IMAGE_COLOR); + if (img.empty()) { + LOG(FATAL) << "opencv read image " << image_file_name << " failed"; + } + //! set your mean value and scale value here + float mean_mb[3] = {103.94f, 116.78f, 123.68f}; + float scale_mb[3] = {0.017f, 0.017f, 0.017f}; + LOG(INFO) << thin.width() << "x" << thin.height(); + fill_tensor_with_cvmat(img, thin, batch_size, thin.width(), thin.height(), mean_mb, scale_mb); +#else + fill_tensor_host_const(thin, 1.f); +#endif + + //! do inference + Context ctx(0, 0, 0); + anakin::saber::SaberTimer my_time; + LOG(INFO) << "run prediction "; + + double to = 0; + double tmin = 1000000; + double tmax = 0; + my_time.start(ctx); + saber::SaberTimer t1; + for (int i = 0; i < test_iter; i++) { + d_tensor_in_p->copy_from(thin); + t1.clear(); + t1.start(ctx); + net_executer.prediction(); + t1.end(ctx); + double tdiff = t1.get_average_ms(); + if (tdiff > tmax) { + tmax = tdiff; + } + if (tdiff < tmin) { + tmin = tdiff; + } + to += tdiff; + } + my_time.end(ctx); + + + LOG(INFO) << model_file_name << " batch_size " << batch_size << \ + " average time " << to / test_iter << \ + ", min time: " << tmin << "ms, max time: " << tmax << " ms"; + + //! get output + //! fixme get output + //std::vector vout = net_executer.get_out_list(); + std::vector vout; + for (auto& it : vout_name) { + vout.push_back(net_executer.get_out(it)); + } + Tensor4df* tensor_out_d = vout[0]; + LOG(INFO) << "output size: " << vout.size(); + + Tensor4hf tensor_out; + tensor_out.re_alloc(tensor_out_d->shape()); + tensor_out.copy_from(*tensor_out_d); +#if 0 //print output tensor data + LOG(INFO) << "extract data: size: " << tensor_out->valid_size() << \ + ", width=" << tensor_out->width() << ", height=" << tensor_out->height(); + const float* ptr_out = tensor_out->data(); + for (int i = 0; i < tensor_out->valid_size(); i++) { + printf("%0.4f ", ptr_out[i]); + if ((i + 1) % 7 == 0) { + printf("\n"); + } + } + printf("\n"); +#endif + print_topk(tensor_out.data(), tensor_out.valid_size(), topk, labels); +} + +int main(int argc, char** argv){ + + LOG(INFO) << "initialized the device"; + Env::env_init(); + + if (argc < 4) { + LOG(ERROR) << "usage: " << argv[0] << ": model_file label_file image_name [topk] [test_iter] [threads]"; + return -1; + } + char* model_file = argv[1]; + char* label_file = argv[2]; + char* image_path = argv[3]; + + std::vector labels; + load_labels(label_file, labels); + + int topk = 5; + if (argc > 4) { + topk = atoi(argv[4]); + } + + int test_iter = 10; + if (argc > 5) { + test_iter = atoi(argv[5]); + } + + int threads = 1; + if (argc > 6) { + threads = atoi(argv[6]); + } + + test_net(model_file, image_path, labels, topk, threads, test_iter); + return 0; +} + diff --git a/examples/arm/classification.cpp b/examples/arm/classification.cpp new file mode 100644 index 000000000..27c3ce45d --- /dev/null +++ b/examples/arm/classification.cpp @@ -0,0 +1,234 @@ +#include "graph_base.h" +#include "graph.h" +#include "scheduler.h" +#include "net.h" +#include "worker.h" +#include "tensor_op.h" +#include "timer.h" + +using namespace anakin::saber; +using namespace anakin::graph; +using namespace anakin; +typedef Tensor Tensor4hf; + + +void load_labels(std::string path, std::vector& labels) { + + FILE* fp = fopen(path.c_str(), "r"); + if (fp == nullptr) { + LOG(FATAL) << "load label file failed"; + } + while (!feof(fp)) { + char str[1024]; + fgets(str, 1024, fp); + std::string str_s(str); + + if (str_s.length() > 0) { + for (int i = 0; i < str_s.length(); i++) { + if (str_s[i] == ' ') { + std::string strr = str_s.substr(i, str_s.length() - i - 1); + labels.push_back(strr); + i = str_s.length(); + } + } + } + } + fclose(fp); +} + +void print_topk(const float* scores, const int size, const int topk, \ + const std::vector& labels) { + + std::vector< std::pair > vec; + vec.resize(size); + for (int i = 0; i < size; i++) { + vec[i] = std::make_pair(scores[i], i); + } + + std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), + std::greater< std::pair >()); + + // print topk and score + for (int i = 0; i < topk; i++) { + float score = vec[i].first; + int index = vec[i].second; + LOG(INFO) << i <<": " << index << " " << labels[index] << " " << score; + } +} + +#ifdef USE_OPENCV +#include "opencv2/opencv.hpp" + +using namespace cv; + +void fill_tensor_with_cvmat(const Mat& img_in, Tensor4hf& tout, const int num, \ + const int width, const int height, const float* mean, const float* scale) { + cv::Mat im; + cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f); + float* ptr_data_in = tout.mutable_data(); + int stride = width * height; + for (int i = 0; i < num; i++) { + float* ptr_in = ptr_data_in + i * tout.channel() * tout.height() * tout.width(); + for (int r = 0; r < height; r++) { + for (int c = 0; c < width; c++) { + ptr_in[r * width + c] = (im.at(r, c)[0] - mean[0]) * scale[0]; + ptr_in[stride + r * width + c] = (im.at(r, c)[1] - mean[1]) * scale[1]; + ptr_in[2 * stride + r * width + c] = (im.at(r, c)[2] - mean[2]) * scale[2]; + } + } + } +} +#endif + +void test_net(const std::string model_file_name, const std::string image_file_name, \ + const std::vector& labels, const int topk, const int threads, \ + const int test_iter) { + + int batch_size = 1; + + //! create runtime context + LOG(INFO) << "create runtime context"; + std::shared_ptr> ctx1 = std::make_shared>(); + ctx1->set_run_mode(SABER_POWER_HIGH, threads); + LOG(INFO) << omp_get_num_threads() << " threads is activated"; + + //! load model + LOG(WARNING) << "load anakin model file from " << model_file_name << " ..."; + Graph graph; + auto status = graph.load(model_file_name); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + //! set batch size + graph.ResetBatchSize("input_0", batch_size); + + //! optimize the graph + LOG(INFO) << "optimize the graph"; + graph.Optimize(); + + //! get output name + std::vector& vout_name = graph.get_outs(); + LOG(INFO) << "output size: " << vout_name.size(); + + //! constructs the executer net + LOG(INFO) << "create net to execute"; + Net net_executer(graph, ctx1, true); + + //! get in + LOG(INFO) << "get input"; + auto d_tensor_in_p = net_executer.get_in("input_0"); + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i = 0; i < valid_shape_in.size(); i++) { + LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; + } + Tensor4hf thin(valid_shape_in); + + //! feed input image to input tensor +#ifdef USE_OPENCV + LOG(INFO) << "loading image " << image_file_name << " ..."; + Mat img = imread(image_file_name, CV_LOAD_IMAGE_COLOR); + if (img.empty()) { + LOG(FATAL) << "opencv read image " << image_file_name << " failed"; + } + //! set your mean value and scale value here + float mean_mb[3] = {103.94f, 116.78f, 123.68f}; + float scale_mb[3] = {0.017f, 0.017f, 0.017f}; + fill_tensor_with_cvmat(img, thin, batch_size, thin.width(), thin.height(), mean_mb, scale_mb); + +#else + fill_tensor_host_const(thin, 1.f); +#endif + + //! do inference + Context ctx(0, 0, 0); + anakin::saber::SaberTimer my_time; + LOG(INFO) << "run prediction "; + + double to = 0; + double tmin = 1000000; + double tmax = 0; + my_time.start(ctx); + saber::SaberTimer t1; + for (int i = 0; i < test_iter; i++) { + d_tensor_in_p->copy_from(thin); + t1.clear(); + t1.start(ctx); + net_executer.prediction(); + t1.end(ctx); + double tdiff = t1.get_average_ms(); + if (tdiff > tmax) { + tmax = tdiff; + } + if (tdiff < tmin) { + tmin = tdiff; + } + to += tdiff; + } + my_time.end(ctx); + + + LOG(INFO) << model_file_name << " batch_size " << batch_size << \ + " average time " << to / test_iter << \ + ", min time: " << tmin << "ms, max time: " << tmax << " ms"; + + //! get output + //! fixme get output + //std::vector vout = net_executer.get_out_list(); + std::vector vout; + for (auto& it : vout_name) { + vout.push_back(net_executer.get_out(it)); + } + Tensor4hf* tensor_out = vout[0]; + LOG(INFO) << "output size: " << vout.size(); + +#if 0 //print output tensor data + LOG(INFO) << "extract data: size: " << tensor_out->valid_size() << \ + ", width=" << tensor_out->width() << ", height=" << tensor_out->height(); + const float* ptr_out = tensor_out->data(); + for (int i = 0; i < tensor_out->valid_size(); i++) { + printf("%0.4f ", ptr_out[i]); + if ((i + 1) % 7 == 0) { + printf("\n"); + } + } + printf("\n"); +#endif + print_topk(tensor_out->data(), tensor_out->valid_size(), topk, labels); +} + +int main(int argc, char** argv){ + + LOG(INFO) << "initialized the device"; + Env::env_init(); + + if (argc < 4) { + LOG(ERROR) << "usage: " << argv[0] << ": model_file label_file image_name [topk] [test_iter] [threads]"; + return -1; + } + char* model_file = argv[1]; + char* label_file = argv[2]; + char* image_path = argv[3]; + + std::vector labels; + load_labels(label_file, labels); + + int topk = 5; + if (argc > 4) { + topk = atoi(argv[4]); + } + + int test_iter = 10; + if (argc > 5) { + test_iter = atoi(argv[5]); + } + + int threads = 1; + if (argc > 6) { + threads = atoi(argv[6]); + } + + test_net(model_file, image_path, labels, topk, threads, test_iter); + return 0; +} + diff --git a/examples/arm/ssd_detection.cpp b/examples/arm/ssd_detection.cpp new file mode 100644 index 000000000..50b02b396 --- /dev/null +++ b/examples/arm/ssd_detection.cpp @@ -0,0 +1,233 @@ +#include "graph_base.h" +#include "graph.h" +#include "scheduler.h" +#include "net.h" +#include "worker.h" +#include "tensor_op.h" +#include "timer.h" + +using namespace anakin::saber; +using namespace anakin::graph; +using namespace anakin; +typedef Tensor Tensor4hf; + +#ifdef USE_OPENCV +#include "opencv2/opencv.hpp" + +using namespace cv; + +struct Object{ + int batch_id; + cv::Rect rec; + int class_id; + float prob; +}; + +const char* class_names[] = {"background", + "aeroplane", "bicycle", "bird", "boat", + "bottle", "bus", "car", "cat", "chair", + "cow", "diningtable", "dog", "horse", + "motorbike", "person", "pottedplant", + "sheep", "sofa", "train", "tvmonitor"}; + +void fill_tensor_with_cvmat(const Mat& img_in, Tensor4hf& tout, const int num, \ + const int width, const int height, const float* mean, const float* scale) { + cv::Mat im; + cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f); + float* ptr_data_in = tout.mutable_data(); + int stride = width * height; + for (int i = 0; i < num; i++) { + float* ptr_in = ptr_data_in + i * tout.channel() * tout.height() * tout.width(); + for (int r = 0; r < height; r++) { + for (int c = 0; c < width; c++) { + ptr_in[r * width + c] = (im.at(r, c)[0] - mean[0]) * scale[0]; + ptr_in[stride + r * width + c] = (im.at(r, c)[1] - mean[1]) * scale[1]; + ptr_in[2 * stride + r * width + c] = (im.at(r, c)[2] - mean[2]) * scale[2]; + } + } + } +} + +void detect_object(Tensor4hf& tout, const float thresh, Mat& image) { + std::vector objects; + const float* dout = tout.data(); + for (int iw = 0; iw < tout.height(); iw++) { + Object object; + const float *values = dout + iw * tout.width(); + int batch_id = static_cast(values[0]); + int oriw = image.cols; + int orih = image.rows; + object.batch_id = batch_id; + object.class_id = (int)values[1]; + object.prob = values[2]; + object.rec.x = (int)(values[3] * oriw); + object.rec.y = (int)(values[4] * orih); + object.rec.width = (int)(values[5] * oriw - object.rec.x); + object.rec.height = (int)(values[6] * orih - object.rec.y); + objects.push_back(object); + } + + for (int i = 0; i< objects.size(); ++i) { + Object object = objects.at(i); + if (object.prob > thresh) { + cv::rectangle(image, object.rec, cv::Scalar(255, 0, 0)); + std::ostringstream pro_str; + pro_str << object.prob; + std::string label = std::string(class_names[object.class_id]) + ": " + pro_str.str(); + cv::putText(image, label, cv::Point(object.rec.x, object.rec.y), \ + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); + LOG(INFO) << "detection in batch: " << object.batch_id << ", image size: " << image.cols << ", " << image.rows << \ + ", detect object: " << class_names[object.class_id] << ", location: x=" << object.rec.x << ", y=" << object.rec.y << \ + ", width=" << object.rec.width << ", height=" << object.rec.height; + cv::imwrite("detection_output.jpg", image); + } + } +} +#endif + +void test_net(const std::string model_file_name, const std::string image_file_name, float thresh, \ + int threads, int test_iter) { + + int batch_size = 1; + + //! create runtime context + LOG(INFO) << "create runtime context"; + std::shared_ptr> ctx1 = std::make_shared>(); + ctx1->set_run_mode(SABER_POWER_HIGH, threads); + LOG(INFO) << omp_get_num_threads() << " threads is activated"; + + //! load model + LOG(WARNING) << "load anakin model file from " << model_file_name << " ..."; + Graph graph; + auto status = graph.load(model_file_name); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + //! set batch size + graph.ResetBatchSize("input_0", batch_size); + + //! optimize the graph + LOG(INFO) << "optimize the graph"; + graph.Optimize(); + + //! get output name + std::vector& vout_name = graph.get_outs(); + LOG(INFO) << "output size: " << vout_name.size(); + + //! constructs the executer net + LOG(INFO) << "create net to execute"; + Net net_executer(graph, ctx1, true); + + //! get in + LOG(INFO) << "get input"; + auto d_tensor_in_p = net_executer.get_in("input_0"); + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i = 0; i < valid_shape_in.size(); i++) { + LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; + } + Tensor4hf thin(valid_shape_in); + + //! feed input image to input tensor +#ifdef USE_OPENCV + LOG(INFO) << "loading image " << image_file_name << " ..."; + Mat img = imread(image_file_name, CV_LOAD_IMAGE_COLOR); + if (img.empty()) { + LOG(FATAL) << "opencv read image " << image_file_name << " failed"; + } + float mean_mb[3] = {127.5f, 127.5f, 127.5f}; + float scale_mb[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f}; + fill_tensor_with_cvmat(img, thin, batch_size, thin.width(), thin.height(), mean_mb, scale_mb); +#else + fill_tensor_host_const(thin, 1.f); +#endif + + //! do inference + Context ctx(0, 0, 0); + anakin::saber::SaberTimer my_time; + LOG(INFO) << "run prediction "; + + double to = 0; + double tmin = 1000000; + double tmax = 0; + my_time.start(ctx); + saber::SaberTimer t1; + for (int i = 0; i < test_iter; i++) { + d_tensor_in_p->copy_from(thin); + t1.clear(); + t1.start(ctx); + net_executer.prediction(); + t1.end(ctx); + double tdiff = t1.get_average_ms(); + if (tdiff > tmax) { + tmax = tdiff; + } + if (tdiff < tmin) { + tmin = tdiff; + } + to += tdiff; + } + my_time.end(ctx); + + + LOG(INFO) << model_file_name << " batch_size " << batch_size << \ + " average time " << to / test_iter << \ + ", min time: " << tmin << "ms, max time: " << tmax << " ms"; + + //! fixme get output + //std::vector vout = net_executer.get_out_list(); + std::vector vout; + for (auto& it : vout_name) { + vout.push_back(net_executer.get_out(it)); + } + Tensor4hf* tensor_out = vout[0]; + LOG(INFO) << "output size: " << vout.size(); +#if 0 //print output data + LOG(INFO) << "extract data: size: " << tensor_out->valid_size() << \ + ", width=" << tensor_out->width() << ", height=" << tensor_out->height(); + const float* ptr_out = tensor_out->data(); + for (int i = 0; i < tensor_out->valid_size(); i++) { + printf("%0.4f ", ptr_out[i]); + if ((i + 1) % 7 == 0) { + printf("\n"); + } + } + printf("\n"); +#endif +#ifdef USE_OPENCV + detect_object(*tensor_out, thresh, img); +#endif +} + +int main(int argc, char** argv){ + + LOG(INFO) << "initialized the device"; + Env::env_init(); + + if (argc < 2) { + LOG(ERROR) << "usage: " << argv[0] << ": model_file image_name [detect_thresh] [test_iter] [threads]"; + return -1; + } + char* model_file = argv[1]; + + char* image_path = argv[2]; + + float thresh = 0.6; + if(argc > 3) { + thresh = (float)atof(argv[3]); + } + + int test_iter = 10; + if (argc > 4) { + test_iter = atoi(argv[4]); + } + + int threads = 1; + if (argc > 5) { + threads = atoi(argv[5]); + } + + test_net(model_file, image_path, thresh, threads, test_iter); + return 0; +} + diff --git a/examples/cat1.jpg b/examples/cat1.jpg new file mode 100755 index 000000000..343aec454 Binary files /dev/null and b/examples/cat1.jpg differ diff --git a/examples/cuda/example_nv_cnn_net.cpp b/examples/cuda/example_nv_cnn_net.cpp new file mode 100644 index 000000000..be7ec6497 --- /dev/null +++ b/examples/cuda/example_nv_cnn_net.cpp @@ -0,0 +1,66 @@ + +#include "utils/logger/logger.h" +#include "graph.h" +#include "net.h" + +#ifdef USE_CUDA +/*util to fill tensor*/ +#include "saber/core/tensor_op.h" +using namespace anakin; +using namespace anakin::graph; +using namespace anakin::saber; + +int main(int argc, const char** argv) { + /*init graph object, graph is the skeleton of model*/ + Graph graph; + + /*load model from file to init the graph*/ + auto status = graph.load("Resnet50.anakin.bin"); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + /*set net input shape and use this shape to optimize the graph(fusion and init operator),shape is n,c,h,w*/ + graph.Reshape("input_0", {1, 3, 224, 224}); + graph.Optimize(); + + /*net_executer is the executor object of model. use graph to init Net*/ + Net net_executer(graph, true); + + /*use input string to get the input tensor of net. for we use NV as target, the tensor of net_executer is on GPU memory*/ + auto d_tensor_in_p = net_executer.get_in("input_0"); + auto valid_shape_in = d_tensor_in_p->valid_shape(); + + /*create tensor located in host*/ + Tensor4d h_tensor_in; + + /*alloc for host tensor*/ + h_tensor_in.re_alloc(valid_shape_in); + + /*init host tensor by random*/ + fill_tensor_host_rand(h_tensor_in, -1.0f, 1.0f); + + /*use host tensor to int device tensor which is net input*/ + d_tensor_in_p->copy_from(h_tensor_in); + + /*run infer*/ + net_executer.prediction(); + + LOG(INFO)<<"infer finash"; + + /*get the out put of net, which is a device tensor*/ + auto d_out=net_executer.get_out("prob_out"); + + /*create another host tensor, and copy the content of device tensor to host*/ + Tensor4d h_tensor_out; + h_tensor_out.re_alloc(d_out->valid_shape()); + h_tensor_out.copy_from(*d_out); + + /*show output content*/ + for(int i=0;i workers("Resnet50.anakin.bin", 10); + workers.register_inputs({"input_0"}); + workers.register_outputs({"prob_out"}); + /*set input shape*/ + workers.Reshape("input_0", {1, 3, 224, 224}); + /*start workers*/ + workers.launch(); + + /*fill input*/ + std::vector::type, AK_FLOAT> > host_tensor_p_in_list; + saber::Shape valid_shape_in({1, 3, 224, 224}); + Tensor4dPtr::type, AK_FLOAT> h_tensor_in = new Tensor4d::type, AK_FLOAT>(valid_shape_in); + float* h_data = h_tensor_in->mutable_data(); + for (int i=0; isize(); i++) { + h_data[i] = 1.0f; + } + host_tensor_p_in_list.push_back(h_tensor_in); + + + /*run infer,send input to worker queue*/ + int epoch = 1000; + for(int i=0; i graph; +auto status = graph.load("Resnet50.anakin.bin"); +``` + +- 根据模型设置网络图的输入尺寸,进行图优化 +```cpp +graph.Reshape("input_0", {1, 3, 224, 224}); +graph.Optimize(); +``` + +- 根据优化后的网络图初始化网络执行器 +```cpp +Net net_executer(graph, true); +``` + +- 取出网络的输入tensor,将数据拷贝到输入tensor,其中copy_from将数据从内存拷贝到显存 +```cpp +auto d_tensor_in_p = net_executer.get_in("input_0"); +Tensor4d h_tensor_in; +h_tensor_in.re_alloc(valid_shape_in); +fill_tensor_host_rand(h_tensor_in, -1.0f, 1.0f); +d_tensor_in_p->copy_from(h_tensor_in); +``` + +- 运行推导 +```cpp +net_executer.prediction(); +``` + +- 取出网络的输出tensor,其中copy_from将数据从显存拷贝到内存 +```cpp +auto d_out=net_executer.get_out("prob_out"); +Tensor4d h_tensor_out; +h_tensor_out.re_alloc(d_out->valid_shape()); +h_tensor_out.copy_from(*d_out); +``` + +示例文件为[example_nv_cnn_net.cpp](cuda/example_nv_cnn_net.cpp) +以NV平台为例演示Anakin框架的使用方法,注意编译时需要打开GPU编译开关和example编译开关,也可以将文件复制到`test/framework/net`下直接编译 +- - - +## 在X86上运行RNN模型 + +整体流程与在NV的GPU上运行CNN模型相似,不同之处如下: +- 使用X86标识初始化图对象和网络执行器对象 +```cpp +Graph graph; +Net net_executer(graph, true); +``` + +- rnn模型的输入尺寸是可变的,初始化图时的输入维度是维度的最大值,输入维度N代表总的词的个数。还需要设置输入tensor的seq_offset来标示这些词是如何划分为句子的,如{0,10,15,30}表示共有12个词,其中第0到第9个词是第一句话,第10到第14个词是第二句话,第15到第29个词是第三句话 +```cpp +h_tensor_in_p->set_seq_offset({0,10,15,30}); +``` + +示例文件为[example_x86_rnn_net.cpp](x86/example_x86_rnn_net.cpp) +以X86平台为例演示Anakin框架的使用方法,注意编译时需要打开X86编译开关和example编译开关,也可以将文件复制到`test/framework/net`下直接编译 +- - - +## 在NV的GPU上使用Anakin的线程池运行CNN模型 + +整体流程与在NV的GPU上运行CNN模型相似,不同之处如下: +- 用模型地址和线程池大小初始化worker对象,注册输入输出,启动线程池 +```cpp +Worker workers("Resnet50.anakin.bin", 10); +workers.register_inputs({"input_0"}); +workers.register_outputs({"prob_out"}); +workers.Reshape("input_0", {1, 3, 224, 224}); +workers.launch(); +``` +- 将输入tensor注入任务队列,获得输出tensor +```cpp +auto d_tensor_p_out_list = workers.sync_prediction(host_tensor_p_in_list); +auto d_tensor_p = d_tensor_p_out_list[0]; +``` + +示例文件为[example_nv_cnn_net_multi_thread.cpp](cuda/example_nv_cnn_net_multi_thread.cpp) 示例使用worker的同步预测接口 + +以NV平台为例演示Anakin框架的使用方法,注意编译时需要打开GPU编译开关和example编译开关,也可以将文件复制到`test/framework/net`下直接编译 diff --git a/examples/labels.txt b/examples/labels.txt new file mode 100755 index 000000000..a9e8c7f50 --- /dev/null +++ b/examples/labels.txt @@ -0,0 +1,1000 @@ +n01440764 tench, Tinca tinca +n01443537 goldfish, Carassius auratus +n01484850 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias +n01491361 tiger shark, Galeocerdo cuvieri +n01494475 hammerhead, hammerhead shark +n01496331 electric ray, crampfish, numbfish, torpedo +n01498041 stingray +n01514668 cock +n01514859 hen +n01518878 ostrich, Struthio camelus +n01530575 brambling, Fringilla montifringilla +n01531178 goldfinch, Carduelis carduelis +n01532829 house finch, linnet, Carpodacus mexicanus +n01534433 junco, snowbird +n01537544 indigo bunting, indigo finch, indigo bird, Passerina cyanea +n01558993 robin, American robin, Turdus migratorius +n01560419 bulbul +n01580077 jay +n01582220 magpie +n01592084 chickadee +n01601694 water ouzel, dipper +n01608432 kite +n01614925 bald eagle, American eagle, Haliaeetus leucocephalus +n01616318 vulture +n01622779 great grey owl, great gray owl, Strix nebulosa +n01629819 European fire salamander, Salamandra salamandra +n01630670 common newt, Triturus vulgaris +n01631663 eft +n01632458 spotted salamander, Ambystoma maculatum +n01632777 axolotl, mud puppy, Ambystoma mexicanum +n01641577 bullfrog, Rana catesbeiana +n01644373 tree frog, tree-frog +n01644900 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui +n01664065 loggerhead, loggerhead turtle, Caretta caretta +n01665541 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea +n01667114 mud turtle +n01667778 terrapin +n01669191 box turtle, box tortoise +n01675722 banded gecko +n01677366 common iguana, iguana, Iguana iguana +n01682714 American chameleon, anole, Anolis carolinensis +n01685808 whiptail, whiptail lizard +n01687978 agama +n01688243 frilled lizard, Chlamydosaurus kingi +n01689811 alligator lizard +n01692333 Gila monster, Heloderma suspectum +n01693334 green lizard, Lacerta viridis +n01694178 African chameleon, Chamaeleo chamaeleon +n01695060 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis +n01697457 African crocodile, Nile crocodile, Crocodylus niloticus +n01698640 American alligator, Alligator mississipiensis +n01704323 triceratops +n01728572 thunder snake, worm snake, Carphophis amoenus +n01728920 ringneck snake, ring-necked snake, ring snake +n01729322 hognose snake, puff adder, sand viper +n01729977 green snake, grass snake +n01734418 king snake, kingsnake +n01735189 garter snake, grass snake +n01737021 water snake +n01739381 vine snake +n01740131 night snake, Hypsiglena torquata +n01742172 boa constrictor, Constrictor constrictor +n01744401 rock python, rock snake, Python sebae +n01748264 Indian cobra, Naja naja +n01749939 green mamba +n01751748 sea snake +n01753488 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus +n01755581 diamondback, diamondback rattlesnake, Crotalus adamanteus +n01756291 sidewinder, horned rattlesnake, Crotalus cerastes +n01768244 trilobite +n01770081 harvestman, daddy longlegs, Phalangium opilio +n01770393 scorpion +n01773157 black and gold garden spider, Argiope aurantia +n01773549 barn spider, Araneus cavaticus +n01773797 garden spider, Aranea diademata +n01774384 black widow, Latrodectus mactans +n01774750 tarantula +n01775062 wolf spider, hunting spider +n01776313 tick +n01784675 centipede +n01795545 black grouse +n01796340 ptarmigan +n01797886 ruffed grouse, partridge, Bonasa umbellus +n01798484 prairie chicken, prairie grouse, prairie fowl +n01806143 peacock +n01806567 quail +n01807496 partridge +n01817953 African grey, African gray, Psittacus erithacus +n01818515 macaw +n01819313 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita +n01820546 lorikeet +n01824575 coucal +n01828970 bee eater +n01829413 hornbill +n01833805 hummingbird +n01843065 jacamar +n01843383 toucan +n01847000 drake +n01855032 red-breasted merganser, Mergus serrator +n01855672 goose +n01860187 black swan, Cygnus atratus +n01871265 tusker +n01872401 echidna, spiny anteater, anteater +n01873310 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus +n01877812 wallaby, brush kangaroo +n01882714 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus +n01883070 wombat +n01910747 jellyfish +n01914609 sea anemone, anemone +n01917289 brain coral +n01924916 flatworm, platyhelminth +n01930112 nematode, nematode worm, roundworm +n01943899 conch +n01944390 snail +n01945685 slug +n01950731 sea slug, nudibranch +n01955084 chiton, coat-of-mail shell, sea cradle, polyplacophore +n01968897 chambered nautilus, pearly nautilus, nautilus +n01978287 Dungeness crab, Cancer magister +n01978455 rock crab, Cancer irroratus +n01980166 fiddler crab +n01981276 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica +n01983481 American lobster, Northern lobster, Maine lobster, Homarus americanus +n01984695 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish +n01985128 crayfish, crawfish, crawdad, crawdaddy +n01986214 hermit crab +n01990800 isopod +n02002556 white stork, Ciconia ciconia +n02002724 black stork, Ciconia nigra +n02006656 spoonbill +n02007558 flamingo +n02009229 little blue heron, Egretta caerulea +n02009912 American egret, great white heron, Egretta albus +n02011460 bittern +n02012849 crane +n02013706 limpkin, Aramus pictus +n02017213 European gallinule, Porphyrio porphyrio +n02018207 American coot, marsh hen, mud hen, water hen, Fulica americana +n02018795 bustard +n02025239 ruddy turnstone, Arenaria interpres +n02027492 red-backed sandpiper, dunlin, Erolia alpina +n02028035 redshank, Tringa totanus +n02033041 dowitcher +n02037110 oystercatcher, oyster catcher +n02051845 pelican +n02056570 king penguin, Aptenodytes patagonica +n02058221 albatross, mollymawk +n02066245 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus +n02071294 killer whale, killer, orca, grampus, sea wolf, Orcinus orca +n02074367 dugong, Dugong dugon +n02077923 sea lion +n02085620 Chihuahua +n02085782 Japanese spaniel +n02085936 Maltese dog, Maltese terrier, Maltese +n02086079 Pekinese, Pekingese, Peke +n02086240 Shih-Tzu +n02086646 Blenheim spaniel +n02086910 papillon +n02087046 toy terrier +n02087394 Rhodesian ridgeback +n02088094 Afghan hound, Afghan +n02088238 basset, basset hound +n02088364 beagle +n02088466 bloodhound, sleuthhound +n02088632 bluetick +n02089078 black-and-tan coonhound +n02089867 Walker hound, Walker foxhound +n02089973 English foxhound +n02090379 redbone +n02090622 borzoi, Russian wolfhound +n02090721 Irish wolfhound +n02091032 Italian greyhound +n02091134 whippet +n02091244 Ibizan hound, Ibizan Podenco +n02091467 Norwegian elkhound, elkhound +n02091635 otterhound, otter hound +n02091831 Saluki, gazelle hound +n02092002 Scottish deerhound, deerhound +n02092339 Weimaraner +n02093256 Staffordshire bullterrier, Staffordshire bull terrier +n02093428 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier +n02093647 Bedlington terrier +n02093754 Border terrier +n02093859 Kerry blue terrier +n02093991 Irish terrier +n02094114 Norfolk terrier +n02094258 Norwich terrier +n02094433 Yorkshire terrier +n02095314 wire-haired fox terrier +n02095570 Lakeland terrier +n02095889 Sealyham terrier, Sealyham +n02096051 Airedale, Airedale terrier +n02096177 cairn, cairn terrier +n02096294 Australian terrier +n02096437 Dandie Dinmont, Dandie Dinmont terrier +n02096585 Boston bull, Boston terrier +n02097047 miniature schnauzer +n02097130 giant schnauzer +n02097209 standard schnauzer +n02097298 Scotch terrier, Scottish terrier, Scottie +n02097474 Tibetan terrier, chrysanthemum dog +n02097658 silky terrier, Sydney silky +n02098105 soft-coated wheaten terrier +n02098286 West Highland white terrier +n02098413 Lhasa, Lhasa apso +n02099267 flat-coated retriever +n02099429 curly-coated retriever +n02099601 golden retriever +n02099712 Labrador retriever +n02099849 Chesapeake Bay retriever +n02100236 German short-haired pointer +n02100583 vizsla, Hungarian pointer +n02100735 English setter +n02100877 Irish setter, red setter +n02101006 Gordon setter +n02101388 Brittany spaniel +n02101556 clumber, clumber spaniel +n02102040 English springer, English springer spaniel +n02102177 Welsh springer spaniel +n02102318 cocker spaniel, English cocker spaniel, cocker +n02102480 Sussex spaniel +n02102973 Irish water spaniel +n02104029 kuvasz +n02104365 schipperke +n02105056 groenendael +n02105162 malinois +n02105251 briard +n02105412 kelpie +n02105505 komondor +n02105641 Old English sheepdog, bobtail +n02105855 Shetland sheepdog, Shetland sheep dog, Shetland +n02106030 collie +n02106166 Border collie +n02106382 Bouvier des Flandres, Bouviers des Flandres +n02106550 Rottweiler +n02106662 German shepherd, German shepherd dog, German police dog, alsatian +n02107142 Doberman, Doberman pinscher +n02107312 miniature pinscher +n02107574 Greater Swiss Mountain dog +n02107683 Bernese mountain dog +n02107908 Appenzeller +n02108000 EntleBucher +n02108089 boxer +n02108422 bull mastiff +n02108551 Tibetan mastiff +n02108915 French bulldog +n02109047 Great Dane +n02109525 Saint Bernard, St Bernard +n02109961 Eskimo dog, husky +n02110063 malamute, malemute, Alaskan malamute +n02110185 Siberian husky +n02110341 dalmatian, coach dog, carriage dog +n02110627 affenpinscher, monkey pinscher, monkey dog +n02110806 basenji +n02110958 pug, pug-dog +n02111129 Leonberg +n02111277 Newfoundland, Newfoundland dog +n02111500 Great Pyrenees +n02111889 Samoyed, Samoyede +n02112018 Pomeranian +n02112137 chow, chow chow +n02112350 keeshond +n02112706 Brabancon griffon +n02113023 Pembroke, Pembroke Welsh corgi +n02113186 Cardigan, Cardigan Welsh corgi +n02113624 toy poodle +n02113712 miniature poodle +n02113799 standard poodle +n02113978 Mexican hairless +n02114367 timber wolf, grey wolf, gray wolf, Canis lupus +n02114548 white wolf, Arctic wolf, Canis lupus tundrarum +n02114712 red wolf, maned wolf, Canis rufus, Canis niger +n02114855 coyote, prairie wolf, brush wolf, Canis latrans +n02115641 dingo, warrigal, warragal, Canis dingo +n02115913 dhole, Cuon alpinus +n02116738 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus +n02117135 hyena, hyaena +n02119022 red fox, Vulpes vulpes +n02119789 kit fox, Vulpes macrotis +n02120079 Arctic fox, white fox, Alopex lagopus +n02120505 grey fox, gray fox, Urocyon cinereoargenteus +n02123045 tabby, tabby cat +n02123159 tiger cat +n02123394 Persian cat +n02123597 Siamese cat, Siamese +n02124075 Egyptian cat +n02125311 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor +n02127052 lynx, catamount +n02128385 leopard, Panthera pardus +n02128757 snow leopard, ounce, Panthera uncia +n02128925 jaguar, panther, Panthera onca, Felis onca +n02129165 lion, king of beasts, Panthera leo +n02129604 tiger, Panthera tigris +n02130308 cheetah, chetah, Acinonyx jubatus +n02132136 brown bear, bruin, Ursus arctos +n02133161 American black bear, black bear, Ursus americanus, Euarctos americanus +n02134084 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus +n02134418 sloth bear, Melursus ursinus, Ursus ursinus +n02137549 mongoose +n02138441 meerkat, mierkat +n02165105 tiger beetle +n02165456 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle +n02167151 ground beetle, carabid beetle +n02168699 long-horned beetle, longicorn, longicorn beetle +n02169497 leaf beetle, chrysomelid +n02172182 dung beetle +n02174001 rhinoceros beetle +n02177972 weevil +n02190166 fly +n02206856 bee +n02219486 ant, emmet, pismire +n02226429 grasshopper, hopper +n02229544 cricket +n02231487 walking stick, walkingstick, stick insect +n02233338 cockroach, roach +n02236044 mantis, mantid +n02256656 cicada, cicala +n02259212 leafhopper +n02264363 lacewing, lacewing fly +n02268443 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk +n02268853 damselfly +n02276258 admiral +n02277742 ringlet, ringlet butterfly +n02279972 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus +n02280649 cabbage butterfly +n02281406 sulphur butterfly, sulfur butterfly +n02281787 lycaenid, lycaenid butterfly +n02317335 starfish, sea star +n02319095 sea urchin +n02321529 sea cucumber, holothurian +n02325366 wood rabbit, cottontail, cottontail rabbit +n02326432 hare +n02328150 Angora, Angora rabbit +n02342885 hamster +n02346627 porcupine, hedgehog +n02356798 fox squirrel, eastern fox squirrel, Sciurus niger +n02361337 marmot +n02363005 beaver +n02364673 guinea pig, Cavia cobaya +n02389026 sorrel +n02391049 zebra +n02395406 hog, pig, grunter, squealer, Sus scrofa +n02396427 wild boar, boar, Sus scrofa +n02397096 warthog +n02398521 hippopotamus, hippo, river horse, Hippopotamus amphibius +n02403003 ox +n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis +n02410509 bison +n02412080 ram, tup +n02415577 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis +n02417914 ibex, Capra ibex +n02422106 hartebeest +n02422699 impala, Aepyceros melampus +n02423022 gazelle +n02437312 Arabian camel, dromedary, Camelus dromedarius +n02437616 llama +n02441942 weasel +n02442845 mink +n02443114 polecat, fitch, foulmart, foumart, Mustela putorius +n02443484 black-footed ferret, ferret, Mustela nigripes +n02444819 otter +n02445715 skunk, polecat, wood pussy +n02447366 badger +n02454379 armadillo +n02457408 three-toed sloth, ai, Bradypus tridactylus +n02480495 orangutan, orang, orangutang, Pongo pygmaeus +n02480855 gorilla, Gorilla gorilla +n02481823 chimpanzee, chimp, Pan troglodytes +n02483362 gibbon, Hylobates lar +n02483708 siamang, Hylobates syndactylus, Symphalangus syndactylus +n02484975 guenon, guenon monkey +n02486261 patas, hussar monkey, Erythrocebus patas +n02486410 baboon +n02487347 macaque +n02488291 langur +n02488702 colobus, colobus monkey +n02489166 proboscis monkey, Nasalis larvatus +n02490219 marmoset +n02492035 capuchin, ringtail, Cebus capucinus +n02492660 howler monkey, howler +n02493509 titi, titi monkey +n02493793 spider monkey, Ateles geoffroyi +n02494079 squirrel monkey, Saimiri sciureus +n02497673 Madagascar cat, ring-tailed lemur, Lemur catta +n02500267 indri, indris, Indri indri, Indri brevicaudatus +n02504013 Indian elephant, Elephas maximus +n02504458 African elephant, Loxodonta africana +n02509815 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens +n02510455 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca +n02514041 barracouta, snoek +n02526121 eel +n02536864 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch +n02606052 rock beauty, Holocanthus tricolor +n02607072 anemone fish +n02640242 sturgeon +n02641379 gar, garfish, garpike, billfish, Lepisosteus osseus +n02643566 lionfish +n02655020 puffer, pufferfish, blowfish, globefish +n02666196 abacus +n02667093 abaya +n02669723 academic gown, academic robe, judge's robe +n02672831 accordion, piano accordion, squeeze box +n02676566 acoustic guitar +n02687172 aircraft carrier, carrier, flattop, attack aircraft carrier +n02690373 airliner +n02692877 airship, dirigible +n02699494 altar +n02701002 ambulance +n02704792 amphibian, amphibious vehicle +n02708093 analog clock +n02727426 apiary, bee house +n02730930 apron +n02747177 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin +n02749479 assault rifle, assault gun +n02769748 backpack, back pack, knapsack, packsack, rucksack, haversack +n02776631 bakery, bakeshop, bakehouse +n02777292 balance beam, beam +n02782093 balloon +n02783161 ballpoint, ballpoint pen, ballpen, Biro +n02786058 Band Aid +n02787622 banjo +n02788148 bannister, banister, balustrade, balusters, handrail +n02790996 barbell +n02791124 barber chair +n02791270 barbershop +n02793495 barn +n02794156 barometer +n02795169 barrel, cask +n02797295 barrow, garden cart, lawn cart, wheelbarrow +n02799071 baseball +n02802426 basketball +n02804414 bassinet +n02804610 bassoon +n02807133 bathing cap, swimming cap +n02808304 bath towel +n02808440 bathtub, bathing tub, bath, tub +n02814533 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon +n02814860 beacon, lighthouse, beacon light, pharos +n02815834 beaker +n02817516 bearskin, busby, shako +n02823428 beer bottle +n02823750 beer glass +n02825657 bell cote, bell cot +n02834397 bib +n02835271 bicycle-built-for-two, tandem bicycle, tandem +n02837789 bikini, two-piece +n02840245 binder, ring-binder +n02841315 binoculars, field glasses, opera glasses +n02843684 birdhouse +n02859443 boathouse +n02860847 bobsled, bobsleigh, bob +n02865351 bolo tie, bolo, bola tie, bola +n02869837 bonnet, poke bonnet +n02870880 bookcase +n02871525 bookshop, bookstore, bookstall +n02877765 bottlecap +n02879718 bow +n02883205 bow tie, bow-tie, bowtie +n02892201 brass, memorial tablet, plaque +n02892767 brassiere, bra, bandeau +n02894605 breakwater, groin, groyne, mole, bulwark, seawall, jetty +n02895154 breastplate, aegis, egis +n02906734 broom +n02909870 bucket, pail +n02910353 buckle +n02916936 bulletproof vest +n02917067 bullet train, bullet +n02927161 butcher shop, meat market +n02930766 cab, hack, taxi, taxicab +n02939185 caldron, cauldron +n02948072 candle, taper, wax light +n02950826 cannon +n02951358 canoe +n02951585 can opener, tin opener +n02963159 cardigan +n02965783 car mirror +n02966193 carousel, carrousel, merry-go-round, roundabout, whirligig +n02966687 carpenter's kit, tool kit +n02971356 carton +n02974003 car wheel +n02977058 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM +n02978881 cassette +n02979186 cassette player +n02980441 castle +n02981792 catamaran +n02988304 CD player +n02992211 cello, violoncello +n02992529 cellular telephone, cellular phone, cellphone, cell, mobile phone +n02999410 chain +n03000134 chainlink fence +n03000247 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour +n03000684 chain saw, chainsaw +n03014705 chest +n03016953 chiffonier, commode +n03017168 chime, bell, gong +n03018349 china cabinet, china closet +n03026506 Christmas stocking +n03028079 church, church building +n03032252 cinema, movie theater, movie theatre, movie house, picture palace +n03041632 cleaver, meat cleaver, chopper +n03042490 cliff dwelling +n03045698 cloak +n03047690 clog, geta, patten, sabot +n03062245 cocktail shaker +n03063599 coffee mug +n03063689 coffeepot +n03065424 coil, spiral, volute, whorl, helix +n03075370 combination lock +n03085013 computer keyboard, keypad +n03089624 confectionery, confectionary, candy store +n03095699 container ship, containership, container vessel +n03100240 convertible +n03109150 corkscrew, bottle screw +n03110669 cornet, horn, trumpet, trump +n03124043 cowboy boot +n03124170 cowboy hat, ten-gallon hat +n03125729 cradle +n03126707 crane +n03127747 crash helmet +n03127925 crate +n03131574 crib, cot +n03133878 Crock Pot +n03134739 croquet ball +n03141823 crutch +n03146219 cuirass +n03160309 dam, dike, dyke +n03179701 desk +n03180011 desktop computer +n03187595 dial telephone, dial phone +n03188531 diaper, nappy, napkin +n03196217 digital clock +n03197337 digital watch +n03201208 dining table, board +n03207743 dishrag, dishcloth +n03207941 dishwasher, dish washer, dishwashing machine +n03208938 disk brake, disc brake +n03216828 dock, dockage, docking facility +n03218198 dogsled, dog sled, dog sleigh +n03220513 dome +n03223299 doormat, welcome mat +n03240683 drilling platform, offshore rig +n03249569 drum, membranophone, tympan +n03250847 drumstick +n03255030 dumbbell +n03259280 Dutch oven +n03271574 electric fan, blower +n03272010 electric guitar +n03272562 electric locomotive +n03290653 entertainment center +n03291819 envelope +n03297495 espresso maker +n03314780 face powder +n03325584 feather boa, boa +n03337140 file, file cabinet, filing cabinet +n03344393 fireboat +n03345487 fire engine, fire truck +n03347037 fire screen, fireguard +n03355925 flagpole, flagstaff +n03372029 flute, transverse flute +n03376595 folding chair +n03379051 football helmet +n03384352 forklift +n03388043 fountain +n03388183 fountain pen +n03388549 four-poster +n03393912 freight car +n03394916 French horn, horn +n03400231 frying pan, frypan, skillet +n03404251 fur coat +n03417042 garbage truck, dustcart +n03424325 gasmask, respirator, gas helmet +n03425413 gas pump, gasoline pump, petrol pump, island dispenser +n03443371 goblet +n03444034 go-kart +n03445777 golf ball +n03445924 golfcart, golf cart +n03447447 gondola +n03447721 gong, tam-tam +n03450230 gown +n03452741 grand piano, grand +n03457902 greenhouse, nursery, glasshouse +n03459775 grille, radiator grille +n03461385 grocery store, grocery, food market, market +n03467068 guillotine +n03476684 hair slide +n03476991 hair spray +n03478589 half track +n03481172 hammer +n03482405 hamper +n03483316 hand blower, blow dryer, blow drier, hair dryer, hair drier +n03485407 hand-held computer, hand-held microcomputer +n03485794 handkerchief, hankie, hanky, hankey +n03492542 hard disc, hard disk, fixed disk +n03494278 harmonica, mouth organ, harp, mouth harp +n03495258 harp +n03496892 harvester, reaper +n03498962 hatchet +n03527444 holster +n03529860 home theater, home theatre +n03530642 honeycomb +n03532672 hook, claw +n03534580 hoopskirt, crinoline +n03535780 horizontal bar, high bar +n03538406 horse cart, horse-cart +n03544143 hourglass +n03584254 iPod +n03584829 iron, smoothing iron +n03590841 jack-o'-lantern +n03594734 jean, blue jean, denim +n03594945 jeep, landrover +n03595614 jersey, T-shirt, tee shirt +n03598930 jigsaw puzzle +n03599486 jinrikisha, ricksha, rickshaw +n03602883 joystick +n03617480 kimono +n03623198 knee pad +n03627232 knot +n03630383 lab coat, laboratory coat +n03633091 ladle +n03637318 lampshade, lamp shade +n03642806 laptop, laptop computer +n03649909 lawn mower, mower +n03657121 lens cap, lens cover +n03658185 letter opener, paper knife, paperknife +n03661043 library +n03662601 lifeboat +n03666591 lighter, light, igniter, ignitor +n03670208 limousine, limo +n03673027 liner, ocean liner +n03676483 lipstick, lip rouge +n03680355 Loafer +n03690938 lotion +n03691459 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system +n03692522 loupe, jeweler's loupe +n03697007 lumbermill, sawmill +n03706229 magnetic compass +n03709823 mailbag, postbag +n03710193 mailbox, letter box +n03710637 maillot +n03710721 maillot, tank suit +n03717622 manhole cover +n03720891 maraca +n03721384 marimba, xylophone +n03724870 mask +n03729826 matchstick +n03733131 maypole +n03733281 maze, labyrinth +n03733805 measuring cup +n03742115 medicine chest, medicine cabinet +n03743016 megalith, megalithic structure +n03759954 microphone, mike +n03761084 microwave, microwave oven +n03763968 military uniform +n03764736 milk can +n03769881 minibus +n03770439 miniskirt, mini +n03770679 minivan +n03773504 missile +n03775071 mitten +n03775546 mixing bowl +n03776460 mobile home, manufactured home +n03777568 Model T +n03777754 modem +n03781244 monastery +n03782006 monitor +n03785016 moped +n03786901 mortar +n03787032 mortarboard +n03788195 mosque +n03788365 mosquito net +n03791053 motor scooter, scooter +n03792782 mountain bike, all-terrain bike, off-roader +n03792972 mountain tent +n03793489 mouse, computer mouse +n03794056 mousetrap +n03796401 moving van +n03803284 muzzle +n03804744 nail +n03814639 neck brace +n03814906 necklace +n03825788 nipple +n03832673 notebook, notebook computer +n03837869 obelisk +n03838899 oboe, hautboy, hautbois +n03840681 ocarina, sweet potato +n03841143 odometer, hodometer, mileometer, milometer +n03843555 oil filter +n03854065 organ, pipe organ +n03857828 oscilloscope, scope, cathode-ray oscilloscope, CRO +n03866082 overskirt +n03868242 oxcart +n03868863 oxygen mask +n03871628 packet +n03873416 paddle, boat paddle +n03874293 paddlewheel, paddle wheel +n03874599 padlock +n03876231 paintbrush +n03877472 pajama, pyjama, pj's, jammies +n03877845 palace +n03884397 panpipe, pandean pipe, syrinx +n03887697 paper towel +n03888257 parachute, chute +n03888605 parallel bars, bars +n03891251 park bench +n03891332 parking meter +n03895866 passenger car, coach, carriage +n03899768 patio, terrace +n03902125 pay-phone, pay-station +n03903868 pedestal, plinth, footstall +n03908618 pencil box, pencil case +n03908714 pencil sharpener +n03916031 perfume, essence +n03920288 Petri dish +n03924679 photocopier +n03929660 pick, plectrum, plectron +n03929855 pickelhaube +n03930313 picket fence, paling +n03930630 pickup, pickup truck +n03933933 pier +n03935335 piggy bank, penny bank +n03937543 pill bottle +n03938244 pillow +n03942813 ping-pong ball +n03944341 pinwheel +n03947888 pirate, pirate ship +n03950228 pitcher, ewer +n03954731 plane, carpenter's plane, woodworking plane +n03956157 planetarium +n03958227 plastic bag +n03961711 plate rack +n03967562 plow, plough +n03970156 plunger, plumber's helper +n03976467 Polaroid camera, Polaroid Land camera +n03976657 pole +n03977966 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria +n03980874 poncho +n03982430 pool table, billiard table, snooker table +n03983396 pop bottle, soda bottle +n03991062 pot, flowerpot +n03992509 potter's wheel +n03995372 power drill +n03998194 prayer rug, prayer mat +n04004767 printer +n04005630 prison, prison house +n04008634 projectile, missile +n04009552 projector +n04019541 puck, hockey puck +n04023962 punching bag, punch bag, punching ball, punchball +n04026417 purse +n04033901 quill, quill pen +n04033995 quilt, comforter, comfort, puff +n04037443 racer, race car, racing car +n04039381 racket, racquet +n04040759 radiator +n04041544 radio, wireless +n04044716 radio telescope, radio reflector +n04049303 rain barrel +n04065272 recreational vehicle, RV, R.V. +n04067472 reel +n04069434 reflex camera +n04070727 refrigerator, icebox +n04074963 remote control, remote +n04081281 restaurant, eating house, eating place, eatery +n04086273 revolver, six-gun, six-shooter +n04090263 rifle +n04099969 rocking chair, rocker +n04111531 rotisserie +n04116512 rubber eraser, rubber, pencil eraser +n04118538 rugby ball +n04118776 rule, ruler +n04120489 running shoe +n04125021 safe +n04127249 safety pin +n04131690 saltshaker, salt shaker +n04133789 sandal +n04136333 sarong +n04141076 sax, saxophone +n04141327 scabbard +n04141975 scale, weighing machine +n04146614 school bus +n04147183 schooner +n04149813 scoreboard +n04152593 screen, CRT screen +n04153751 screw +n04154565 screwdriver +n04162706 seat belt, seatbelt +n04179913 sewing machine +n04192698 shield, buckler +n04200800 shoe shop, shoe-shop, shoe store +n04201297 shoji +n04204238 shopping basket +n04204347 shopping cart +n04208210 shovel +n04209133 shower cap +n04209239 shower curtain +n04228054 ski +n04229816 ski mask +n04235860 sleeping bag +n04238763 slide rule, slipstick +n04239074 sliding door +n04243546 slot, one-armed bandit +n04251144 snorkel +n04252077 snowmobile +n04252225 snowplow, snowplough +n04254120 soap dispenser +n04254680 soccer ball +n04254777 sock +n04258138 solar dish, solar collector, solar furnace +n04259630 sombrero +n04263257 soup bowl +n04264628 space bar +n04265275 space heater +n04266014 space shuttle +n04270147 spatula +n04273569 speedboat +n04275548 spider web, spider's web +n04277352 spindle +n04285008 sports car, sport car +n04286575 spotlight, spot +n04296562 stage +n04310018 steam locomotive +n04311004 steel arch bridge +n04311174 steel drum +n04317175 stethoscope +n04325704 stole +n04326547 stone wall +n04328186 stopwatch, stop watch +n04330267 stove +n04332243 strainer +n04335435 streetcar, tram, tramcar, trolley, trolley car +n04336792 stretcher +n04344873 studio couch, day bed +n04346328 stupa, tope +n04347754 submarine, pigboat, sub, U-boat +n04350905 suit, suit of clothes +n04355338 sundial +n04355933 sunglass +n04356056 sunglasses, dark glasses, shades +n04357314 sunscreen, sunblock, sun blocker +n04366367 suspension bridge +n04367480 swab, swob, mop +n04370456 sweatshirt +n04371430 swimming trunks, bathing trunks +n04371774 swing +n04372370 switch, electric switch, electrical switch +n04376876 syringe +n04380533 table lamp +n04389033 tank, army tank, armored combat vehicle, armoured combat vehicle +n04392985 tape player +n04398044 teapot +n04399382 teddy, teddy bear +n04404412 television, television system +n04409515 tennis ball +n04417672 thatch, thatched roof +n04418357 theater curtain, theatre curtain +n04423845 thimble +n04428191 thresher, thrasher, threshing machine +n04429376 throne +n04435653 tile roof +n04442312 toaster +n04443257 tobacco shop, tobacconist shop, tobacconist +n04447861 toilet seat +n04456115 torch +n04458633 totem pole +n04461696 tow truck, tow car, wrecker +n04462240 toyshop +n04465501 tractor +n04467665 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi +n04476259 tray +n04479046 trench coat +n04482393 tricycle, trike, velocipede +n04483307 trimaran +n04485082 tripod +n04486054 triumphal arch +n04487081 trolleybus, trolley coach, trackless trolley +n04487394 trombone +n04493381 tub, vat +n04501370 turnstile +n04505470 typewriter keyboard +n04507155 umbrella +n04509417 unicycle, monocycle +n04515003 upright, upright piano +n04517823 vacuum, vacuum cleaner +n04522168 vase +n04523525 vault +n04525038 velvet +n04525305 vending machine +n04532106 vestment +n04532670 viaduct +n04536866 violin, fiddle +n04540053 volleyball +n04542943 waffle iron +n04548280 wall clock +n04548362 wallet, billfold, notecase, pocketbook +n04550184 wardrobe, closet, press +n04552348 warplane, military plane +n04553703 washbasin, handbasin, washbowl, lavabo, wash-hand basin +n04554684 washer, automatic washer, washing machine +n04557648 water bottle +n04560804 water jug +n04562935 water tower +n04579145 whiskey jug +n04579432 whistle +n04584207 wig +n04589890 window screen +n04590129 window shade +n04591157 Windsor tie +n04591713 wine bottle +n04592741 wing +n04596742 wok +n04597913 wooden spoon +n04599235 wool, woolen, woollen +n04604644 worm fence, snake fence, snake-rail fence, Virginia fence +n04606251 wreck +n04612504 yawl +n04613696 yurt +n06359193 web site, website, internet site, site +n06596364 comic book +n06785654 crossword puzzle, crossword +n06794110 street sign +n06874185 traffic light, traffic signal, stoplight +n07248320 book jacket, dust cover, dust jacket, dust wrapper +n07565083 menu +n07579787 plate +n07583066 guacamole +n07584110 consomme +n07590611 hot pot, hotpot +n07613480 trifle +n07614500 ice cream, icecream +n07615774 ice lolly, lolly, lollipop, popsicle +n07684084 French loaf +n07693725 bagel, beigel +n07695742 pretzel +n07697313 cheeseburger +n07697537 hotdog, hot dog, red hot +n07711569 mashed potato +n07714571 head cabbage +n07714990 broccoli +n07715103 cauliflower +n07716358 zucchini, courgette +n07716906 spaghetti squash +n07717410 acorn squash +n07717556 butternut squash +n07718472 cucumber, cuke +n07718747 artichoke, globe artichoke +n07720875 bell pepper +n07730033 cardoon +n07734744 mushroom +n07742313 Granny Smith +n07745940 strawberry +n07747607 orange +n07749582 lemon +n07753113 fig +n07753275 pineapple, ananas +n07753592 banana +n07754684 jackfruit, jak, jack +n07760859 custard apple +n07768694 pomegranate +n07802026 hay +n07831146 carbonara +n07836838 chocolate sauce, chocolate syrup +n07860988 dough +n07871810 meat loaf, meatloaf +n07873807 pizza, pizza pie +n07875152 potpie +n07880968 burrito +n07892512 red wine +n07920052 espresso +n07930864 cup +n07932039 eggnog +n09193705 alp +n09229709 bubble +n09246464 cliff, drop, drop-off +n09256479 coral reef +n09288635 geyser +n09332890 lakeside, lakeshore +n09399592 promontory, headland, head, foreland +n09421951 sandbar, sand bar +n09428293 seashore, coast, seacoast, sea-coast +n09468604 valley, vale +n09472597 volcano +n09835506 ballplayer, baseball player +n10148035 groom, bridegroom +n10565667 scuba diver +n11879895 rapeseed +n11939491 daisy +n12057211 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum +n12144580 corn +n12267677 acorn +n12620546 hip, rose hip, rosehip +n12768682 buckeye, horse chestnut, conker +n12985857 coral fungus +n12998815 agaric +n13037406 gyromitra +n13040303 stinkhorn, carrion fungus +n13044778 earthstar +n13052670 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa +n13054560 bolete +n13133613 ear, spike, capitulum +n15075141 toilet tissue, toilet paper, bathroom tissue diff --git a/examples/x86/example_x86_rnn_net.cpp b/examples/x86/example_x86_rnn_net.cpp new file mode 100644 index 000000000..3ba2d61da --- /dev/null +++ b/examples/x86/example_x86_rnn_net.cpp @@ -0,0 +1,56 @@ + +#include "utils/logger/logger.h" +#include "graph.h" +#include "net.h" + +#ifdef USE_X86_PLACE +/*util to fill tensor*/ +#include "saber/core/tensor_op.h" +using namespace anakin; +using namespace anakin::graph; +using namespace anakin::saber; + +int main(int argc, const char** argv) { + /*init graph object, graph is the skeleton of model*/ + Graph graph; + + /*load model from file to init the graph*/ + auto status = graph.load("language_model.anakin2.bin"); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + /*set net input shape and use this shape to optimize the graph(fusion and init operator), shape is n,c,h,w. n=sum of words*/ + graph.Reshape("input_0", {30, 1, 1, 1}); + graph.Optimize(); + + /*net_executer is the executor object of model. use graph to init Net*/ + Net net_executer(graph, true); + + /*use input string to get the input tensor of net. for we use X86 as target, the tensor of net_executer is on host memory*/ + auto h_tensor_in_p = net_executer.get_in("input_0"); + + /*init host tensor by continue int*/ + fill_tensor_host_seq(*h_tensor_in_p); + + /*seq offset of tensor means offset of sentence, 0,10,15,30 means sentence0 = 0-9, sentence 1 = 10-14, sentence2 = 15-29*/ + h_tensor_in_p->set_seq_offset({0,10,15,30}); + + + /*run infer*/ + net_executer.prediction(); + + LOG(INFO)<<"infer finash"; + + /*get the out put of net, which is a host tensor*/ + auto h_out=net_executer.get_out("fc_1.tmp_2_out"); + + + /*show some output content*/ + for(int i=0;i<10;i++){ + LOG(INFO)<<"out ["<data()[i]; + } +} +#else +int main(){} +#endif \ No newline at end of file diff --git a/framework/CMakeLists.txt b/framework/CMakeLists.txt index 324cf292d..f7a645d29 100644 --- a/framework/CMakeLists.txt +++ b/framework/CMakeLists.txt @@ -1,13 +1,27 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved -# @file CMakeLists files in the framework directory of project -# @auther cuichaowen -# @date 2017-10-24 -# ---------------------------------------------------------------------------- -anakin_fetch_include_recursively(${ANAKIN_FRAMEWORK}) +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. anakin_fetch_include_recursively(${ANAKIN_SABER}) anakin_fetch_include_recursively(${ANAKIN_MODEL_PARSER}) anakin_fetch_include_recursively(${ANAKIN_UTILS}) +anakin_fetch_include_recursively(${ANAKIN_FRAMEWORK}/core) +anakin_fetch_include_recursively(${ANAKIN_FRAMEWORK}/graph) +anakin_fetch_include_recursively(${ANAKIN_FRAMEWORK}/model_parser) +anakin_fetch_include_recursively(${ANAKIN_FRAMEWORK}/operators) +if(BUILD_RPC) + anakin_fetch_include_recursively(${ANAKIN_SERVICE}) +endif() + set(ANAKIN_BASE_SRC "") @@ -48,15 +62,15 @@ if(UNIX OR APPLE) endif() if(BUILD_STATIC) add_library(${anakin_lib_static} STATIC ${ANAKIN_SRC}) - add_dependencies(${anakin_lib_static} ${ANAKIN_SABER_LIB_TARGET}) - set_target_properties(${anakin_lib_static} PROPERTIES VERSION ${VERSION}) + add_dependencies(${anakin_lib_static} ${ANAKIN_SABER_LIB_TARGET})# ${anakin_framework_static}) + #set_target_properties(${anakin_lib_static} PROPERTIES VERSION ${VERSION}) target_link_libraries(${anakin_lib_static} ${ANAKIN_SABER_LIB_TARGET} ${ANAKIN_LINKER_LIBS}) - set_target_properties(${anakin_lib_static} PROPERTIES LINK_FLAGS "") + set_target_properties(${anakin_lib_static} PROPERTIES LINK_FLAGS "") set_target_properties(${anakin_lib_static} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/output/) - install(DIRECTORY ${ANAKIN_FRAMEWORK} ${ANAKIN_SABER} ${ANAKIN_UTILS} - DESTINATION ${PROJECT_SOURCE_DIR}/output/ - FILES_MATCHING - PATTERN "*.h" - PATTERN "*.inl") + install(DIRECTORY ${ANAKIN_FRAMEWORK} ${ANAKIN_SABER} ${ANAKIN_UTILS} + DESTINATION ${PROJECT_SOURCE_DIR}/output/ + FILES_MATCHING + PATTERN "*.h" + PATTERN "*.inl") endif() endif() diff --git a/framework/core/any.h b/framework/core/any.h index 80fc038af..bef127079 100644 --- a/framework/core/any.h +++ b/framework/core/any.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/core/base.h b/framework/core/base.h index dd59a1100..6cf5d9f25 100644 --- a/framework/core/base.h +++ b/framework/core/base.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/core/common_macros.h b/framework/core/common_macros.h index d7f3c81d3..0c8b4430e 100644 --- a/framework/core/common_macros.h +++ b/framework/core/common_macros.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/core/data_types.h b/framework/core/data_types.h index 16bfccd08..fb8df0e2f 100644 --- a/framework/core/data_types.h +++ b/framework/core/data_types.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #define ANAKIN_DATA_TYPES_H #include "framework/core/parameter.h" -#include "bmlib_runtime.h" #include namespace anakin { @@ -46,7 +45,6 @@ SABER_TO_BASE_TYPE(AK_UINT16, uint16_t); SABER_TO_BASE_TYPE(AK_UINT32, uint32_t); SABER_TO_BASE_TYPE(AK_BOOL, bool); SABER_TO_BASE_TYPE(AK_STRING, std::string); -SABER_TO_BASE_TYPE(AK_BM, bm_device_mem_t); template struct DataTypeRecover { @@ -71,7 +69,6 @@ BASE_TYPE_TO_SABER(uint8_t, AK_UINT8); BASE_TYPE_TO_SABER(uint32_t, AK_UINT32); BASE_TYPE_TO_SABER(bool, AK_BOOL); BASE_TYPE_TO_SABER(std::string, AK_STRING); -BASE_TYPE_TO_SABER(bm_device_mem_t, AK_BM); template struct TypeWarpper { @@ -99,7 +96,6 @@ ANAKIN_TO_TYPE_ID(long long, anakin_int64) ANAKIN_TO_TYPE_ID(unsigned long long, anakin_uint64) ANAKIN_TO_TYPE_ID(bool, anakin_bool) ANAKIN_TO_TYPE_ID(std::string, anakin_string) -ANAKIN_TO_TYPE_ID(bm_device_mem_t, anakin_bm) /// unique type tensor /// ANAKIN_TO_TYPE_ID(tensor, anakin_tensor) @@ -121,27 +117,22 @@ ANAKIN_TO_TYPE_ID(PTuple, anakin_tuple_bool) ANAKIN_TO_TYPE_ID(Enum, anakin_tuple_enum) -#define ANAKIN_PBLOCK_TO_TYPE_ID(type, target, type_id) \ - using type##target = PBlock; \ - ANAKIN_TO_TYPE_ID(type##target, type_id) +#define ANAKIN_PBLOCK_TO_TYPE_ID(target, type_id) \ + using __type##target = PBlock; \ + ANAKIN_TO_TYPE_ID(__type##target, type_id) #ifdef USE_CUDA - ANAKIN_PBLOCK_TO_TYPE_ID(float, NV, anakin_block_float) + ANAKIN_PBLOCK_TO_TYPE_ID(NV, anakin_block) #endif #ifdef USE_X86_PLACE - ANAKIN_PBLOCK_TO_TYPE_ID(float, X86, anakin_block_float) + ANAKIN_PBLOCK_TO_TYPE_ID(X86, anakin_block) #endif #ifdef USE_ARM_PLACE - ANAKIN_PBLOCK_TO_TYPE_ID(float, ARM, anakin_block_float) + ANAKIN_PBLOCK_TO_TYPE_ID(ARM, anakin_block) #endif -#ifdef USE_BM - ANAKIN_PBLOCK_TO_TYPE_ID(bm_device_mem_t, BM, anakin_block_float) -#endif - - template struct type_id { typedef T type; diff --git a/framework/core/factory.h b/framework/core/factory.h index f91684f0f..81771b15e 100644 --- a/framework/core/factory.h +++ b/framework/core/factory.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -36,6 +36,9 @@ class FactoryBase { if (_container.count(type_id) == 0) { LOG(FATAL) << type_id << " has not been registered! "; } + //LOG(INFO) << "create " << type_id << " fuction " << &_container.at(type_id); + //auto ptr = _container.at(type_id)(); + //return ptr; return (_container.at(type_id))(); } void __ALIAS__(const TypeIdentifier& ori_type_id, const TypeIdentifier& type_id) { @@ -51,9 +54,11 @@ class FactoryBase { bool Register(TypeIdentifier type_id, PolicyCreator creator) EXCLUSIVE_LOCKS_REQUIRED(container_mutex_) { std::lock_guard guard(container_mutex_); - CHECK_EQ(_container.count(type_id), 0) << type_id << " has not been registered! "; - _type_id_list.push_back(type_id); - _container[type_id] = creator; + //LOG(ERROR) << "register " << type_id; + if (_container.count(type_id) == 0) { + _type_id_list.push_back(type_id); + _container[type_id] = creator; + } return true; } void UnRegister(const TypeIdentifier& type_id) @@ -103,9 +108,8 @@ class ObjectRegisterBase { PolicyType* Get(const TypeIdentifier& type_id) { if (_container.count(type_id) == 0) { LOG(FATAL) << type_id << " has not been registered! "; - } else { - return _container.at(type_id); } + return _container.at(type_id); } void __ALIAS__(const TypeIdentifier& ori_type_id, const TypeIdentifier& type_id) { if (_container.count(ori_type_id) == 0) { @@ -119,11 +123,17 @@ class ObjectRegisterBase { } PolicyType& Register(TypeIdentifier type_id) EXCLUSIVE_LOCKS_REQUIRED(_container_mutex) { std::lock_guard guard(_container_mutex); - CHECK_EQ(_container.count(type_id), 0) << type_id << " has been registered! "; - PolicyType* object= new PolicyType(); - _container[type_id] = object; - _type_id_list.push_back(type_id); - return *object; + //CHECK_EQ(_container.count(type_id), 0) << type_id << " has been registered! "; + if (_container.count(type_id) == 0) { + PolicyType* object= new PolicyType(); + _container[type_id] = object; + _type_id_list.push_back(type_id); + return *object; + } else { + PolicyType* object = _container[type_id]; + return *object; + } + } void UnRegister(const TypeIdentifier& type_id) EXCLUSIVE_LOCKS_REQUIRED(_container_mutex) { std::lock_guard guard(_container_mutex); diff --git a/framework/core/functor.h b/framework/core/functor.h index 0e4232e35..7aacdd927 100644 --- a/framework/core/functor.h +++ b/framework/core/functor.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/core/mem_info.h b/framework/core/mem_info.h new file mode 100644 index 000000000..6055621c1 --- /dev/null +++ b/framework/core/mem_info.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_MEM_INFO_H +#define ANAKIN_MEM_INFO_H + +#include "framework/core/parameter.h" +#include "framework/core/singleton.h" + +namespace anakin { + +/** + * \brief memory management + */ +template +class MemInfo { +public: + MemInfo() {} + ~MemInfo() {} + + /// get used mem in MB + double get_used_mem_in_mb() { + return mem_used; + } + +private: + double mem_used{0.f}; ///< mem in mb + double mem_total{0.f}; //< mem in mb +}; + +#ifdef USE_CUDA +template<> +double MemInfo::get_used_mem_in_mb() { + size_t free_bytes; + size_t total_bytes; + auto cuda_status = cudaMemGetInfo(&free_bytes, &total_bytes); + if(cudaSuccess != cuda_status) { + LOG(FATAL) <<" cudaMemGetInfo fails: %s" << cudaGetErrorString(cuda_status); + } + this->mem_used = (double)(total_bytes - free_bytes)/1e6; + this->mem_total = (double)total_bytes/1e6; + return this->mem_used; +}; +#endif + +template +using MemoryInfo= Singleton>; + +} /* namespace anakin */ + +#endif diff --git a/framework/core/net/net.cpp b/framework/core/net/net.cpp index a86b385e3..80f3ecab7 100644 --- a/framework/core/net/net.cpp +++ b/framework/core/net/net.cpp @@ -1,59 +1,153 @@ #include "framework/core/net/net.h" #include "saber/funcs/timer.h" #include "saber/funcs/debug.h" +#include "framework/core/mem_info.h" + namespace anakin { -template -Net::~Net() { - if(_graph_p) { - delete _graph_p; - _graph_p = nullptr; - } +template +Net::~Net() { + if(_graph_p) { + delete _graph_p; + _graph_p = nullptr; + } } -template -double tensor_average(Tensor4dPtr& out_tensor_p) { +template +double tensor_average(Tensor4dPtr& out_tensor_p) { double sum = 0.0f; -#ifdef USE_CUDA - float* h_data = new float[out_tensor_p->valid_size()]; - const float* d_data = out_tensor_p->data(); - CUDA_CHECK(cudaMemcpy(h_data, d_data, out_tensor_p->valid_size()*sizeof(float), cudaMemcpyDeviceToHost)); -#else - float* h_data = out_tensor_p->data(); -#endif + const float* hptr = nullptr; + + Shape shin = out_tensor_p->valid_shape(); + Shape rs = out_tensor_p->shape(); + PBlock tensorptr(shin); + /*LOG(INFO) << " ---> get valid_shape("<< shin[0]<< ", " << shin[1] << ", " << shin[2] << ", " << shin[3] << ") " + <<" real_shape: (" << rs[0] << ", "<< rs[1] << ", "<< rs[2] << ", "<< rs[3] << ") ";*/ + tensorptr.h_tensor().copy_from(*out_tensor_p); + hptr = (const float* )(tensorptr.h_tensor().data()); for (int i=0; ivalid_size(); i++) { - sum+=h_data[i]; + sum += hptr[i]; } return sum/out_tensor_p->valid_size(); } - -template -Net::Net(bool need_summary) { - _graph_p = new graph::Graph(); +template <> +double tensor_average(Tensor4dPtr& out_tensor_p) { + double sum = 0.0f; + CHECK_NOTNULL(out_tensor_p)<<"out_tensor_p can not be null"; + const float* hptr = (const float*)(out_tensor_p->data()); + for (int i=0; ivalid_size(); i++) { + sum += hptr[i]; + } + return sum/out_tensor_p->valid_size(); +} +template +Net::Net(bool need_summary) { + _graph_p = new graph::Graph(); _need_summary = need_summary; } -template -Net::Net(graph::Graph& graph, bool need_summary) { - _graph_p = new graph::Graph(); +template +Net::Net(graph::Graph& graph, bool need_summary) { + _graph_p = new graph::Graph(); _need_summary = need_summary; - init_env(graph); + //init_env(graph); init(graph); } -template -void Net::init(graph::Graph& graph) { +template +Net::Net(\ + graph::Graph& graph, OpContextPtr ctx, bool need_summary) { + _graph_p = new graph::Graph(); + _need_summary = need_summary; + //init_env(graph); + init(graph, ctx); +} + +template +void Net::init(graph::Graph& graph, \ + OpContextPtr ctx) { + init_env(graph); // shallow copy _graph_p->CopyFrom(graph); - auto node_names_in_exec_order = graph.get_nodes_in_order(); // infer basic shape and parsing parameter from graph for (auto& node_name : node_names_in_exec_order) { auto node_ptr = (*_graph_p)[node_name]; - if (node_ptr->get_op_name() == "Output") { + //LOG(ERROR) << "get node " << node_name << ", op type " << node_ptr->get_op_name(); + /*if (node_ptr->get_op_name() == "Output") { continue; + }*/ + + // create operations + auto* op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; + if (op_pointer == nullptr) { + LOG(FATAL) << node_name << ", type " << node_ptr->get_op_name() << " is null"; + } + node_ptr->set_op(op_pointer); + op_pointer = nullptr; + + static_cast*>(node_ptr->Op())->_helper->BindParam(node_ptr); + // parsing parameter + static_cast*>(node_ptr->Op())->_helper->InitParam(); + } + + // remove null op node + for (auto it = node_names_in_exec_order.begin(); it != node_names_in_exec_order.end(); ){ + if (!(*_graph_p)[*it]->Op()) { + it = node_names_in_exec_order.erase(it); + } else { + ++it; + } + } + _exec_funcs.resize(node_names_in_exec_order.size()); + for(int i = 0; i < node_names_in_exec_order.size(); i++) { + auto& node_name = node_names_in_exec_order[i]; + auto& op_func = _exec_funcs[i]; + op_func.name = node_name; + auto& edge_in_its = _graph_p->get_in_arc_its(node_name); + DLOG(ERROR) << " node : " << op_func.name << " (" << (*_graph_p)[node_name]->get_op_name() << ") "; + for(auto& edge_it : edge_in_its) { + DLOG(INFO) << " => find in arc : " << edge_it->bottom() << " --> " << edge_it->top(); + op_func.ins.push_back(edge_it->weight().get()); + op_func.in_lanes.push_back(edge_it->lane()); + } + auto& edge_out_its = _graph_p->get_out_arc_its(node_name); + for(auto& edge_it : edge_out_its) { + DLOG(INFO) << " <= find out arc : " << edge_it->bottom() << " --> " << edge_it->top(); + op_func.outs.push_back(edge_it->weight().get()); + op_func.out_lanes.push_back(edge_it->lane()); } + op_func.current_lane = (*_graph_p)[node_name]->lane(); + op_func.need_sync = (*_graph_p)[node_name]->need_wait(); + op_func.op = static_cast* >((*_graph_p)[node_name]->Op()); + op_func.op_name = (*_graph_p)[node_name]->get_op_name(); + op_func.ctx_p = ctx; + // call init of operator + CHECK_NOTNULL(op_func.op) << "Node(node_name) doesn't have op pointer! "; + + op_func.op->_helper->InferShape(op_func.ins, op_func.outs); + op_func.op->_helper->Init(*(op_func.ctx_p), op_func.ins, op_func.outs); + } + + // init memory of _graph_p + init_memory(); +} + + +template +void Net::init(graph::Graph& graph) { + init_env(graph); + // shallow copy + _graph_p->CopyFrom(graph); + + double curr_mem_in_mb_start = MemoryInfo::Global().get_used_mem_in_mb(); + + auto node_names_in_exec_order = graph.get_nodes_in_order(); + // infer basic shape and parsing parameter from graph + for (auto& node_name : node_names_in_exec_order) { + auto node_ptr = (*_graph_p)[node_name]; + #ifdef ENABLE_OP_TIMER if (std::string::npos != (node_ptr->get_op_name()).find("Conv") || std::string::npos != (node_ptr->get_op_name()).find("Deconv")) { @@ -90,31 +184,47 @@ void Net::init(graph::Graph& #endif // create operations -#if defined(USE_CUDA) - if (node_ptr->get_op_name() == "ConvBatchnormScaleRelu" || node_ptr->get_op_name() == "ConvRelu" || node_ptr->get_op_name() == "Convolution") { - std::string group = "group"; - auto group_val = node_ptr->template get_attr(group); - if (group_val == 1) { - node_ptr->set_op(OpFactory::Global()["Sass"+node_ptr->get_op_name()]); - node_ptr->get_op_name() = "Sass" + node_ptr->get_op_name(); - } else { - LOG(ERROR) << "node_ptr->get_op_name() sass not support yet."; - auto* op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; - node_ptr->set_op(op_pointer); - } + + if(std::is_same::value) { + if (node_ptr->get_op_name() == "ConvBatchnormScale" || + node_ptr->get_op_name() == "ConvBatchnormScaleRelu" || node_ptr->get_op_name() == "ConvRelu" || + node_ptr->get_op_name() == "Convolution") { + std::string group = "group"; + auto group_val = node_ptr->template get_attr(group); + std::string dilation = "dilation_rate"; + auto dilation_rate_val = node_ptr->template get_attr >(dilation); + std::string weight_name = "weight_1"; + auto weights = node_ptr->template get_attr >(weight_name); + + int k_w = weights.d_tensor().width(); + int k_h = weights.d_tensor().height(); + int dil_h = dilation_rate_val.vector()[0]; + int dil_w = dilation_rate_val.vector()[1]; + + if ((group_val == 1) && (k_w == 3 && k_h == 3 && dil_h == 1 && dil_w == 1)) { + node_ptr->set_op(OpFactory::Global()["Sass"+node_ptr->get_op_name()]); + node_ptr->get_op_name() = "Sass" + node_ptr->get_op_name(); + } else { + LOG(ERROR) << "node_ptr->get_op_name() sass not support yet."; + auto *op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; + node_ptr->set_op(op_pointer); + } + } else { + auto *op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; + node_ptr->set_op(op_pointer); + } } else { - auto* op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; + auto *op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; + if (op_pointer == nullptr) { + CHECK(false)<< node_name << ", type " << node_ptr->get_op_name() << " is null"; + LOG(FATAL) << node_name << ", type " << node_ptr->get_op_name() << " is null"; + } node_ptr->set_op(op_pointer); } -#else - auto* op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; - node_ptr->set_op(op_pointer); - op_pointer = nullptr; -#endif // bind parameter structure - static_cast*>(node_ptr->Op())->_helper->BindParam(node_ptr); + static_cast*>(node_ptr->Op())->_helper->BindParam(node_ptr); // parsing parameter - static_cast*>(node_ptr->Op())->_helper->InitParam(); + static_cast*>(node_ptr->Op())->_helper->InitParam(); } // remove null op node @@ -145,13 +255,13 @@ void Net::init(graph::Graph& } op_func.current_lane = (*_graph_p)[node_name]->lane(); op_func.need_sync = (*_graph_p)[node_name]->need_wait(); - op_func.op = static_cast* >((*_graph_p)[node_name]->Op()); + op_func.op = static_cast* >((*_graph_p)[node_name]->Op()); op_func.op_name = (*_graph_p)[node_name]->get_op_name(); op_func.ctx_p = std::make_shared>(TargetWrapper::get_device_id(), op_func.current_lane, op_func.current_lane); // call init of operator - CHECK_NOTNULL_S(op_func.op) << "Node(node_name) doesn't have op pointer! "; + CHECK_NOTNULL(op_func.op) << "Node(node_name) doesn't have op pointer! "; op_func.op->_helper->InferShape(op_func.ins, op_func.outs); @@ -161,19 +271,33 @@ void Net::init(graph::Graph& << " " << in->valid_shape()[1] << " " << in->valid_shape()[2] << " " << in->valid_shape()[3]; + LOG(INFO) <<"in offset size = "<get_seq_offset().size(); } for(auto& out : op_func.outs) { LOG(INFO) << " <= [shape]: " << out->valid_shape()[0] << " " << out->valid_shape()[1] << " " << out->valid_shape()[2] << " " << out->valid_shape()[3]; + LOG(INFO) <<"out offset size = "<get_seq_offset().size(); } + #endif op_func.op->_helper->Init(*(op_func.ctx_p), op_func.ins, op_func.outs); +#ifdef ENABLE_DEBUG + DLOG(INFO)<<"op init success "<::Global().get_used_mem_in_mb(); + this->_graph_p->statistics.template set_info(curr_mem_in_mb_end - curr_mem_in_mb_start); // init memory of _graph_p init_memory(); + + graph.statistics = _graph_p->statistics; // copy statistic back + LOG(INFO) << "Temp mem used: " << this->_graph_p->statistics.template get_info() << " MB"; + LOG(INFO) << "Original mem used: " << this->_graph_p->statistics.template get_info() << " MB"; + LOG(INFO) << "Model mem used: " << this->_graph_p->statistics.template get_info() << " MB"; + LOG(INFO) << "System mem used: " << this->_graph_p->statistics.template get_info() << " MB"; #ifdef ENABLE_OP_TIMER _op_time = std::vector(_exec_funcs.size(), 0.0f); #endif @@ -191,105 +315,92 @@ void Net::init(graph::Graph& LOG(WARNING) << " Inspect memory of " << executer.name << " (" << executer.op_name << ") "; executer.infer_shape(); - for (auto out : executer.outs) { - LOG(INFO) << " |-- out tensor avg " << tensor_average(out); - } + for (auto out : executer.outs) { + LOG(INFO) << " |-- out tensor avg " << tensor_average(out); + } #ifdef USE_CUDA - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaPeekAtLastError()); #endif } #endif } -template -void Net::prediction() { +template +void Net::prediction() { #ifdef ENABLE_OP_TIMER int op_id = 0; #endif + + int i = 0; for(auto& executer : _exec_funcs) { - if (RunType == OpRunType::SYNC || executer.need_sync) { + if (RunType == OpRunType::SYNC || executer.need_sync || executer.op_name == "Output") { for(int i = 0; i < executer.ins.size(); i++) { - // record - executer.ins[i]->record_event(executer.ctx_p->get_compute_stream()); + // sync event record in multi_stream or syn when encountering output op executer.ins[i]->sync(); } } + #ifdef ENABLE_DEBUG LOG(ERROR) << " executer : " << executer.name << " (" << executer.op_name << ") "; - for(auto in : executer.ins) { - LOG(ERROR) << " \\in shape " << in->valid_shape()[0] - << " " << in->valid_shape()[1] - << " " << in->valid_shape()[2] - << " " << in->valid_shape()[3] - << " valid_size: " << in->valid_size() - << " realsize: " << in->size() - << " offset_size "<get_seq_offset().size(); + for(auto in : executer.ins) { + LOG(ERROR) << " \\in shape " << in->valid_shape()[0] + << " " << in->valid_shape()[1] + << " " << in->valid_shape()[2] + << " " << in->valid_shape()[3] + << " valid_size: " << in->valid_size() + << " realsize: " << in->size() + << " offset_size "<get_seq_offset().size(); } #endif -#ifdef ENABLE_OP_TIMER - Context ctx(0, 0, 0); - saber::SaberTimer my_time; - my_time.start(ctx); -#endif - if (executer.op_name != "Input") { - executer.infer_shape(); - executer.launch(); - } - - for(int i = 0; i < executer.outs.size(); i++) { - executer.outs[i]->record_event(executer.ctx_p->get_compute_stream()); - } + #ifdef ENABLE_OP_TIMER - for (int i = 0; i < executer.outs.size(); i++) { - // record - executer.outs[i]->record_event(executer.ctx_p->get_compute_stream()); - executer.outs[i]->sync(); - } - my_time.end(ctx); - _op_time[op_id++] += my_time.get_average_ms(); -#endif - //LOG(INFO)<< "op: " << executer.name<<"(" << executer.op_name <<") === infer+launch time "<data(); - record_dev_tensorfile(out->data(), out->valid_size(), - ("net_record_" + executer.name + ".txt").data()); - LOG(ERROR) << " |---out avg " << tensor_average(out); - } - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); + Context ctx(0, 0, 0); + saber::SaberTimer my_time; + my_time.start(ctx); #endif -#ifdef USE_X86_PLACE - for (auto out : executer.outs) { - LOG(INFO) <data(); - const float* out_data = out->data(); - std::cout << "seq_offset size: " << out->get_seq_offset().size()<<" "; - for (int i = 0; i < 10; ++i) { - std::cout << out_data[i] << " "; - } - std::cout << std::endl; + if (executer.op_name != "Input" || executer.op_name != "Output") { + executer.infer_shape(); + executer.launch(); + } - } + for(int i = 0; i < executer.outs.size(); i++) { + executer.outs[i]->record_event(executer.ctx_p->get_compute_stream()); + } +#ifdef ENABLE_DEBUG + for(auto out : executer.outs) { + LOG(WARNING) << " \\out shape(" << out->valid_shape()[0] << "," + << out->valid_shape()[1] << "," + << out->valid_shape()[2] << "," + << out->valid_shape()[3] << ") " + << " avg: " << tensor_average(out); + } #endif + +#ifdef ENABLE_OP_TIMER + for (int i = 0; i < executer.outs.size(); i++) { + // record + executer.outs[i]->record_event(executer.ctx_p->get_compute_stream()); + executer.outs[i]->sync(); + } + my_time.end(ctx); + _op_time[op_id++] += my_time.get_average_ms(); #endif - } + + } // for } -template -void Net::execute_stop_at_node(std::string node_name) { - if(_suspended_point==-1) { - for(int i=0; i<_exec_funcs.size(); i++) { - if(_exec_funcs[i].name == node_name) { - _suspended_point = i; - } - } - } - for(int i=0; i<_suspended_point; i++) { - auto& executer = _exec_funcs[i]; +template +void Net::execute_stop_at_node(std::string node_name) { + if(_suspended_point==-1) { + for(int i=0; i<_exec_funcs.size(); i++) { + if(_exec_funcs[i].name == node_name) { + _suspended_point = i; + } + } + } + for(int i=0; i<_suspended_point; i++) { + auto& executer = _exec_funcs[i]; if (RunType == OpRunType::SYNC || executer.need_sync) { for(int i = 0; i < executer.ins.size(); i++) { // record @@ -304,37 +415,37 @@ void Net::execute_stop_at_node(std::string node_na << " " << in->valid_shape()[1] << " " << in->valid_shape()[2] << " " << in->valid_shape()[3] - << " valid_size: " << in->valid_size() - << " realsize: " << in->size() - << " offset_size "<get_seq_offset().size(); + << " valid_size: " << in->valid_size() + << " realsize: " << in->size() + << " offset_size "<get_seq_offset().size(); + } + for (auto out : executer.outs) { + LOG(INFO) << " |-- out tensor avg " << tensor_average(out); } - for (auto out : executer.outs) { - LOG(INFO) << " |-- out tensor avg " << tensor_average(out); - } #endif - if (executer.op_name != "Input") { - executer.infer_shape(); - executer.launch(); - } + if (executer.op_name != "Input") { + executer.infer_shape(); + executer.launch(); + } - for(int i = 0; i < executer.outs.size(); i++) { - executer.outs[i]->record_event(executer.ctx_p->get_compute_stream()); - } - } + for(int i = 0; i < executer.outs.size(); i++) { + executer.outs[i]->record_event(executer.ctx_p->get_compute_stream()); + } + } } -template -void Net::execute_start_from_node(std::string node_name) { - if(_start_point == -1) { - for(int i=0; i<_exec_funcs.size(); i++) { - if(_exec_funcs[i].name == node_name) { - _start_point = i; - } - } - } - for(int i=_start_point; i<_exec_funcs.size(); i++) { - auto& executer = _exec_funcs[i]; +template +void Net::execute_start_from_node(std::string node_name) { + if(_start_point == -1) { + for(int i=0; i<_exec_funcs.size(); i++) { + if(_exec_funcs[i].name == node_name) { + _start_point = i; + } + } + } + for(int i=_start_point; i<_exec_funcs.size(); i++) { + auto& executer = _exec_funcs[i]; if (RunType == OpRunType::SYNC || executer.need_sync) { for(int i = 0; i < executer.ins.size(); i++) { // record @@ -349,35 +460,35 @@ void Net::execute_start_from_node(std::string node << " " << in->valid_shape()[1] << " " << in->valid_shape()[2] << " " << in->valid_shape()[3] - << " valid_size: " << in->valid_size() - << " realsize: " << in->size() - << " offset_size "<get_seq_offset().size(); + << " valid_size: " << in->valid_size() + << " realsize: " << in->size() + << " offset_size "<get_seq_offset().size(); + } + for (auto out : executer.outs) { + LOG(INFO) << " |-- out tensor avg " << tensor_average(out); } - for (auto out : executer.outs) { - LOG(INFO) << " |-- out tensor avg " << tensor_average(out); - } #endif - if (executer.op_name != "Input") { - executer.infer_shape(); - executer.launch(); - } + if (executer.op_name != "Input") { + executer.infer_shape(); + executer.launch(); + } - for(int i = 0; i < executer.outs.size(); i++) { - executer.outs[i]->record_event(executer.ctx_p->get_compute_stream()); - } - } + for(int i = 0; i < executer.outs.size(); i++) { + executer.outs[i]->record_event(executer.ctx_p->get_compute_stream()); + } + } } -template -Tensor4dPtr Net::get_out(std::string out_name) { +template +Tensor4dPtr Net::get_out(std::string out_name) { auto& edge_it_list = _graph_p->get_in_arc_its(out_name); CHECK_EQ(edge_it_list.size(), 1) << " Node(" << out_name << ") should have 1 in edge."; return edge_it_list[0]->weight().get(); } -template -std::vector > Net::get_out_list() { +template +std::vector > Net::get_out_list() { auto& out_list_vec = _graph_p->get_outs(); for (auto& out : out_list_vec) { _out_tensor_list.push_back(get_out(out.c_str())); @@ -385,15 +496,15 @@ std::vector > Net::get_o return _out_tensor_list; } -template -Tensor4dPtr Net::get_in(std::string in_name) { +template +Tensor4dPtr Net::get_in(std::string in_name) { auto& edge_it_list = _graph_p->get_out_arc_its(in_name); CHECK_EQ(edge_it_list.size(), 1) << " Node(" << in_name << ") should have 1 out edge."; return edge_it_list[0]->weight().get(); } -template -std::vector > Net::get_in_list() { +template +std::vector > Net::get_in_list() { auto& in_list_vec = _graph_p->get_ins(); for (auto& in : in_list_vec) { _in_tensor_list.push_back(get_in(in.c_str())); @@ -401,14 +512,14 @@ std::vector > Net::get_i return _in_tensor_list; } -template -Tensor4dPtr Net::get_tensor_from_edge(const char* from, const char* to) { +template +Tensor4dPtr Net::get_tensor_from_edge(const char* from, const char* to) { return _graph_p->get_arc(std::string(from), std::string(to)).weight().get(); } -template -Status Net::init_memory() { - auto alloc_memory = [this](graph::Edge& edge) { +template +Status Net::init_memory() { + auto alloc_memory = [this](graph::Edge& edge) { auto& tensor_p = edge.weight(); if(!edge.shared()) { tensor_p->re_alloc(tensor_p->shape()); @@ -417,30 +528,30 @@ Status Net::init_memory() { }; _graph_p->Scanner->BFS_Edge(alloc_memory); - auto share_memory = [this](graph::Edge& edge) { + auto share_memory = [this](graph::Edge& edge) { if(edge.shared()) { auto& edge_name = edge.share_from(); - bool continue_search = true; - while(continue_search) { - auto match_edge = [&](graph::Edge& inner_edge) { - if(inner_edge.name() == edge_name) { - if(inner_edge.shared()) { - edge_name = inner_edge.share_from(); - return Status::EXIT(" Continue to find next . "); - } - if (inner_edge.weight()->size() < edge.weight()->valid_size()) { - auto inner_original_shape = inner_edge.weight()->valid_shape(); - inner_edge.weight()->re_alloc(edge.weight()->valid_shape()); - inner_edge.weight()->set_shape(inner_original_shape, inner_edge.weight()->shape()); - } - edge.weight()->share_from(*(inner_edge.weight())); - continue_search = false; - return Status::EXIT(" Find the matched target edge. "); - } - return Status::OK(); - }; - this->_graph_p->Scanner->BFS_Edge(match_edge); - } + bool continue_search = true; + while(continue_search) { + auto match_edge = [&](graph::Edge& inner_edge) { + if(inner_edge.name() == edge_name) { + if(inner_edge.shared()) { + edge_name = inner_edge.share_from(); + return Status::EXIT(" Continue to find next . "); + } + if (inner_edge.weight()->size() < edge.weight()->valid_size()) { + auto inner_original_shape = inner_edge.weight()->valid_shape(); + inner_edge.weight()->re_alloc(edge.weight()->valid_shape()); + inner_edge.weight()->set_shape(inner_original_shape, inner_edge.weight()->shape()); + } + edge.weight()->share_from(*(inner_edge.weight())); + continue_search = false; + return Status::EXIT(" Find the matched target edge. "); + } + return Status::OK(); + }; + this->_graph_p->Scanner->BFS_Edge(match_edge); + } } }; _graph_p->Scanner->BFS_Edge(share_memory); @@ -448,7 +559,7 @@ Status Net::init_memory() { if (_need_summary) { size_t temp_mem_in_mbytes = 0; size_t ori_temp_mem_in_mbytes = 0; - auto analysis_used_of_temp_mem = [&](graph::Edge& edge) { + auto analysis_used_of_temp_mem = [&](graph::Edge& edge) { auto& tensor_p = edge.weight(); if (!edge.shared()) { temp_mem_in_mbytes += (tensor_p->size() * 4); @@ -456,50 +567,58 @@ Status Net::init_memory() { ori_temp_mem_in_mbytes += (tensor_p->valid_shape().count() * 4); }; this->_graph_p->Scanner->BFS_Edge(analysis_used_of_temp_mem); - LOG(ERROR) << " temp !!!!!! " << temp_mem_in_mbytes / 1e6 << " mb "; - LOG(ERROR) << " origin temp !!!!!! " << ori_temp_mem_in_mbytes / 1e6 << " mb "; + + this->_graph_p->statistics.template set_info(temp_mem_in_mbytes / 1e6); + this->_graph_p->statistics.template set_info(ori_temp_mem_in_mbytes / 1e6); } return Status::OK(); } -template -Status Net::init_env(graph::Graph& graph) { +template +Status Net::init_env(graph::Graph& graph) { LOG(WARNING) << "Detect and initial " << graph.get_ins().size() << " lanes."; - Env::env_init(graph.get_ins().size()); + Env::env_init(graph.get_ins().size()); LOG(WARNING) << "Current used device id : " << TargetWrapper::get_device_id(); return Status::OK(); } #ifdef USE_CUDA -template class Net; -template class Net; -template class Net; +template class Net; +template class Net; +template class Net; -template class Net; -template class Net; -template class Net; +template class Net; +template class Net; +template class Net; #endif #ifdef USE_X86_PLACE -template class Net; -template class Net; -template class Net; +template class Net; +template class Net; +template class Net; -template class Net; -template class Net; -template class Net; +template class Net; +template class Net; +template class Net; #endif #ifdef USE_ARM_PLACE -template class Net; -template class Net; -template class Net; +#ifdef ANAKIN_TYPE_FP32 +template class Net; +template class Net; +#endif -template class Net; -template class Net; -template class Net; +#ifdef ANAKIN_TYPE_FP16 +template class Net; +template class Net; #endif -} /* namespace anakin */ +#ifdef ANAKIN_TYPE_INT8 +template class Net; +template class Net; +#endif //int8 + +#endif //arm +} /* namespace anakin */ diff --git a/framework/core/net/net.h b/framework/core/net/net.h index 3930e757e..b01c26136 100644 --- a/framework/core/net/net.h +++ b/framework/core/net/net.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ namespace anakin { /** * \brief Net class used for execution of graph and it is thread safety. */ -template +template class Net { public: explicit Net(bool need_summary = false); @@ -34,17 +34,28 @@ class Net { * \brief Construct a net by graph. * This construction should be use in thread call and make sure thread safety. */ - explicit Net(graph::Graph&, bool need_summary = false); + explicit Net(graph::Graph&, bool need_summary = false); + + /** + * \brief Construct a net by graph, init with specified context. + * This construction should be use in thread call and make sure thread safety. + */ + explicit Net(graph::Graph&, OpContextPtr ctx, bool need_summary = false); ~Net(); public: - - /** - * \brief init execute net from graph. + /** + * \brief init execute net from graph, init with specified context. + * you can use Net(Graph&) instead. + */ + void init(graph::Graph& graph, OpContextPtr ctx); + + /** + * \brief init execute net from graph. * you can use Net(Graph&) instead. */ - void init(graph::Graph&); + void init(graph::Graph&); /** * \brief do inference. @@ -77,7 +88,7 @@ class Net { void reset_op_time() {_op_time = std::vector(_exec_funcs.size(), 0.0f);} std::vector get_op_time() {return _op_time;} std::vector get_op_param() {return _op_param;} - std::vector > get_exec_funcs() { + std::vector > get_exec_funcs() { return _exec_funcs; } #endif @@ -87,19 +98,19 @@ class Net { /** * \brief Get out by name. */ - Tensor4dPtr get_out(std::string out_name); - std::vector > get_out_list(); + Tensor4dPtr get_out(std::string out_name); + std::vector > get_out_list(); /** * \brief Get in by name. */ - Tensor4dPtr get_in(std::string in_name); - std::vector > get_in_list(); + Tensor4dPtr get_in(std::string in_name); + std::vector > get_in_list(); /** * \brief Get tensor from a given edge. */ - Tensor4dPtr get_tensor_from_edge(const char* from, const char* to); + Tensor4dPtr get_tensor_from_edge(const char* from, const char* to); private: /** @@ -110,22 +121,22 @@ class Net { /** * \brief Initial context environments. */ - Status init_env(graph::Graph&); + Status init_env(graph::Graph&); private: ///< executor for operators in node. - std::vector > _exec_funcs; + std::vector > _exec_funcs; ///< suspended point is set when you invoke execute_stop_at_node int _suspended_point{-1}; ///< start point is set when you invoke execute_start_from_node int _start_point{-1}; ///< The pointer to Context. OpContextPtr _ctx_p; - graph::Graph* _graph_p{nullptr}; + graph::Graph* _graph_p{nullptr}; ///< A list of in tensor. - std::vector > _in_tensor_list; + std::vector > _in_tensor_list; ///< A list of out tensor. - std::vector > _out_tensor_list; + std::vector > _out_tensor_list; bool _need_summary{false}; #ifdef ENABLE_OP_TIMER diff --git a/framework/core/net/operator_func.cpp b/framework/core/net/operator_func.cpp index 7bc7f44c5..42a402d5a 100644 --- a/framework/core/net/operator_func.cpp +++ b/framework/core/net/operator_func.cpp @@ -2,33 +2,47 @@ namespace anakin { -template -void OperatorFunc::launch() { +template +void OperatorFunc::launch() { (*op)(*ctx_p, ins, outs); } -template -void OperatorFunc::infer_shape() { +template +void OperatorFunc::infer_shape() { op->_helper->InferShape(ins, outs); } #ifdef USE_CUDA -template class OperatorFunc; -template class OperatorFunc; -template class OperatorFunc; +template class OperatorFunc; +template class OperatorFunc; +template class OperatorFunc; #endif #ifdef USE_X86_PLACE -template class OperatorFunc; -template class OperatorFunc; -template class OperatorFunc; +template class OperatorFunc; +template class OperatorFunc; +template class OperatorFunc; +#endif + +#ifdef AMD_GPU +template class OperatorFunc; +template class OperatorFunc; +template class OperatorFunc; #endif #ifdef USE_ARM_PLACE -template class OperatorFunc; -template class OperatorFunc; -template class OperatorFunc; +#ifdef ANAKIN_TYPE_FP32 +template class OperatorFunc; +#endif + +#ifdef ANAKIN_TYPE_FP16 +template class OperatorFunc; +#endif + +#ifdef ANAKIN_TYPE_INT8 +template class OperatorFunc; #endif +#endif //arm } /* namespace */ diff --git a/framework/core/net/operator_func.h b/framework/core/net/operator_func.h index 822bfc37c..8973a7c5f 100644 --- a/framework/core/net/operator_func.h +++ b/framework/core/net/operator_func.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ namespace anakin { /** * \brief Operator executor class. */ -template +template struct OperatorFunc { OperatorFunc() {} @@ -44,14 +44,13 @@ struct OperatorFunc { ///< request list for operators. ///< std::vector > requests. ///< input data of operator. - ///< std::vector *> ins. - std::vector> ins; + std::vector > ins; ///< the lanes int data resides in std::vector in_lanes; ///< output data of operator - std::vector> outs; + std::vector > outs; ///< the lanes out data resides in std::vector out_lanes; @@ -61,7 +60,7 @@ struct OperatorFunc { bool need_sync{false}; - Operator* op; + Operator* op; ///< node name std::string name; diff --git a/framework/core/net/worker.cpp b/framework/core/net/worker.cpp index be981af6c..0af132fa2 100644 --- a/framework/core/net/worker.cpp +++ b/framework/core/net/worker.cpp @@ -4,7 +4,7 @@ namespace anakin { //! \brief a model map between thread_id and net model -template +template struct NetGraphWrapper { typedef std::thread::id key; @@ -36,67 +36,80 @@ struct NetGraphWrapper { } } - inline Net& get_net(key id) { + inline Net& get_net(key id) { if(_thread_to_net.find(id) != _thread_to_net.end()) { return _thread_to_net[id]; } LOG(FATAL) << " target key(thread_id) not found in NetGraphWrapper"; + return _thread_to_net[id]; } private: - std::unordered_map> _graph_map; - std::unordered_map> _thread_to_net GUARDED_BY(this->_mut); + std::unordered_map> _graph_map; + std::unordered_map> _thread_to_net GUARDED_BY(this->_mut); std::mutex _mut; }; -template -using MultiThreadModel = Singleton>; +template +using MultiThreadModel = Singleton>; -template -Worker::Worker(std::string model_path, int num_thread) : _model_path(model_path), ThreadPool(num_thread) {} +template +Worker::Worker(std::string model_path, int num_thread) : _model_path(model_path), ThreadPool(num_thread) {} -template -Worker::~Worker() {} +template +Worker::~Worker() {} -template -void Worker::pause(size_t time) { +template +void Worker::pause(size_t time) { std::function sleep = [](size_t time) { std::this_thread::sleep_for(std::chrono::milliseconds(time)); }; this->RunSync(sleep, time); } -template -void Worker::Reshape(std::string in_name, std::vector new_shape) { +template +void Worker::Reshape(std::string in_name, std::vector new_shape) { _in_shapes[in_name] = new_shape; } -template -void Worker::register_inputs(std::vector input_names) { +template +void Worker::register_inputs(std::vector input_names) { _inputs_in_order = input_names; } -template -void Worker::register_outputs(std::vector output_names) { +template +void Worker::register_outputs(std::vector output_names) { _outputs_in_order = output_names; } -template -void Worker::register_interior_edges(std::string bottom, std::string top) { +template +void Worker::register_interior_edges(std::string bottom, std::string top) { graph::Arc arc(bottom, top); _edges_in_order.push_back(arc); } -template -std::vector > Worker::sync_prediction(std::vector::type, Dtype> >& net_ins_list) { - auto task = [&](std::vector::type, Dtype> >& ins) -> std::vector > { - auto& net = MultiThreadModel::Global().get_net(std::this_thread::get_id()); - //fill the graph inputs +template +std::future::type> > > +Worker::sync_prediction(std::vector::type> >& net_ins_list) { + auto task = [&](std::vector::type> >& ins) + -> std::vector::type> > { + auto& net = MultiThreadModel::Global().get_net(std::this_thread::get_id()); + //fill the graph inputs + for(int i = 0; i < _inputs_in_order.size(); i++) { - auto d_tensor_in_p = net.get_in(_inputs_in_order[i]); - d_tensor_in_p->copy_from(*ins[i]); + float* data = (float*)(ins[i].mutable_data()); + for(int j=0; j<10; j++) { + LOG(INFO) << "------> data " << data[j];; + } + auto d_tensor_in_p = net.get_in(_inputs_in_order[i]); + d_tensor_in_p->reshape(ins[i].valid_shape()); + d_tensor_in_p->copy_from(ins[i]); + d_tensor_in_p->set_seq_offset(ins[i].get_seq_offset()); } + /*Context ctx(0, 0, 0); + saber::SaberTimer my_time; + my_time.start(ctx); #ifdef ENABLE_OP_TIMER Context ctx(0, 0, 0); saber::SaberTimer my_time; @@ -104,29 +117,40 @@ std::vector > Worker::sy #endif net.prediction(); + my_time.end(ctx); + LOG(ERROR) << " exec << time: " << my_time.get_average_ms() << " ms ";*/ + #ifdef ENABLE_OP_TIMER my_time.end(ctx); { std::lock_guard guard(_mut); _thead_id_to_prediction_times_vec_in_ms[std::this_thread::get_id()].push_back(my_time.get_average_ms()); + LOG(ERROR) << " exec << time: " << my_time.get_average_ms() << " ms "; } #endif // get outputs of graph - std::vector> ret; - for (auto out : _outputs_in_order) { - auto d_tensor_out_p = net.get_out(out); - ret.push_back(d_tensor_out_p); + std::vector::type>> ret; + ret.resize(_outputs_in_order.size()); + for (int out_idx = 0; out_idx < _outputs_in_order.size(); out_idx++) { + auto d_tensor_out_p = net.get_out(_outputs_in_order[out_idx]); + ret[out_idx].re_alloc(d_tensor_out_p->valid_shape()); + ret[out_idx].copy_from(*d_tensor_out_p); + float* data = (float*)(ret[out_idx].mutable_data()); + LOG(INFO) << "this thread: " << std::this_thread::get_id(); + for(int i=0; i< 10; i++) { + LOG(INFO) << "????? data " << data[i]; + } } return ret; - }; - return this->RunSync(task, net_ins_list); + }; + return this->RunAsync(task, net_ins_list); } -template -std::vector > Worker::sync_prediction_device(std::vector >& net_ins_list) { - auto task = [&](std::vector >& ins) -> std::vector > { - auto& net = MultiThreadModel::Global().get_net(std::this_thread::get_id()); +template +std::future > > Worker::sync_prediction_device(std::vector >& net_ins_list) { + auto task = [&](std::vector >& ins) -> std::vector > { + auto& net = MultiThreadModel::Global().get_net(std::this_thread::get_id()); //fill the graph inputs for (int i = 0; i < _inputs_in_order.size(); i++) { auto d_tensor_in_p = net.get_in(_inputs_in_order[i]); @@ -134,7 +158,7 @@ std::vector > Worker::sy } net.prediction(); // get outputs of graph - std::vector> ret; + std::vector> ret; for (auto out : _outputs_in_order) { auto d_tensor_out_p = net.get_out(out); ret.push_back(d_tensor_out_p); @@ -142,24 +166,26 @@ std::vector > Worker::sy return ret; }; - return this->RunSync(task, net_ins_list); + return this->RunAsync(task, net_ins_list); } -template -void Worker::async_prediction(std::vector::type, Dtype> >& net_ins_list) { +template +void Worker::async_prediction(std::vector::type> >& net_ins_list) { std::lock_guard guard(this->_async_que_mut); - auto task = [&](std::vector::type, Dtype> >& ins) -> std::vector > { - auto& net = MultiThreadModel::Global().get_net(std::this_thread::get_id()); + auto task = [&](std::vector::type> >& ins) -> std::vector > { + auto& net = MultiThreadModel::Global().get_net(std::this_thread::get_id()); //fill the graph inputs for(int i = 0; i < _inputs_in_order.size(); i++) { auto d_tensor_in_p = net.get_in(_inputs_in_order[i]); + d_tensor_in_p->reshape(ins[i]->valid_shape()); d_tensor_in_p->copy_from(*ins[i]); + d_tensor_in_p->set_seq_offset(ins[i]->get_seq_offset()); } net.prediction(); // get outputs of graph - std::vector> ret; + std::vector> ret; for(auto out : _outputs_in_order) { auto d_tensor_out_p = net.get_out(out); ret.push_back(d_tensor_out_p); @@ -170,54 +196,63 @@ void Worker::async_prediction(std::vectorRunAsync(task, net_ins_list)); } -template -std::vector > Worker::async_get_result() { +template +std::vector > Worker::async_get_result() { std::lock_guard guard(this->_async_que_mut); auto result = std::move(_async_que.front()); _async_que.pop(); return result.get(); } -template -void Worker::init() { - MultiThreadModel::Global().initial(_model_path, _in_shapes); +template +void Worker::init() { + MultiThreadModel::Global().initial(_model_path, _in_shapes); } -template -void Worker::auxiliary_funcs() { +template +void Worker::auxiliary_funcs() { for(auto func : _auxiliary_funcs) { func(); } } #ifdef USE_CUDA -template class Worker; -template class Worker; -template class Worker; +template class Worker; +template class Worker; +template class Worker; -template class Worker; -template class Worker; -template class Worker; +template class Worker; +template class Worker; +template class Worker; #endif #ifdef USE_X86_PLACE -template class Worker; -template class Worker; -template class Worker; +template class Worker; +template class Worker; +template class Worker; -template class Worker; -template class Worker; -template class Worker; +template class Worker; +template class Worker; +template class Worker; #endif #ifdef USE_ARM_PLACE -template class Worker; -template class Worker; -template class Worker; -template class Worker; -template class Worker; -template class Worker; +#ifdef ANAKIN_TYPE_FP32 +template class Worker; +template class Worker; +#endif + +#ifdef ANAKIN_TYPE_FP16 +template class Worker; +template class Worker; +#endif + +#ifdef ANAKIN_TYPE_INT8 +template class Worker; +template class Worker; +#endif + #endif } /* namespace */ diff --git a/framework/core/net/worker.h b/framework/core/net/worker.h index 9ecf23842..b64f1cdbb 100644 --- a/framework/core/net/worker.h +++ b/framework/core/net/worker.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -62,7 +62,7 @@ namespace anakin { * \endcode * */ -template +template class Worker : public ThreadPool { public: Worker(std::string model_path, int thread_num); @@ -106,23 +106,23 @@ class Worker : public ThreadPool { * \param host net_in_list the inputs of net graph (note: the len of net_in_list should be equal to the net inputs). * \return the net graph outputs. */ - std::vector > sync_prediction(\ - std::vector::type, Dtype> >& net_in_list); + std::future::type> > > sync_prediction(\ + std::vector::type> >& net_in_list); /** * \brief Do sync prediction in multi-thread worker useful in sync rpc server, this function need * \param device net_in_list the inputs of net graph (note: the len of net_in_list should be equal to the net inputs). * \return the net graph outputs. */ - std::vector > sync_prediction_device(\ - std::vector >& net_in_list); + std::future > > sync_prediction_device(\ + std::vector >& net_in_list); /** * \brief do async prediction in multi-thread worker, the result will be save to que * \param net_in_list the inputs of net graph (note: the len of net_in_list should be equal to the net inputs) * \return void */ - void async_prediction(std::vector::type, Dtype> >& net_in_list); + void async_prediction(std::vector::type> >& net_in_list); /** * \brief Judge if the async queue is empty. @@ -135,7 +135,7 @@ class Worker : public ThreadPool { * the return order of results from async_get_result is the same as the order of net_in_list called by async_prediction. * \return the net inference result. */ - std::vector > async_get_result(); + std::vector > async_get_result(); public: /** @@ -175,7 +175,7 @@ class Worker : public ThreadPool { std::vector _outputs_in_order; ///< vector of edges in order. std::vector> _edges_in_order; - std::queue< std::future< std::vector > > > _async_que GUARDED_BY(_async_que_mut); + std::queue< std::future< std::vector > > > _async_que GUARDED_BY(_async_que_mut); std::mutex _async_que_mut; std::vector > _auxiliary_funcs; std::unordered_map> _in_shapes; @@ -185,17 +185,17 @@ class Worker : public ThreadPool { #endif }; -template +template template -void Worker::register_aux_function(functor function, ParamTypes ...args) { +void Worker::register_aux_function(functor function, ParamTypes ...args) { auto task = std::bind(function, std::forward(args)...); _auxiliary_funcs.push_back(task); } ///< global singleton worker -template -using GlobalWorker = Singleton>; +template +using GlobalWorker = Singleton>; } /* namespace */ diff --git a/framework/core/operator/operator.cpp b/framework/core/operator/operator.cpp index 7f5ca42f3..f7d518aff 100644 --- a/framework/core/operator/operator.cpp +++ b/framework/core/operator/operator.cpp @@ -2,16 +2,16 @@ namespace anakin { -inline std::vector& OpAttrObjectRegister::get_list_op_name() { +std::vector& OpAttrObjectRegister::get_list_op_name() { return this->get_list_name(); } // Operator attributes warpper object register -inline OpAttrWarpper* OpAttrObjectRegister::operator[](const std::string op_name) { +OpAttrWarpper* OpAttrObjectRegister::operator[](const std::string op_name) { return ObjectRegister::operator[](op_name); } -inline void OpAttrObjectRegister::add_alias(const std::string& ori_op_name, +void OpAttrObjectRegister::add_alias(const std::string& ori_op_name, const std::string& op_name_alias) { this->__alias__(ori_op_name, op_name_alias); } diff --git a/framework/core/operator/operator.h b/framework/core/operator/operator.h index c9b68d271..aeebbdefc 100644 --- a/framework/core/operator/operator.h +++ b/framework/core/operator/operator.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,25 +22,20 @@ #include "framework/core/factory.h" #include "framework/core/parameter.h" #include "framework/core/singleton.h" +#include "framework/utils/parameter_fusion.h" +#include "framework/graph/graph_global_mem.h" namespace anakin { -template -class OperatorHelper; +using namespace std::placeholders; -/** - * \brief Basic operation class. - */ -class OperatorBase { -public: - OperatorBase() {} - virtual ~OperatorBase() {} -}; +template +class OperatorHelper; /** * \brief Operator class, it's a base class for other op defined by anakin. */ -template +template class Operator : public OperatorBase { public: Operator() {} @@ -52,27 +47,27 @@ class Operator : public OperatorBase { } virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { + const std::vector >& ins, + std::vector >& outs) { LOG(ERROR) << "The Operator is basic"; } /** * \brief Bind helper. */ - Operator* operator>>(OperatorHelper* helper) { + Operator* operator>>(OperatorHelper* helper) { _helper = helper; return this; } ///< Receive helper and attr from outside define. - OperatorHelper* _helper{nullptr}; + OperatorHelper* _helper{nullptr}; }; /** * \brief Helper for operator, user defined helper should derived from it. */ -template +template class OperatorHelper { public: OperatorHelper() {} @@ -83,31 +78,37 @@ class OperatorHelper { */ virtual Status InitParam() { DLOG(ERROR) << " Target ParserParam not overriden."; + return Status::FAIL(); } /** * \brief Initial all the resource needed by operator and it's also need to be overrided. */ virtual Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs){ + const std::vector >& ins, + std::vector >& outs){ DLOG(ERROR) << " Target init not overriden."; + return Status::FAIL(); } /** * \brief Infer the shape of output and input and it's also need to be overrided. */ - virtual Status InferShape(const std::vector >& ins, - std::vector >& outs){ + virtual Status InferShape(const std::vector >& ins, + std::vector >& outs){ DLOG(ERROR) << " Target infershape not overriden."; + return Status::FAIL(); } /** * \brief Bind parameter pack from graph. */ - void BindParam(graph::NodePtr& node_p) { - _node_p = std::make_shared>(); - *_node_p = *node_p; + void BindParam(graph::NodePtr& node_p) { + // Shareptr shallow copy + // Note: We can also use deep copy by using node operator=, + // but if change the node attrs through net class, + // the base graph can't detect it. + _node_p = node_p; } /** @@ -116,9 +117,31 @@ class OperatorHelper { template T get_attr(std::string attr_name) { return _node_p->get_attr(attr_name); } + /** + * \brief set target attr + */ + template + void set_sttr(const std::string& attr_name, const T val) { + _node_p->set_attr(attr_name, val); + } + + /** + * \brief Judge if op access target attr + */ + inline bool check_attr(const std::string& attr_name) { + return _node_p->inspect_attr(attr_name); + } + + /** + * \brief remove attr if it exists + */ + inline void remove_attr(const std::string& attr_name) { + _node_p->remove_attr(attr_name); + } + private: ///< Pointer to graph node. - graph::NodePtr _node_p; + graph::NodePtr _node_p; }; /** @@ -127,15 +150,33 @@ class OperatorHelper { #define GET_PARAMETER(type, name) \ this->template get_attr(#name) +/** + * \brief Call set_sttr from derived class. + */ +#define SET_PARAMETER(name, val, type) \ + this->template set_sttr(#name, val) + +/** + * \brief Call check_attr from derived class. + */ +#define CHECK_PARAMETER(name) \ + this->check_attr(#name) + +/** + * \brief Call remove_attr from derived class. + */ +#define REMOVE_PARAMETER(name) \ + this->remove_attr(#name) + /** * \brief Operator creator. * Typedef std::function OperatorCreator. */ -template -using OperatorCreator = std::function*()>; +template +using OperatorCreator = std::function*()>; -template -class OperatorFactory : public Factory, OperatorCreator> { +template +class OperatorFactory : public Factory, OperatorCreator> { public: /** @@ -145,11 +186,30 @@ class OperatorFactory : public Factory, OperatorCr return this->get_list_name(); } + /** + * \brief judge if op factory has target op by it's name + */ + virtual inline bool has_op(const std::string& op_name) { + auto& supp_op_name_vec = get_list_op_name(); + auto ret_it = std::find(supp_op_name_vec.begin(), supp_op_name_vec.end(), op_name); + if(ret_it != supp_op_name_vec.end()) { + return true; + } + return false; + } + /** * \brief Create Operator object by op_name. + * + * note: If Ptype is low precision( < FP32) and the low precise op doesn't exist, + * this function will return nullptr. + * */ - virtual Operator* operator[](const std::string op_name) { - Factory, OperatorCreator>::operator[](op_name); + virtual Operator* operator[](const std::string op_name) { + if(has_op(op_name)) { + return Factory, OperatorCreator>::operator[](op_name); + } + return nullptr; } /** @@ -161,8 +221,8 @@ class OperatorFactory : public Factory, OperatorCr }; ///< Typedef Singleton OpFactory. -template -using OpFactory = Singleton >; +template +using OpFactory = Singleton >; /** * \brief Operator objector register type. @@ -209,20 +269,20 @@ typedef Singleton OpAttrRegister; /// .set_in(1) /// .set_out(1) /// .Args("axis", " the axis in input dim index. ") -/// .Arg("bias_term", " whether include bias parameter.") +/// .Args("bias_term", " whether include bias parameter.") /// .Args>("weight", " the weight name.") /// .Args>("bias", " the bias name."); #define ANAKIN_REGISTER_OP(OpName) \ - static AK_ATTRIBUTE_UNUSED OpAttrWarpper& AK_MAKE_UNIQ_OPERATOR_NAME(OpName) = \ - OpAttrRegister::Global().Register(#OpName).name(#OpName) - -#define ANAKIN_REGISTER_OP_HELPER(OpName, OpHelperName, TargetT, DataT, PrecisionT) \ - static AK_ATTRIBUTE_UNUSED bool AK_MAKE_UNIQ_OPERATOR_NAME(OpName##_##OpHelperName##TargetT##DataT) = \ - OpFactory::Global().Register(#OpName, \ - []() { \ - OpName* tmpop = new OpName(); \ - (*tmpop)>>(new OpHelperName()); \ - return tmpop; \ + static AK_ATTRIBUTE_UNUSED OpAttrWarpper& AK_MAKE_UNIQ_OPERATOR_NAME(OpName) = \ + OpAttrRegister::Global().Register(#OpName).name(#OpName) + +#define ANAKIN_REGISTER_OP_HELPER(OpName, OpHelperName, TargetT, PrecisionT) \ + static AK_ATTRIBUTE_UNUSED bool AK_MAKE_UNIQ_OPERATOR_NAME(OpName##_##OpHelperName##TargetT) = \ + OpFactory::Global().Register(#OpName, \ + []() { \ + OpName* tmpop = new OpName(); \ + (*tmpop)>>(new OpHelperName()); \ + return tmpop; \ } ) diff --git a/framework/core/operator/operator_attr.cpp b/framework/core/operator/operator_attr.cpp index 885eb572c..9e7a3ad38 100644 --- a/framework/core/operator/operator_attr.cpp +++ b/framework/core/operator/operator_attr.cpp @@ -3,66 +3,79 @@ namespace anakin { -OpAttrWarpper& OpAttrWarpper::name(std::string op_name) { +OpAttrWarpper& OpAttrWarpper::name(const std::string& op_name) { //! set the origin op name. opAttr_.name = op_name; + return *this; } -template -OpAttrWarpper& OpAttrWarpper::__alias__(std::string op_name) { +template +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name) { OpAttrRegister::Global().add_alias(this->opAttr_.name, op_name); - OpFactory::Global().add_alias(this->opAttr_.name, op_name); - return *(this); + OpFactory::Global().add_alias(this->opAttr_.name, op_name); + return *this; } +//#ifdef USE_CUDA +template +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); +template +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); +template +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); +//#endif +//#ifdef USE_X86_PLACE template -OpAttrWarpper& OpAttrWarpper::__alias__(std::string op_name); +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); template -OpAttrWarpper& OpAttrWarpper::__alias__(std::string op_name); +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); template -OpAttrWarpper& OpAttrWarpper::__alias__(std::string op_name); +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); +//#endif +//#ifdef USE_ARM_PLACE +//#ifdef ANAKIN_TYPE_FP32 template -OpAttrWarpper& OpAttrWarpper::__alias__(std::string op_name); +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); +//#endif + +//#ifdef ANAKIN_TYPE_FP16 template -OpAttrWarpper& OpAttrWarpper::__alias__(std::string op_name); +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); +//#endif + +//#ifdef ANAKIN_TYPE_INT8 template -OpAttrWarpper& OpAttrWarpper::__alias__(std::string op_name); +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); +//#endif + +//#endif template -OpAttrWarpper& OpAttrWarpper::__alias__(std::string op_name); +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); template -OpAttrWarpper& OpAttrWarpper::__alias__(std::string op_name); +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); template -OpAttrWarpper& OpAttrWarpper::__alias__(std::string op_name); +OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); -OpAttrWarpper& OpAttrWarpper::Doc(std::string doc) { +OpAttrWarpper& OpAttrWarpper::Doc(const std::string& doc) { opAttr_.doc = doc; - return *(this); + return *this; } OpAttrWarpper& OpAttrWarpper::num_in(size_t num) { opAttr_.num_in = num; - return *(this); + return *this; } OpAttrWarpper& OpAttrWarpper::num_out(size_t num) { opAttr_.num_out = num; - return *(this); + return *this; } OpAttrWarpper& OpAttrWarpper::commutative(bool is_commutative) { opAttr_.is_commutative = is_commutative; - return *(this); -} - -template -T& OpAttrWarpper::GetArg(std::string arg_name, graph::AttrInfo& info) { - CHECK(this->has_arg(arg_name)) << " the operator doesn't have target argument: " << arg_name; - CHECK(info.parameter.count(arg_name) > 0) << " Attr info doesn't have target argument: " << - arg_name; - any& target_arg = info.parameter[arg_name]; - return any_cast(target_arg); + return *this; } } /* namespace anakin */ diff --git a/framework/core/operator/operator_attr.h b/framework/core/operator/operator_attr.h index 23af931e7..32ae4b942 100644 --- a/framework/core/operator/operator_attr.h +++ b/framework/core/operator/operator_attr.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -59,13 +59,13 @@ class OpAttrWarpper { /** * \brief Set origin op name (opAttr_ = op_name) */ - OpAttrWarpper& name(std::string op_name); + OpAttrWarpper& name(const std::string&); /// set alias name of Operator. - template - OpAttrWarpper& __alias__(std::string); + template + OpAttrWarpper& __alias__(const std::string&); /// set description doc for target op. - OpAttrWarpper& Doc( std::string ); + OpAttrWarpper& Doc(const std::string&); /// set and get number input and output. OpAttrWarpper& num_in(size_t); OpAttrWarpper& num_out(size_t); @@ -83,7 +83,7 @@ class OpAttrWarpper { * param arg_doc : the doc for argument [default = ""]. */ template - OpAttrWarpper& Args(std::string arg_name, std::string arg_doc = "") { + OpAttrWarpper& Args(const std::string& arg_name, const std::string& arg_doc = "") { Argument arg; arg.name = arg_name; arg.type = type_id().type_info(); @@ -101,7 +101,12 @@ class OpAttrWarpper { * \brief Get arg value from attributes info (AttrInfo) in node. */ template - T& GetArg(std::string arg_name, graph::AttrInfo& info); + T& GetArg(std::string arg_name, graph::AttrInfo& info) { + CHECK(this->has_arg(arg_name)) << " the operator doesn't have target argument: " << arg_name; + CHECK(info.inspect(arg_name)) << " Attr info doesn't have target argument: " + << arg_name; + return info.get(arg_name); + } /** * \brief Get name of operator. @@ -125,7 +130,7 @@ class OpAttrWarpper { /** * \brief Judge if OperatorAttr the argument's name. */ - bool has_arg(std::string arg_name) { return opAttr_.Args_map.count(arg_name) > 0; } + bool has_arg(const std::string& arg_name) { return opAttr_.Args_map.count(arg_name) > 0; } friend class OpAttrHelper; private: diff --git a/framework/core/operator/operator_help.cpp b/framework/core/operator/operator_help.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/framework/core/operator/operator_help.h b/framework/core/operator/operator_help.h new file mode 100644 index 000000000..0a0f42100 --- /dev/null +++ b/framework/core/operator/operator_help.h @@ -0,0 +1,27 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_HELP_H +#define ANAKIN_OPERATOR_HELP_H + +namespace anakin { + +class MixPrecOpGen { +}; + + +} /* namespace anakin */ + +#endif diff --git a/framework/core/operator/request.h b/framework/core/operator/request.h index ba6b0e886..d7a173281 100644 --- a/framework/core/operator/request.h +++ b/framework/core/operator/request.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/core/parameter.h b/framework/core/parameter.h index 4fc6b8bdc..0973c9fec 100644 --- a/framework/core/parameter.h +++ b/framework/core/parameter.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -33,14 +33,14 @@ using namespace saber; /// Basic type define comes from lib saber by hac-sys-baidu. #ifndef USE_SABER #define USE_SABER -template -using Tensor4d = Tensor;/// saber::Tensor<4, Ttype, Dtype, NCHW>; +template +using Tensor4d = Tensor; /// Global type to tensor pointer. -template -using Tensor4dPtr = Tensor4d*;/// std::shared_ptr >; +template +using Tensor4dPtr = Tensor4d*; -template -using TensorSharedPtr = std::shared_ptr >; +template +using TensorSharedPtr = std::shared_ptr >; using Shape4d = Shape;/// saber::Shape<4>; @@ -165,47 +165,58 @@ class PTuple { template struct DataTypeRecover; /// declare for PBlock -/** +/** * \brief a simple wrapper of tensor use in weights parameter. * default layout [ NCHW ] */ -template +template class PBlock { public: inline bool host_only() { return true; } + + inline void map_to_host() {} }; #ifdef USE_CUDA -template -class PBlock { +template<> +class PBlock { public: - typedef Tensor4d::type> d_type; - typedef Tensor4d::type> h_type; + typedef Tensor4d d_type; + typedef Tensor4d h_type; - PBlock() { - _d_inner_tensor = std::make_shared(); - _h_inner_tensor = std::make_shared(); + PBlock(DataType Dtype = AK_FLOAT) { + _d_inner_tensor = std::make_shared(Dtype); + _h_inner_tensor = std::make_shared(Dtype); } - PBlock(Shape4d& shape) { - _d_inner_tensor = std::make_shared(shape); - _h_inner_tensor = std::make_shared(shape); + PBlock(Shape4d& shape, DataType Dtype = AK_FLOAT) { + _d_inner_tensor = std::make_shared(shape, Dtype); + _h_inner_tensor = std::make_shared(shape, Dtype); } inline bool host_only() { return false; } + inline void map_to_host() { + _h_inner_tensor->re_alloc(this->real_shape()); + auto save_valid_shape = _d_inner_tensor->valid_shape(); + _d_inner_tensor->set_shape(this->real_shape()); + _h_inner_tensor->copy_from(*_d_inner_tensor); + _d_inner_tensor->set_shape(save_valid_shape); + _h_inner_tensor->set_shape(save_valid_shape); + } + /// shallow copy construction - PBlock(PBlock& p_block) { *this = p_block; } + PBlock(PBlock& p_block) { *this = p_block; } - PBlock(const PBlock& p_block) { *this = p_block; } + PBlock(const PBlock& p_block) { *this = p_block; } /// assign - PBlock& operator=(const PBlock& p_block) { + PBlock& operator=(const PBlock& p_block) { _d_inner_tensor = p_block._d_inner_tensor; _h_inner_tensor = p_block._h_inner_tensor; } - PBlock& operator=(PBlock& p_block) { + PBlock& operator=(PBlock& p_block) { _d_inner_tensor = p_block._d_inner_tensor; _h_inner_tensor = p_block._h_inner_tensor; } @@ -215,11 +226,100 @@ class PBlock { h_type& h_tensor() { return *(_h_inner_tensor); } /// Get host data to vector. - std::vector vector() { - std::vector ret; - auto* data = _h_inner_tensor->mutable_data(); + std::vector vector() { + std::vector ret; + DataTraitBase::PtrDtype data = _h_inner_tensor->mutable_data(); for (int i = 0; i <_h_inner_tensor->valid_size(); i++) { - ret.push_back(data[i]); + ret.push_back(((float*)data)[i]); + } + return ret; + } + + /// reallocate the storage + void re_alloc(Shape4d shape) { + _d_inner_tensor->re_alloc(shape); + _h_inner_tensor->re_alloc(shape); + } + + /// Get shape. + Shape4d shape() const { + CHECK(_d_inner_tensor->valid_shape() == _h_inner_tensor->valid_shape()) + << " [Fatal Err] device shape is not equal to that of host in PBlock"; + return _d_inner_tensor->valid_shape(); + } + + /// get real shape + Shape4d real_shape() { + return _d_inner_tensor->shape(); + } + + /// Get size. + size_t count() const { + return this->shape().count(); + } + + ~PBlock() {} + +private: + std::shared_ptr _d_inner_tensor; + std::shared_ptr _h_inner_tensor; +}; +#endif + +#ifdef AMD_GPU +template<> +class PBlock { +public: + typedef Tensor4d d_type; + typedef Tensor4d h_type; + + PBlock(DataType Dtype = AK_FLOAT) { + _d_inner_tensor = std::make_shared(Dtype); + _h_inner_tensor = std::make_shared(Dtype); + } + + PBlock(Shape4d& shape, DataType Dtype = AK_FLOAT) { + _d_inner_tensor = std::make_shared(shape, Dtype); + _h_inner_tensor = std::make_shared(shape, Dtype); + } + + inline bool host_only() { return false; } + + inline void map_to_host() { + _h_inner_tensor->re_alloc(this->real_shape()); + auto save_valid_shape = _d_inner_tensor->valid_shape(); + _d_inner_tensor->set_shape(this->real_shape()); + _h_inner_tensor->copy_from(*_d_inner_tensor); + _d_inner_tensor->set_shape(save_valid_shape); + _h_inner_tensor->set_shape(save_valid_shape); + } + + /// shallow copy construction + PBlock(PBlock& p_block) { *this = p_block; } + + PBlock(const PBlock& p_block) { *this = p_block; } + + /// assign + PBlock& operator=(const PBlock& p_block) { + _d_inner_tensor = p_block._d_inner_tensor; + _h_inner_tensor = p_block._h_inner_tensor; + } + + PBlock& operator=(PBlock& p_block) { + _d_inner_tensor = p_block._d_inner_tensor; + _h_inner_tensor = p_block._h_inner_tensor; + } + + /// Get tensor. + d_type& d_tensor() { return *(_d_inner_tensor); } + h_type& h_tensor() { return *(_h_inner_tensor); } + + /// Get host data to vector. + std::vector vector() { + std::vector ret; + DataTraitBase::PtrDtype data = _h_inner_tensor->mutable_data(); + for (int i = 0; i <_h_inner_tensor->valid_size(); i++) { + ret.push_back(((float*)data)[i]); } return ret; } @@ -231,6 +331,12 @@ class PBlock { return _d_inner_tensor->valid_shape(); } + /// get real shape + Shape4d real_shape() { + return _d_inner_tensor->shape(); + } + + /// Get size. size_t count() { return this->shape().count(); @@ -245,32 +351,34 @@ class PBlock { #endif #ifdef USE_X86_PLACE -template -class PBlock { +template<> +class PBlock { public: - typedef Tensor4d::type> type; + typedef Tensor4d type; - PBlock() { - _inner_tensor = std::make_shared(); + PBlock(DataType Dtype = AK_FLOAT) { + _inner_tensor = std::make_shared(Dtype); } - PBlock(Shape4d& shape) { - _inner_tensor = std::make_shared(shape); + PBlock(Shape4d& shape, DataType Dtype = AK_FLOAT) { + _inner_tensor = std::make_shared(shape, Dtype); } inline bool host_only() { return true; } + + inline void map_to_host() {} /// shallow copy construction - PBlock(PBlock& p_block) { *this = p_block; } + PBlock(PBlock& p_block) { *this = p_block; } - PBlock(const PBlock& p_block) { *this = p_block; } + PBlock(const PBlock& p_block) { *this = p_block; } /// assign - PBlock& operator=(const PBlock& p_block) { + PBlock& operator=(const PBlock& p_block) { _inner_tensor = p_block._inner_tensor; } - PBlock& operator=(PBlock& p_block) { + PBlock& operator=(PBlock& p_block) { _inner_tensor = p_block._inner_tensor; } @@ -279,60 +387,74 @@ class PBlock { type& h_tensor() { return *(_inner_tensor); } /// Get host data to vector. - std::vector vector() { - std::vector ret; - auto* data = _inner_tensor->mutable_data(); + std::vector vector() { + std::vector ret; + DataTraitBase::PtrDtype data = _inner_tensor->mutable_data(); for (int i = 0; i <_inner_tensor->valid_size(); i++) { - ret.push_back(data[i]); + ret.push_back(((float*)data)[i]); } return ret; } + // reallocate storage + void re_alloc(Shape4d shape) { + _inner_tensor->re_alloc(shape); + } + /// Get shape. - Shape4d shape() { - return _inner_tensor->valid_shape(); + Shape4d shape() { + return _inner_tensor->valid_shape(); } + /// get real shape + Shape4d real_shape() { + return _inner_tensor->shape(); + } + + /// Get size. - size_t count() { + size_t count() { return this->shape().count(); } ~PBlock() {} -private: - std::shared_ptr _inner_tensor; +private: + std::shared_ptr _inner_tensor; }; #endif #ifdef USE_ARM_PLACE -template -class PBlock { +template<> +class PBlock { public: - typedef Tensor4d::type> type; + typedef Tensor4d type; - PBlock() { - _inner_tensor = std::make_shared(); + PBlock(DataType Dtype = AK_FLOAT) { + _inner_tensor = std::make_shared(Dtype); } - PBlock(Shape4d& shape) { - _inner_tensor = std::make_shared(shape); + PBlock(Shape4d& shape, DataType Dtype = AK_FLOAT) { + _inner_tensor = std::make_shared(shape, Dtype); } inline bool host_only() { return true; } + inline void map_to_host() {} /// shallow copy construction - PBlock(PBlock& p_block) { *this = p_block; } + PBlock(PBlock& p_block) { *this = p_block; } - PBlock(const PBlock& p_block) { *this = p_block; } + PBlock(const PBlock& p_block) { *this = p_block; } /// assign - PBlock& operator=(const PBlock& p_block) { - _inner_tensor = p_block._inner_tensor; + PBlock& operator=(const PBlock& p_block) { + this->_inner_tensor = p_block._inner_tensor; + return *this; } - PBlock& operator=(PBlock& p_block) { - _inner_tensor = p_block._inner_tensor; + PBlock& operator=(PBlock& p_block) { + this->_inner_tensor = p_block._inner_tensor; + return *this; } /// Get tensor. @@ -340,34 +462,42 @@ class PBlock { type& h_tensor() { return *(_inner_tensor); } /// Get host data to vector. - std::vector vector() { - std::vector ret; - auto* data = _inner_tensor->mutable_data(); + std::vector vector() { + std::vector ret; + DataTraitBase::PtrDtype data = _inner_tensor->mutable_data(); for (int i = 0; i <_inner_tensor->valid_size(); i++) { - ret.push_back(data[i]); + ret.push_back(((float*)data)[i]); } return ret; } + // reallocate the storage + void re_alloc(Shape4d shape) { + _inner_tensor->re_alloc(shape); + } + /// Get shape. - Shape4d shape() { - return _inner_tensor->valid_shape(); + Shape4d shape() { + return _inner_tensor->valid_shape(); + } + + Shape4d real_shape() { + return _inner_tensor->shape(); } /// Get size. - size_t count() { + size_t count() { return this->shape().count(); } ~PBlock() {} -private: - std::shared_ptr _inner_tensor; +private: + std::shared_ptr _inner_tensor; }; #endif - -/** +/** * \brief Enum type. */ struct Enum { diff --git a/framework/core/singleton.h b/framework/core/singleton.h index f0f8b5dd9..b79b94042 100644 --- a/framework/core/singleton.h +++ b/framework/core/singleton.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/core/thread_pool.h b/framework/core/thread_pool.h index f4687bf33..aea288aec 100644 --- a/framework/core/thread_pool.h +++ b/framework/core/thread_pool.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -32,32 +32,88 @@ namespace anakin { class ThreadPool { public: ThreadPool(int num_thread):_num_thread(num_thread) {} - virtual ~ThreadPool(); - - void launch(); + virtual ~ThreadPool(){ + stop(); + this->_cv.notify_all(); + for(auto & worker: _workers){ + worker.join(); + } + } + + void launch() { + for(size_t i = 0; i<_num_thread; ++i) { + _workers.emplace_back( + [i ,this]() { + // initial + this->init(); + for(;;) { + std::function task; + { + std::unique_lock lock(this->_mut); + while(!this->_stop && this->_tasks.empty()) { + this->_cv.wait(lock); + } + if(this->_stop) { + return ; + } + task = std::move(this->_tasks.front()); + this->_tasks.pop(); + } + DLOG(INFO) << " Thread (" << i <<") processing"; + auxiliary_funcs(); + task(); + } + } + ); + } + } /** * \brief Lanuch the normal function task in sync. */ template - typename function_traits::return_type RunSync(functor function, ParamTypes ...args); + typename function_traits::return_type RunSync(functor function, ParamTypes ...args) { + auto task = std::make_shared::return_type(void)> >( \ + std::bind(function, std::forward(args)...) + ); + std::future::return_type> result = task->get_future(); + { + std::unique_lock lock(this->_mut); + this->_tasks.emplace( [&]() { (*task)(); } ); + } + this->_cv.notify_one(); + return result.get(); + } /** * \brief Lanuch the normal function task in async. */ template - typename std::future::return_type> RunAsync(functor function, ParamTypes ...args); - + typename std::future::return_type> RunAsync(functor function, ParamTypes ...args) { + auto task = std::make_shared::return_type(void)> >( \ + std::bind(function, std::forward(args)...) + ); + std::future::return_type> result = task->get_future(); + { + std::unique_lock lock(this->_mut); + this->_tasks.emplace( [=]() { (*task)(); } ); + } + this->_cv.notify_one(); + return result; + } /// Stop the pool. - void stop(); + void stop() { + std::unique_lock lock(this->_mut); + _stop = true; + } private: /// The initial function should be overrided by user who derive the ThreadPool class. - virtual void init(); + virtual void init(){} /// Auxiliary function should be overrided when you want to do other things in the derived class. - virtual void auxiliary_funcs(); + virtual void auxiliary_funcs(){} private: int _num_thread; @@ -70,6 +126,6 @@ class ThreadPool { } /* namespace anakin */ -#include "thread_pool.inl" +//#include "thread_pool.inl" #endif diff --git a/framework/core/thread_safe_macros.h b/framework/core/thread_safe_macros.h index 6f08cdd4e..e8d9b6f97 100644 --- a/framework/core/thread_safe_macros.h +++ b/framework/core/thread_safe_macros.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/core/tls.h b/framework/core/tls.h index df38fa2ae..888fce80b 100644 --- a/framework/core/tls.h +++ b/framework/core/tls.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/core/type_traits_extend.h b/framework/core/type_traits_extend.h index 2c22bca86..0afa52c7e 100644 --- a/framework/core/type_traits_extend.h +++ b/framework/core/type_traits_extend.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -121,16 +121,19 @@ struct is_status_function : template struct PrecisionWrapper { typedef float type; + const static saber::DataType saber_type = saber::AK_FLOAT; }; template<> struct PrecisionWrapper { typedef int8_t type; + const static saber::DataType saber_type = saber::AK_INT8; }; template<> struct PrecisionWrapper { typedef unsigned short type; + const static saber::DataType saber_type = saber::AK_HALF; }; template @@ -157,7 +160,55 @@ struct target_host { typedef saber::NVHX86 type; }; +template +struct ServiceRunPatternToType{}; +template +struct target_name { + static constexpr const char* value = "INVALID"; +}; + +#define TARGET_NAME_SET(Ttype, TargetName) \ + template<>\ + struct target_name {\ + static constexpr const char* value = #TargetName;\ + }; + +TARGET_NAME_SET(saber::NV, saber_NV) +TARGET_NAME_SET(saber::NVHX86, saber_NVHX86) +TARGET_NAME_SET(saber::X86, saber_X86) +TARGET_NAME_SET(saber::ARM, saber_ARM) +TARGET_NAME_SET(saber::BM, saber_BM) +TARGET_NAME_SET(saber::AMD, saber_AMD) + +/*template +struct target_name { + static const char* value() { + return ret(Ttype()); + } +private: + static const char* ret(saber::NV) { + return "saber::NV"; + } + static const char* ret(saber::NVHX86) { + return "saber::NVHX86"; + } + static const char* ret(saber::X86) { + return "saber::X86"; + } + static const char* ret(saber::ARM) { + return "saber::ARM"; + } + static const char* ret(saber::BM) { + return "saber::BM"; + } + static const char* ret(saber::AMD) { + return "saber::AMD"; + } + static const char* ret(saber::INVLD) { + return "saber::INVLD"; + } +};*/ } /* namespace anakin */ diff --git a/framework/core/types.h b/framework/core/types.h index 3d872ce40..590b9c4f3 100644 --- a/framework/core/types.h +++ b/framework/core/types.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -38,6 +38,14 @@ enum class OpRunType : int { ASYNC ///< ASYNC the net exec asynchronous (for GPU, means mutli-stream) }; +/** + * \brief service run pattern + */ +enum class ServiceRunPattern: int { + SYNC, + ASYNC +}; + /** * \brief Inner return type used by Status type. */ diff --git a/framework/graph/algorithm.h b/framework/graph/algorithm.h index 39adca92b..4523acf4a 100644 --- a/framework/graph/algorithm.h +++ b/framework/graph/algorithm.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/graph/arc.h b/framework/graph/arc.h index c42293eda..3d86edd44 100644 --- a/framework/graph/arc.h +++ b/framework/graph/arc.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ class Arc { Arc(VertexNameType vertex_1, VertexNameType vertex_2); Arc(VertexNameType vertex_1, VertexNameType vertex_2, WeightType weight); Arc(const Arc& otherArc); - virtual ~Arc() {}; + virtual ~Arc() {} /// judge if one arc equal to another bool operator==(const Arc& otherArc) const { diff --git a/framework/graph/graph.cpp b/framework/graph/graph.cpp index e0d9361b1..122ceb788 100644 --- a/framework/graph/graph.cpp +++ b/framework/graph/graph.cpp @@ -1,6 +1,7 @@ #include "framework/graph/graph.h" #include "framework/model_parser/parser/parser.h" #include "framework/graph/llvm/scheduler.h" +#include "framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.h" #include "framework/graph/llvm/optimizer/parall_scheduler.h" #include "framework/graph/llvm/optimizer/memory_scheduler.h" #include "framework/graph/llvm/fusion/graph_pattern.h" @@ -10,41 +11,41 @@ namespace anakin { namespace graph { -template -Status Graph::load(std::string model_path) EXCLUSIVE_LOCKS_REQUIRED(_mut) { +template +Status Graph::load(std::string model_path) EXCLUSIVE_LOCKS_REQUIRED(_mut) { std::unique_lock lock(this->_mut); Status ret = Status::OK(); if(model_path != _model_path) { this->Clean(); - ret = parser::load(this, model_path); + ret = parser::load(this, model_path); _model_path = model_path; } return ret; } -template -Status Graph::load(const char* model_path) { - return parser::load(this, model_path); +template +Status Graph::load(const char* model_path) { + return parser::load(this, model_path); } -template -Status Graph::save(std::string model_path) { - return parser::save(this, model_path); +template +Status Graph::save(std::string model_path) { + return parser::save(this, model_path); } -template -Status Graph::save(const char* model_path) { - return parser::save(this, model_path); +template +Status Graph::save(const char* model_path) { + return parser::save(this, model_path); } -template -std::vector& Graph::get_nodes_in_order() { +template +std::vector& Graph::get_nodes_in_order() { return _nodes_exec_order; } -template -void Graph::Reshape(std::string in_name, +template +void Graph::Reshape(std::string in_name, std::vector shape) EXCLUSIVE_LOCKS_REQUIRED(_mut) { std::unique_lock lock(this->_mut); auto input_node_p = (*this)[in_name]; @@ -61,8 +62,8 @@ void Graph::Reshape(std::string in_name, input_node_p->set_attr(in_shape, input_dim); } -template -void Graph::ResetBatchSize(std::string in_name, +template +void Graph::ResetBatchSize(std::string in_name, const int batch_size) EXCLUSIVE_LOCKS_REQUIRED(_mut) { std::unique_lock lock(this->_mut); auto input_node_p = (*this)[in_name]; @@ -73,17 +74,17 @@ void Graph::ResetBatchSize(std::string in_name, input_node_p->set_attr(in_shape, input_dim); } -template -Status Graph::RegistOut(std::string node_bottom_name, +template +Status Graph::RegistOut(std::string node_bottom_name, std::string node_top_name) { std::pair tmp_pair(node_bottom_name, node_top_name); _registed_outs.push_back(tmp_pair); return Status::OK();; } -template -Status Graph::RegistAllOut() { - auto register_edge = [&, this](Edge& edge) { +template +Status Graph::RegistAllOut() { + auto register_edge = [&, this](Edge& edge) { this->RegistOut(edge.bottom(), edge.top()); return Status::OK(); }; @@ -94,8 +95,8 @@ Status Graph::RegistAllOut() { return Status::OK();; } -template -Status Graph::Optimize() EXCLUSIVE_LOCKS_REQUIRED(_mut) { +template +Status Graph::Optimize() EXCLUSIVE_LOCKS_REQUIRED(_mut) { std::unique_lock lock(this->_mut); if (!_has_graph_optimized) { @@ -106,7 +107,7 @@ Status Graph::Optimize() EXCLUSIVE_LOCKS_REQUIRED(_mut) { //! decide wheter the vgraph is optimized auto is_optimized = statistics.get_info(); - if (is_optimized && _registed_outs.size() == 0) { + if (is_optimized && (_registed_outs.size() == 0)) { // schedule for exec order Scheduler scheduler; scheduler.RegIOResource(_vgraph); @@ -129,10 +130,21 @@ Status Graph::Optimize() EXCLUSIVE_LOCKS_REQUIRED(_mut) { scheduler.RegIOResource(_vgraph); scheduler.Run(); + _nodes_exec_order = scheduler.get_exec_node_in_order(); + + +#if 0 // get node exec in order _nodes_exec_order = scheduler.get_exec_node_in_order(); - +#else // enable conv+eltwise fusion // optimization + ConvElsFusionScheduler conv_eltwise_fusion_scheduler; + conv_eltwise_fusion_scheduler.RegIOResource(_vgraph); + conv_eltwise_fusion_scheduler.Run(); + // get node exec in order + //_nodes_exec_order = conv_eltwise_fusion_scheduler.get_exec_node_in_order(); +#endif + // optimization again MemoryScheduler mem_scheduler; mem_scheduler.RegIOResource(_vgraph); mem_scheduler.Run(); @@ -140,8 +152,6 @@ Status Graph::Optimize() EXCLUSIVE_LOCKS_REQUIRED(_mut) { para_scheduler.RegIOResource(_vgraph); para_scheduler.Run(); - LOG(INFO) << "input_0 name: " << (*_vgraph)["input_0"].name << " input_0 lane: " << (*_vgraph)["input_0"].lane << " wait: " << (*_vgraph)["input_0"].need_wait; - // set info for graph statistics.set_info(true); DLOG(INFO) << " model size : " << graph::GraphGlobalMem::Global().get_sum_mbyte() << " mb "; @@ -155,12 +165,12 @@ Status Graph::Optimize() EXCLUSIVE_LOCKS_REQUIRED(_mut) { } #ifdef ENABLE_DEBUG - auto print_edge_debug_string = [](Edge& edge) { + auto print_edge_debug_string = [](Edge& edge) { DLOG(INFO) << "Real Graph Edge : " << edge.ToString(); return Status::OK(); }; this->Scanner->BFS_Edge(print_edge_debug_string); - auto print_Node_debug_string = [](NodePtr& target_node) { + auto print_Node_debug_string = [](NodePtr& target_node) { DLOG(INFO) << "Real Graph Node : " << target_node->ToString(); return Status::OK(); }; @@ -169,10 +179,10 @@ Status Graph::Optimize() EXCLUSIVE_LOCKS_REQUIRED(_mut) { return Status::OK(); } -template -VGraph& Graph::get_vgraph() { +template +VGraph& Graph::get_vgraph() { _vgraph = new VGraph(); - auto set_nodes = [&](NodePtr& node_p) { + auto set_nodes = [&](NodePtr& node_p) { node v_node; v_node.name = node_p->name(); v_node.opName = node_p->get_op_name(); @@ -181,14 +191,8 @@ VGraph& Graph::get_vgraph() { }; // add node this->Scanner->BFS(set_nodes); - /*auto set_edge_io = [&](Edge& edge) { // this funciton may damage the construction in op input order - io v_io; - v_io.name = edge.name(); - Arc arc(edge.bottom(), edge.top(), v_io); - _vgraph->add_arc(arc); - return Status::OK(); - };*/ - auto set_edge_io_in = [&](NodePtr& node_p) { + + auto set_edge_io_in = [&](NodePtr& node_p) { auto& edge_its = this->get_in_arc_its(node_p->name()); for (auto& edge_it : edge_its) { @@ -201,7 +205,7 @@ VGraph& Graph::get_vgraph() { return Status::OK(); }; - auto set_edge_io_out = [&](NodePtr& node_p) { + auto set_edge_io_out = [&](NodePtr& node_p) { auto& edge_its = this->get_out_arc_its(node_p->name()); for (auto& edge_it : edge_its) { @@ -226,8 +230,8 @@ VGraph& Graph::get_vgraph() { return *_vgraph; } -template -Status Graph::restore_from_vgraph(VGraph* vgraph) { +template +Status Graph::restore_from_vgraph(VGraph* vgraph) { //! need to clear graph edge first this->arcs_clear(); @@ -237,7 +241,7 @@ Status Graph::restore_from_vgraph(VGraph* vgraph) { auto& tmp_io = arc_it->weight(); auto& bottom = arc_it->bottom(); auto& top = arc_it->top(); - Edge edge(bottom, top); + Edge edge(bottom, top); auto& shared = edge.shared(); shared = tmp_io.shared; auto& share_from = edge.share_from(); @@ -258,15 +262,15 @@ Status Graph::restore_from_vgraph(VGraph* vgraph) { auto& tmp_io = arc_it->weight(); auto& bottom = arc_it->bottom(); auto& top = arc_it->top(); - Edge edge(bottom, top); + Edge edge(bottom, top); auto& shared = edge.shared(); shared = tmp_io.shared; auto& share_from = edge.share_from(); share_from = tmp_io.share_from; auto& lane = edge.lane(); lane = tmp_io.lane; - //edge.weight() = new Tensor4d(); - //edge.weight() = std::make_shared >(); + //edge.weight() = new Tensor4d(); + //edge.weight() = std::make_shared >(); this->add_out_arc(edge); } @@ -276,7 +280,7 @@ Status Graph::restore_from_vgraph(VGraph* vgraph) { vgraph->Scanner->BFS(interpreter_io_out); // this will change this real graph // interpreter for node, more complicated - auto map_node_to_node_ptr = [this](NodePtr& node_p, + auto map_node_to_node_ptr = [this](NodePtr& node_p, node & target_node) -> Status { if (node_p->name() == target_node.name) { CHECK_EQ(target_node.mergeNodes.size(), target_node.mergeNodeNames.size()) @@ -288,6 +292,11 @@ Status Graph::restore_from_vgraph(VGraph* vgraph) { this->_pattern_name_merges[target_node.name].push_back(target_node.mergeNodeNames[i]); } } + if(target_node.idx_keep_in_merge_nodes.size()) { + for(auto& idx : target_node.idx_keep_in_merge_nodes) { + this->_node_merges_keep[target_node.name].push_back(idx); + } + } auto& need_wait = node_p->need_wait(); need_wait = target_node.need_wait; @@ -307,7 +316,7 @@ Status Graph::restore_from_vgraph(VGraph* vgraph) { vgraph->Scanner->BFS(interpreter_node); //! merge the attr of nodes which need to merge - auto merge_node_attrs = [this](NodePtr& node_p) -> Status { + auto merge_node_attrs = [this](NodePtr& node_p) -> Status { auto& target_node_name = node_p->name(); if (this->_node_merges.count(target_node_name) > 0 && this->_node_merges[target_node_name].size()) { @@ -315,7 +324,14 @@ Status Graph::restore_from_vgraph(VGraph* vgraph) { auto& tmp_node_p = this->operator[](this->_node_merges[target_node_name][i]); (*node_p).Merge(*tmp_node_p, this->_pattern_name_merges[target_node_name][i]); // add the merge node's attr - this->remove(this->_node_merges[target_node_name][i]); // remove merge node which is useless + + // detect if the i-th node in _node_merges should be saved in Graph + auto ret = std::find(this->_node_merges_keep[target_node_name].begin(), + this->_node_merges_keep[target_node_name].end(), + i); + if(ret == this->_node_merges_keep[target_node_name].end()) { + this->remove(this->_node_merges[target_node_name][i]); // remove merge node which is useless + } } } @@ -325,42 +341,47 @@ Status Graph::restore_from_vgraph(VGraph* vgraph) { return Status::OK(); } -template -Status Graph::CopyFrom(Graph& graph) { +template +Status Graph::CopyFrom(Graph& graph) { // this clear all the edges and nodes this->all_clear(); - auto shallow_copy_node = [&, this](NodePtr& node_p) { + auto shallow_copy_node = [&, this](NodePtr& node_p) { // create and copy node - NodePtr node_new_p = std::make_shared>(); + NodePtr node_new_p = std::make_shared(); *node_new_p = *node_p; this->add_vertex(node_new_p->name(), node_new_p); }; graph.Scanner->BFS(shallow_copy_node); - auto shallow_copy_edge = [&, this](NodePtr& node_p) { + auto shallow_copy_edge = [&, this](NodePtr& node_p) { // create and copy edges auto edge_in_its = graph.get_in_arc_its(node_p->name()); for (auto in_it : edge_in_its) { - in_it->weight() = std::make_shared >(); + in_it->weight() = std::make_shared >(); this->add_in_arc(*in_it); } auto edge_out_its = graph.get_out_arc_its(node_p->name()); for (auto out_it : edge_out_its) { - out_it->weight() = std::make_shared >(); + out_it->weight() = std::make_shared >(); this->add_out_arc(*out_it); } }; graph.Scanner->BFS(shallow_copy_edge); // get node execution order _nodes_exec_order = graph.get_nodes_in_order(); + // get graph inputs and outputs + _ins = graph._ins; + _outs = graph._outs; + // get statistic + statistics = graph.statistics; return Status::OK(); } -template -Status Graph::Clean() { +template +Status Graph::Clean() { // this clear all the edges and nodes this->all_clear(); // delete _vgraph pointer @@ -373,23 +394,34 @@ Status Graph::Clean() { } #ifdef USE_CUDA -template class Graph; -template class Graph; -template class Graph; +template class Graph; +template class Graph; +template class Graph; #endif #ifdef USE_X86_PLACE -template class Graph; -template class Graph; -template class Graph; +template class Graph; +template class Graph; +template class Graph; #endif #ifdef USE_ARM_PLACE -template class Graph; -template class Graph; -template class Graph; +#ifdef ANAKIN_TYPE_FP32 +template class Graph; +#endif +#ifdef ANAKIN_TYPE_FP16 +template class Graph; +#endif +#ifdef ANAKIN_TYPE_INT8 +template class Graph; +#endif #endif +#ifdef AMD_GPU +template class Graph; +template class Graph; +template class Graph; +#endif } /* namespace graph */ } /* namespace anakin */ diff --git a/framework/graph/graph.h b/framework/graph/graph.h index 9aa549197..c232ddea8 100644 --- a/framework/graph/graph.h +++ b/framework/graph/graph.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -31,20 +31,20 @@ namespace graph { * \brief Graph class * public inherit GraphBase */ -template +template class Graph : public GraphBase, - Tensor4dPtr, - Edge > { + NodePtr, + Tensor4dPtr, + Edge > { public: Graph():GraphBase, - Tensor4dPtr, - Edge >() {} + NodePtr, + Tensor4dPtr, + Edge >() {} Graph(size_t size):GraphBase, - Tensor4dPtr, - Edge >(size) {} + NodePtr, + Tensor4dPtr, + Edge >(size) {} ~Graph() { if(_vgraph) { @@ -54,6 +54,7 @@ class Graph : public GraphBase& graph); + Status CopyFrom(Graph& graph); ///< statistics stand for Statistics info of anakin graph Statistics statistics; @@ -126,7 +127,7 @@ class Graph : public GraphBase _ins; ///< graph output node name @@ -135,6 +136,8 @@ class Graph : public GraphBase _nodes_exec_order; ///< node_merges map: target node map to all its fusion node std::unordered_map > _node_merges; + ///< _node_merges_keep map: target node map to all its fusion node that shouldn't be removed + std::unordered_map > _node_merges_keep; ///< _pattern_name_merges map: target node map to all its fusion pattern node std::unordered_map > _pattern_name_merges; diff --git a/framework/graph/graph_base.h b/framework/graph/graph_base.h index 0920ffce1..d30db00ca 100644 --- a/framework/graph/graph_base.h +++ b/framework/graph/graph_base.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -58,6 +58,7 @@ class GraphBase { /// add vertex to graph virtual void add_vertex(VertexNameType vertexName, VertexType vertex); + virtual void add_alias(VertexNameType vertexNameOri, VertexNameType vertexNameAlias); /// add in/out arc to graph, if you in/out arc need order virtual void add_in_arc(ArcType& arc); diff --git a/framework/graph/graph_base.inl b/framework/graph/graph_base.inl index ebeaafa6f..4cb48966e 100644 --- a/framework/graph/graph_base.inl +++ b/framework/graph/graph_base.inl @@ -49,6 +49,25 @@ void GraphBase::add_vertex(Vert } } +template +void GraphBase::add_alias(VertexNameType vertexNameOri, VertexNameType vertexNameAlias) { + if(vertexNameOri == vertexNameAlias) { + return; + } + if(!this->has_vertex(vertexNameOri)) { + LOG(FATAL) << "The graph doesn't have vertext " << vertexNameOri; + return; + } + if(this->has_vertex(vertexNameAlias)) { + LOG(FATAL) <<"The graph shouldn't have alias vertex(" + < void GraphBase::add_in_arc(ArcType& arc) { @@ -197,9 +216,10 @@ ArcType& GraphBase::get_arc(Ver } Arc_iterator it_end = _arcs.end(); Arc_iterator it = find(vertex_name_from, vertex_name_to); - if(it != it_end) { - return *it; - } +// if(it != it_end) { +// return *it; +// } + return *it; } template diff --git a/framework/graph/graph_global_mem.h b/framework/graph/graph_global_mem.h index 5796bc5f9..adfe707f0 100644 --- a/framework/graph/graph_global_mem.h +++ b/framework/graph/graph_global_mem.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,8 +26,53 @@ namespace anakin { using namespace saber; +/** +* \brief global resource level +*/ +enum Level { + Level_0 = 0, + Level_1, + Level_2, + Level_3, + Level_4, + Level_5 +}; + namespace graph { +/** +* \brief global resource level stage +*/ +template +struct LevelStage { + std::mutex _mut; + bool accessible = true; +}; + +/** +* \brief global resource multi level stage and restraint +*/ +template +struct GlobalResRestrain : public LevelStage... { + GlobalResRestrain() {} + GlobalResRestrain& operator=(const GlobalResRestrain& other){ + return *this; + } + + template + std::mutex& get_mut() { + return LevelStage::_mut; + } + template + bool& check_access() { + return LevelStage::accessible; + } + template + void use() { + LevelStage::accessible = false; + } +}; + /** * \brief GraphGlobalMemBase class */ @@ -39,13 +84,89 @@ class GraphGlobalMemBase { /// create Block memory template - PBlock::type, Ttype>* new_block(saber::Shape& shape) EXCLUSIVE_LOCKS_REQUIRED(_mut) { + PBlock* new_block(saber::Shape& shape) EXCLUSIVE_LOCKS_REQUIRED(_mut) { std::unique_lock lock(this->_mut); - PBlock::type, Ttype>* block_p = new PBlock::type, Ttype>(shape); + PBlock* block_p = new PBlock(shape, Dtype); + // register new block_p for resource guard + _res_guard[block_p->h_tensor().data()] = LevelList(); _push_mem_pool(block_p, DataTypeWarpper()); return block_p; } + /// apply arbitrary function to two memory block + /// note: that args may contain target PBlock pointer + /// so we need to set mutex for mem management + template + void apply(functor func, PBlock tensor_1 , PBlock tensor_2, ParamTypes ...args) { + std::unique_lock lock(this->_mut); + void* key_1 = tensor_1.h_tensor().data(); + void* key_2 = tensor_1.h_tensor().data(); + if(_res_guard[key_1].check_access()) { + std::unique_lock lock(_res_guard[key_1].get_mut()); + _res_guard[key_1].use(); + _res_guard[key_2].use(); + func(tensor_1, tensor_2, std::forward(args)...); + void* new_key_1 = tensor_1.h_tensor().data(); + void* new_key_2 = tensor_2.h_tensor().data(); + if(new_key_1 != key_1) { + _res_guard[new_key_1] = _res_guard[key_1]; + if(_res_guard.erase(key_1) != 1) { // delete old key-vale + LOG(FATAL) << "target key_1(" << key_1 << ") doesn't exist."; + } + } + if(new_key_2 != key_2) { + _res_guard[new_key_2] = _res_guard[key_2]; + if(_res_guard.erase(key_2) != 1) { // delete old key-vale + LOG(FATAL) << "target key_2(" << key_2 << ") doesn't exist."; + } + } + } + } + /// apply arbitrary function to one memory block + /// note: that args may contain target PBlock pointer + /// so we need to set mutex for mem management + template + void apply(functor func, PBlock tensor , ParamTypes ...args) { + std::unique_lock lock(this->_mut); + void* key = tensor.h_tensor().data(); + if(_res_guard[key].check_access()) { + std::unique_lock lock(_res_guard[key].get_mut()); + _res_guard[key].use(); + func(tensor, std::forward(args)...); + void* new_key = tensor.data(); + if(new_key != key) { + _res_guard[new_key] = _res_guard[key]; + if(_res_guard.erase(key) != 1) { // delete old key-vale + LOG(FATAL) << "target key(" << key << ") doesn't exist."; + } + } + } + } + + /// apply arbitrary function to one memory tensor + /// note: that args may contain target PBlock pointer + /// so we need to set mutex for mem management + template + void apply(functor func, Tensor4d& tensor , ParamTypes ...args) { + std::unique_lock lock(this->_mut); + void* key = tensor.data(); + if(_res_guard[key].check_access()) { + std::unique_lock lock(_res_guard[key].get_mut()); + _res_guard[key].use(); + func(tensor, std::forward(args)...); + void* new_key = tensor.data(); // check if tensor data has changed + if(key != new_key) { + _res_guard[new_key] = _res_guard[key]; + if(_res_guard.erase(key) != 1) { // delete old key-vale + LOG(FATAL) << "target key(" << key << ") doesn't exist."; + } + } + } + if(key == nullptr) { + func(tensor, std::forward(args)...); + } + } + /// get sum size in m-btyes size_t get_sum_mbyte() EXCLUSIVE_LOCKS_REQUIRED(_mut) { std::unique_lock lock(this->_mut); @@ -85,15 +206,15 @@ class GraphGlobalMemBase { private: /// push int8_mem operaiton - void _push_mem_pool(PBlock* block_p, DataTypeWarpper) { + void _push_mem_pool(PBlock* block_p, DataTypeWarpper) { _int8_mem_pool.push_back(block_p); } /// push fp16_mem operaiton - void _push_mem_pool(PBlock* block_p, DataTypeWarpper) { + void _push_mem_pool(PBlock* block_p, DataTypeWarpper) { _fp16_mem_pool.push_back(block_p); } /// push fp32_mem operaiton - void _push_mem_pool(PBlock* block_p, DataTypeWarpper) { + void _push_mem_pool(PBlock* block_p, DataTypeWarpper) { _fp32_mem_pool.push_back(block_p); } @@ -111,12 +232,14 @@ class GraphGlobalMemBase { } private: + typedef GlobalResRestrain LevelList; + std::unordered_map _res_guard; ///< _int8_mem_pool stand for int8 type memory - std::vector::type, Ttype>* > _int8_mem_pool GUARDED_BY(_mut); + std::vector* > _int8_mem_pool GUARDED_BY(_mut); ///< _fp16_mem_pool stand for fp16 type memory - std::vector::type, Ttype>* > _fp16_mem_pool GUARDED_BY(_mut); + std::vector* > _fp16_mem_pool GUARDED_BY(_mut); ///< _fp32_mem_pool stand for fp32 type memory - std::vector::type, Ttype>* > _fp32_mem_pool GUARDED_BY(_mut); + std::vector* > _fp32_mem_pool GUARDED_BY(_mut); ///< _mut std::mutex _mut; }; diff --git a/framework/graph/llvm/base.h b/framework/graph/llvm/base.h index 8b4e8f6ed..f96d69ddf 100644 --- a/framework/graph/llvm/base.h +++ b/framework/graph/llvm/base.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/graph/llvm/fusion/fusion_op_register.cpp b/framework/graph/llvm/fusion/fusion_op_register.cpp index 106c88db4..1b0c4e449 100644 --- a/framework/graph/llvm/fusion/fusion_op_register.cpp +++ b/framework/graph/llvm/fusion/fusion_op_register.cpp @@ -4,6 +4,8 @@ namespace anakin { namespace graph { +/// in straight order + REGISTER_GRAPH_FUSION_PATTERN(DeconvRelu) .Type(IN_ORDER) .AddOpNode("conv_0", "Deconvolution") @@ -74,13 +76,6 @@ REGISTER_GRAPH_FUSION_PATTERN(EltwiseRelu) .AddConnect("eltwise_0", "relu_0") .CreatePattern([](VGraph* graph) {}); - -/*REGISTER_GRAPH_FUSION_PATTERN(Dense) - .Type(IN_PARELLEL) - .CreatePattern([](VGraph* graph){ - })*/ - - } /* namespace graph */ } /* namespace anakin */ diff --git a/framework/graph/llvm/fusion/graph_pattern.cpp b/framework/graph/llvm/fusion/graph_pattern.cpp index d49c10025..9ff2b87a6 100644 --- a/framework/graph/llvm/fusion/graph_pattern.cpp +++ b/framework/graph/llvm/fusion/graph_pattern.cpp @@ -4,8 +4,7 @@ namespace anakin { namespace graph { -const std::unordered_map, FusionHash> FusionSniffer -= { +const std::unordered_map, FusionHash> FusionSniffer = { { IN_ORDER, [](VGraph * vgraph, Pattern * pattern) -> int { @@ -92,19 +91,22 @@ const std::unordered_map, FusionHa } }; vgraph->Scanner->BFS(search_vgraph, pattern); + return 0; } }, { IN_PARELLEL, [](VGraph * vgraph, Pattern * pattern) ->int { + return 0; } }, { GRAPH, [](VGraph * vgraph, Pattern * pattern) ->int { + return 0; } }, - { None, [](VGraph*, Pattern*) ->int {} } + { None, [](VGraph*, Pattern*) ->int { return 0;} } }; Pattern& Pattern::name(std::string fusion_op_name) { diff --git a/framework/graph/llvm/fusion/graph_pattern.h b/framework/graph/llvm/fusion/graph_pattern.h index 1513c211b..2363578e9 100644 --- a/framework/graph/llvm/fusion/graph_pattern.h +++ b/framework/graph/llvm/fusion/graph_pattern.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.cpp b/framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.cpp new file mode 100644 index 000000000..a9e240492 --- /dev/null +++ b/framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.cpp @@ -0,0 +1,89 @@ +#include "framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.h" + +namespace anakin { + +namespace graph { + +bool ConvElsFusionScheduler::callable(node& node_arg) { + if(_helper.has_node(node_arg)) { + auto& node_arc_out_its = _vgraph->get_out_arc_its(node_arg.name); + auto& node_arc_in_its = _vgraph->get_in_arc_its(node_arg.name); + CHECK_EQ(node_arc_out_its.size(), 1)<<"Conv+eltwise analysis: Convolution like op should have only one output."; + auto& node_next = (*_vgraph)[node_arc_out_its[0]->top()]; + if(node_next.opName == "EltwiseRelu" /*|| node_next.opName == "Eltwise"*/) { + auto& elt_node_in_its = _vgraph->get_in_arc_its(node_next.name); + for(auto& it : elt_node_in_its) { + if(it->bottom() != node_arg.name) { + if(!_helper.need_wait(it->bottom())) { + _helper.push_wait(node_arg.name); + if(!this->have_launched((*_vgraph)[it->bottom()])) { + /*std::vector io_in; + for (auto& arc_it : node_arc_in_its) { + io_in.push_back(arc_it->weight()); + } + _helper.set_holder(io_in, _vgraph);*/ + //_helper.register_pair(node_arg.name, node_next.name); + if ((*_vgraph)[it->bottom()].opName == "Split") { + _helper.register_pair(node_arg.name, node_next.name); } + else { + _helper.register_pair(it->bottom(), node_next.name); + } + + //_helper.register_pair(it->bottom(), node_next.name); + return false; + } else { + _helper.release(node_arg.name); + } + break; + } + } + } + } + } + + // original code + auto& node_arc_in_its = _vgraph->get_in_arc_its(node_arg.name); + std::vector io_in; + + for (auto& arc_it : node_arc_in_its) { + io_in.push_back(arc_it->weight()); + } + + return this->check_access(io_in); +} + +void ConvElsFusionScheduler::Run() { + while (!(this->_wait_que.empty())) { + // lanuch the acessible op and remove it from wait que. + for (auto op_it = this->_wait_que.begin(); op_it != this->_wait_que.end();) { + if (callable(*op_it)) { + launch(*op_it); + op_it = this->_wait_que.erase(op_it); + } else { + ++op_it; + } + } + } + + // complete fusion replacement for conv+eltwise + auto& pairs = _helper.get_replace_pairs(); + for(auto& tmp_pair : pairs) { + auto& node_conv = (*_vgraph)[tmp_pair.conv_name]; + auto& node_eltwise = (*_vgraph)[tmp_pair.eltwise_name]; + node_conv += node_eltwise; // merge node parameter + node_conv.register_keep(node_conv.mergeNodes.size()-1); // keep eltwise node in reconstruction + node_conv.mergeNodeNames.push_back("merge"); // eltwise op's pattern name is equal to its original attr's name + + node_eltwise.opName = "Gather"; // change eltwise op to Gather op + } + // set exec order for vgraph + auto exec_node_order = this->get_exec_node_in_order(); + _vgraph->set_exec_order(exec_node_order); +} + + +} /* namespace graph */ + +} /* namespace anakin */ + + diff --git a/framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.h b/framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.h new file mode 100644 index 000000000..a3ca2bbf5 --- /dev/null +++ b/framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_LLVM_SCHEDULER_CONV_ELEWISE_FUSION_H +#define ANAKIN_LLVM_SCHEDULER_CONV_ELEWISE_FUSION_H + +#include "utils/logger/logger.h" +#include "framework/graph/llvm/schedule_base.h" +#include "framework/graph/llvm/virtual_graph.h" +#include "framework/graph/llvm/scheduler.h" + +namespace anakin { + +namespace graph { + +/** + * \brief ConvElsFusionScheduler helper class + */ +struct ConvElsFusionHelper { +private: + std::vector ops { + "ConvBatchnormScale", + }; + struct conv_eltwise_pair { + std::string conv_name; + std::string eltwise_name; + inline bool operator==(const conv_eltwise_pair& pair_other) { + return (conv_name == pair_other.conv_name) && (eltwise_name == pair_other.eltwise_name); + } + }; + std::vector _pairs; + + std::vector _node_need_to_wait; + +public: + /** + * \brief judge if meet target op + */ + inline bool has_node(node& node_arg) { + for(auto& op : ops) { + if(op == node_arg.opName) { + return true; + } + } + return false; + } + + bool need_wait(std::string& node_name) { + auto ret = std::find(_node_need_to_wait.begin(), _node_need_to_wait.end(), node_name); + if(ret != _node_need_to_wait.end()) { + return true; + } + return false; + } + + void push_wait(std::string& node_name) { + if(!need_wait(node_name)) { + _node_need_to_wait.push_back(node_name); + } + } + + void release(std::string& node_name) { + int index = -1; + for(int i=0; i<_node_need_to_wait.size();i++) { + if(_node_need_to_wait[i] == node_name) { + index = i; + } + } + if(index != -1) { + _node_need_to_wait.erase(_node_need_to_wait.begin()+index); + } + } + + /*void set_holder(std::vector& io_vec, VGraph* graph) { + for(auto io : io_vec) { + io.holder = true; + } + for (auto& io_res : io_vec) { + auto replace_arc = [&](Arc& arc) { + if (arc.weight() == io_res) { + auto& io_tmp = arc.weight(); + io_tmp = io_res; + return Status::EXIT(" Find the matched target arc io. "); + } + return Status::OK(); + }; + graph->Scanner->BFS_Edge(replace_arc); + } + }*/ + + void register_pair(std::string& conv_name, std::string& eltwise_name) { + conv_eltwise_pair tmp_pair; + tmp_pair.conv_name = conv_name; + tmp_pair.eltwise_name = eltwise_name; + auto ret = std::find(_pairs.begin(), _pairs.end(), tmp_pair); + if(ret == _pairs.end()) { + _pairs.push_back(tmp_pair); + } + } + + std::vector& get_replace_pairs() { + return _pairs; + } +}; + +/** + * \brief Dependency scheduler for analysing the possibility of conv+eltwise fusion in graph + */ +class ConvElsFusionScheduler : public Scheduler { +public: + ConvElsFusionScheduler() {} + virtual ~ConvElsFusionScheduler() {} + + /// decide if the target node's op is callable + virtual bool callable(node&); + + /// run scheduler + virtual void Run(); + + +private: + ConvElsFusionHelper _helper; +}; + + +} /* namespace graph */ + +} /* namespace anakin */ + +#endif diff --git a/framework/graph/llvm/optimizer/memory_scheduler.cpp b/framework/graph/llvm/optimizer/memory_scheduler.cpp index 239ae5343..cc1a88e20 100644 --- a/framework/graph/llvm/optimizer/memory_scheduler.cpp +++ b/framework/graph/llvm/optimizer/memory_scheduler.cpp @@ -29,7 +29,6 @@ void IOBlockResource::rm_self_lock_tree(io& io_in) { } } - void IOBlockResource::free_self(std::vector& self_shared_edges, VGraph* vgraph_p) { for (auto& io : self_shared_edges) { rm_self_lock_tree(io); @@ -113,7 +112,6 @@ void IOBlockResource::free(std::vector& io_vec, VGraph* vgraph_p) { tmp_io.name = io_res.name; if ((*it) == tmp_io) { - //_free.push(*it); push_free(*it, vgraph_p); it = _lock.erase(it); } else { @@ -144,6 +142,16 @@ void IOBlockResource::lock(std::vector& io_vec) { } } +bool IOBlockResource::is_locked(io& io_in) { + for(auto it = _lock.begin(); it != _lock.end();) { + if((*it) == io_in) { + return true; + } else { + ++it; + } + } +} + void IOBlockResource::map_ios_to_vgraph(std::vector& io_vec, VGraph* vgraph_p) { for (auto& io_res : io_vec) { auto replace_arc = [&](Arc& arc) { @@ -174,26 +182,84 @@ void MemoryScheduler::launch(node& node_arg) { set_fix_io(io_out); if (_need_self_shared(node_arg)) { - auto& node_arc_in_its = _vgraph->get_in_arc_its(node_arg.name); - CHECK_EQ(node_arc_in_its.size(), - 1) << "Self shared node(" << node_arg.name << ")'s input size should be 1"; - - for (auto& arc_it : node_arc_in_its) { - _io_block_res.push_self_lock(arc_it->weight()); - } - - for (auto& io_tmp : io_out) { - io_tmp.shared = true; - - if (node_arc_in_its[0]->weight().shared) { - io_tmp.share_from = node_arc_in_its[0]->weight().share_from; - } else { - io_tmp.share_from = node_arc_in_its[0]->weight().name; - } - } - - _io_block_res.reg_self_lock_tree(node_arc_in_its[0]->weight(), io_out); - _io_block_res.map_ios_to_vgraph(io_out, _vgraph); // map changes to _vgraph + auto& node_arc_in_its = _vgraph->get_in_arc_its(node_arg.name); + if(node_arc_in_its.size() > 1) { + int selected = 0; + std::vector io_locked_idx; + for(int i=0; i < node_arc_in_its.size(); i++) { + //if(_io_block_res.is_locked(node_arc_in_its[i]->weight())) { + io_locked_idx.push_back(i); + //} + } + // collect all locked io bottom node's inputs io + std::vector all_collected; + for(auto idx : io_locked_idx) { + auto& arc_select = node_arc_in_its[idx]; + auto& temp_arc_in_its = _vgraph->get_in_arc_its(arc_select->bottom()); + for(auto& it : temp_arc_in_its) { + all_collected.push_back(it->weight()); + } + } + for(auto idx : io_locked_idx) { + bool dismiss = false; + for(auto& io : all_collected) { + if(node_arc_in_its[idx]->weight().shared) { + auto& node_btm = (*_vgraph)[node_arc_in_its[idx]->bottom()]; + if(_need_self_shared(node_btm)) { + dismiss = false; + break; + } + if((io.share_from == node_arc_in_its[idx]->weight().share_from) || \ + (io.name == node_arc_in_its[idx]->weight().share_from)) { + dismiss = true; + break; + } + } else { + dismiss = false; + break; + } + } + if(!dismiss) { + selected = idx; + break; + } + } + _io_block_res.push_self_lock(node_arc_in_its[selected]->weight()); + for(int i=0; iweight()); + } + } + for(auto& io_tmp : io_out) { + io_tmp.shared = true; + if (node_arc_in_its[selected]->weight().shared) { + io_tmp.share_from = node_arc_in_its[selected]->weight().share_from; + } else { + io_tmp.share_from = node_arc_in_its[selected]->weight().name; + } + } + _io_block_res.reg_self_lock_tree(node_arc_in_its[selected]->weight(), io_out); + _io_block_res.map_ios_to_vgraph(io_out, _vgraph); // map changes to _vgraph + } else { + // original impl + auto& node_arc_in_its = _vgraph->get_in_arc_its(node_arg.name); + CHECK_EQ(node_arc_in_its.size(), + 1) << "Self shared node(" << node_arg.name << ")'s input size should be 1"; + + for (auto& arc_it : node_arc_in_its) { + _io_block_res.push_self_lock(arc_it->weight()); + } + for (auto& io_tmp : io_out) { + io_tmp.shared = true; + if (node_arc_in_its[0]->weight().shared) { + io_tmp.share_from = node_arc_in_its[0]->weight().share_from; + } else { + io_tmp.share_from = node_arc_in_its[0]->weight().name; + } + } + _io_block_res.reg_self_lock_tree(node_arc_in_its[0]->weight(), io_out); + _io_block_res.map_ios_to_vgraph(io_out, _vgraph); // map changes to _vgraph + } } else { _io_block_res.lock(io_out); // lock out _io_block_res.map_ios_to_vgraph(io_out, _vgraph); // map changes to _vgraph @@ -208,9 +274,6 @@ void MemoryScheduler::launch(node& node_arg) { _io_block_res.free(io_in, _vgraph); } - /*if (!_need_self_shared.last_op_is_self_shared(_vgraph, node_arg)) { - _io_block_res.free_self(); - }*/ std::vector self_shared_edges; if (_need_self_shared.last_op_is_self_shared(_vgraph, node_arg, self_shared_edges)) { diff --git a/framework/graph/llvm/optimizer/memory_scheduler.h b/framework/graph/llvm/optimizer/memory_scheduler.h index b3a3352aa..e4a0e7592 100644 --- a/framework/graph/llvm/optimizer/memory_scheduler.h +++ b/framework/graph/llvm/optimizer/memory_scheduler.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -24,22 +24,24 @@ namespace anakin { namespace graph { + /** -* \brief check_self_shared struct -* used to check arcs in graph whether is shared -*/ + * \brief check_self_shared struct + * used to check arcs in graph whether is shared + */ struct check_self_shared { /// ops : Split and Reshape std::vector ops{ "Split", "Reshape", + "Gather", "Flatten" }; /** - * \brief whether node_arg's op is in ops - * \param node_arg stand for certain node - * \return bool the value of ops == node_arg.opName - */ + * \brief whether node_arg's op is in ops + * \param node_arg stand for certain node + * \return bool the value of ops == node_arg.opName + */ inline bool operator()(node& node_arg) { for (auto& op_type : ops) { if (op_type == node_arg.opName) { @@ -50,12 +52,12 @@ struct check_self_shared { } /** - * \brief whether bottom_node's op is in ops - * \param graph stand for current graph - * \param node_tmp stand for certain node - * \param self_shared_ios stand for shared ios queue - * \return bool the value of ret - */ + * \brief whether bottom_node's op is in ops + * \param graph stand for current graph + * \param node_tmp stand for certain node + * \param self_shared_ios stand for shared ios queue + * \return bool the value of ret + */ inline bool last_op_is_self_shared(VGraph* graph, node& node_tmp, std::vector& self_shared_ios) { bool ret = false; auto node_arc_in_its = graph->get_in_arc_its(node_tmp.name); @@ -63,7 +65,7 @@ struct check_self_shared { auto& node_ref = (*graph)[arc_in_it->bottom()]; for (auto& op_type : ops) { if (op_type == node_ref.opName) { - self_shared_ios.push_back(arc_in_it->weight()); + self_shared_ios.push_back(arc_in_it->weight()); ret = true; } } @@ -73,8 +75,8 @@ struct check_self_shared { }; /** -* \brief io block resource class used for scheduler of VGraph memory usage -*/ + * \brief io block resource class used for scheduler of VGraph memory usage + */ class IOBlockResource { public: IOBlockResource() {} @@ -85,9 +87,11 @@ class IOBlockResource { bool is_same_target(io&, io&, VGraph*); void push_free(io&, VGraph*); void lock(std::vector&); + bool is_locked(io&); inline void push_self_lock(io& io_tmp) { _self_lock.push_back(io_tmp);} void reg_self_lock_tree(io&, std::vector&); void rm_self_lock_tree(io&); + bool is_in_self_tree(io&); void free_self(std::vector&, VGraph*); void map_ios_to_vgraph(std::vector&, VGraph*); diff --git a/framework/graph/llvm/optimizer/parall_scheduler.h b/framework/graph/llvm/optimizer/parall_scheduler.h index c1ec3f79f..9460da698 100644 --- a/framework/graph/llvm/optimizer/parall_scheduler.h +++ b/framework/graph/llvm/optimizer/parall_scheduler.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/graph/llvm/schedule_base.h b/framework/graph/llvm/schedule_base.h index c9990f263..9a7d8ecd0 100644 --- a/framework/graph/llvm/schedule_base.h +++ b/framework/graph/llvm/schedule_base.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -201,6 +201,7 @@ class ScheduleBase { inline void exe_push(OpType& op) { _exec_que.push(op); } + /** * \brief operations of op queue * push_back operation @@ -218,6 +219,23 @@ class ScheduleBase { */ virtual void launch(OpType&) = 0; + /** + * \brief judge if target op have been launched + * + * \param op stand for operation type + * \return bool + */ + inline bool have_launched(OpType& op) { + for(auto it = _wait_que.begin(); it != _wait_que.end();) { + if(*it == op) { + return false; + } + ++it; + } + return true; + } + + /** * \brief get exec queue. * queue operation such as push,push_back,pop diff --git a/framework/graph/llvm/scheduler.cpp b/framework/graph/llvm/scheduler.cpp index 33c37ecbb..970f51249 100644 --- a/framework/graph/llvm/scheduler.cpp +++ b/framework/graph/llvm/scheduler.cpp @@ -14,12 +14,19 @@ void Scheduler::RegIOResource(VGraph* vgraph) { // register io resources. vgraph->Scanner->BFS_Edge(register_io_f); - auto push_wait_que_f = [this](node & node_arg) { - this->wait_push(node_arg); - return 0; - }; - // push all node op to wait que and disable the out resources. - vgraph->Scanner->BFS(push_wait_que_f); + /*if(vgraph->has_exec_order()) { + auto node_exec_order = vgraph->get_exec_order(); + for(auto& node_name : node_exec_order) { + this->wait_push((*vgraph)[node_name]); + } + } else {*/ + auto push_wait_que_f = [this](node & node_arg) { + this->wait_push(node_arg); + return 0; + }; + // push all node op to wait que and disable the out resources. + vgraph->Scanner->BFS(push_wait_que_f); + //} // scheduler add fix arc io auto& regist_outs = vgraph->get_registed_outs(); diff --git a/framework/graph/llvm/scheduler.h b/framework/graph/llvm/scheduler.h index f6d454871..8ffeb6e06 100644 --- a/framework/graph/llvm/scheduler.h +++ b/framework/graph/llvm/scheduler.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/framework/graph/llvm/virtual_graph.h b/framework/graph/llvm/virtual_graph.h index 30a555842..5a5f26e02 100644 --- a/framework/graph/llvm/virtual_graph.h +++ b/framework/graph/llvm/virtual_graph.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -80,10 +80,12 @@ struct node { ///< mergeNodes stand for sub merged nodes std::vector mergeNodes; + ///< save node's index in mergeNodes which shouldn't be removed in reconstructing Graph + std::vector idx_keep_in_merge_nodes; /// mergeNodeNames; - + ///< lane stand for the stream of lane the node operator occurs. default 0 int lane{0}; ///mergeNodes.push_back(rhs); return *this; } + + // register node index should keep + inline void register_keep(int idx) { + idx_keep_in_merge_nodes.push_back(idx); + } }; /** @@ -161,9 +168,17 @@ class VGraph : public GraphBase { std::vector>& get_registed_outs() { return _registed_outs; } + bool has_exec_order() { return _nodes_exec_order.size() == 0 ? false : true; } + + void set_exec_order(std::vector& exe_order) { _nodes_exec_order = exe_order; } + + std::vector& get_exec_order() { return _nodes_exec_order; } + private: ///< _registed_outs :outs that needs to be exported std::vector> _registed_outs; + ///< node execute order + std::vector _nodes_exec_order; }; diff --git a/framework/graph/node.h b/framework/graph/node.h index be5a17fd8..e5c9716af 100644 --- a/framework/graph/node.h +++ b/framework/graph/node.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -23,10 +23,14 @@ namespace anakin { -class OperatorBase; - -template -class Operator; +/** + * \brief Basic operation class. + */ +class OperatorBase { +public: + OperatorBase() {} + virtual ~OperatorBase() {} +}; namespace graph { @@ -34,8 +38,75 @@ namespace graph { * \brief struct AttrInfo of node */ struct AttrInfo { +public: + AttrInfo() { + parameter_p = + std::make_shared >(); + } + + inline bool inspect(const std::string& attr_name) { + auto it_end = parameter_p->end(); + auto it_find = parameter_p->find(attr_name); + if(it_find != it_end) { + return true; + } + return false; + } + + template + T get(const std::string& attr_name) { + auto it_end = parameter_p->end(); + auto it_find = parameter_p->find(attr_name); + if(it_find == it_end) { + LOG(FATAL) << "Target attr name(" << attr_name << ") not found."; + return T(); + } + return any_cast((*parameter_p)[attr_name]); + } + + template + Status set(const std::string& attr_name, const T val) { + (*parameter_p)[attr_name] = val; + return Status::OK(); + } + + Status remove(const std::string& attr_name) { + auto it_end = parameter_p->end(); + auto it_find = parameter_p->find(attr_name); + if(it_find != it_end) { + parameter_p->erase(attr_name); + return Status::OK(); + } else { + return Status::OK("target attr_name not included in attrs"); + } + } + + inline void MergeWithPattern(AttrInfo& operand, const std::string& pattern_name) { + auto it_begin = operand.parameter_p->begin(); + auto it_end = operand.parameter_p->end(); + for(auto it = it_begin; it != it_end; ++it ) { + // operand name has been changed! + std::string new_name = pattern_name + "_" + it->first; + (*parameter_p)[new_name] = it->second; + } + } + + std::unordered_map::iterator begin() { + return parameter_p->begin(); + } + + std::unordered_map::iterator end() { + return parameter_p->end(); + } + + /// shallow copy from other AttrInfo + AttrInfo& operator=(const AttrInfo& other_attr_info) { + this->parameter_p = other_attr_info.parameter_p; + return *this; + } +private: /// map : parameter ---> value - std::unordered_map parameter; + std::shared_ptr > parameter_p; }; /** @@ -56,19 +127,19 @@ struct Lane { * \brief Edge class used for Global edge type * public inherit Arc */ -template -class Edge : public Arc > { +template +class Edge : public Arc > { public: - Edge():Arc >() {} - Edge(const Edge& edge):Arc >(edge) { + Edge():Arc >() {} + Edge(const Edge& edge):Arc >(edge) { _shared = edge._shared; _share_from = edge._share_from; _current_lane = edge._current_lane; } - explicit Edge(std::string first, std::string second):Arc >(first, second) {} - explicit Edge(std::string first, std::string second, TensorSharedPtr tensor_ptr) - :Arc >(first, second, tensor_ptr) {} + explicit Edge(std::string first, std::string second):Arc >(first, second) {} + explicit Edge(std::string first, std::string second, TensorSharedPtr tensor_ptr) + :Arc >(first, second, tensor_ptr) {} /// Get first node name of the edge. inline std::string& first() { return this->bottom(); } @@ -77,7 +148,7 @@ class Edge : public Arc > { inline std::string& second() { return this->top(); } /// get data weigts of the edge. - inline TensorSharedPtr data() { return this->weight(); } + inline TensorSharedPtr data() { return this->weight(); } /// If edge's data is shared from the others. bool& shared() { return _shared; } @@ -102,7 +173,7 @@ class Edge : public Arc > { _shared = edge._shared; _share_from = edge._share_from; _current_lane = edge._current_lane; - Arc >::operator=(edge); + Arc >::operator=(edge); } private: @@ -117,7 +188,6 @@ class Edge : public Arc > { /** * \brief Node class used for Graph */ -template class Node { public: Node() {} @@ -139,12 +209,10 @@ class Node { void set_name(std::string name) { _name = name; } /// Node operator - //Operator* Op() { return _Op; } OperatorBase* Op() { return _Op; } /// set node operator - //void set_op(Operator* other) { _Op = other; } void set_op(OperatorBase* other) { _Op = other; } /// Node need wait @@ -155,21 +223,19 @@ class Node { /// Access to attributes. AttrInfo& attr() { return _attr; } + /// inspect if node attr have target attr name + inline bool inspect_attr(const std::string& attr_name) { + return this->_attr.inspect(attr_name); + } + /** * \brief Get target attr by name * \param attr_name stand for target_attr name * \return T the value of target attribute */ template - T get_attr(std::string& attr_name) { - auto& attrs = this->attr(); - const auto& it_end = attrs.parameter.end(); - auto it_find = attrs.parameter.find(attr_name); - if(it_find == it_end) { - LOG(FATAL) << "Target attr name(" << attr_name << ") not found."; - return T(); - } - return any_cast(attrs.parameter[attr_name]); + T get_attr(const std::string& attr_name) { + return this->_attr.get(attr_name); } /** * \brief Set target attr by name and value @@ -179,14 +245,8 @@ class Node { */ template Status set_attr(const std::string& attr_name, const T val) { - auto& attrs = this->attr(); - const auto& it_end = attrs.parameter.end(); - auto it_find = attrs.parameter.find(attr_name); - if(it_find != it_end) { - return Status::FAIL(); - } - attrs.parameter[attr_name] = val; - return Status::OK(); + std::unique_lock lock(this->_mut); + return this->_attr.set(attr_name, val); } /** @@ -195,15 +255,8 @@ class Node { * \return Status */ Status remove_attr(const std::string& attr_name) { - auto& attrs = this->attr(); - const auto& it_end = attrs.parameter.end(); - auto it_find = attrs.parameter.find(attr_name); - if(it_find != it_end) { - attrs.parameter.erase(attr_name); - return Status::OK(); - } else { - return Status::OK("target attr_name not included in attrs"); - } + std::unique_lock lock(this->_mut); + return this->_attr.remove(attr_name); } /// get lane @@ -215,39 +268,25 @@ class Node { * \param pattern_name * \return Node */ - inline Node& Merge(const Node& operand, std::string& pattern_name) { - auto it_begin = operand._attr.parameter.begin(); - auto it_end = operand._attr.parameter.end(); - for(auto it = it_begin; it != it_end; ++it ) { - // operand name has been changed! - std::string new_name = pattern_name + "_" + it->first; - _attr.parameter[new_name] = it->second; - /*if(_attr.parameter.count(it->first) > 0) { - // change the parameter name if fusion node have same parameter name - std::string new_name = operand._name + "_" + it->first; - _attr.parameter[new_name] = it->second; - } else { - _attr.parameter[it->first] = it->second; - }*/ - } + inline Node& Merge(Node& operand, const std::string& pattern_name) { + std::unique_lock lock(this->_mut); + this->_attr.MergeWithPattern(operand.attr(), pattern_name); return *this; } /// copy construction [ shallow copy ] - inline Node& operator=(const Node& operand) { + inline Node& operator=(Node& operand) { _name = operand._name; _current_lane = operand._current_lane; _Op = nullptr; // Assign the op pointer with operand's should be disabled, because it causes double free after binding the nodeptr by op itself. _op_name = operand._op_name; - // copy attributes - auto it_begin = operand._attr.parameter.begin(); - auto it_end = operand._attr.parameter.end(); - for(auto it = it_begin; it != it_end; ++it ) { - _attr.parameter[it->first] = it->second; - } + // shallow copy of attributes + this->_attr = operand.attr(); + // copy others _need_wait = operand._need_wait; _in_degree = operand._in_degree; _out_degree = operand._out_degree; + return *this; } /// print message @@ -263,7 +302,6 @@ class Node { ///< _current_lane stand for Current lane the node resides in. Lane _current_lane; ///< _Op stand for Operator in node.default bullptr - //Operator* _Op{nullptr}; OperatorBase* _Op{nullptr}; ///< _op_name stand for op name std::string _op_name; @@ -274,15 +312,16 @@ class Node { ///< _in_degree stand for number input degree size_t _in_degree; - ///< _out_degree stand for number output degree + ///< _out_degree stand for number output degree size_t _out_degree; + + std::mutex _mut; }; /// global node pointer type //typedef std::shared_ptr NodePtr; -template -using NodePtr = std::shared_ptr>; +using NodePtr = std::shared_ptr; } /* namespace graph */ diff --git a/framework/lite/CMakeLists.txt b/framework/lite/CMakeLists.txt new file mode 100644 index 000000000..d6358e85e --- /dev/null +++ b/framework/lite/CMakeLists.txt @@ -0,0 +1,57 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved +# ---------------------------------------------------------------------------- + +# used for temporary +anakin_fetch_include_recursively(${ANAKIN_FRAMEWORK}) +anakin_fetch_include_recursively(${ANAKIN_SABER}) +anakin_fetch_include_recursively(${ANAKIN_FRAMEWORK}/lite) + +#if(USE_ARM_PLACE) +# anakin_fetch_files_with_suffix(${ANAKIN_SABER}/lite/core "cpp" ANAKIN_SABER_ARM_LITE_SRC) +# anakin_fetch_files_with_suffix(${ANAKIN_SABER}/lite/funcs "cpp" ANAKIN_SABER_ARM_LITE_SRC) +# anakin_fetch_files_with_suffix(${ANAKIN_SABER}/lite/funcs/neon "cpp" ANAKIN_SABER_ARM_LITE_SRC) +# anakin_fetch_files_with_suffix(${ANAKIN_SABER}/lite/funcs/neon/impl "cpp" ANAKIN_SABER_ARM_LITE_SRC) +# +# set(anakin_saber_arm_lite_static "anakin_saber_arm_lite_static") +# add_library(${anakin_saber_arm_lite_static} STATIC ${ANAKIN_SABER_ARM_LITE_SRC}) +# set_target_properties(${anakin_saber_arm_lite_static} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/output/) +# install(DIRECTORY ${ANAKIN_SABER}/lite +# DESTINATION ${PROJECT_SOURCE_DIR}/tools/anakin-lite +# FILES_MATCHING +# PATTERN "*.h" +# PATTERN "*.inl") +#endif() + +# install saber arm lite to tools/anakin-lite +install(FILES ${ANAKIN_SABER}/saber_types.h DESTINATION ${PROJECT_SOURCE_DIR}/tools/anakin-lite/saber) +install(DIRECTORY ${ANAKIN_SABER}/lite DESTINATION ${PROJECT_SOURCE_DIR}/tools/anakin-lite/saber) +install(DIRECTORY ${ANAKIN_UTILS}/logger DESTINATION ${PROJECT_SOURCE_DIR}/tools/anakin-lite/utils) + +anakin_fetch_files_with_suffix(${ANAKIN_LITE}/generator/src "cpp" ANAKIN_LITE_EXE_SRC) +anakin_fetch_files_with_suffix(${ANAKIN_FRAMEWORK}/lite "cpp" ANAKIN_LITE_SRC) + +# build test cases +foreach(SRC_NAME ${ANAKIN_LITE_EXE_SRC}) + #unpack the dir "/" + string(REPLACE "/" ";" SEXY_LIST ${SRC_NAME}) + list(GET SEXY_LIST -1 LITE_EXE_NAME) + #get the file name without suffix + string(REPLACE "." ";" SEXY_LIST ${LITE_EXE_NAME}) + list(GET SEXY_LIST 0 LITE_EXE_NAME) + add_executable(${LITE_EXE_NAME} ${SRC_NAME} ${ANAKIN_LITE_SRC}) + if(BUILD_SHARED) + target_link_libraries(${LITE_EXE_NAME} ${anakin_lib_so}) + else() + target_link_libraries(${LITE_EXE_NAME} ${anakin_lib_static}) + endif() + set_target_properties(${LITE_EXE_NAME} PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_SOURCE_DIR}/output/generator/src) + install(DIRECTORY ${ANAKIN_LITE}/generator + DESTINATION ${PROJECT_SOURCE_DIR}/output/ + FILES_MATCHING + PATTERN "*.sh" + PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ + GROUP_EXECUTE GROUP_READ) +endforeach() diff --git a/framework/lite/binary_writter.h b/framework/lite/binary_writter.h new file mode 100644 index 000000000..f8cdb8888 --- /dev/null +++ b/framework/lite/binary_writter.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_FRAMEWORK_LITE_BINARY_WRITTER_H +#define ANAKIN_FRAMEWORK_LITE_BINARY_WRITTER_H + +#include "framework/lite/file_stream.h" +#include "framework/graph/graph.h" + +namespace anakin { + +namespace lite { + +/** + * \brief class to help generating binary file. + * + */ +class BinaryWritter { +public: + BinaryWritter() {} + + explicit BinaryWritter(std::string path) { + this->open(path); + } + + // BinaryWritteropen file for code generating. + void open(std::string& path, const char* file_mode = "wb") { + _file_io.open(path, file_mode); + } + + // write data list to file + inline bool write(void* ptr, size_t size, size_t count) { + return _file_io.write(ptr, size, count); + } + + // read data list from file + inline bool read(void* ptr, size_t size, size_t count) { + return _file_io.read(ptr, size, count); + } + +private: + LiteFileIO _file_io; +}; + +/** + * \brief class Weghts + */ +struct WeghtOffset { + struct Offset{ + size_t offset{0}; // offset from start + size_t length{0}; // weight length + }; + std::vector weights; +}; + +/** + * \brief class to help generating model weigth file. + * + */ +class WeightsWritter : public BinaryWritter { +public: + WeightsWritter() {} + ~WeightsWritter() {} + + // set weight + template + void register_weights(const std::string& node_name, PBlock& weight) { + WeghtOffset::Offset offset_tmp; + offset_tmp.offset = _offset; + offset_tmp.length = weight.count(); + _offset += offset_tmp.length; + _node_weights_map[node_name].weights.push_back(offset_tmp); + write(weight.h_tensor().mutable_data(), weight.h_tensor().get_dtype_size(), offset_tmp.length); + } + + bool has_node(std::string node_name) { + return _node_weights_map.count(node_name) > 0 ? true : false; + } + + WeghtOffset get_weights_by_name(std::string node_name) { + if(!has_node(node_name)) { + LOG(FATAL) << "WeightsWritter doesn't have target node name: " << node_name; + return WeghtOffset(); + } + return _node_weights_map[node_name]; + } + +private: + size_t _offset{0}; + std::unordered_map _node_weights_map; +}; + + + +} /* namespace lite */ + +} /* namespace anakin */ + +#endif diff --git a/framework/lite/code_gen_base.cpp b/framework/lite/code_gen_base.cpp new file mode 100644 index 000000000..cc87a3154 --- /dev/null +++ b/framework/lite/code_gen_base.cpp @@ -0,0 +1,229 @@ +#include "framework/lite/code_gen_base.h" +#include "framework/graph/graph_global_mem.h" +#include "framework/core/net/net.h" +#include "framework/graph/llvm/scheduler.h" +#include "framework/graph/llvm/optimizer/parall_scheduler.h" +#include "framework/graph/llvm/optimizer/memory_scheduler.h" + +namespace anakin { + +namespace lite { + +/** + * this full specialization use for help generating lite device running api + */ +template +bool CodeGenBase::extract_graph(const std::string& model_path) { + graph::Graph graph; + auto status = graph.load(model_path); + if(!status ) { + LOG(ERROR) << " [ERROR] " << status.info(); + return false; + } + + // change graph node and edge name to standard of c(or others)variable name + change_name(graph); + + // Optimize +#ifdef USE_ARM_PLACE + auto vgraph = graph.get_vgraph(); + graph::Scheduler scheduler; + // schedule for exec order + scheduler.RegIOResource(&vgraph); + scheduler.Run(); + scheduler.get_exec_node_in_order(); + // optimize mem + graph::MemoryScheduler mem_scheduler; + mem_scheduler.RegIOResource(&vgraph); + mem_scheduler.Run(); + // analyse parallel + graph::ParallScheduler para_scheduler; + para_scheduler.RegIOResource(&vgraph); + para_scheduler.Run(); + // restore from vgraph + graph.restore_from_vgraph(&vgraph); +#else + // Optimize + graph.Optimize(); +#endif + + // get graph io + _ins = graph.get_ins(); + _outs = graph.get_outs(); + + // copy graph + _graph.CopyFrom(graph); + // getting execution order + auto& node_names_in_exec_order = _graph.get_nodes_in_order(); + for (auto& node_name : node_names_in_exec_order) { + auto node_ptr = _graph[node_name]; + //if(node_ptr->get_op_name() == "Output") { + // continue; + //} + // op execution order + _exec_node_order.push_back(node_name); + _graph_node_map[node_name].name = node_name; + _graph_node_map[node_name].op_name = node_ptr->get_op_name(); + // set node op pointer + auto* op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; + node_ptr->set_op(op_pointer); + op_pointer = nullptr; + // bind parameter structure + static_cast*>(node_ptr->Op())->_helper->BindParam(node_ptr); + // parsing parameter + static_cast*>(node_ptr->Op())->_helper->InitParam(); + } + // remove null op node + for (auto it = node_names_in_exec_order.begin(); it != node_names_in_exec_order.end(); ){ + if (!_graph[*it]->Op()) { + it = node_names_in_exec_order.erase(it); + } else { + ++it; + } + } + + // compute in/out shape and initialize the _graph + std::vector > exec_funcs; + exec_funcs.resize(node_names_in_exec_order.size()); + for(int i = 0; i < node_names_in_exec_order.size(); i++) { + auto& node_name = node_names_in_exec_order[i]; + auto& op_func = exec_funcs[i]; + auto& edge_in_its = _graph.get_in_arc_its(node_name); + DLOG(ERROR) << " node : " << node_name << " (" << _graph[node_name]->get_op_name() << ") "; + for(auto& edge_it : edge_in_its) { + DLOG(INFO) << " => find in arc : " << edge_it->bottom() << " --> " << edge_it->top(); + _graph_node_map[node_name].ins.push_back(edge_it->name()); + op_func.ins.push_back(edge_it->weight().get()); + op_func.in_lanes.push_back(edge_it->lane()); + } + auto& edge_out_its = _graph.get_out_arc_its(node_name); + for(auto& edge_it : edge_out_its) { + DLOG(INFO) << " <= find out arc : " << edge_it->bottom() << " --> " << edge_it->top(); + _graph_node_map[node_name].outs.push_back(edge_it->name()); + op_func.outs.push_back(edge_it->weight().get()); + op_func.out_lanes.push_back(edge_it->lane()); + } + op_func.current_lane = _graph[node_name]->lane(); + op_func.need_sync = _graph[node_name]->need_wait(); + op_func.op = static_cast* >(_graph[node_name]->Op()); + op_func.op_name = _graph[node_name]->get_op_name(); + + CHECK_NOTNULL(op_func.op) << "Node(node_name) doesn't have op pointer! "; + op_func.op->_helper->InferShape(op_func.ins, op_func.outs); + } + + // initialize memory info + if(!init_memory_info()) { + return false; + } + return true; +} + +template +void CodeGenBase::change_name(graph::Graph& graph) { + auto convert2underline = [&](std::string& name, char converter_char) -> std::string { + char* target_p = strdup(name.c_str()); + for(char* p = strchr(target_p + 1, converter_char); p!=NULL; p = strchr(p + 1, converter_char)) { + *p = '_'; + } + return std::string(target_p); + }; + auto change_node_name = [&, this](graph::NodePtr& node_p) { + auto & name = node_p->name(); + // add_alias is an important api for changing node's name and edge + // and add_alias is useful only at this place so far. + graph.add_alias(name, convert2underline(name, '/')); + name = convert2underline(name, '/'); + }; + graph.Scanner->BFS(change_node_name); + + auto change_edge_name = [&, this](graph::Edge& edge) { + auto & first = edge.first(); + auto & second = edge.second(); + first = convert2underline(first, '/'); + second = convert2underline(second, '/'); + }; + graph.Scanner->BFS_Edge(change_edge_name); +} + +template +bool CodeGenBase::init_memory_info() { + auto alloc_memory = [this](graph::Edge& edge) { + EdgeInfo edge_info; + edge_info.name = edge.name(); + + auto& tensor_p = edge.weight(); + if(!edge.shared()) { + tensor_p->re_alloc(tensor_p->shape()); + + edge_info.valid_shape = tensor_p->shape(); + edge_info.real_shape = tensor_p->shape(); + edge_info.is_shared = false; + } else { + edge_info.is_shared = true; + } + _tensor_map[edge_info.name] = edge_info; + return 0; + }; + _graph.Scanner->BFS_Edge(alloc_memory); + + auto share_memory = [this](graph::Edge& edge) { + if(edge.shared()) { + auto& edge_name = edge.share_from(); + + _tensor_map[edge.name()].valid_shape = edge.weight()->valid_shape(); + _tensor_map[edge.name()].real_shape = edge.weight()->shape(); + + bool continue_search = true; + while(continue_search) { + auto match_edge = [&](graph::Edge& inner_edge) { + if(inner_edge.name() == edge_name) { + if(inner_edge.shared()) { + edge_name = inner_edge.share_from(); + return Status::EXIT(" Continue to find next . "); + } + if (inner_edge.weight()->size() < edge.weight()->valid_size()) { + auto inner_original_shape = inner_edge.weight()->valid_shape(); + inner_edge.weight()->re_alloc(edge.weight()->valid_shape()); + inner_edge.weight()->set_shape(inner_original_shape, inner_edge.weight()->shape()); + + _tensor_map[edge_name].valid_shape = inner_edge.weight()->valid_shape(); + _tensor_map[edge_name].real_shape = edge.weight()->valid_shape(); + } + edge.weight()->share_from(*(inner_edge.weight())); + _tensor_map[edge.name()].share_from= edge_name; + continue_search = false; + return Status::EXIT(" Find the matched target edge. "); + } + return Status::OK(); + }; + _graph.Scanner->BFS_Edge(match_edge); + } + } + }; + _graph.Scanner->BFS_Edge(share_memory); + return true; +} + +#ifdef USE_CUDA +template class CodeGenBase; +template class CodeGenBase; +template class CodeGenBase; +#endif + +#ifdef USE_X86_PLACE +template class CodeGenBase; +template class CodeGenBase; +template class CodeGenBase; +#endif + +#ifdef USE_ARM_PLACE +template class CodeGenBase; +template class CodeGenBase; +template class CodeGenBase; +#endif + +} /* namespace lite */ + +} /* namespace anakin */ + diff --git a/framework/lite/code_gen_base.h b/framework/lite/code_gen_base.h new file mode 100644 index 000000000..f3cfa4fdc --- /dev/null +++ b/framework/lite/code_gen_base.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_FRAMEWORK_LITE_CODE_GEN_BASE_H +#define ANAKIN_FRAMEWORK_LITE_CODE_GEN_BASE_H + +#include +#include +#include + +#include "framework/graph/graph.h" + +namespace anakin { + +namespace lite { + +/** + * \brief Node information for generating executor + */ +struct NodeInfo { + std::string name; // node name + std::string op_name; // op name + std::vector ins; // input edge name + std::vector outs; // output edge name +}; + + +/** + * \brief Edge information for generating edge tensors. + */ +struct EdgeInfo { + std::string name; // edge name + std::vector valid_shape; // edge valid shape + std::vector real_shape; // edge real shape + bool is_shared{false}; // if the edge is shared by others + std::string share_from{""}; // if the edge is_shared(true), share_from will hold the target edge name. +}; + +/** + * \brief class for target language code generator. + * + * The class CodeGenBase hold base information for running model. + * There exists several base info: + * 1. Operatoin name in execution order. + * 2. All the tensor model needs and share info between those tensors. + * 3. Model weights + */ +template +class CodeGenBase { +public: + CodeGenBase() {} + virtual ~CodeGenBase(){} + + /** + * \biref extract graph msg + */ + bool extract_graph(const std::string& model_path); + + /** + * \brief generate all source files + */ + virtual void gen_files() = 0; + + +private: + /** + * \brief analyse the memory reuse info + */ + bool init_memory_info(); + + /** + * \brief change graph edge and node name to match the standard of c variable name + */ + void change_name(graph::Graph&); + + /** + * \brief generate ops of graph + */ + virtual void gen_ops() = 0; + +protected: + graph::Graph _graph; + std::vector _exec_node_order; /// running order of operation's name + std::vector _ins; /// graph ins + std::vector _outs; /// graph outs + std::unordered_map _graph_node_map; + /// graph base arch + std::unordered_map _tensor_map; +}; + +} /* namespace lite */ + +} /* namespace anakin */ + +#endif + diff --git a/framework/lite/code_gen_cpp.cpp b/framework/lite/code_gen_cpp.cpp new file mode 100644 index 000000000..73785b3d5 --- /dev/null +++ b/framework/lite/code_gen_cpp.cpp @@ -0,0 +1,391 @@ +#include "framework/lite/code_gen_cpp.h" + +namespace anakin { + +namespace lite { + +template +void GenCPP::gen_license() { + _code<< "/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved.\n\n Licensed under the Apache License, Version 2.0 (the \"License\");\n you may not use this file except in compliance with the License.\n You may obtain a copy of the License at\n\n http://www.apache.org/licenses/LICENSE-2.0\n\n Unless required by applicable law or agreed to in writing, software\n distributed under the License is distributed on an \"AS IS\" BASIS,\n WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n See the License for the specific language governing permissions and\n limitations under the License.\n*/\n\n"; +} + +template +void GenCPP::gen_header_start() { + _code.Clean(); + gen_license(); + _code.feed("#ifndef ANAKIN_%s_H \n", _code_name.c_str()); + _code.feed("#define ANAKIN_%s_H \n\n", _code_name.c_str()); + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n\n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n"; + _code<<"#include \n\n"; + _code<<"using namespace anakin;\n"; + _code<<"using namespace anakin::saber;\n"; + _code<<"using namespace anakin::saber::lite;\n\n"; + _code<<"namespace anakin { \n\n"; +} + +template +void GenCPP::gen_header_end() { + _code<<"} /* namespace anakin */\n"; + _code<<"\n#endif\n"; +} + +template +void GenCPP::gen_source_start() { + _code.Clean(); + _code.feed("#include \"%s.h\" \n\n", _code_name.c_str()); + _code<<"namespace anakin { \n\n"; + // add running impl for model api +} + +template +void GenCPP::gen_source_end() { + _code<<"} /* namespace anakin */\n"; +} + +template +void GenCPP::gen_tensors() { + _code<<"\n// generating tensors \n"; + for(auto it = this->_tensor_map.begin(); it != this->_tensor_map.end(); ++it) { + auto& edge_name = it->first; + auto& edge_info = it->second; + if(! edge_info.is_shared) { + _code.feed("Tensor %s;\n", edge_name.c_str()); + _code.feed("Shape %s_real_shape(%d,%d,%d,%d);\n", edge_name.c_str(), edge_info.real_shape[0], + edge_info.real_shape[1], + edge_info.real_shape[2], + edge_info.real_shape[3]); + _code.feed("Shape %s_valid_shape(%d,%d,%d,%d);\n", edge_name.c_str(), edge_info.valid_shape[0], + edge_info.valid_shape[1], + edge_info.valid_shape[2], + edge_info.valid_shape[3]); + } + } + for(auto it = this->_tensor_map.begin(); it != this->_tensor_map.end(); ++it) { + auto& edge_name = it->first; + auto& edge_info = it->second; + if(edge_info.is_shared) { + _code.feed("Tensor %s;\n", edge_name.c_str()); + _code.feed("Shape %s_valid_shape(%d,%d,%d,%d);\n", edge_name.c_str(), edge_info.valid_shape[0], + edge_info.valid_shape[1], + edge_info.valid_shape[2], + edge_info.valid_shape[3]); + } + } +} + +template +void GenCPP::tensors_init() { + _code<<"\n// initialize tensors \n"; + _code.feed("void %s_tensors_init() {\n", _code_name.c_str()); + for(auto it = this->_tensor_map.begin(); it != this->_tensor_map.end(); ++it) { + auto& edge_name = it->first; + auto& edge_info = it->second; + if(! edge_info.is_shared) { + _code.feed(" %s.re_alloc(%s_real_shape);\n", edge_name.c_str(), edge_name.c_str()); + _code.feed(" %s.set_shape(%s_valid_shape);\n", edge_name.c_str(), edge_name.c_str()); + } + } + for(auto it = this->_tensor_map.begin(); it != this->_tensor_map.end(); ++it) { + auto& edge_name = it->first; + auto& edge_info = it->second; + if(edge_info.is_shared) { + _code.feed(" %s.set_shape(%s_valid_shape);\n", edge_name.c_str(), edge_name.c_str()); + _code.feed(" %s.share_from(%s);\n", edge_name.c_str(), edge_info.share_from.c_str()); + } + } + _code<<"}\n"; + +} + +template +void GenCPP::gen_model_ios() { + _code<<"\n// generating model's I/O \n"; + for(auto & node_name : this->_exec_node_order) { + auto& node_info = this->_graph_node_map[node_name]; + _code.feed("std::vector*> %s_ins;\n", node_name.c_str()); + _code.feed("std::vector*> %s_outs;\n", node_name.c_str()); + } +} + +template +void GenCPP::model_ios_init() { + _code<<"\n// initialize model's I/O \n"; + _code.feed("void %s_model_ios_init() {\n", _code_name.c_str()); + for(auto & node_name : this->_exec_node_order) { + auto& node_info = this->_graph_node_map[node_name]; + for(auto &edge_in : node_info.ins) { + _code.feed(" %s_ins.push_back(&%s);\n", node_name.c_str(), edge_in.c_str()); + } + for(auto &edge_out : node_info.outs) { + _code.feed(" %s_outs.push_back(&%s);\n", node_name.c_str(), edge_out.c_str()); + } + } + _code<<"}\n"; +} + +template +void GenCPP::gen_ops() { + _code<<"\n// generating model's operations\n"; + for(auto & node_name : this->_exec_node_order) { + if(this->_graph_node_map[node_name].op_name == "Input" || this->_graph_node_map[node_name].op_name == "Output") { + continue; + } + auto& node_info = this->_graph_node_map[node_name]; + if(OPERATION_MAP.count(node_info.op_name) > 0) { + _code.feed("%s %s; \n", OPERATION_MAP[node_info.op_name].OpClassName.c_str(), node_name.c_str()); + } + } +} + +template +void GenCPP::gen_init_impl() { + _code<<"// initial function for model.\n"; + _code.feed("void %s_init(Context& ctx) {\n", _code_name.c_str()); + for(auto & node_name : this->_exec_node_order) { + if(this->_graph_node_map[node_name].op_name == "Input" || this->_graph_node_map[node_name].op_name == "Output") { + continue; + } + auto& node_info = this->_graph_node_map[node_name]; + if(OPERATION_MAP.count(node_info.op_name) > 0) { + _code.feed(" %s.compute_output_shape(%s_ins,%s_outs); \n", node_name.c_str(), + node_name.c_str(), + node_name.c_str()); + _code.feed(" %s.init(%s_ins,%s_outs,ctx); \n", node_name.c_str(), + node_name.c_str(), + node_name.c_str()); + } + } + _code << "}\n"; +} + +template +void GenCPP::gen_run_impl() { + _code << "// Running prediction for model. \n"; + _code.feed("void %s_prediction() {\n", _code_name.c_str()); + for(auto & node_name : this->_exec_node_order) { + if(this->_graph_node_map[node_name].op_name == "Input" || this->_graph_node_map[node_name].op_name == "Output") { + continue; + } + auto& node_info = this->_graph_node_map[node_name]; + if(OPERATION_MAP.count(node_info.op_name) > 0) { + _code.feed(" %s.compute_output_shape(%s_ins,%s_outs); \n", node_name.c_str(), + node_name.c_str(), + node_name.c_str()); + _code.feed(" %s.dispatch(%s_ins,%s_outs); \n", node_name.c_str(), + node_name.c_str(), + node_name.c_str()); + } + } + _code << "}\n"; +} + +template +void GenCPP::gen_head_api() { + // gen gloss for graph ins + _code << "/// Model "<< _code_name << " have " << this->_ins.size() << " inputs.\n"; + for(auto in : this->_ins) { + auto& node_info = this->_graph_node_map[in]; + auto& edge_info = this->_tensor_map[node_info.outs[0]]; + _code << "/// |-- input name : " << in << " -- Shape("; + std::string shape_str; + for(int i=0; i 0) { + _code << edge_info.valid_shape[edge_info.valid_shape.size() - 1] << ")\n"; + } else { + _code << ")\n"; + } + } + + // gen api for getting graph input tensor + _code << "LITE_EXPORT Tensor* get_in(const char* in_name);\n\n"; + + // gen gloss for graph outs + _code << "/// Model " << _code_name << " have " << this->_outs.size() << " outputs.\n"; + for(auto out : this->_outs) { + auto& node_info = this->_graph_node_map[out]; + auto& edge_info = this->_tensor_map[node_info.ins[0]]; + _code << "/// |-- output name : " << out << " -- Shape("; + for(int i=0; i 0) { + _code << edge_info.valid_shape[edge_info.valid_shape.size() - 1] << ")\n"; + } else { + _code << ")\n"; + } + } + // gen api for getting graph output tensor + _code << "LITE_EXPORT Tensor* get_out(const char* out_name);\n\n"; + + // gen weights loading function + _code.feed("LITE_EXPORT bool %s_load_param(const char* param_path);\n\n", _code_name.c_str()); + + // gen api for model init + _code.feed("/// %s_init should only be invoked once when input shape changes.\n", _code_name.c_str()); + _code.feed("LITE_EXPORT void %s_init(Context& ctx);\n\n", _code_name.c_str()); + + // gen api for model prediction + _code.feed("/// Running prediction for model %s.\n", _code_name.c_str()); + _code.feed("LITE_EXPORT void %s_prediction();\n\n", _code_name.c_str()); + + // gen free function + _code.feed("/// Release all resource used by model %s.\n", _code_name.c_str()); + _code.feed("LITE_EXPORT void %s_release_resource();\n\n", _code_name.c_str()); + +} + +template +void GenCPP::gen_head_api_impl() { + // gen api for getting graph input tensor + _code << "\n// gen api for getting graph input tensor \n"; + _code << "Tensor* get_in(const char* in_name) {\n"; + _code.feed(" if(strcmp(in_name, \"%s\") == 0) {\n", this->_ins[0].c_str()); + auto node_info = this->_graph_node_map[this->_ins[0]]; + auto edge_info = this->_tensor_map[node_info.outs[0]]; + _code.feed(" return &%s;\n }", edge_info.name.c_str()); + for(int i = 1; i < this->_ins.size(); i++) { + node_info = this->_graph_node_map[this->_ins[i]]; + edge_info = this->_tensor_map[node_info.outs[0]]; + _code.feed(" else if(strcmp(in_name, \"%s\") == 0) {\n", this->_ins[i].c_str()); + _code.feed(" return &%s;\n }\n", edge_info.name.c_str()); + } + _code <<" else {\n return nullptr;\n }\n"; + _code <<"}\n"; + + // gen api for getting graph output tensor + _code << "\n// gen api for getting graph output tensor \n"; + _code << "Tensor* get_out(const char* out_name) {\n"; + _code.feed(" if(strcmp(out_name, \"%s\") == 0) {\n", this->_outs[0].c_str()); + node_info = this->_graph_node_map[this->_outs[0]]; + edge_info = this->_tensor_map[node_info.ins[0]]; + _code.feed(" return &%s;\n }", edge_info.name.c_str()); + for(int i = 1; i < this->_outs.size(); i++) { + node_info = this->_graph_node_map[this->_outs[i]]; + edge_info = this->_tensor_map[node_info.ins[0]]; + _code.feed(" else if(strcmp(out_name ,\"%s\") == 0) {\n", this->_outs[i].c_str()); + _code.feed(" return &%s;\n }\n", edge_info.name.c_str()); + } + _code <<" else {\n return nullptr;\n }\n"; + _code <<"}\n\n"; + + // gen weights loading function + _code.feed("float *%s = nullptr; // global weights start pointer \n", _g_weights_ptr_name.c_str()); + _code.feed("bool %s_load_param(const char* param_path) {\n", _code_name.c_str()); + _code << " FILE *f = fopen(param_path, \"rb\"); \n"; + _code << " if(!f) {\n"; + _code << " return false;\n }\n"; + _code << " fseek(f, 0, SEEK_END);\n"; + _code << " long fsize = ftell(f);\n"; + _code << " fseek(f, 0, SEEK_SET);\n"; + _code.feed(" %s = new float[fsize + 1];\n", _g_weights_ptr_name.c_str()); + _code.feed(" fread(%s, fsize, sizeof(float), f);\n", _g_weights_ptr_name.c_str()); + _code << " fclose(f);\n"; + _code.feed(" %s_tensors_init();\n", _code_name.c_str()); // invoke (model_name)_tensors_init() + _code.feed(" %s_model_ios_init();\n", _code_name.c_str()); // invoke (model_name)_model_ios_init() + for(auto & node_name : this->_exec_node_order) { + if(this->_graph_node_map[node_name].op_name == "Input" || this->_graph_node_map[node_name].op_name == "Output") { + continue; + } + auto& node_info = this->_graph_node_map[node_name]; + auto& attr_info = this->_graph[node_name]->attr(); + if(OPERATION_MAP.count(node_info.op_name) > 0) { + LOG(INFO) << "Target op type : " << this->_graph_node_map[node_name].op_name << " parsing ..."; + auto str = OPERATION_MAP[node_info.op_name].parse(attr_info, + OPERATION_MAP[node_info.op_name].OpClassName, + node_name, + _g_weights_ptr_name, + _weights); + if(!str.empty()) { + _code.feed(" %s", str.c_str()); + } + } else { + LOG(WARNING) << "Target op type : " << this->_graph_node_map[node_name].op_name << " not support"; + } + } + _code << " return true;\n"; + _code <<"}\n\n"; + + // release all resource function impl + _code.feed("void %s_release_resource() {\n", _code_name.c_str()); + _code.feed(" delete %s;\n", _g_weights_ptr_name.c_str()); + _code.feed(" %s = nullptr;\n", _g_weights_ptr_name.c_str()); + _code <<"}\n\n"; +} + +template +void GenCPP::gen_header() { + _code.Clean(); + _code.open(_h_file_name); + gen_header_start(); + // gen api + gen_head_api(); + gen_header_end(); + _code.save(); +} + +template +void GenCPP::gen_source() { + _code.Clean(); + _code.open(_cpp_file_name); + gen_source_start(); + // generate tensors + gen_tensors(); + // tensors init + tensors_init(); + // generate i/o + gen_model_ios(); + // initial model i/o + model_ios_init(); + // generate ops + gen_ops(); + // gen head api implement + gen_head_api_impl(); + // gen initial api impl + gen_init_impl(); + // gen running api impl + gen_run_impl(); + gen_source_end(); + _code.save(); +} + +#ifdef USE_CUDA +template class GenCPP; +template class GenCPP; +template class GenCPP; +#endif + +#ifdef USE_X86_PLACE +template class GenCPP; +template class GenCPP; +template class GenCPP; +#endif + +#ifdef USE_ARM_PLACE +template class GenCPP; +template class GenCPP; +template class GenCPP; +#endif + +} /* namespace lite */ + +} /* namespace anakin */ + diff --git a/framework/lite/code_gen_cpp.h b/framework/lite/code_gen_cpp.h new file mode 100644 index 000000000..2985e8e84 --- /dev/null +++ b/framework/lite/code_gen_cpp.h @@ -0,0 +1,126 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_FRAMEWORK_LITE_CODE_GENERATE_CPP_H +#define ANAKIN_FRAMEWORK_LITE_CODE_GENERATE_CPP_H + +#include "framework/lite/op_map.h" +#include "framework/lite/code_gen_base.h" + +namespace anakin { + +namespace lite { + +/** + * \brief class to generate cpp files. + * + */ +template +class GenCPP : public CodeGenBase { +public: + explicit GenCPP(std::string model_name, std::string model_dir = ".") { + _cpp_file_name = model_dir + '/' + model_name + ".cpp"; + _h_file_name = model_dir + '/' + model_name + ".h"; + _model_file_name = model_dir + '/' + model_name + ".bin"; + _weights.open(_model_file_name); + _code_name = model_name; + _g_weights_ptr_name = _code_name+"_weights_ptr"; + } + ~GenCPP()=default; + + /// generate all cpp files + virtual void gen_files() { + gen_header(); + gen_source(); + } + +private: + void gen_license(); + void gen_header_start(); + void gen_header_end(); + void gen_source_start(); + void gen_source_end(); + + /** + * \brief generate tensors for edges + */ + void gen_tensors(); + + /** + * \brief initialize tensors for edges + */ + void tensors_init(); + + /** + * \brief generate model's inputs and outputs + */ + void gen_model_ios(); + + /** + * \brief initialize model's inputs and outputs + */ + void model_ios_init(); + + /** + * \brief generate operations for model + */ + virtual void gen_ops(); + + /** + * \brief generate initial impl api for model + */ + void gen_init_impl(); + + /** + * \brief generate running api impl for model + */ + void gen_run_impl(); + + + /** + * \brief generate api for model + */ + void gen_head_api(); + + /** + * \brief generate head api implement + */ + void gen_head_api_impl(); + + /** + * \biref generata header file + */ + void gen_header(); + + /** + * \biref generata source file + */ + void gen_source(); + +private: + std::string _cpp_file_name; + std::string _h_file_name; + std::string _model_file_name; + std::string _code_name; + std::string _g_weights_ptr_name; + CodeWritter _code; + WeightsWritter _weights; +}; + +} /* namespace lite */ + +} /* namespace anakin */ + +#endif diff --git a/framework/lite/code_writter.h b/framework/lite/code_writter.h new file mode 100644 index 000000000..9dd03705e --- /dev/null +++ b/framework/lite/code_writter.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_FRAMEWORK_LITE_CODE_WRITTER_H +#define ANAKIN_FRAMEWORK_LITE_CODE_WRITTER_H + +#include +#include "framework/lite/file_stream.h" + +namespace anakin { + +namespace lite { + +/** + * \brief class to help generating code string. + * + */ +class CodeWritter { +public: + CodeWritter() {} + explicit CodeWritter(std::string path) { + this->open(path); + } + + // CodeWritter open file for code generating. + void open(std::string& path, const char* file_mode = "w" ) { + _file_io.open(path, file_mode); + } + + // get CodeWritter's target name + std::string get_code_name() { + auto path = _file_io.get_file_path(); + char* file_path = strdup(path.c_str()); + char* pos_end = file_path + path.size()-1; + char* split_idx = nullptr; + while(*pos_end != '/') { + if(*pos_end == '.') { + *pos_end = '\0'; + split_idx = pos_end; + } + pos_end--; + } + std::string name = std::string(pos_end+1); + *split_idx='/'; + free(file_path); + return name; + } + + /// feed format string for code writter. + void feed(const char* format, ...) { + va_list vlist; + va_start(vlist, format); + auto code_str_p = pick_format(format, vlist); + // get msg + _code< + CodeWritter& operator<<(const T& var) { + _code<open(path, file_mode); + } + + ~LiteFileIO() { + if(_file_p) { + fflush(this->_file_p); + fclose(this->_file_p); + this->_file_p = nullptr; + } + } + + // write msg to file + inline bool write(const std::string& msg) { + fprintf(this->_file_p, "%s\n", msg.c_str()); + fflush(this->_file_p); + return true; + } + + // write data list to file + inline bool write(const void* ptr, size_t size, size_t count) { + size_t ret = fwrite(ptr, size, count, this->_file_p); + fflush(this->_file_p); + if(ret != count) { + LOG(ERROR) << "Writing error " << stderr; + return false; + } + return true; + } + + // read data list from file + inline bool read(void* ptr, size_t size, size_t count) { + size_t ret = fread(ptr, size, count, this->_file_p); + if(ret != count) { + LOG(ERROR) << "Reading error " << stderr; + return false; + } + return true; + } + + inline bool is_file_open() { + return _file_p != nullptr ? true:false; + } + + inline std::string get_file_path() { + return _file_path; + } + + /// open the target file path + void open(const std::string& path, const char* file_mode) { + // close old + if(is_file_open()) { + fflush(this->_file_p); + fclose(this->_file_p); + this->_file_p = nullptr; + } + // open new + if (!this->is_file_open()) { + _file_path = path; + char* file_path = strdup(path.c_str()); + for (char* p = strchr(file_path + 1, '/'); p!=NULL; p = strchr(p + 1, '/')){ + *p = '\0'; + struct stat st; + if ((stat(file_path, &st) == 0) && (((st.st_mode) & S_IFMT) == S_IFDIR)){ + // file_path exists and is a directory. do nothing + *p = '/'; + continue; + } else { + if(mkdir(file_path,0755)==-1){ + LOG(FATAL) << "Failed to ceate the path "<< file_path; + } + } + *p = '/'; + } + free(file_path); + this->_file_p = fopen(path.c_str(), file_mode); + if (!this->_file_p){ + LOG(FATAL)<< "Failed to open " << path.c_str(); + } + } + } + +private: + std::string _file_path{""}; + FILE* _file_p{nullptr}; +}; + +} /* namespace lite */ + +} /* namespace anakin */ + +#endif diff --git a/framework/lite/generator/gen_code.sh b/framework/lite/generator/gen_code.sh new file mode 100755 index 000000000..acbcfa714 --- /dev/null +++ b/framework/lite/generator/gen_code.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +################################################# +# +# Usage: sh gen_code.sh -n -m -o +# +################################################# +# print help info +help_gen_code() { + echo "Usage: sh gen_code.sh [-h] [-n MODEL_NAME] [-m MODEL_PATH] [-o OUTPUT_PATH]" + echo "" + echo " Generating lite code for target model." + echo "" + echo "optional arguments:" + echo "" + echo " -h help info" + echo " -n model name used as the name of generating codes." + echo " -m path to model " + echo " -o path to save the generating codes. [ default './']" + exit 1 +} + +# generating code function +gen_code() { + if [ $# -lt 3 ]; then + exit 1 + fi + mode_name=$1 + mode_path=$2 + out_path=$3 + executor="$( cd "$(dirname "$0")"/src ; pwd -P)"/anakin_lite_executer + $executor $mode_name $mode_path $out_path +} + +# get args +if [ $# -lt 3 ]; then + help_gen_code + exit 1 +fi + +mode_name=0 +mode_path=0 +out_path="./" +while getopts h:n:m:o:hold opt +do + case $opt in + n) mode_name=$OPTARG;; + m) mode_path=$OPTARG;; + o) out_path=${OPTARG};; + *) help_gen_code;; + esac +done + +echo "User set model name: $mode_name" +echo "User set model path: $mode_path" +echo "User set out_path: $out_path" + +if [ ! -f $mode_path ];then + echo "mode_path: $mode_path not exists." + exit 1 +fi + +if [ ! -d $out_path ];then + echo "out path: $out_path not exists." + exit 1 +fi + +gen_code $mode_name $mode_path $out_path diff --git a/framework/lite/generator/src/anakin_lite_executer.cpp b/framework/lite/generator/src/anakin_lite_executer.cpp new file mode 100644 index 000000000..05146cd06 --- /dev/null +++ b/framework/lite/generator/src/anakin_lite_executer.cpp @@ -0,0 +1,36 @@ +#include "saber/saber_types.h" +#include "framework/lite/code_gen_cpp.h" +#include "framework/core/types.h" + +using namespace anakin; +using namespace anakin::saber; +using namespace anakin::lite; + +void anakin_lite_executer(const char* model_name, const char* model_path, const char* output_path = "./") { + // constructs + GenCPP code_gen(model_name, output_path); + if(! code_gen.extract_graph(model_path)) { + LOG(ERROR) << "extract error on : " << model_path; + } + // gen + code_gen.gen_files(); +} + + +int main(int argc, const char** argv){ + // initial logger + logger::init(argv[0]); + if(argc < 3) { + LOG(ERROR) << "Some arguments not supplied!"; + return 1; + } + const char* model_name = argv[1]; + const char* model_path = argv[2]; + if(argc == 3) { + anakin_lite_executer(model_name, model_path); + } else { // > 3 + const char* output_path = argv[3]; + anakin_lite_executer(model_name, model_path, output_path); + } + return 0; +} diff --git a/framework/lite/op_map.h b/framework/lite/op_map.h new file mode 100644 index 000000000..734508759 --- /dev/null +++ b/framework/lite/op_map.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_FRAMEWORK_LITE_OPERATION_MAP_H +#define ANAKIN_FRAMEWORK_LITE_OPERATION_MAP_H + +#include +#include + +#include "framework/lite/code_writter.h" +#include "framework/lite/binary_writter.h" + +namespace anakin { + +namespace lite { + +template +inline T get_attr(std::string attr_name, graph::AttrInfo& attrs) { + return attrs.get(attr_name); +} + +/// function type for parser +typedef std::function ParseParamFunctor; +/** + * \brief class OpParser + */ +struct OpParser { + std::string OpClassName; + ParseParamFunctor parse; +}; + +/// operations map +extern std::unordered_map OPERATION_MAP; + +} /* namespace lite */ + +} /* namespace anakin */ + +#endif diff --git a/framework/lite/op_map_cpp.cpp b/framework/lite/op_map_cpp.cpp new file mode 100644 index 000000000..f7375b965 --- /dev/null +++ b/framework/lite/op_map_cpp.cpp @@ -0,0 +1,659 @@ +#include "framework/lite/op_map.h" +#include "framework/utils/parameter_fusion.h" + +namespace anakin { + +namespace lite { + +std::string not_impl_yet(graph::AttrInfo&, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + LOG(INFO) << "Target "<< op_class_name << "Parsing not impl yet. continue ..."; + return ""; +} + +// SaberConv2D +std::string ParserConvolution(graph::AttrInfo& attr, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + // parsing parameter + auto group = get_attr("group", attr); + auto bias_term = get_attr("bias_term", attr); + auto padding = get_attr>("padding", attr); + auto strides = get_attr>("strides", attr); + auto dilation_rate = get_attr>("dilation_rate", attr); + auto filter_num = get_attr("filter_num", attr); + auto kernel_size = get_attr>("kernel_size", attr); + auto axis = get_attr("axis", attr); + + auto weights = get_attr>("weight_1", attr); + auto weights_shape = weights.shape(); + int weights_size = weights_shape[2]*weights_shape[3]; + int num_output = weights_shape[0]*weights_shape[1]; + + writter.register_weights(node_name, weights); + if(bias_term) { + auto bias = get_attr>("weight_2", attr); + writter.register_weights(node_name, bias); + } + + auto offset_info = writter.get_weights_by_name(node_name); + + // gen cpp code + CodeWritter code_w; + code_w.feed("%s.load_param(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d);\n", node_name.c_str(), + weights_size, + num_output, + group, + kernel_size[1], + kernel_size[0], + strides[1], + strides[0], + padding[1], + padding[0], + dilation_rate[1], + dilation_rate[0], + bias_term ? "true":"false", + weights_ptr_name.c_str(), + offset_info.weights[0].offset, + weights_ptr_name.c_str(), + bias_term ? offset_info.weights[1].offset : 0); + return code_w.get_code_string(); +} + +// ParserConvolutionRelu +std::string ParserConvolutionRelu(graph::AttrInfo& attr, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + // parsing parameter + auto group = get_attr("group", attr); + auto bias_term = get_attr("bias_term", attr); + auto padding = get_attr>("padding", attr); + auto strides = get_attr>("strides", attr); + auto dilation_rate = get_attr>("dilation_rate", attr); + auto filter_num = get_attr("filter_num", attr); + auto kernel_size = get_attr>("kernel_size", attr); + auto axis = get_attr("axis", attr); + + auto weights = get_attr>("weight_1", attr); + auto weights_shape = weights.shape(); + int weights_size = weights_shape[2]*weights_shape[3]; + int num_output = weights_shape[0]*weights_shape[1]; + + writter.register_weights(node_name, weights); + if(bias_term) { + auto bias = get_attr>("weight_2", attr); + writter.register_weights(node_name, bias); + } + + auto offset_info = writter.get_weights_by_name(node_name); + + // gen cpp code + CodeWritter code_w; + code_w.feed("%s.load_param(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,Active_relu,%s+%d,%s+%d);\n", node_name.c_str(), + weights_size, + num_output, + group, + kernel_size[1], + kernel_size[0], + strides[1], + strides[0], + padding[1], + padding[0], + dilation_rate[1], + dilation_rate[0], + bias_term ? "true":"false", + weights_ptr_name.c_str(), + offset_info.weights[0].offset, + weights_ptr_name.c_str(), + bias_term ? offset_info.weights[1].offset : 0); + return code_w.get_code_string(); +} + +std::string ParserConvBatchnormScale(graph::AttrInfo& attr, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + // parsing parameter + auto group = get_attr("group", attr); + auto bias_term = get_attr("bias_term", attr); + auto padding = get_attr>("padding", attr); + auto strides = get_attr>("strides", attr); + auto dilation_rate = get_attr>("dilation_rate", attr); + auto filter_num = get_attr("filter_num", attr); + auto kernel_size = get_attr>("kernel_size", attr); + auto axis = get_attr("axis", attr); + + auto weights = get_attr>("weight_1", attr); + auto weights_shape = weights.shape(); + int weights_size = weights_shape[2]*weights_shape[3]; + int num_output = weights_shape[0]*weights_shape[1]; + + // get batchnorm param + auto epsilon = get_attr("batchnorm_0_epsilon", attr); + auto momentum = get_attr("batchnorm_0_momentum", attr); + auto batch_norm_weight_1 = get_attr>("batchnorm_0_weight_1", attr); + auto batch_norm_weight_1_vector = batch_norm_weight_1.vector(); + auto batch_norm_weight_2 = get_attr>("batchnorm_0_weight_2", attr); + auto batch_norm_weight_2_vector = batch_norm_weight_2.vector(); + auto batch_norm_weight_3 = get_attr>("batchnorm_0_weight_3", attr); + auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); + + // get scale param + auto scale_num_axes = get_attr("scale_0_num_axes", attr); + auto scale_bias_term = get_attr("scale_0_bias_term", attr); + auto scale_axis = get_attr("scale_0_axis", attr); + auto scale_weight_1 = get_attr>("scale_0_weight_1", attr); + auto scale_weight_1_vector = scale_weight_1.vector(); + auto scale_weight_2 = get_attr>("scale_0_weight_2", attr); + auto scale_weight_2_vector = scale_weight_2.vector(); + + + if(bias_term) { + auto bias = get_attr>("weight_2", attr); + update_weights(weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + bias_term, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + + + writter.register_weights(node_name, weights); + writter.register_weights(node_name, bias); + } else { + auto bias = PBlock(); + update_weights(weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + + writter.register_weights(node_name, weights); + writter.register_weights(node_name, bias); + } + + auto offset_info = writter.get_weights_by_name(node_name); + + // gen cpp code + CodeWritter code_w; + code_w.feed("%s.load_param(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d);\n", node_name.c_str(), + weights_size, + num_output, + group, + kernel_size[1], + kernel_size[0], + strides[1], + strides[0], + padding[1], + padding[0], + dilation_rate[1], + dilation_rate[0], + "true", + weights_ptr_name.c_str(), + offset_info.weights[0].offset, + weights_ptr_name.c_str(), + bias_term ? offset_info.weights[1].offset : 0); + return code_w.get_code_string(); +} + +// SaberConvBatchnormScaleRelu +std::string ParserConvBatchnormScaleRelu(graph::AttrInfo& attr, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + // parsing parameter + auto group = get_attr("group", attr); + auto bias_term = get_attr("bias_term", attr); + auto padding = get_attr>("padding", attr); + auto strides = get_attr>("strides", attr); + auto dilation_rate = get_attr>("dilation_rate", attr); + auto filter_num = get_attr("filter_num", attr); + auto kernel_size = get_attr>("kernel_size", attr); + auto axis = get_attr("axis", attr); + + auto weights = get_attr>("weight_1", attr); + auto weights_shape = weights.shape(); + int weights_size = weights_shape[2]*weights_shape[3]; + int num_output = weights_shape[0]*weights_shape[1]; + + // get batchnorm param + auto epsilon = get_attr("batchnorm_0_epsilon", attr); + auto momentum = get_attr("batchnorm_0_momentum", attr); + auto batch_norm_weight_1 = get_attr>("batchnorm_0_weight_1", attr); + auto batch_norm_weight_1_vector = batch_norm_weight_1.vector(); + auto batch_norm_weight_2 = get_attr>("batchnorm_0_weight_2", attr); + auto batch_norm_weight_2_vector = batch_norm_weight_2.vector(); + auto batch_norm_weight_3 = get_attr>("batchnorm_0_weight_3", attr); + auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); + + // get scale param + auto scale_num_axes = get_attr("scale_0_num_axes", attr); + auto scale_bias_term = get_attr("scale_0_bias_term", attr); + auto scale_axis = get_attr("scale_0_axis", attr); + auto scale_weight_1 = get_attr>("scale_0_weight_1", attr); + auto scale_weight_1_vector = scale_weight_1.vector(); + auto scale_weight_2 = get_attr>("scale_0_weight_2", attr); + auto scale_weight_2_vector = scale_weight_2.vector(); + + if(bias_term) { + auto bias = get_attr>("weight_2", attr); + update_weights(weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + bias_term, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + + + writter.register_weights(node_name, weights); + writter.register_weights(node_name, bias); + } else { + auto bias = PBlock(); + update_weights(weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + + writter.register_weights(node_name, weights); + writter.register_weights(node_name, bias); + } + + auto offset_info = writter.get_weights_by_name(node_name); + + // gen cpp code + CodeWritter code_w; + code_w.feed("%s.load_param(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,Active_relu,%s+%d,%s+%d);\n", node_name.c_str(), + weights_size, + num_output, + group, + kernel_size[1], + kernel_size[0], + strides[1], + strides[0], + padding[1], + padding[0], + dilation_rate[1], + dilation_rate[0], + "true", + weights_ptr_name.c_str(), + offset_info.weights[0].offset, + weights_ptr_name.c_str(), + bias_term ? offset_info.weights[1].offset : 0); + return code_w.get_code_string(); +} + +// SaberConcat +std::string ParserConcat(graph::AttrInfo& attr, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + // parsing parameter + auto axis = get_attr("axis", attr); + // gen cpp code + CodeWritter code_w; + code_w.feed("%s.load_param(%d);\n", node_name.c_str(), axis); + return code_w.get_code_string(); +} + +// SaberDectionOutput +std::string ParserDectionOutput(graph::AttrInfo& attr, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + // parsing parameter + auto flag_share_location = get_attr("share_location", attr); + auto flag_var_in_target = get_attr("variance_encode_in_target", attr); + auto classes_num = get_attr("class_num", attr); + auto background_id = get_attr("background_id", attr); + auto keep_top_k = get_attr("keep_top_k", attr); + auto code_type = get_attr("code_type", attr); + auto conf_thresh = get_attr("conf_thresh", attr); + auto nms_top_k = get_attr("nms_top_k", attr); + auto nms_thresh = get_attr("nms_thresh", attr); + auto nms_eta = get_attr("nms_eta", attr); + + // gen cpp code + CodeWritter code_w; + code_w.feed("%s.load_param(%s,%s,%d,%d,%d,%s,%f,%d,%f,%f);\n", node_name.c_str(), + flag_share_location ? "true":"false", + flag_var_in_target ? "true":"false", + classes_num, + background_id, + keep_top_k, + code_type.c_str(), + conf_thresh, + nms_top_k, + nms_thresh, + nms_eta); + return code_w.get_code_string(); +} + +// SaberEltwise +std::string ParserEltwise(graph::AttrInfo& attr, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + // parsing parameter + auto type = get_attr("type", attr); + auto coeff = get_attr>("coeff", attr); + + std::string eltwise_type_str("Eltwise_unknow"); + + if (type == "Add") { + eltwise_type_str = "Eltwise_sum"; + } else if (type == "Max") { + eltwise_type_str = "Eltwise_max"; + } else { + eltwise_type_str = "Eltwise_prod"; + } + + CodeWritter coeff_vec_code; + coeff_vec_code<<"{"; + for(int i=0; i 0) { + coeff_vec_code<("type", attr); + + std::string act_type("Active_unknow"); + + if (type == "TanH") { + act_type = "Active_tanh"; + } else if (type == "Sigmoid") { + act_type = "Active_sigmoid"; + } else { + LOG(FATAL) << "Other Activation type" << type << " unknown."; + } + + // gen cpp code + CodeWritter code_w; + code_w.feed("%s.load_param(%s);\n", node_name.c_str(), act_type.c_str()); + return code_w.get_code_string(); +} + +std::string ParserRelu(graph::AttrInfo& attr, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + // parsing parameter + auto alpha = get_attr("alpha", attr); + + std::string act_type("Active_relu"); + + // gen cpp code + CodeWritter code_w; + code_w.feed("%s.load_param(%s,%f);\n", node_name.c_str(), act_type.c_str(),alpha); + return code_w.get_code_string(); +} + +// SaberFc +std::string ParserFc(graph::AttrInfo& attr, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + // parsing parameter + auto axis = get_attr("axis", attr); + auto out_dim = get_attr("out_dim", attr); + auto bias_term = get_attr("bias_term", attr); + + auto weights = get_attr>("weight_1", attr); + writter.register_weights(node_name, weights); + if(bias_term) { + auto bias = get_attr>("weight_2", attr); + writter.register_weights(node_name, bias); + } + + auto offset_info = writter.get_weights_by_name(node_name); + + // gen cpp code + CodeWritter code_w; + code_w.feed("%s.load_param(%d,%d,false,%s,%s+%d,%s+%d);\n", node_name.c_str(), axis, out_dim, + bias_term ? "true":"false", + weights_ptr_name.c_str(), + offset_info.weights[0].offset, + weights_ptr_name.c_str(), + bias_term ? offset_info.weights[1].offset : 0); + return code_w.get_code_string(); +} + +// SaberPermute +std::string ParserPermute(graph::AttrInfo& attr, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + // parsing parameter + auto dims = get_attr>("dims", attr); + + CodeWritter dims_vec_code; + dims_vec_code<<"{"; + for(int i=0; i 0) { + dims_vec_code<("global_pooling", attr); + auto pool_padding = get_attr>("padding", attr); + auto pool_strides = get_attr>("strides", attr); + auto pool_size = get_attr>("pool_size", attr); + auto pool_method = get_attr("method", attr); + + // gen cpp code + CodeWritter code_w; + code_w.feed("%s.load_param(%s,%s,%d,%d,%d,%d,%d,%d);\n", node_name.c_str(), + pool_method.c_str(), + global_pooling ? "true" : "false", + pool_size[1], + pool_size[0], + pool_strides[1], + pool_strides[0], + pool_padding[1], + pool_padding[0]); + return code_w.get_code_string(); +} + +// SaberPrelu +std::string ParserPrelu(graph::AttrInfo& attr, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + // parsing parameter + auto channel_shared = get_attr("channel_shared", attr); + + auto weights = get_attr>("weight_1", attr); + writter.register_weights(node_name, weights); + + auto offset_info = writter.get_weights_by_name(node_name); + + // gen cpp code + CodeWritter code_w; + code_w.feed("%s.load_param(%s,%s+%d);\n", node_name.c_str(), channel_shared ? "true":"false", + weights_ptr_name.c_str(), + offset_info.weights[0].offset); + return code_w.get_code_string(); +} + +// SaberPriorBox +std::string ParserPriorBox(graph::AttrInfo& attr, + std::string& op_class_name, + std::string& node_name, + std::string& weights_ptr_name, + WeightsWritter& writter) { + // parsing parameter + auto min_size = get_attr>("min_size", attr); + auto max_size = get_attr>("max_size", attr); + auto as_ratio = get_attr>("aspect_ratio", attr); + auto flip_flag = get_attr("is_flip", attr); + auto clip_flag = get_attr("is_clip", attr); + auto var = get_attr>("variance", attr); + auto image_h = get_attr("img_h", attr); + auto image_w = get_attr("img_w", attr); + auto step_h = get_attr("step_h", attr); + auto step_w = get_attr("step_w", attr); + auto offset = get_attr("offset", attr); + + auto gen_vec_code = [](PTuple ptuple) -> std::string { + CodeWritter dims_vec_code; + dims_vec_code<<"{"; + for(int i=0; i 0) { + dims_vec_code<("slice_dim", attr); + auto slice_point = get_attr>("slice_point", attr); + auto axis = get_attr("axis", attr); + + CodeWritter slice_point_vec_code; + slice_point_vec_code<<"{"; + for(int i=0; i 0) { + slice_point_vec_code<("axis", attr); + + // gen cpp code + CodeWritter code_w; + code_w.feed("%s.load_param(%d);\n", node_name.c_str(), axis); + return code_w.get_code_string(); +} + +std::unordered_map OPERATION_MAP({ + {"Input", {"Input", not_impl_yet} }, + {"Convolution", {"SaberConv2D", ParserConvolution} }, // done + {"Activation", {"SaberActivation", ParserActivation} }, // done + {"ReLU", {"SaberActivation",ParserRelu}}, // done + {"ConvRelu", {"SaberConvAct2D", ParserConvolutionRelu} }, // done + {"ConvBatchnormScaleRelu", {"SaberConvAct2D", ParserConvBatchnormScaleRelu}}, // done have question ?? + {"ConvBatchnormScale", {"SaberConv2D", ParserConvBatchnormScale}}, //done + {"Concat", {"SaberConcat", ParserConcat} }, // done + {"DetectionOutput", {"SaberDectionOutput", ParserDectionOutput} }, // done + {"Eltwise", {"SaberEltwise", ParserEltwise} }, //done + {"Eltwise", {"SaberEltwiseRelu", not_impl_yet}}, // not impl ?? + {"Dense", {"SaberFc", ParserFc} }, // done + {"Permute", {"SaberPermute", ParserPermute} }, // done + {"Pooling", {"SaberPooling", ParserPooling} }, // done + {"ReLU", {"SaberPrelu", ParserPrelu} }, // done + {"PriorBox", {"SaberPriorBox", ParserPriorBox} }, // done + {"Slice", {"SaberSlice", ParserSlice} }, // done + {"Softmax", {"SaberSoftmax", ParserSoftmax} } // done +}); + +} /* namespace lite */ + +} /* namespace anakin */ + diff --git a/framework/model_parser/CMakeLists.txt b/framework/model_parser/CMakeLists.txt index dd0e1a9e5..c6bc3e721 100644 --- a/framework/model_parser/CMakeLists.txt +++ b/framework/model_parser/CMakeLists.txt @@ -1,10 +1,16 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved -# @file CMakeLists files in the model parser directory of project -# @auther cuichaowen -# @date 2017-10-24 -# ---------------------------------------------------------------------------- - +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. set(ANAKIN_BASE_SRC "") # add ak_base_source files diff --git a/framework/model_parser/parser/model_io.cpp b/framework/model_parser/parser/model_io.cpp index 795f5317c..de094d0d8 100644 --- a/framework/model_parser/parser/model_io.cpp +++ b/framework/model_parser/parser/model_io.cpp @@ -6,14 +6,14 @@ namespace anakin { namespace parser { -template -NodeIO& NodeIO::operator>>(const NodeProto& node_proto) { - graph::NodePtr node_p = std::make_shared>(); +template +NodeIO& NodeIO::operator>>(const NodeProto& node_proto) { + graph::NodePtr node_p = std::make_shared(); node_p->name() = node_proto.name(); node_p->need_wait() = node_proto.need_wait(); node_p->lane() = node_proto.lane(); auto it = node_proto.attr().begin(); - + DLOG(INFO)<<"read :"<name(); for (; it != node_proto.attr().end(); ++it) { auto& key = it->first; auto& value = it->second; @@ -130,17 +130,18 @@ NodeIO& NodeIO::operator>>(const NodeP case TENSOR: { auto& tensor = value.tensor(); - auto& shape = tensor.shape(); - CHECK_EQ(shape.dim().size(), 4) << "Weights parameter's shape len must equal to 4."; + auto& real_shape = tensor.shape(); + auto& valid_shape = tensor.valid_shape(); + CHECK_EQ(real_shape.dim().size(), 4) << "Weights parameter's shape len must equal to 4."; auto& data = tensor.data(); switch (data.type()) { case FLOAT: { /* At so far, we only support weights saved as float. */ - saber::Shape saber_shape(1, 1, 1, 1); + saber::Shape saber_shape({1, 1, 1, 1}); - // get shape + // get real_shape for (int i = 0; i < 4; i++) { - saber_shape[i] = shape.dim().value()[i]; + saber_shape[i] = real_shape.dim().value()[i]; } auto* block = graph::GraphGlobalMem::Global().template new_block(saber_shape); @@ -151,12 +152,25 @@ NodeIO& NodeIO::operator>>(const NodeP cpu_data[i] = data.f()[i]; } -#ifdef USE_CUDA - //! map cpu data to GPU - //block->tensor().get_gpu_data(); +#if defined(USE_CUDA) || defined(AMD_GPU) + // map cpu data to GPU block->d_tensor().set_shape(saber_shape); block->d_tensor().copy_from(block->h_tensor()); #endif + if(valid_shape.dim().size() == 0) { + // set valid shape (== real shape) for host and device + block->d_tensor().set_shape(saber_shape); + block->h_tensor().set_shape(saber_shape); + } else { + saber::Shape saber_valid_shape({1, 1, 1, 1}); + for (int i=0; i < 4; i++) { + saber_valid_shape[i] = valid_shape.dim().value()[i]; + } + // set valid shape for host and device + block->d_tensor().set_shape(saber_valid_shape); + block->h_tensor().set_shape(saber_valid_shape); + } + node_p->set_attr(key, *block); } break; @@ -183,15 +197,15 @@ NodeIO& NodeIO::operator>>(const NodeP return *this; } -template -NodeIO& NodeIO::operator>>(const - graph::NodePtr node_p) { +template +NodeIO& NodeIO::operator>>(const + graph::NodePtr& node_p) { _que.push(node_p); return *this; } -template -Status NodeIO::operator<<(graph::Graph& graph) { +template +Status NodeIO::operator<<(graph::Graph& graph) { while (!this->empty()) { auto& node_p = _que.front(); DLOG(WARNING) << "[NODE] Graph get node: " << node_p->name(); @@ -207,8 +221,8 @@ Status NodeIO::operator<<(graph::Graph return Status::OK(); } -template -Status NodeIO::operator<<(GraphProto& graph) { +template +Status NodeIO::operator<<(GraphProto& graph) { while (!this->empty()) { auto& node_p = _que.front(); NodeProto* node_proto = graph.add_nodes(); @@ -222,9 +236,9 @@ Status NodeIO::operator<<(GraphProto& graph) { // set node proto's attr auto node_proto_attr = node_proto->mutable_attr(); - auto it = node_p->attr().parameter.begin(); + auto it = node_p->attr().begin(); - for (; it != node_p->attr().parameter.end(); ++it) { + for (; it != node_p->attr().end(); ++it) { auto& key = it->first; auto& value = it->second; @@ -298,30 +312,62 @@ Status NodeIO::operator<<(GraphProto& graph) { (*node_proto_attr)[key].set_type(CACHE_LIST); (*node_proto_attr)[key].mutable_cache_list()->set_type(BOOLEN); (*node_proto_attr)[key].mutable_cache_list()->set_size(tuple_bool.size()); - } else if (value.type() == "anakin_block_float") { // default block have float data - auto block_float = any_cast>(value); + } else if (value.type() == "anakin_block") { // default block have float data + auto block_float = any_cast>(value); float* cpu_data = static_cast(block_float.h_tensor().mutable_data()); - auto shape_saber = block_float.shape(); + auto valid_shape = block_float.shape(); + auto real_shape = block_float.real_shape(); - // set proto tensor shape - for (int i = 0; i < shape_saber.dims(); i++) { - (*node_proto_attr)[key].mutable_tensor()->mutable_shape()->mutable_dim()->add_value(shape_saber[i]); - } + if(valid_shape == real_shape) { + // set proto tensor shape + for (int i = 0; i < valid_shape.dims(); i++) { + (*node_proto_attr)[key].mutable_tensor()->mutable_shape()->mutable_dim()->add_value(valid_shape[i]); + } - (*node_proto_attr)[key].mutable_tensor()->mutable_shape()->mutable_dim()->set_size( - shape_saber.size()); + (*node_proto_attr)[key].mutable_tensor()->mutable_shape()->mutable_dim()->set_size( + valid_shape.size()); - // set proto tensor data - for (int i = 0; i < shape_saber.count(); i++) { - (*node_proto_attr)[key].mutable_tensor()->mutable_data()->add_f(cpu_data[i]); - } + // set proto tensor data + for (int i = 0; i < valid_shape.count(); i++) { + (*node_proto_attr)[key].mutable_tensor()->mutable_data()->add_f(cpu_data[i]); + } + + (*node_proto_attr)[key].mutable_tensor()->mutable_data()->set_type(FLOAT); + (*node_proto_attr)[key].mutable_tensor()->mutable_data()->set_size(valid_shape.count()); + (*node_proto_attr)[key].set_type(TENSOR); + } else { + // set proto tensor valid shape + for (int i = 0; i < valid_shape.dims(); i++) { + (*node_proto_attr)[key].mutable_tensor()->mutable_valid_shape()->mutable_dim()->add_value(valid_shape[i]); + } + (*node_proto_attr)[key].mutable_tensor()->mutable_valid_shape()->mutable_dim()->set_size( + valid_shape.size()); + + // set proto tensor real shape + for (int i = 0; i < real_shape.dims(); i++) { + (*node_proto_attr)[key].mutable_tensor()->mutable_shape()->mutable_dim()->add_value(real_shape[i]); + } + (*node_proto_attr)[key].mutable_tensor()->mutable_shape()->mutable_dim()->set_size( + real_shape.size()); + + + // set proto tensor data + for (int i = 0; i < real_shape.count(); i++) { + (*node_proto_attr)[key].mutable_tensor()->mutable_data()->add_f(cpu_data[i]); + } - (*node_proto_attr)[key].mutable_tensor()->mutable_data()->set_type(FLOAT); - (*node_proto_attr)[key].mutable_tensor()->mutable_data()->set_size(shape_saber.count()); - (*node_proto_attr)[key].set_type(TENSOR); + (*node_proto_attr)[key].mutable_tensor()->mutable_data()->set_type(FLOAT); + (*node_proto_attr)[key].mutable_tensor()->mutable_data()->set_size(real_shape.count()); + (*node_proto_attr)[key].set_type(TENSOR); + } } else { + auto tuple_float = any_cast>(value); + (*node_proto_attr)[key].set_type(CACHE_LIST); + (*node_proto_attr)[key].mutable_cache_list()->set_type(FLOAT); + (*node_proto_attr)[key].mutable_cache_list()->set_size(tuple_float.size()); + LOG(ERROR) << "node: " << node_p->name() << " (" << node_p->get_op_name() << ") \ - key : " << key << " value_type: " << value.type(); + key : " << key << " value_type: " << value.type(); } } @@ -332,21 +378,36 @@ Status NodeIO::operator<<(GraphProto& graph) { } #ifdef USE_CUDA -template class NodeIO; -template class NodeIO; -template class NodeIO; +template class NodeIO; +template class NodeIO; +template class NodeIO; +#endif + +#ifdef AMD_GPU +template class NodeIO; +template class NodeIO; +template class NodeIO; #endif #ifdef USE_X86_PLACE -template class NodeIO; -template class NodeIO; -template class NodeIO; +template class NodeIO; +template class NodeIO; +template class NodeIO; #endif #ifdef USE_ARM_PLACE -template class NodeIO; -template class NodeIO; -template class NodeIO; +#ifdef ANAKIN_TYPE_FP32 +template class NodeIO; +#endif + +#ifdef ANAKIN_TYPE_FP16 +template class NodeIO; +#endif + +#ifdef ANAKIN_TYPE_INT8 +template class NodeIO; +#endif + #endif } /* parser */ diff --git a/framework/model_parser/parser/model_io.h b/framework/model_parser/parser/model_io.h index 7ba9ba7f3..e3394c486 100644 --- a/framework/model_parser/parser/model_io.h +++ b/framework/model_parser/parser/model_io.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ namespace anakin { namespace parser { -template +template class NodeIO { public: NodeIO() {} @@ -44,10 +44,10 @@ class NodeIO { // read NodeProto NodeIO& operator>>(const NodeProto& node_proto); // read Node - NodeIO& operator>>(const graph::NodePtr node_p); + NodeIO& operator>>(const graph::NodePtr& node_p); // output to Graph - Status operator<<(graph::Graph& graph); + Status operator<<(graph::Graph& graph); // output to GraphProto Status operator<<(GraphProto& graph); @@ -56,7 +56,7 @@ class NodeIO { std::vector& get_node_name_in_order() { return _que_node_name_in_order; } private: - std::queue> _que; + std::queue _que; std::vector _que_node_name_in_order; }; diff --git a/framework/model_parser/parser/parser.cpp b/framework/model_parser/parser/parser.cpp index f70ee2704..1bc110986 100644 --- a/framework/model_parser/parser/parser.cpp +++ b/framework/model_parser/parser/parser.cpp @@ -17,49 +17,13 @@ namespace anakin { namespace parser { -template -Status load(graph::Graph* graph, std::string& model_path) { +template +Status load(graph::Graph* graph, std::string& model_path) { return load(graph, model_path.c_str()); } -#ifdef USE_CUDA -template -Status load(graph::Graph* graph, - std::string& model_path); -template -Status load(graph::Graph* graph, - std::string& model_path); -template -Status load(graph::Graph* graph, - std::string& model_path); -#endif - -#ifdef USE_X86_PLACE -template -Status load(graph::Graph* graph, - std::string& model_path); -template -Status load(graph::Graph* graph, - std::string& model_path); -template -Status load(graph::Graph* graph, - std::string& model_path); -#endif - -#ifdef USE_ARM_PLACE -template -Status load(graph::Graph* graph, - std::string& model_path); -template -Status load(graph::Graph* graph, - std::string& model_path); -template -Status load(graph::Graph* graph, - std::string& model_path); -#endif - -template -Status load(graph::Graph* graph, const char* model_path) { +template +Status load(graph::Graph* graph, const char* model_path) { #if 0 std::fstream input(model_path, std::ios::in | std::ios::binary); @@ -86,13 +50,16 @@ Status load(graph::Graph* graph, const char* model_path) { google::protobuf::io::ZeroCopyInputStream* raw_input = new google::protobuf::io::FileInputStream( file_descriptor); + google::protobuf::io::CodedInputStream* coded_input = new google::protobuf::io::CodedInputStream( raw_input); + coded_input->SetTotalBytesLimit(ProtoReadBytesLimit, 536870912); + bool success = graph_proto.ParseFromCodedStream(coded_input); if (!success) { - LOG(FATAL) << " Parsing GraphProto " << model_path; + LOG(FATAL) << " Parsing GraphProto " << model_path << " ERROR"; } delete coded_input; @@ -100,8 +67,8 @@ Status load(graph::Graph* graph, const char* model_path) { close(file_descriptor); #endif // fill the graph with name - LOG(INFO) << " graph name: " << graph_proto.name(); - graph->name() = graph_proto.name(); + LOG(INFO) << "graph name: " << graph_proto.name(); + graph->set_name(graph_proto.name()); // fill the graph with ins/outs for (int i = 0; i < graph_proto.ins().size(); i++) { @@ -117,7 +84,7 @@ Status load(graph::Graph* graph, const char* model_path) { } // fill the graph with nodes - NodeIO node_io; + NodeIO node_io; for (int i = 0; i < graph_proto.nodes().size(); i++) { node_io >> graph_proto.nodes()[i]; @@ -134,10 +101,10 @@ Status load(graph::Graph* graph, const char* model_path) { auto& second = it_in->second; for (int i = 0; i < second.val().size(); i++) { - //Tensor4dPtr tensor_p = std::make_shared>(); - graph::Edge edge(second.val()[i], key); - //edge.weight() = new Tensor4d(); - //edge.weight() = std::make_shared >(); + //Tensor4dPtr tensor_p = std::make_shared>(); + graph::Edge edge(second.val()[i], key); + //edge.weight() = new Tensor4d(); + //edge.weight() = std::make_shared >(); edge.shared() = (*graph_proto.mutable_edges_info())[edge.name()].shared(); edge.share_from() = (*graph_proto.mutable_edges_info())[edge.name()].share_from(); graph->add_in_arc(edge); @@ -151,10 +118,10 @@ Status load(graph::Graph* graph, const char* model_path) { auto& second = it_out->second; for (int i = 0; i < second.val().size(); i++) { - //Tensor4dPtr tensor_p = std::make_shared>(); - graph::Edge edge(key, second.val()[i]); - //edge.weight() = new Tensor4d(); - //edge.weight() = std::make_shared >(); + //Tensor4dPtr tensor_p = std::make_shared>(); + graph::Edge edge(key, second.val()[i]); + //edge.weight() = new Tensor4d(); + //edge.weight() = std::make_shared >(); edge.shared() = (*graph_proto.mutable_edges_info())[edge.name()].shared(); edge.share_from() = (*graph_proto.mutable_edges_info())[edge.name()].share_from(); graph->add_out_arc(edge); @@ -168,8 +135,8 @@ Status load(graph::Graph* graph, const char* model_path) { if (graph_proto.edges().count(node_name) > 0) { auto& second_node_name_list = graph_proto.edges().at(node_name); for(int j = 0; j < second_node_name_list.val().size(); j++) { - graph::Edge edge(node_name, second_node_name_list.val()[j]); - edge.weight() = std::make_shared >(); + graph::Edge edge(node_name, second_node_name_list.val()[j]); + edge.weight() = std::make_shared >(); edge.shared() = (*graph_proto.mutable_edges_info())[edge.name()].shared(); edge.share_from() = (*graph_proto.mutable_edges_info())[edge.name()].share_from(); graph->add_arc(edge); @@ -190,85 +157,13 @@ Status load(graph::Graph* graph, const char* model_path) { return Status::OK(); } -#ifdef USE_CUDA -template -Status load(graph::Graph* graph, - const char* model_path); -template -Status load(graph::Graph* graph, - const char* model_path); -template -Status load(graph::Graph* graph, - const char* model_path); -#endif - -#ifdef USE_X86_PLACE -template -Status load(graph::Graph* graph, - const char* model_path); -template -Status load(graph::Graph* graph, - const char* model_path); -template -Status load(graph::Graph* graph, - const char* model_path); -#endif - -#ifdef USE_ARM_PLACE -template -Status load(graph::Graph* graph, - const char* model_path); -template -Status load(graph::Graph* graph, - const char* model_path); -template -Status load(graph::Graph* graph, - const char* model_path); -#endif - -template -Status save(graph::Graph* graph, std::string& model_path) { +template +Status save(graph::Graph* graph, std::string& model_path) { return save(graph, model_path.c_str()); } -#ifdef USE_CUDA -template -Status save(graph::Graph* graph, - std::string& model_path); -template -Status save(graph::Graph* graph, - std::string& model_path); -template -Status save(graph::Graph* graph, - std::string& model_path); -#endif - -#ifdef USE_X86_PLACE -template -Status save(graph::Graph* graph, - std::string& model_path); -template -Status save(graph::Graph* graph, - std::string& model_path); -template -Status save(graph::Graph* graph, - std::string& model_path); -#endif - -#ifdef USE_ARM_PLACE -template -Status save(graph::Graph* graph, - std::string& model_path); -template -Status save(graph::Graph* graph, - std::string& model_path); -template -Status save(graph::Graph* graph, - std::string& model_path); -#endif - -template -Status save(graph::Graph* graph, const char* model_path) { +template +Status save(graph::Graph* graph, const char* model_path) { std::fstream output(model_path, std::ios::out | std::ios::trunc | std::ios::binary); if (!output) { @@ -291,7 +186,7 @@ Status save(graph::Graph* graph, const char* model_path) { } // fill the graph proto nodes with NodePtr in exec order - NodeIO node_io; + NodeIO node_io; auto nodes_in_exec_order = graph->get_nodes_in_order(); for (int i = 0; i < nodes_in_exec_order.size(); i++) { @@ -304,7 +199,7 @@ Status save(graph::Graph* graph, const char* model_path) { auto edges_in = graph_proto.mutable_edges_in(); auto edges_out = graph_proto.mutable_edges_out(); auto edges_info = graph_proto.mutable_edges_info(); - /*auto insert_edge = [&](graph::Edge& edge) { + /*auto insert_edge = [&](graph::Edge& edge) { (*edges)[edge.first()].add_val(edge.second()); TensorProto ts; ts.set_name(edge.name()); @@ -312,7 +207,7 @@ Status save(graph::Graph* graph, const char* model_path) { ts.set_share_from(edge.share_from()); (*edges_info)[edge.name()].CopyFrom(ts); };*/ - auto insert_edge = [&](graph::NodePtr& node_p) { + auto insert_edge = [&](graph::NodePtr& node_p) { auto& arcs_it_in = graph->get_in_arc_its(node_p->name()); auto& arcs_it_out = graph->get_out_arc_its(node_p->name()); @@ -352,40 +247,131 @@ Status save(graph::Graph* graph, const char* model_path) { return Status::OK(); } + #ifdef USE_CUDA template -Status save(graph::Graph* graph, - const char* model_path); +Status load(graph::Graph* graph, const char* model_path); +template +Status load(graph::Graph* graph, const char* model_path); +template +Status load(graph::Graph* graph, const char* model_path); +template +Status save(graph::Graph* graph, std::string& model_path); +template +Status save(graph::Graph* graph, std::string& model_path); +template +Status save(graph::Graph* graph, std::string& model_path); + +template +Status load(graph::Graph* graph, std::string& model_path); template -Status save(graph::Graph* graph, - const char* model_path); +Status load(graph::Graph* graph, std::string& model_path); template -Status save(graph::Graph* graph, - const char* model_path); +Status load(graph::Graph* graph, std::string& model_path); + +template +Status save(graph::Graph* graph, const char* model_path); +template +Status save(graph::Graph* graph, const char* model_path); +template +Status save(graph::Graph* graph, const char* model_path); #endif #ifdef USE_X86_PLACE template -Status save(graph::Graph* graph, - const char* model_path); +Status load(graph::Graph* graph, const char* model_path); +template +Status load(graph::Graph* graph, const char* model_path); +template +Status load(graph::Graph* graph, const char* model_path); + template -Status save(graph::Graph* graph, - const char* model_path); +Status save(graph::Graph* graph, std::string& model_path); template -Status save(graph::Graph* graph, - const char* model_path); +Status save(graph::Graph* graph, std::string& model_path); +template +Status save(graph::Graph* graph, std::string& model_path); + +template +Status load(graph::Graph* graph, std::string& model_path); +template +Status load(graph::Graph* graph, std::string& model_path); +template +Status load(graph::Graph* graph, std::string& model_path); + +template +Status save(graph::Graph* graph, const char* model_path); +template +Status save(graph::Graph* graph, const char* model_path); +template +Status save(graph::Graph* graph, const char* model_path); #endif #ifdef USE_ARM_PLACE +#ifdef ANAKIN_TYPE_FP32 +template +Status load(graph::Graph* graph, const char* model_path); +template +Status save(graph::Graph* graph, std::string& model_path); +template +Status load(graph::Graph* graph, std::string& model_path); +template +Status save(graph::Graph* graph, const char* model_path); +#endif + +#ifdef ANAKIN_TYPE_FP16 +template +Status load(graph::Graph* graph, const char* model_path); +template +Status save(graph::Graph* graph, std::string& model_path); +template +Status load(graph::Graph* graph, std::string& model_path); +template +Status save(graph::Graph* graph, const char* model_path); +#endif + +#ifdef ANAKIN_TYPE_INT8 +template +Status load(graph::Graph* graph, const char* model_path); +template +Status save(graph::Graph* graph, std::string& model_path); +template +Status load(graph::Graph* graph, std::string& model_path); +template +Status save(graph::Graph* graph, const char* model_path); +#endif + +#endif + + +#ifdef AMD_GPU +template +Status load(graph::Graph* graph, std::string& model_path); +template +Status load(graph::Graph* graph, std::string& model_path); +template +Status load(graph::Graph* graph, std::string& model_path); + +template +Status load(graph::Graph* graph, const char* model_path); +template +Status load(graph::Graph* graph, const char* model_path); +template +Status load(graph::Graph* graph, const char* model_path); + +template +Status save(graph::Graph* graph, std::string& model_path); +template +Status save(graph::Graph* graph, std::string& model_path); +template +Status save(graph::Graph* graph, std::string& model_path); + template -Status save(graph::Graph* graph, - const char* model_path); +Status save(graph::Graph* graph, const char* model_path); template -Status save(graph::Graph* graph, - const char* model_path); +Status save(graph::Graph* graph, const char* model_path); template -Status save(graph::Graph* graph, - const char* model_path); +Status save(graph::Graph* graph, const char* model_path); #endif } /* parser */ diff --git a/framework/model_parser/parser/parser.h b/framework/model_parser/parser/parser.h index 59d091700..2a2df23fc 100644 --- a/framework/model_parser/parser/parser.h +++ b/framework/model_parser/parser/parser.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,16 +28,16 @@ namespace anakin { namespace parser { //! parse data of external model_path file into graph. -template -Status load(graph::Graph* graph, std::string& model_path); -template -Status load(graph::Graph* graph, const char* model_path); +template +Status load(graph::Graph* graph, std::string& model_path); +template +Status load(graph::Graph* graph, const char* model_path); //! save graph to disk. use to save improved Graph. -template -Status save(graph::Graph* graph, std::string& model_path); -template -Status save(graph::Graph* graph, const char* model_path); +template +Status save(graph::Graph* graph, std::string& model_path); +template +Status save(graph::Graph* graph, const char* model_path); } /* parser */ diff --git a/framework/model_parser/proto/tensor.proto b/framework/model_parser/proto/tensor.proto index cf50d3c85..f46c643ca 100644 --- a/framework/model_parser/proto/tensor.proto +++ b/framework/model_parser/proto/tensor.proto @@ -47,9 +47,12 @@ message TensorProto { // ( only used when anakin generates optimized model) bytes share_from = 3; - // tensor shape + // tensor real shape TensorShape shape = 8; + // tensor valid shape + TensorShape valid_shape = 9; + // tensor data cache. CacheDate data = 10; }; diff --git a/framework/operators/activation.cpp b/framework/operators/activation.cpp index 8df3e6bc1..92900b5d9 100644 --- a/framework/operators/activation.cpp +++ b/framework/operators/activation.cpp @@ -4,39 +4,54 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Activation::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_activation; - impl->_funcs_activation(ins, outs, param, ctx); +#define INSTANCE_ACTIVATION(Ttype, Ptype) \ +template<> \ +void Activation::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_activation; \ + impl->_funcs_activation(ins, outs, param, ctx); \ } -#endif - -/// TODO ... specialization other type of operator - /// set helper -template -ActivationHelper::~ActivationHelper() { +template +ActivationHelper::~ActivationHelper() { } -template -Status ActivationHelper::InitParam() { +template +Status ActivationHelper::InitParam() { DLOG(WARNING) << "Parsing Activation op parameter."; auto type = GET_PARAMETER(std::string, type); - if (type == "TanH") { - ActivationParam> param_activation(Active_tanh); + ActivationParam param_activation(Active_tanh); _param_activation = param_activation; } else if (type == "Sigmoid") { - ActivationParam> param_activation(Active_sigmoid); + ActivationParam param_activation(Active_sigmoid); + _param_activation = param_activation; + } else if (type == "PReLU") { + auto channel_shared = GET_PARAMETER(bool, channel_shared); + using pblock_type = PBlock; + auto weights = GET_PARAMETER(pblock_type, weight_1); + + PreluParam prelu_param(channel_shared, &(weights.d_tensor())); + + ActivationParam param_activation(Active_prelu, 0, 0, prelu_param); + _param_activation = param_activation; + } else if (type == "Stanh") { + ActivationParam param_activation(Active_stanh); _param_activation = param_activation; + } else if (type == "Relu") { + ActivationParam param_activation(Active_relu); + _param_activation = param_activation; + } else if (type == "ClippedRelu") { + ActivationParam param_activation(Active_clipped_relu); + _param_activation = param_activation; + } else if (type == "Elu") { + ActivationParam param_activation(Active_elu); + _param_activation = param_activation; } else { LOG(FATAL) << "Other Activation type" << type << " should be replace by other ops."; } @@ -44,54 +59,76 @@ Status ActivationHelper::InitParam() { return Status::OK(); } -template -Status ActivationHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, SPECIFY, VENDER_IMPL, ctx)); +template +Status ActivationHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status ActivationHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { +template +Status ActivationHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_activation.compute_output_shape(ins, outs, _param_activation)); return Status::OK(); } #ifdef USE_CUDA -template class ActivationHelper; -template class ActivationHelper; -template class ActivationHelper; -#endif -#ifdef USE_ARM_PLACE -template class ActivationHelper; -template class ActivationHelper; -template class ActivationHelper; +INSTANCE_ACTIVATION(NV, Precision::FP32); + +template<> +Status ActivationHelper::Init(OpContext& ctx, + const std::vector< Tensor4dPtr > & ins, + std::vector< Tensor4dPtr >& outs) { + SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, STATIC, VENDER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, NV, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, NV, AK_FLOAT, Precision::FP32); + +#ifdef USE_X86_PLACE +INSTANCE_ACTIVATION(X86, Precision::FP32); +INSTANCE_ACTIVATION(X86, Precision::FP16); +INSTANCE_ACTIVATION(X86, Precision::INT8); +template class ActivationHelper; +ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, X86, Precision::FP32); #endif + #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_ACTIVATION(ARM, Precision::FP32); +template class ActivationHelper; +ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_ACTIVATION(AMD, Precision::FP32); +template class ActivationHelper; +template class ActivationHelper; +template class ActivationHelper; +ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, AMD, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Activation) .Doc("Activation operator") #ifdef USE_CUDA -.__alias__("activation") +.__alias__("activation") #endif #ifdef USE_ARM_PLACE -.__alias__("activation") +.__alias__("activation") +#endif +#ifdef USE_X86_PLACE +.__alias__("activation") +#endif +#ifdef AMD_GPU +.__alias__("activation") #endif .num_in(1) .num_out(1) -.Args("type", " type of Activation "); +.Args("type", " type of Activation ") +.Args("channel_shared", "prelu channel is shared or not "); } /* namespace ops */ } /* namespace anakin */ - diff --git a/framework/operators/activation.h b/framework/operators/activation.h index 42bdac06f..bcc1100aa 100644 --- a/framework/operators/activation.h +++ b/framework/operators/activation.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class ActivationHelper; /// pooling op @@ -34,20 +34,20 @@ class ActivationHelper; * \brief operation of ops class * public inheritance Operator */ -template -class Activation : public Operator { +template +class Activation : public Operator { public: Activation() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Activation< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ActivationHelper; + friend class ActivationHelper; }; /** @@ -55,8 +55,8 @@ class Activation : public Operator { * public inheritance OperatorHelper * including init operation context and the size of shape */ -template -class ActivationHelper : public OperatorHelper { +template +class ActivationHelper : public OperatorHelper { public: ActivationHelper()=default; @@ -72,8 +72,8 @@ class ActivationHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class ActivationHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_activation stand for activation parameter - saber::ActivationParam> _param_activation; + saber::ActivationParam _param_activation; ///< _funcs_activation stand for activation function - saber::Activation _funcs_activation; + saber::Activation::saber_type> _funcs_activation; }; diff --git a/framework/operators/arg_max.cpp b/framework/operators/arg_max.cpp index b85251791..ce88d4596 100644 --- a/framework/operators/arg_max.cpp +++ b/framework/operators/arg_max.cpp @@ -4,82 +4,116 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Argmax::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = impl->_param_argmax; - impl->_funcs_argmax(ins, outs, param, ctx); -} -#endif +//#ifdef USE_CUDA +//template<> +//void Argmax::operator()( +// OpContext& ctx, +// const std::vector >& ins, +// std::vector >& outs) { +// auto* impl = +// static_cast*>(this->_helper); +// auto& param = impl->_param_argmax; +// impl->_funcs_argmax(ins, outs, param, ctx); +//} +//#endif /// TODO ... specialization other type of operator - +#define INSTANCE_ARGMAX(Ttype, Ptype) \ +template<> \ +void Argmax::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = impl->_param_argmax; \ + impl->_funcs_argmax(ins, outs, param, ctx); \ +} /// set helper -template -ArgmaxHelper::~ArgmaxHelper() { +template +ArgmaxHelper::~ArgmaxHelper() { } -template -Status ArgmaxHelper::InitParam() { +template +Status ArgmaxHelper::InitParam() { DLOG(WARNING) << "Parsing Argmax op parameter."; auto out_max_val = GET_PARAMETER(bool, out_max_val); auto top_k = GET_PARAMETER(int, top_k); - auto axis = GET_PARAMETER(int, axis); - saber::ArgmaxParam> argmax_param(out_max_val, top_k, axis); - _param_argmax = argmax_param; + auto axis_term = GET_PARAMETER(bool, axis_term); + + if (axis_term == true) { + auto axis = GET_PARAMETER(int, axis); + saber::ArgmaxParam argmax_param(out_max_val, top_k, axis); + _param_argmax = argmax_param; + } else { + saber::ArgmaxParam argmax_param(out_max_val, top_k); + _param_argmax = argmax_param; + } + return Status::OK(); } -template -Status ArgmaxHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status ArgmaxHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_argmax.init(ins, outs, _param_argmax, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status ArgmaxHelper::InferShape(const std::vector >& - ins, - std::vector >& outs) { +template +Status ArgmaxHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_argmax.compute_output_shape(ins, outs, _param_argmax)); return Status::OK(); } #ifdef USE_CUDA -template class ArgmaxHelper; -template class ArgmaxHelper; -template class ArgmaxHelper; +INSTANCE_ARGMAX(NV, Precision::FP32); +template class ArgmaxHelper; +template class ArgmaxHelper; +template class ArgmaxHelper; +ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, NV, Precision::FP32); #endif -#ifdef USE_ARM_PLACE -template class ArgmaxHelper; -template class ArgmaxHelper; -template class ArgmaxHelper; +#ifdef USE_X86_PLACE +INSTANCE_ARGMAX(X86, Precision::FP32); +template class ArgmaxHelper; +template class ArgmaxHelper; +template class ArgmaxHelper; +ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, X86, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, NV, AK_FLOAT, Precision::FP32); -#endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, ARM, AK_FLOAT, Precision::FP32); -#endif + +#ifdef ANAKIN_TYPE_FP32 +INSTANCE_ARGMAX(ARM, Precision::FP32); +template class ArgmaxHelper; +ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, ARM, Precision::FP32); +#endif //fp32 + +#ifdef ANAKIN_TYPE_FP16 +template class ArgmaxHelper; +#endif //fp16 + +#ifdef ANAKIN_TYPE_INT8 +template class ArgmaxHelper; +#endif //int8 + +#endif //arm //! register op ANAKIN_REGISTER_OP(Argmax) .Doc("Argmax operator") #ifdef USE_CUDA -.__alias__("Argmax") +.__alias__("Argmax") #endif #ifdef USE_ARM_PLACE -.__alias__("Argmax") +.__alias__("Argmax") +#endif + +#ifdef USE_X86_PLACE +.__alias__("Argmax") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/arg_max.h b/framework/operators/arg_max.h index 05ef7ca4f..e772fa163 100644 --- a/framework/operators/arg_max.h +++ b/framework/operators/arg_max.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class ArgmaxHelper; /// axpy op @@ -34,20 +34,20 @@ class ArgmaxHelper; * \brief operation of argMax class * public inheritance Operator */ -template -class Argmax : public Operator { +template +class Argmax : public Operator { public: Argmax() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator Argmax::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Argmax< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ArgmaxHelper; + friend class ArgmaxHelper; }; /** @@ -55,8 +55,8 @@ class Argmax : public Operator { * public inheritance OperatorHelper * including init operation context and the size of shape */ -template -class ArgmaxHelper : public OperatorHelper { +template +class ArgmaxHelper : public OperatorHelper { public: ArgmaxHelper()=default; @@ -72,8 +72,8 @@ class ArgmaxHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class ArgmaxHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_argmax stand for argmax parameter - saber::ArgmaxParam> _param_argmax; + saber::ArgmaxParam _param_argmax; ///< _funcs_argmax stand for argmax function - saber::Argmax _funcs_argmax; + saber::Argmax::saber_type> _funcs_argmax; private: ///< _dims stand for argmax size diff --git a/framework/operators/axpy.cpp b/framework/operators/axpy.cpp index 56d7a9e14..67768775c 100644 --- a/framework/operators/axpy.cpp +++ b/framework/operators/axpy.cpp @@ -4,80 +4,106 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Axpy::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = impl->_param_axpy; - impl->_funcs_axpy(ins, outs, param, ctx); -} -#endif +//#ifdef USE_CUDA +//template<> +//void Axpy::operator()( +// OpContext& ctx, +// const std::vector >& ins, +// std::vector >& outs) { +// auto* impl = +// static_cast*>(this->_helper); +// auto& param = impl->_param_axpy; +// impl->_funcs_axpy(ins, outs, param, ctx); +//} +//#endif /// TODO ... specialization other type of operator - +#define INSTANCE_AXPY(Ttype, Ptype) \ +template<> \ +void Axpy::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = impl->_param_axpy; \ + impl->_funcs_axpy(ins, outs, param, ctx); \ +} /// set helper -template -AxpyHelper::~AxpyHelper() { +template +AxpyHelper::~AxpyHelper() { } -template -Status AxpyHelper::InitParam() { +template +Status AxpyHelper::InitParam() { DLOG(WARNING) << "Parsing Axpy op parameter."; - saber::AxpyParam> axpy_param; + saber::AxpyParam axpy_param; _param_axpy = axpy_param; return Status::OK(); } -template -Status AxpyHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status AxpyHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_axpy.init(ins, outs, _param_axpy, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status AxpyHelper::InferShape(const std::vector >& +template +Status AxpyHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { + std::vector >& outs) { SABER_CHECK(_funcs_axpy.compute_output_shape(ins, outs, _param_axpy)); return Status::OK(); } #ifdef USE_CUDA -template class AxpyHelper; -template class AxpyHelper; -template class AxpyHelper; +INSTANCE_AXPY(NV, Precision::FP32); +template class AxpyHelper; +template class AxpyHelper; +template class AxpyHelper; +ANAKIN_REGISTER_OP_HELPER(Axpy, AxpyHelper, NV, Precision::FP32); +#endif + +#ifdef USE_X86_PLACE +INSTANCE_AXPY(X86, Precision::FP32); +template class AxpyHelper; +template class AxpyHelper; +template class AxpyHelper; +ANAKIN_REGISTER_OP_HELPER(Axpy, AxpyHelper, X86, Precision::FP32); #endif #ifdef USE_ARM_PLACE -template class AxpyHelper; -template class AxpyHelper; -template class AxpyHelper; + +#ifdef ANAKIN_TYPE_FP32 +INSTANCE_AXPY(ARM, Precision::FP32); +template class AxpyHelper; +ANAKIN_REGISTER_OP_HELPER(Axpy, AxpyHelper, ARM, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Axpy, AxpyHelper, NV, AK_FLOAT, Precision::FP32); +#ifdef ANAKIN_TYPE_FP16 +template class AxpyHelper; #endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Axpy, AxpyHelper, ARM, AK_FLOAT, Precision::FP32); + +#ifdef ANAKIN_TYPE_INT8 +template class AxpyHelper; #endif +#endif//arm + //! register op ANAKIN_REGISTER_OP(Axpy) .Doc("Axpy operator") #ifdef USE_CUDA -.__alias__("axpy") +.__alias__("axpy") #endif #ifdef USE_ARM_PLACE -.__alias__("axpy") +.__alias__("axpy") +#endif +#ifdef USE_X86_PLACE +.__alias__("axpy") #endif .num_in(3) .num_out(1); diff --git a/framework/operators/axpy.h b/framework/operators/axpy.h index 6a77dc7d4..8e36c3da0 100644 --- a/framework/operators/axpy.h +++ b/framework/operators/axpy.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class AxpyHelper; /// axpy op @@ -34,20 +34,20 @@ class AxpyHelper; * \brief operation of Axpy class * public inheritance Operator */ -template -class Axpy : public Operator { +template +class Axpy : public Operator { public: Axpy() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator axpy::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Axpy< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class AxpyHelper; + friend class AxpyHelper; }; /** @@ -55,8 +55,8 @@ class Axpy : public Operator { * public inheritance OperatorHelper * including init operation context and the size of shape */ -template -class AxpyHelper : public OperatorHelper { +template +class AxpyHelper : public OperatorHelper { public: AxpyHelper()=default; @@ -72,8 +72,8 @@ class AxpyHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class AxpyHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_axpy stand for axpy parameter - saber::AxpyParam> _param_axpy; + saber::AxpyParam _param_axpy; ///< _funcs_axpy stand for axpy function - saber::Axpy _funcs_axpy; + saber::Axpy::saber_type> _funcs_axpy; private: ///< _dims stand for axpy size diff --git a/framework/operators/batch_norm.cpp b/framework/operators/batch_norm.cpp index d590449ff..33f5d9956 100644 --- a/framework/operators/batch_norm.cpp +++ b/framework/operators/batch_norm.cpp @@ -4,88 +4,101 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA +#define INSTANCE_BATCHNORM(Ttype, Ptype) \ +template<> \ +void BatchNorm::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*>(this->_helper)->_param_scale; \ + impl->_funcs_scale(ins, outs, param, ctx); \ +} + +#if 0//def USE_CUDA template<> void BatchNorm::operator()( OpContext& ctx, const std::vector >& ins, std::vector >& outs) { - /*auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_permute; - impl->_funcs_permute(ins, outs, param, ctx);*/ + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_scale; + impl->_funcs_scale(ins, outs, param, ctx); } #endif -/// TODO ... specialization other type of operator - - -/// set helper -template -BatchNormHelper::~BatchNormHelper() { - LOG(INFO) << "Decons permute_cpu_float"; -} +template +Status BatchNormHelper::InitParam() { + DLOG(WARNING) << "Parsing Scale op parameter."; + using pblock_type = PBlock; + + auto eps = GET_PARAMETER(float, epsilon); + auto mean = GET_PARAMETER(pblock_type, weight_1); + auto var = GET_PARAMETER(pblock_type, weight_2); + auto scale_factor = GET_PARAMETER(pblock_type, weight_3); + auto mean_vec = mean.vector(); + auto var_vec = var.vector(); + auto scale_factor_vec = scale_factor.vector(); + std::vector::type> scale; + std::vector::type> bias; + scale.resize(mean.count()); + bias.resize(mean.count()); + auto scale_val = scale_factor_vec[0] == 0 ? 0 : 1 / scale_factor_vec[0]; + for (int i = 0; i < mean.count(); i++) { + scale[i] = 1.0f / std::sqrt(var_vec[i] * scale_val + eps); + bias[i] = - mean_vec[i] * scale_val / std::sqrt(var_vec[i] * scale_val + eps); + } -template -Status BatchNormHelper::InitParam() { + saber::ScaleParam param_scale(scale, bias, true, 1, 1); + _param_scale = param_scale; return Status::OK(); } -template -Status BatchNormHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - //_funcs_permute.init(ins, outs, _param_permute, SPECIFY, VENDER_IMPL, ctx); +template +Status BatchNormHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_scale.init(ins, outs, _param_scale, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status BatchNormHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { - std::vector shape; - - //_funcs_permute.compute_output_shape(shape, ins, _param_permute); - //CHECK_EQ(shape.size(), outs.size()) << " size of (out) should be equal to that of vector (shape)."; - for (int i = 0; i < outs.size(); i++) { - // set tensor shape tensor->set_shape(shape[i]); - outs[i]->set_shape(ins[i]->shape()); - } - +template +Status BatchNormHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_scale.compute_output_shape(ins, outs, _param_scale)); return Status::OK(); } -#ifdef USE_CUDA -template class BatchNormHelper; -template class BatchNormHelper; -template class BatchNormHelper; -#endif - -#ifdef USE_ARM_PLACE -template class BatchNormHelper; -template class BatchNormHelper; -template class BatchNormHelper; -#endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(BatchNorm, BatchNormHelper, NV, AK_FLOAT, Precision::FP32); +INSTANCE_BATCHNORM(NV, Precision::FP32); +template class BatchNormHelper; +ANAKIN_REGISTER_OP_HELPER(BatchNorm, BatchNormHelper, NV, Precision::FP32); +#endif + +#ifdef USE_X86_PLACE +INSTANCE_BATCHNORM(X86, Precision::FP32); +template class BatchNormHelper; +ANAKIN_REGISTER_OP_HELPER(BatchNorm, BatchNormHelper, X86, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(BatchNorm, BatchNormHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_BATCHNORM(ARM, Precision::FP32); +template class BatchNormHelper; +ANAKIN_REGISTER_OP_HELPER(BatchNorm, BatchNormHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(BatchNorm) - .Doc("BatchNorm operator") +.Doc("BatchNorm operator") #ifdef USE_CUDA - .__alias__("power") +.__alias__("eps") #endif #ifdef USE_ARM_PLACE - .__alias__("power") +.__alias__("eps") #endif - .num_in(1) - .num_out(1) - .Args>("dims", " dims for permuting the order of input "); +.num_in(1) +.num_out(1); } /* namespace ops */ diff --git a/framework/operators/batch_norm.h b/framework/operators/batch_norm.h index b578af503..ff778a1a5 100644 --- a/framework/operators/batch_norm.h +++ b/framework/operators/batch_norm.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -//#include "saber/funcs/permute.h" +#include "saber/funcs/scale.h" namespace anakin { namespace ops { -template +template class BatchNormHelper; /// pooling op @@ -34,20 +34,20 @@ class BatchNormHelper; * \brief Batch normalization class * public inherit Operator */ -template -class BatchNorm : public Operator { +template +class BatchNorm : public Operator { public: BatchNorm() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator BatchNorm< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class BatchNormHelper; + friend class BatchNormHelper; }; /** @@ -55,12 +55,12 @@ class BatchNorm : public Operator { * public inherit OperatorHelper * including init resource and shape size in BatchNorm processing */ -template -class BatchNormHelper : public OperatorHelper { +template +class BatchNormHelper : public OperatorHelper { public: BatchNormHelper()=default; - ~BatchNormHelper(); + ~BatchNormHelper() {} Status InitParam() override; @@ -72,8 +72,8 @@ class BatchNormHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,20 +81,15 @@ class BatchNormHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; - + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: - //PermuteParam _param_permute; - //saber::Permute _funcs_permute; + saber::ScaleParam _param_scale; + ///< _funcs_scale stand for scale function + saber::Scale::saber_type > _funcs_scale; -private: - ///< _dims stand for batchNorm size - PTuple _dims; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/bk/gru.cpp b/framework/operators/bk/gru.cpp new file mode 100644 index 000000000..e36cda50f --- /dev/null +++ b/framework/operators/bk/gru.cpp @@ -0,0 +1,136 @@ +#include "framework/operators/gru.h" + +namespace anakin { + +namespace ops { + +#ifdef USE_CUDA +template<> +void Gru::operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_gru; + impl->_funcs_gru(ins, outs, param, ctx); +} +#endif +#ifdef USE_X86_PLACE +template<> +void Gru::operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_gru; + impl->_funcs_gru(ins, outs, param, ctx); +} +#endif + +/// TODO ... specialization other type of operator +/// set helper +template +GruHelper::~GruHelper() { +} + +template +Status GruHelper::InitParam() { + DLOG(WARNING) << "Parsing Gru op parameter."; + auto is_reverse = GET_PARAMETER(bool, is_reverse); + auto gate_act = GET_PARAMETER(std::string, gate_activation); + auto hidden_act = GET_PARAMETER(std::string, activation); + auto formula = GET_PARAMETER(std::string, gru_formula); + + using pblock_type = PBlock; + auto weight_wu = GET_PARAMETER(pblock_type, weight_1); + auto bias = GET_PARAMETER(pblock_type, weight_2); + + CHECK((formula != "") && (formula == "gru_origin" + || formula == "gru_cudnn")) << "formula illegal"; + + std::unordered_map act_map = { + {"sigmoid_fluid", Active_sigmoid}, + {"relu_fluid", Active_relu}, + {"tanh_fluid", Active_tanh}, + {"identity_fluid", Active_identity} + }; + std::unordered_map formula_map = { + {"gru_origin", GRU_ORIGIN}, + {"gru_cudnn", GRU_CUDNN}, + }; + CHECK_GT(weight_wu.d_tensor().valid_size(),0)<<"weights size must > 0"; + CHECK_GT(bias.d_tensor().valid_size(),0)<<"bias size must > 0"; + + GruParam gru_param(&(weight_wu.d_tensor()), &(bias.d_tensor()), + formula_map[formula], act_map[gate_act], + act_map[hidden_act], is_reverse); + + _param_gru = gru_param; + + return Status::OK(); +} + +template +Status GruHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_gru.init(ins, outs, _param_gru, SPECIFY, SABER_IMPL, ctx)); +// SABER_CHECK(_funcs_gru.init(ins, outs, _param_gru, SPECIFY, VENDER_IMPL, ctx)); + return Status::OK(); +} + +template +Status GruHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_gru.compute_output_shape(ins, outs, _param_gru)); + return Status::OK(); +} + +#ifdef USE_CUDA +template class GruHelper; +template class GruHelper; +template class GruHelper; +#endif + +#ifdef USE_ARM_PLACE +template class GruHelper; +template class GruHelper; +template class GruHelper; +#endif + +#ifdef USE_X86_PLACE +template class GruHelper; +template class GruHelper; +template class GruHelper; +#endif + +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, NV, Precision::FP32); +#endif +#ifdef USE_ARM_PLACE +ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, ARM, Precision::FP32); +#endif +#ifdef USE_X86_PLACE +ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, X86, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(Gru) + .Doc("Gru operator") +#ifdef USE_CUDA + .__alias__("gru") +#endif +#ifdef USE_ARM_PLACE + .__alias__("gru") +#endif +#ifdef USE_X86_PLACE + .__alias__("gru") +#endif + .num_in(1) + .num_out(1) + .Args("is_reverse", " is_reverse for gru.") + .Args("gate_activation", "gate_activation for gru.") + .Args("activation", "hidden_activation for gru."); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/gru.h b/framework/operators/bk/gru.h similarity index 65% rename from framework/operators/gru.h rename to framework/operators/bk/gru.h index 3a5ebb977..4fc1171a0 100644 --- a/framework/operators/gru.h +++ b/framework/operators/bk/gru.h @@ -1,6 +1,6 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ namespace anakin { namespace ops { -template +template class GruHelper; @@ -37,20 +37,20 @@ class GruHelper; * \brief Gru implementation class * public inherit Operator */ -template -class Gru : public Operator { +template +class Gru : public Operator { public: Gru() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator Gru::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Gru< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class GruHelper; + friend class GruHelper; }; /** @@ -58,8 +58,8 @@ class Gru : public Operator { * public inherit OperatorHelper * including init resource and shape size in Gru context */ -template -class GruHelper : public OperatorHelper { +template +class GruHelper : public OperatorHelper { public: GruHelper()=default; @@ -75,8 +75,8 @@ class GruHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -84,14 +84,14 @@ class GruHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_gru stand for Gru parameter - saber::GruParam> _param_gru; + saber::GruParam _param_gru; ///< _funcs_gru stand for Gru function - saber::Gru _funcs_gru; + saber::Gru::saber_type> _funcs_gru; }; } /* namespace ops */ diff --git a/framework/operators/concat.cpp b/framework/operators/concat.cpp index 1af835f0d..426941d5c 100644 --- a/framework/operators/concat.cpp +++ b/framework/operators/concat.cpp @@ -4,100 +4,71 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Concat::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_concat; - impl->_funcs_concat(ins, outs, param, ctx); -} -#endif -#ifdef USE_X86_PLACE -template<> -void Concat::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_concat; - impl->_funcs_concat(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - -/// set helper -template -ConcatHelper::~ConcatHelper() { -} - -template -Status ConcatHelper::InitParam() { +template +Status ConcatHelper::InitParam() { DLOG(WARNING) << "Parsing Concat op parameter."; auto axis = GET_PARAMETER(int, axis); - ConcatParam> param_concat(axis); + ConcatParam param_concat(axis); _param_concat = param_concat; return Status::OK(); } -template -Status ConcatHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status ConcatHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs){ SABER_CHECK(_funcs_concat.init(ins, outs, _param_concat, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status ConcatHelper::InferShape(const std::vector >& - ins, - std::vector >& outs) { +template +Status ConcatHelper::InferShape(const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_concat.compute_output_shape(ins, outs, _param_concat)); return Status::OK(); } + +#define INSTANCE_CONCAT(Ttype, Ptype) \ +template<> \ +void Concat::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_concat; \ + impl->_funcs_concat(ins, outs, param, ctx); \ +} + #ifdef USE_CUDA -template class ConcatHelper; -template class ConcatHelper; -template class ConcatHelper; -#endif -#ifdef USE_ARM_PLACE -template class ConcatHelper; -template class ConcatHelper; -template class ConcatHelper; -#endif -#ifdef USE_X86_PLACE -template class ConcatHelper; -template class ConcatHelper; -template class ConcatHelper; -#endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Concat, ConcatHelper, NV, AK_FLOAT, Precision::FP32); +INSTANCE_CONCAT(NV, Precision::FP32); +template class ConcatHelper; +ANAKIN_REGISTER_OP_HELPER(Concat, ConcatHelper, NV, Precision::FP32); #endif + #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Concat, ConcatHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_CONCAT(ARM, Precision::FP32); +template class ConcatHelper; +ANAKIN_REGISTER_OP_HELPER(Concat, ConcatHelper, ARM, Precision::FP32); #endif + #ifdef USE_X86_PLACE -ANAKIN_REGISTER_OP_HELPER(Concat, ConcatHelper, X86, AK_FLOAT, Precision::FP32); +INSTANCE_CONCAT(X86, Precision::FP32); +template class ConcatHelper; +ANAKIN_REGISTER_OP_HELPER(Concat, ConcatHelper, X86, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Concat) .Doc("Concat operator") #ifdef USE_CUDA -.__alias__("concat") +.__alias__("concat") #endif #ifdef USE_ARM_PLACE -.__alias__("concat") +.__alias__("concat") #endif #ifdef USE_X86_PLACE -.__alias__("concat") +.__alias__("concat") #endif .num_in(2) .num_out(1) diff --git a/framework/operators/concat.h b/framework/operators/concat.h index e191bd35d..c937f758b 100644 --- a/framework/operators/concat.h +++ b/framework/operators/concat.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class ConcatHelper; /// pooling op @@ -34,20 +34,20 @@ class ConcatHelper; * \brief contct class * public inherit Operator */ -template -class Concat : public Operator { +template +class Concat : public Operator { public: Concat() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Concat< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ConcatHelper; + friend class ConcatHelper; }; /** @@ -55,12 +55,12 @@ class Concat : public Operator { * public inherit OperatorHelper * including init resource and shape size in contact context */ -template -class ConcatHelper : public OperatorHelper { +template +class ConcatHelper : public OperatorHelper { public: ConcatHelper()=default; - ~ConcatHelper(); + ~ConcatHelper() {} Status InitParam() override; @@ -72,8 +72,8 @@ class ConcatHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class ConcatHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_concat stand for contact parameter - saber::ConcatParam> _param_concat; + saber::ConcatParam _param_concat; ///< _funcs_concat stand for contact function - saber::Concat _funcs_concat; + saber::Concat::saber_type> _funcs_concat; private: ///< _dims stand for contact size @@ -96,7 +96,6 @@ class ConcatHelper : public OperatorHelper { }; - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/conv_3x3.cpp b/framework/operators/conv_3x3.cpp index dd5da9b9e..24318ba96 100644 --- a/framework/operators/conv_3x3.cpp +++ b/framework/operators/conv_3x3.cpp @@ -6,27 +6,39 @@ namespace ops { #ifdef USE_CUDA template<> -void SassConvolution::operator()( +void SassConvolution::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*> (this->_helper)->_param_conv; impl->_funcs_conv(ins, outs, param, ctx); } #endif +#ifdef AMD_GPU +template<> +void SassConvolution::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*> + (this->_helper)->_param_conv; + impl->_funcs_conv(ins, outs, param, ctx); +} +#endif /// TODO ... specialization other type of operator /// set helper -template -SassConvolutionHelper::~SassConvolutionHelper() { +template +SassConvolutionHelper::~SassConvolutionHelper() { } -template -Status SassConvolutionHelper::InitParam() { +template +Status SassConvolutionHelper::InitParam() { DLOG(WARNING) << "Parsing SassConvolution op parameter."; auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -36,19 +48,19 @@ Status SassConvolutionHelper::InitParam() { auto filter_num = GET_PARAMETER(int, filter_num); auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], + saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], &(weights.d_tensor()), &(bias.d_tensor())); _param_conv = conv_param; } else { - Tensor4d* bias = new Tensor4d();; - saber::ConvParam> conv_param(group, padding[0], padding[1], + Tensor4d* bias = new Tensor4d();; + saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], &(weights.d_tensor()), bias); @@ -58,52 +70,89 @@ Status SassConvolutionHelper::InitParam() { return Status::OK(); } -template -Status SassConvolutionHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status SassConvolutionHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); SABER_CHECK(_funcs_conv.init(ins, outs, _param_conv, SPECIFY, SABER_IMPL, ctx)); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply(std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply(std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + } return Status::OK(); } -template -Status SassConvolutionHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { +template +Status SassConvolutionHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_conv.compute_output_shape(ins, outs, _param_conv)); return Status::OK(); } #ifdef USE_CUDA -template class SassConvolutionHelper; -template class SassConvolutionHelper; -template class SassConvolutionHelper; +template class SassConvolutionHelper; +template class SassConvolutionHelper; +template class SassConvolutionHelper; #endif -#ifdef USE_ARM_PLACE -template class SassConvolutionHelper; -template class SassConvolutionHelper; -template class SassConvolutionHelper; +//#ifdef USE_ARM_PLACE +//template class SassConvolutionHelper; +//template class SassConvolutionHelper; +//template class SassConvolutionHelper; +//#endif + +#ifdef AMD_GPU +template class SassConvolutionHelper; +template class SassConvolutionHelper; +template class SassConvolutionHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(SassConvolution, SassConvolutionHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SassConvolution, SassConvolutionHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(SassConvolution, SassConvolutionHelper, ARM, AK_FLOAT, Precision::FP32); +//ANAKIN_REGISTER_OP_HELPER(SassConvolution, SassConvolutionHelper, ARM, Precision::FP32); +#endif + +#ifdef AMD_GPU +ANAKIN_REGISTER_OP_HELPER(SassConvolution, SassConvolutionHelper, AMD, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(SassConvolution) .Doc("SassConvolution operator") #ifdef USE_CUDA -.__alias__("convolution") +.__alias__("convolution") #endif -#ifdef USE_ARM_PLACE -.__alias__("convolution") +#ifdef AMD_GPU +.__alias__("convolution") #endif +//#ifdef USE_ARM_PLACE +//.__alias__("convolution") +//#endif .num_in(1) .num_out(1) .Args("group", " group of conv ") diff --git a/framework/operators/conv_3x3.h b/framework/operators/conv_3x3.h index 251dc5343..acc69ea34 100644 --- a/framework/operators/conv_3x3.h +++ b/framework/operators/conv_3x3.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class SassConvolutionHelper; /// pooling op @@ -34,20 +34,20 @@ class SassConvolutionHelper; * \brief conv_3X3 implementation class * public inherit Operator */ -template -class SassConvolution : public Operator { +template +class SassConvolution : public Operator { public: SassConvolution() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator SassConvolution< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SassConvolutionHelper; + friend class SassConvolutionHelper; }; /** @@ -55,8 +55,8 @@ class SassConvolution : public Operator { * public inherit OperatorHelper * including init resource and shape size in conv3X3 context */ -template -class SassConvolutionHelper : public OperatorHelper { +template +class SassConvolutionHelper : public OperatorHelper { public: SassConvolutionHelper()=default; @@ -72,8 +72,8 @@ class SassConvolutionHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class SassConvolutionHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_conv stand for conv_3X3 parameter - saber::ConvParam> _param_conv; + saber::ConvParam _param_conv; ///< _funcs_conv stand for convolution function - saber::Conv _funcs_conv; + saber::Conv::saber_type> _funcs_conv; private: ///< _dims stand for conv_3X3 size diff --git a/framework/operators/convolution.cpp b/framework/operators/convolution.cpp index 487e812d7..10515b5ec 100644 --- a/framework/operators/convolution.cpp +++ b/framework/operators/convolution.cpp @@ -4,29 +4,19 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Convolution::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_conv; - impl->_funcs_conv(ins, outs, param, ctx); +#define INSTANCE_CONVOLUTION(Ttype, Ptype) \ +template<> \ +void Convolution::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_conv; \ + impl->_funcs_conv(ins, outs, param, ctx); \ } -#endif - -/// TODO ... specialization other type of operator - -/// set helper -template -ConvolutionHelper::~ConvolutionHelper() { -} - -template -Status ConvolutionHelper::InitParam() { +template +Status ConvolutionHelper::InitParam() { DLOG(WARNING) << "Parsing Convolution op parameter."; auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -36,29 +26,20 @@ Status ConvolutionHelper::InitParam() { auto filter_num = GET_PARAMETER(int, filter_num); auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - DLOG(INFO) << "conv group : " << group; - DLOG(INFO) << "conv bias_term: " << bias_term; - DLOG(INFO) << "conv padding : [" << padding[0] << " " << padding[1] << "]"; - DLOG(INFO) << "conv strides : [" << strides[0] << " " << strides[1] << "]"; - DLOG(INFO) << "conv dilation_rate : [" << dilation_rate[0] << " " << dilation_rate[1] << "]"; - DLOG(INFO) << "conv filter_num : " << filter_num; - DLOG(INFO) << "conv kernel_size : " << kernel_size[0] << " " << kernel_size[1] << "]"; - DLOG(INFO) << "conv axis : " << axis; - - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], + saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], &(weights.d_tensor()), &(bias.d_tensor())); _param_conv = conv_param; } else { - Tensor4d* bias = new Tensor4d();; - saber::ConvParam> conv_param(group, padding[0], padding[1], + Tensor4d* bias = new Tensor4d();; + saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], &(weights.d_tensor()), bias); @@ -68,62 +49,123 @@ Status ConvolutionHelper::InitParam() { return Status::OK(); } -template -Status ConvolutionHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - SABER_CHECK(_funcs_conv.init(ins, outs, _param_conv, SPECIFY, VENDER_IMPL, ctx)); +template +Status ConvolutionHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + SABER_CHECK(_funcs_conv.init(ins, outs, _param_conv, SPECIFY, SABER_IMPL, ctx)); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + } return Status::OK(); } -template -Status ConvolutionHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { +template +Status ConvolutionHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_conv.compute_output_shape(ins, outs, _param_conv)); return Status::OK(); } #ifdef USE_CUDA -template class ConvolutionHelper; -template class ConvolutionHelper; -template class ConvolutionHelper; -#endif +INSTANCE_CONVOLUTION(NV, Precision::FP32); +template <> +Status ConvolutionHelper::Init(OpContext &ctx, \ + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + SABER_CHECK(_funcs_conv.init(ins, outs, _param_conv, SPECIFY, VENDER_IMPL, ctx)); -#ifdef USE_ARM_PLACE -template class ConvolutionHelper; -template class ConvolutionHelper; -template class ConvolutionHelper; + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply(std::bind( + &Conv::saber_type>::trans_weights, + &_funcs_conv, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + VENDER_IMPL); + weights.map_to_host(); + } + return Status::OK(); +} + +template class ConvolutionHelper; +template class ConvolutionHelper; +template class ConvolutionHelper; + +ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, NV, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, NV, AK_FLOAT, Precision::FP32); +#ifdef USE_X86_PLACE +INSTANCE_CONVOLUTION(X86, Precision::FP32); +template class ConvolutionHelper; +ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, X86, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_CONVOLUTION(ARM, Precision::FP32); +template class ConvolutionHelper; +ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, ARM, Precision::FP32); +#endif + +#ifdef AMD_GPU +INSTANCE_CONVOLUTION(AMD, Precision::FP32); +template class ConvolutionHelper; +template class ConvolutionHelper; +template class ConvolutionHelper; +ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, AMD, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Convolution) .Doc("Convolution operator") #ifdef USE_CUDA -.__alias__("convolution") +.__alias__("convolution") +#endif +#ifdef AMD_GPU +.__alias__("convolution") #endif #ifdef USE_ARM_PLACE -.__alias__("convolution") +.__alias__("convolution") #endif .num_in(1) .num_out(1) .Args("group", " group of conv ") .Args("bias_term", " whether conv weights have bias") .Args>("padding", "padding of conv (x, y)") - .Args>("strides", "strides of conv (x)") - .Args>("dilation_rate", "dilation rate of conv (x)") - .Args("filter_num", "filter(kernel) number of weights") - .Args>("kernel_size", "kernel size of kernel (x, y)") - .Args("axis", "axis of conv"); +.Args>("strides", "strides of conv (x)") +.Args>("dilation_rate", "dilation rate of conv (x)") +.Args("filter_num", "filter(kernel) number of weights") +.Args>("kernel_size", "kernel size of kernel (x, y)") +.Args("axis", "axis of conv"); } /* namespace ops */ diff --git a/framework/operators/convolution.h b/framework/operators/convolution.h index e929c0e11..3322eeeab 100644 --- a/framework/operators/convolution.h +++ b/framework/operators/convolution.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class ConvolutionHelper; /// pooling op @@ -34,20 +34,20 @@ class ConvolutionHelper; * \brief convlution operation class * public inheritance Operator */ -template -class Convolution : public Operator { +template +class Convolution : public Operator { public: Convolution() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Convolution< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ConvolutionHelper; + friend class ConvolutionHelper; }; /** @@ -55,12 +55,12 @@ class Convolution : public Operator { * public inherit OperatorHelper * including init resource and shape size in convolution context */ -template -class ConvolutionHelper : public OperatorHelper { +template +class ConvolutionHelper : public OperatorHelper { public: ConvolutionHelper()=default; - ~ConvolutionHelper(); + ~ConvolutionHelper(){} Status InitParam() override; @@ -72,8 +72,8 @@ class ConvolutionHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,22 +81,20 @@ class ConvolutionHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_conv stand for convolution parameter - saber::ConvParam> _param_conv; + saber::ConvParam _param_conv; ///< _funcs_conv stand for convolution function - saber::Conv _funcs_conv; + saber::Conv::saber_type> _funcs_conv; private: ///< _dims stand for Convolution size PTuple _dims; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/crf_decoding.cpp b/framework/operators/crf_decoding.cpp index 8b70ca068..24fee68e6 100644 --- a/framework/operators/crf_decoding.cpp +++ b/framework/operators/crf_decoding.cpp @@ -6,12 +6,12 @@ namespace ops { #ifdef USE_X86_PLACE template<> -void CrfDecoding::operator()( +void CrfDecoding::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_crf_decoding; + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_crf_decoding; impl->_funcs_crf_decoding(ins, outs, param, ctx); } #endif @@ -20,80 +20,80 @@ void CrfDecoding::operator()( /// set helper -template -CrfDecodingHelper::~CrfDecodingHelper() { +template +CrfDecodingHelper::~CrfDecodingHelper() { } -template -Status CrfDecodingHelper::InitParam() { +template +Status CrfDecodingHelper::InitParam() { DLOG(WARNING) << "Parsing CrfDecoding op parameter."; - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - saber::CrfDecodingParam> crf_decoding_param(&(weights.d_tensor())); + saber::CrfDecodingParam crf_decoding_param(&(weights.d_tensor())); _param_crf_decoding = crf_decoding_param; return Status::OK(); } -template -Status CrfDecodingHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status CrfDecodingHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_crf_decoding.init(ins, outs, _param_crf_decoding, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status CrfDecodingHelper::InferShape( - const std::vector >& ins, - std::vector >& outs) { +template +Status CrfDecodingHelper::InferShape( + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_crf_decoding.compute_output_shape(ins, outs, _param_crf_decoding)); return Status::OK(); } #ifdef USE_CUDA -template class CrfDecodingHelper; -template class CrfDecodingHelper; -template class CrfDecodingHelper; +template class CrfDecodingHelper; +template class CrfDecodingHelper; +template class CrfDecodingHelper; #endif #ifdef USE_ARM_PLACE -template class CrfDecodingHelper; -template class CrfDecodingHelper; -template class CrfDecodingHelper; +template class CrfDecodingHelper; +template class CrfDecodingHelper; +template class CrfDecodingHelper; #endif #ifdef USE_X86_PLACE -template class CrfDecodingHelper; -template class CrfDecodingHelper; -template class CrfDecodingHelper; +template class CrfDecodingHelper; +template class CrfDecodingHelper; +template class CrfDecodingHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE -ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, X86, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, X86, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(CrfDecoding) .Doc("CrfDecoding operator") #ifdef USE_CUDA -.__alias__("CrfDecoding") +.__alias__("CrfDecoding") #endif #ifdef USE_ARM_PLACE -.__alias__("CrfDecoding") +.__alias__("CrfDecoding") #endif #ifdef USE_X86_PLACE -.__alias__("CrfDecoding") +.__alias__("CrfDecoding") #endif .num_in(1) .num_out(1); diff --git a/framework/operators/crf_decoding.h b/framework/operators/crf_decoding.h index 2bf966cc5..b6beaf2db 100644 --- a/framework/operators/crf_decoding.h +++ b/framework/operators/crf_decoding.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class CrfDecodingHelper; /// pooling op @@ -34,20 +34,18 @@ class CrfDecodingHelper; * \brief CrfDecoding operation class * public inheritance Operator */ -template -class CrfDecoding : public Operator { +template +class CrfDecoding : public Operator { public: CrfDecoding() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator CrfDecoding::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { } - friend class CrfDecodingHelper; + friend class CrfDecodingHelper; }; /** @@ -55,8 +53,8 @@ class CrfDecoding : public Operator { * public inherit OperatorHelper * including init resource and shape size in crf_decoding context */ -template -class CrfDecodingHelper : public OperatorHelper { +template +class CrfDecodingHelper : public OperatorHelper { public: CrfDecodingHelper()=default; @@ -72,8 +70,8 @@ class CrfDecodingHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +79,14 @@ class CrfDecodingHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_crf_decoding stand for CrfDecoding parameter - saber::CrfDecodingParam> _param_crf_decoding; + saber::CrfDecodingParam _param_crf_decoding; ///< _funcs_crf_decoding stand for CrfDecoding function - saber::CrfDecoding _funcs_crf_decoding; + saber::CrfDecoding::saber_type> _funcs_crf_decoding; private: ///< _dims stand for CrfDecoding size diff --git a/framework/operators/ctc_align.cpp b/framework/operators/ctc_align.cpp index 892493f92..1d0de9934 100644 --- a/framework/operators/ctc_align.cpp +++ b/framework/operators/ctc_align.cpp @@ -6,11 +6,11 @@ namespace ops { #ifdef USE_CUDA template<> -void CtcAlign::operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_ctc_align; +void CtcAlign::operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_ctc_align; impl->_funcs_ctc_align(ins, outs, param, ctx); } #endif @@ -19,68 +19,68 @@ void CtcAlign::operator() (OpContext &ctx, /// set helper -template -CtcAlignHelper::~CtcAlignHelper() { +template +CtcAlignHelper::~CtcAlignHelper() { } -template -Status CtcAlignHelper::InitParam() { +template +Status CtcAlignHelper::InitParam() { DLOG(WARNING) << "Parsing CtcAlign op parameter."; auto merge_repeated = GET_PARAMETER(bool, merge_repeated); auto blank = GET_PARAMETER(int, blank); - CtcAlignParam> ctc_align_param(blank, merge_repeated); + CtcAlignParam ctc_align_param(blank, merge_repeated); _param_ctc_align = ctc_align_param; return Status::OK(); } -template -Status CtcAlignHelper::Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status CtcAlignHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_ctc_align.init(ins, outs, _param_ctc_align, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status CtcAlignHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { +template +Status CtcAlignHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_ctc_align.compute_output_shape(ins, outs, _param_ctc_align)); return Status::OK(); } #ifdef USE_CUDA -template class CtcAlignHelper; -template class CtcAlignHelper; -template class CtcAlignHelper; +template class CtcAlignHelper; +template class CtcAlignHelper; +template class CtcAlignHelper; #endif #ifdef USE_ARM_PLACE -template class CtcAlignHelper; -template class CtcAlignHelper; -template class CtcAlignHelper; +template class CtcAlignHelper; +template class CtcAlignHelper; +template class CtcAlignHelper; #endif -//template class CtcAlignHelper; -//template class CtcAlignHelper; -//template class CtcAlignHelper; +//template class CtcAlignHelper; +//template class CtcAlignHelper; +//template class CtcAlignHelper; // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(CtcAlign, CtcAlignHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(CtcAlign, CtcAlignHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(CtcAlign, CtcAlignHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(CtcAlign, CtcAlignHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(CtcAlign) .Doc("CtcAlign operator") #ifdef USE_CUDA - .__alias__("ctc_align") + .__alias__("ctc_align") #endif #ifdef USE_ARM_PLACE - .__alias__("ctc_align") + .__alias__("ctc_align") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/ctc_align.h b/framework/operators/ctc_align.h index 9f47b28c8..7fa849f8f 100644 --- a/framework/operators/ctc_align.h +++ b/framework/operators/ctc_align.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class CtcAlignHelper; @@ -35,20 +35,20 @@ class CtcAlignHelper; * \brief CtcAlign implementation class * public inherit Operator */ -template -class CtcAlign : public Operator { +template +class CtcAlign : public Operator { public: CtcAlign() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator CtcAlign::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator CtcAlign< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class CtcAlignHelper; + friend class CtcAlignHelper; }; /** @@ -56,8 +56,8 @@ class CtcAlign : public Operator { * public inherit OperatorHelper * including init resource and shape size in CtcAlign context */ -template -class CtcAlignHelper : public OperatorHelper { +template +class CtcAlignHelper : public OperatorHelper { public: CtcAlignHelper()=default; @@ -73,8 +73,8 @@ class CtcAlignHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -82,14 +82,14 @@ class CtcAlignHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_ctc_align stand for CtcAlign parameter - saber::CtcAlignParam> _param_ctc_align; + saber::CtcAlignParam _param_ctc_align; ///< _funcs_ctc_align stand for CtcAlign function - saber::CtcAlign _funcs_ctc_align; + saber::CtcAlign::saber_type> _funcs_ctc_align; }; } /* namespace ops */ diff --git a/framework/operators/deconvolution.cpp b/framework/operators/deconvolution.cpp index e219824fe..826c046de 100644 --- a/framework/operators/deconvolution.cpp +++ b/framework/operators/deconvolution.cpp @@ -4,29 +4,18 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Deconvolution::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_deconv; - impl->_funcs_deconv(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - - -/// set helper -template -DeconvolutionHelper::~DeconvolutionHelper() { +#define INSTANCE_DECONV(Ttype, Ptype) \ +template<> \ +void Deconvolution::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*>(this->_helper)->_param_deconv; \ + impl->_funcs_deconv(ins, outs, param, ctx); \ } -template -Status DeconvolutionHelper::InitParam() { +template +Status DeconvolutionHelper::InitParam() { DLOG(WARNING) << "Parsing Deconvolution op parameter."; auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -37,19 +26,19 @@ Status DeconvolutionHelper::InitParam() { auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], + saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], &(weights.d_tensor()), &(bias.d_tensor())); _param_deconv = conv_param; } else { - Tensor4d* bias = new Tensor4d();; - saber::ConvParam> conv_param(group, padding[0], padding[1], + Tensor4d* bias = new Tensor4d();; + saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], &(weights.d_tensor()), bias); @@ -59,10 +48,28 @@ Status DeconvolutionHelper::InitParam() { return Status::OK(); } -template -Status DeconvolutionHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status DeconvolutionHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_deconv.init(ins, outs, _param_deconv, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status DeconvolutionHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_deconv.compute_output_shape(ins, outs, _param_deconv)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_DECONV(NV, Precision::FP32); +template<> +Status DeconvolutionHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector>& outs) { bool p = true; p = p && (_param_deconv.weight()->width() == 4); p = p && (_param_deconv.weight()->height() == 4); @@ -86,55 +93,37 @@ Status DeconvolutionHelper::Init(OpContext& ctx, return Status::OK(); } - -template -Status DeconvolutionHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { - SABER_CHECK(_funcs_deconv.compute_output_shape(ins, outs, _param_deconv)); - return Status::OK(); -} - -#ifdef USE_CUDA -template class DeconvolutionHelper; -template class DeconvolutionHelper; -template class DeconvolutionHelper; -#endif - -#ifdef USE_ARM_PLACE -template class DeconvolutionHelper; -template class DeconvolutionHelper; -template class DeconvolutionHelper; -#endif - -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Deconvolution, DeconvolutionHelper, NV, AK_FLOAT, Precision::FP32); +template class DeconvolutionHelper; +template class DeconvolutionHelper; +template class DeconvolutionHelper; +ANAKIN_REGISTER_OP_HELPER(Deconvolution, DeconvolutionHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Deconvolution, DeconvolutionHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_DECONV(ARM, Precision::FP32); +template class DeconvolutionHelper; +ANAKIN_REGISTER_OP_HELPER(Deconvolution, DeconvolutionHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Deconvolution) .Doc("Deconvolution operator") #ifdef USE_CUDA -.__alias__("deconvolution") +.__alias__("deconvolution") #endif #ifdef USE_ARM_PLACE -.__alias__("deconvolution") +.__alias__("deconvolution") #endif .num_in(1) .num_out(1) .Args("group", " group of conv ") .Args("bias_term", " whether conv weights have bias") .Args>("padding", "padding of conv (x, y)") - .Args>("strides", "strides of conv (x)") - .Args>("dilation_rate", "dilation rate of conv (x)") - .Args("filter_num", "filter(kernel) number of weights") - .Args>("kernel_size", "kernel size of kernel (x, y)") - .Args("axis", "axis of conv"); +.Args>("strides", "strides of conv (x)") +.Args>("dilation_rate", "dilation rate of conv (x)") +.Args("filter_num", "filter(kernel) number of weights") +.Args>("kernel_size", "kernel size of kernel (x, y)") +.Args("axis", "axis of conv"); } /* namespace ops */ diff --git a/framework/operators/deconvolution.h b/framework/operators/deconvolution.h index 5c8b46b4e..d95865965 100644 --- a/framework/operators/deconvolution.h +++ b/framework/operators/deconvolution.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class DeconvolutionHelper; /// pooling op @@ -34,20 +34,20 @@ class DeconvolutionHelper; * \brief Deconvolution operation class * public inheritance Operator */ -template -class Deconvolution : public Operator { +template +class Deconvolution : public Operator { public: Deconvolution() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Deconvolution< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class DeconvolutionHelper; + friend class DeconvolutionHelper; }; /** @@ -55,12 +55,12 @@ class Deconvolution : public Operator { * public inherit OperatorHelper * including init resource and shape size in deconvolution context */ -template -class DeconvolutionHelper : public OperatorHelper { +template +class DeconvolutionHelper : public OperatorHelper { public: DeconvolutionHelper()=default; - ~DeconvolutionHelper(); + ~DeconvolutionHelper(){} Status InitParam() override; @@ -72,8 +72,8 @@ class DeconvolutionHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief initial all the resource needed by pooling @@ -82,22 +82,20 @@ class DeconvolutionHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_deconv stand for deconvolution parameter - saber::ConvParam> _param_deconv; + saber::ConvParam _param_deconv; ///< _funcs_deconv stand for deconvolution function - saber::Deconv _funcs_deconv; + saber::Deconv::saber_type> _funcs_deconv; private: ///< _dims stand for batchNorm size PTuple _dims; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/deformconvolution.cpp b/framework/operators/deformconvolution.cpp index 1032b2587..4752b5743 100644 --- a/framework/operators/deformconvolution.cpp +++ b/framework/operators/deformconvolution.cpp @@ -6,12 +6,12 @@ namespace ops { #ifdef USE_CUDA template<> -void DeformConvolution::operator()( +void DeformConvolution::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*> (this->_helper)->_param_deform_conv; impl->_funcs_deform_conv(ins, outs, param, ctx); } @@ -21,12 +21,12 @@ void DeformConvolution::operator()( /// set helper -template -DeformConvolutionHelper::~DeformConvolutionHelper() { +template +DeformConvolutionHelper::~DeformConvolutionHelper() { } -template -Status DeformConvolutionHelper::InitParam() { +template +Status DeformConvolutionHelper::InitParam() { DLOG(WARNING) << "Parsing DeformConvolution op parameter."; auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -37,19 +37,19 @@ Status DeformConvolutionHelper::InitParam() { auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::DeformableConvParam> deform_conv_param(group, padding[0], padding[1], + saber::DeformableConvParam deform_conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], &(weights.d_tensor()), &(bias.d_tensor())); _param_deform_conv = deform_conv_param; } else { - Tensor4d* bias = new Tensor4d();; - saber::DeformableConvParam> deform_conv_param(group, padding[0], padding[1], + Tensor4d* bias = new Tensor4d();; + saber::DeformableConvParam deform_conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], &(weights.d_tensor()), bias); @@ -59,53 +59,51 @@ Status DeformConvolutionHelper::InitParam() { return Status::OK(); } -template -Status DeformConvolutionHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status DeformConvolutionHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_deform_conv.init(ins, outs, _param_deform_conv, SPECIFY, VENDER_IMPL, ctx)); return Status::OK(); } -template -Status DeformConvolutionHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { +template +Status DeformConvolutionHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_deform_conv.compute_output_shape(ins, outs, _param_deform_conv)); return Status::OK(); } #ifdef USE_CUDA -template class DeformConvolutionHelper; -template class DeformConvolutionHelper; -template class DeformConvolutionHelper; +template class DeformConvolutionHelper; +template class DeformConvolutionHelper; +template class DeformConvolutionHelper; #endif #ifdef USE_ARM_PLACE -template class DeformConvolutionHelper; -template class DeformConvolutionHelper; -template class DeformConvolutionHelper; +template class DeformConvolutionHelper; +template class DeformConvolutionHelper; +template class DeformConvolutionHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(DeformConvolution, DeformConvolutionHelper, NV, AK_FLOAT, - Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(DeformConvolution, DeformConvolutionHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(DeformConvolution, DeformConvolutionHelper, ARM, AK_FLOAT, - Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(DeformConvolution, DeformConvolutionHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(DeformConvolution) .Doc("DeformConvolution operator") #ifdef USE_CUDA -.__alias__("deformable_convolution") +.__alias__("deformable_convolution") #endif #ifdef USE_ARM_PLACE -.__alias__("defromable_convolution") +.__alias__("defromable_convolution") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/deformconvolution.h b/framework/operators/deformconvolution.h index 967d3a144..ac5ae681a 100644 --- a/framework/operators/deformconvolution.h +++ b/framework/operators/deformconvolution.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class DeformConvolutionHelper; /// pooling op @@ -34,20 +34,20 @@ class DeformConvolutionHelper; * \brief DeformConvolution operation class * public inheritance Operator */ -template -class DeformConvolution : public Operator { +template +class DeformConvolution : public Operator { public: DeformConvolution() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator DeformConvolution::value << "), Precision("<< Ptype <<") >"; } - friend class DeformConvolutionHelper; + friend class DeformConvolutionHelper; }; /** @@ -55,8 +55,8 @@ class DeformConvolution : public Operator { * public inherit OperatorHelper * including init resource and shape size in deformconvolution context */ -template -class DeformConvolutionHelper : public OperatorHelper { +template +class DeformConvolutionHelper : public OperatorHelper { public: DeformConvolutionHelper()=default; @@ -72,8 +72,8 @@ class DeformConvolutionHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class DeformConvolutionHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_deform_conv stand for deformconvolution parameter - saber::DeformableConvParam> _param_deform_conv; + saber::DeformableConvParam _param_deform_conv; ///< _funcs_deform_conv stand for deformconvolution function - saber::DeformableConv _funcs_deform_conv; + saber::DeformableConv::saber_type> _funcs_deform_conv; private: ///< _dims stand for batchNorm size diff --git a/framework/operators/dense.cpp b/framework/operators/dense.cpp index 8fc7242a9..ca2ff9895 100644 --- a/framework/operators/dense.cpp +++ b/framework/operators/dense.cpp @@ -4,122 +4,111 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Dense::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_dense; - impl->_funcs_dense(ins, outs, param, ctx); +#define INSTANCE_DENSE(Ttype, Ptype) \ +template<> \ +void Dense::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*>(this->_helper)->_param_dense; \ + SABER_CHECK(impl->_funcs_dense(ins, outs, param, ctx)); \ } -#endif -#ifdef USE_X86_PLACE -template<> -void Dense::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_dense; - impl->_funcs_dense(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - - -/// set helper -template -DenseHelper::~DenseHelper() { -} - -template -Status DenseHelper::InitParam() { +template +Status DenseHelper::InitParam() { DLOG(WARNING) << "Parsing Dense op parameter."; auto axis = GET_PARAMETER(int, axis); auto out_dim = GET_PARAMETER(int, out_dim); auto bias_term = GET_PARAMETER(bool, bias_term); - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::FcParam> fc_param(&(weights.d_tensor()), &(bias.d_tensor()), out_dim, + saber::FcParam fc_param(&(weights.d_tensor()), &(bias.d_tensor()), out_dim, axis); _param_dense = fc_param; } else { - Tensor4d* bias = nullptr; - saber::FcParam> fc_param(&(weights.d_tensor()), bias, out_dim, axis); + Tensor4d* bias = nullptr; + saber::FcParam fc_param(&(weights.d_tensor()), bias, out_dim, axis); _param_dense = fc_param; } return Status::OK(); } -template -Status DenseHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status DenseHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_dense.init(ins, outs, _param_dense, STATIC, VENDER_IMPL, ctx)); return Status::OK(); } -template -Status DenseHelper::InferShape(const std::vector >& +template +Status DenseHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { + std::vector >& outs) { SABER_CHECK(_funcs_dense.compute_output_shape(ins, outs, _param_dense)); return Status::OK(); } #ifdef USE_CUDA -template class DenseHelper; -template class DenseHelper; -template class DenseHelper; +INSTANCE_DENSE(NV, Precision::FP32); +template class DenseHelper; +ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, NV, Precision::FP32); +template class DenseHelper; +template class DenseHelper; #endif #ifdef USE_ARM_PLACE -template class DenseHelper; -template class DenseHelper; -template class DenseHelper; +INSTANCE_DENSE(ARM, Precision::FP32); +template<> +Status DenseHelper::Init(OpContext &ctx,\ + const std::vector >& ins, \ + std::vector >& outs) { + SABER_CHECK(_funcs_dense.init(ins, outs, _param_dense, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE -template class DenseHelper; -template class DenseHelper; -template class DenseHelper; +INSTANCE_DENSE(X86, Precision::FP32); +template class DenseHelper; +ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, X86, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, NV, AK_FLOAT, Precision::FP32); -#endif - -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, ARM, AK_FLOAT, Precision::FP32); -#endif - -#ifdef USE_X86_PLACE -ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, X86, AK_FLOAT, Precision::FP32); +#ifdef AMD_GPU +INSTANCE_DENSE(AMD, Precision::FP32); +template<> +Status DenseHelper::Init(OpContext &ctx,\ + const std::vector >& ins, \ + std::vector >& outs) { + SABER_CHECK(_funcs_dense.init(ins, outs, _param_dense, SPECIFY, VENDER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, AMD, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Dense) .Doc("Dense operator") #ifdef USE_CUDA -.__alias__("fullconnect") -.__alias__("fc") +.__alias__("fullconnect") +.__alias__("fc") #endif #ifdef USE_ARM_PLACE -.__alias__("fullconnect") -.__alias__("fc") +.__alias__("fullconnect") +.__alias__("fc") #endif #ifdef USE_X86_PLACE -.__alias__("fullconnect") -.__alias__("fc") +.__alias__("fullconnect") +.__alias__("fc") +#endif +#ifdef AMD_GPU +.__alias__("fullconnect") +.__alias__("fc") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/dense.h b/framework/operators/dense.h index 9d551b127..7f60108ac 100644 --- a/framework/operators/dense.h +++ b/framework/operators/dense.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class DenseHelper; /// pooling op @@ -34,20 +34,20 @@ class DenseHelper; * \brief Dense operation class * public inheritance Operator */ -template -class Dense : public Operator { +template +class Dense : public Operator { public: Dense() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Dense::value << "), Precision("<< Ptype <<") >"; } - friend class DenseHelper; + friend class DenseHelper; }; /** @@ -55,12 +55,12 @@ class Dense : public Operator { * public inherit OperatorHelper * including init resource and shape size in dense context */ -template -class DenseHelper : public OperatorHelper { +template +class DenseHelper : public OperatorHelper { public: DenseHelper()=default; - ~DenseHelper(); + ~DenseHelper() {} Status InitParam() override; @@ -72,8 +72,8 @@ class DenseHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,22 +81,20 @@ class DenseHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_dense stand for Dense parameter - saber::FcParam> _param_dense; + saber::FcParam _param_dense; ///< _funcs_dense stand for Dense function - saber::Fc _funcs_dense; + saber::Fc::saber_type> _funcs_dense; private: ///< _dims stand for Dense size PTuple _dims; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/detection_output.cpp b/framework/operators/detection_output.cpp index 10e3ec3ff..ff1722e05 100644 --- a/framework/operators/detection_output.cpp +++ b/framework/operators/detection_output.cpp @@ -4,28 +4,18 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void DetectionOutput::operator()(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_detection_output; - impl->_funcs_detection_output(ins, outs, param, ctx); +#define INSTANCE_DETECTIONOUTPUT(Ttype, Ptype) \ +template<> \ +void DetectionOutput::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*>(this->_helper)->_param_detection_output; \ + impl->_funcs_detection_output(ins, outs, param, ctx); \ } -#endif - -/// TODO ... specialization other type of operator - -/// set helper -template -DetectionOutputHelper::~DetectionOutputHelper() { -} - -template -Status DetectionOutputHelper::InitParam() { +template +Status DetectionOutputHelper::InitParam() { DLOG(WARNING) << "Parsing Detectionoutput op parameter."; auto flag_share_location = GET_PARAMETER(bool, share_location); auto flag_var_in_target = GET_PARAMETER(bool, variance_encode_in_target); @@ -47,57 +37,59 @@ Status DetectionOutputHelper::InitParam() { } else if (code_type_ == "CORNER_SIZE") { code_type = CORNER_SIZE; } else { - LOG(FATAL) << "unsupport type: " << code_type_; + LOG(FATAL) << "unsupport type: " << code_type_; } - DetectionOutputParam> param_det(classes_num, background_id_, \ + DetectionOutputParam param_det(classes_num, background_id_, \ keep_top_k_, nms_top_k_, nms_thresh_, conf_thresh_, \ flag_share_location, flag_var_in_target, code_type, nms_eta_); _param_detection_output = param_det; return Status::OK(); } -template -Status DetectionOutputHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status DetectionOutputHelper::Init(OpContext &ctx, const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_detection_output.init(ins, outs, _param_detection_output, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status DetectionOutputHelper::InferShape(\ - const std::vector >& ins, - std::vector >& outs) { +template +Status DetectionOutputHelper::InferShape(const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_detection_output.compute_output_shape(ins, outs, _param_detection_output)); return Status::OK(); } #ifdef USE_CUDA -template class DetectionOutputHelper; -template class DetectionOutputHelper; -template class DetectionOutputHelper; +INSTANCE_DETECTIONOUTPUT(NV, Precision::FP32); +template class DetectionOutputHelper; +ANAKIN_REGISTER_OP_HELPER(DetectionOutput, DetectionOutputHelper, NV, Precision::FP32); #endif -#ifdef USE_ARM_PLACE -template class DetectionOutputHelper; -template class DetectionOutputHelper; -template class DetectionOutputHelper; -#endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(DetectionOutput, DetectionOutputHelper, NV, AK_FLOAT, Precision::FP32); + +#ifdef USE_X86_PLACE +INSTANCE_DETECTIONOUTPUT(X86, Precision::FP32); +template class DetectionOutputHelper; +ANAKIN_REGISTER_OP_HELPER(DetectionOutput, DetectionOutputHelper, X86, Precision::FP32); #endif + #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(DetectionOutput, DetectionOutputHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_DETECTIONOUTPUT(ARM, Precision::FP32); +template class DetectionOutputHelper; +ANAKIN_REGISTER_OP_HELPER(DetectionOutput, DetectionOutputHelper, ARM, Precision::FP32); #endif + //! register op ANAKIN_REGISTER_OP(DetectionOutput) .Doc("DetectionOutput operator") #ifdef USE_CUDA -.__alias__("detectionoutput") +.__alias__("detectionoutput") #endif #ifdef USE_ARM_PLACE -.__alias__("detectionoutput") +.__alias__("detectionoutput") +#endif +#ifdef USE_X86_PLACE +.__alias__("detectionoutput") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/detection_output.h b/framework/operators/detection_output.h index 633fae93a..2c82e82d9 100644 --- a/framework/operators/detection_output.h +++ b/framework/operators/detection_output.h @@ -25,51 +25,49 @@ namespace anakin { namespace ops { -template +template class DetectionOutputHelper; //! DetectionOutput op -template -class DetectionOutput : public Operator { +template +class DetectionOutput : public Operator { public: DetectionOutput() {} //! forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - //LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator DetectionOutput::value << "), Precision("<< Ptype <<") >"; } - friend class DetectionOutputHelper; + friend class DetectionOutputHelper; }; -template -class DetectionOutputHelper : public OperatorHelper { +template +class DetectionOutputHelper : public OperatorHelper { public: DetectionOutputHelper()=default; - ~DetectionOutputHelper(); + ~DetectionOutputHelper(){} Status InitParam() override; //! initial all the resource needed by pooling Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; //! infer the shape of output and input. - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: - saber::DetectionOutputParam> _param_detection_output; - saber::DetectionOutput _funcs_detection_output; + saber::DetectionOutputParam _param_detection_output; + saber::DetectionOutput::saber_type> _funcs_detection_output; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/eltwise_op.cpp b/framework/operators/eltwise_op.cpp index 5dd8953dc..6cd393e12 100644 --- a/framework/operators/eltwise_op.cpp +++ b/framework/operators/eltwise_op.cpp @@ -4,29 +4,20 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Eltwise::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_eltwise; - impl->_funcs_eltwise(ins, outs, param, ctx); +#define INSTANCE_ELTWISE(Ttype, Ptype) \ +template<> \ +void Eltwise::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_eltwise; \ + impl->_funcs_eltwise(ins, outs, param, ctx); \ } -#endif - -/// TODO ... specialization other type of operator -/// set helper -template -EltwiseHelper::~EltwiseHelper() { -} - -template -Status EltwiseHelper::InitParam() { +template +Status EltwiseHelper::InitParam() { DLOG(WARNING) << "Parsing Eltwise op parameter."; auto type = GET_PARAMETER(std::string, type); auto coeff = GET_PARAMETER(PTuple, coeff); @@ -39,73 +30,61 @@ Status EltwiseHelper::InitParam() { } else { elt_type = Eltwise_prod; } - - // Shape shape_coeff(1, 1, 1, coeff.size()); - // Tensor thcoeff(shape_coeff); - // for (int i = 0; i < thcoeff.size(); ++i) { - // thcoeff.mutable_data()[i] = coeff[i]; - // } - // Tensor4d * tdcoeff_p = new Tensor4d(); - // tdcoeff_p->re_alloc(shape_coeff); - // tdcoeff_p->copy_from(thcoeff); - - // saber::EltwiseParam> eltwise_param(elt_type, tdcoeff_p); - saber::EltwiseParam > eltwise_param(elt_type, coeff.vector()); + saber::EltwiseParam eltwise_param(elt_type, coeff.vector()); _param_eltwise = eltwise_param; return Status::OK(); } -template -Status EltwiseHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status EltwiseHelper::Init(OpContext &ctx, const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_eltwise.init(ins, outs, _param_eltwise, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status EltwiseHelper::InferShape(const std::vector >& - ins, - std::vector >& outs) { +template +Status EltwiseHelper::InferShape(const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_eltwise.compute_output_shape(ins, outs, _param_eltwise)); return Status::OK(); } #ifdef USE_CUDA -template class EltwiseHelper; -template class EltwiseHelper; -template class EltwiseHelper; +INSTANCE_ELTWISE(NV, Precision::FP32); +template class EltwiseHelper; +ANAKIN_REGISTER_OP_HELPER(Eltwise, EltwiseHelper, NV, Precision::FP32); #endif -#ifdef USE_ARM_PLACE -template class EltwiseHelper; -template class EltwiseHelper; -template class EltwiseHelper; -#endif - -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Eltwise, EltwiseHelper, NV, AK_FLOAT, Precision::FP32); +#ifdef USE_X86_PLACE +INSTANCE_ELTWISE(X86, Precision::FP32); +template class EltwiseHelper; +ANAKIN_REGISTER_OP_HELPER(Eltwise, EltwiseHelper, X86, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Eltwise, EltwiseHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_ELTWISE(ARM, Precision::FP32); +template class EltwiseHelper; +ANAKIN_REGISTER_OP_HELPER(Eltwise, EltwiseHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Eltwise) .Doc("Eltwise operator") #ifdef USE_CUDA -.__alias__("eltwise") +.__alias__("eltwise") #endif #ifdef USE_ARM_PLACE -.__alias__("eltwise") +.__alias__("eltwise") +#endif +#ifdef USE_X86_PLACE +.__alias__("eltwise") #endif .num_in(1) .num_out(1) .Args("type", " eltwise type( string )") .Args>("coeff", "coeff of eltwise"); + } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/eltwise_op.h b/framework/operators/eltwise_op.h index c17bc9a4e..4982d78fc 100644 --- a/framework/operators/eltwise_op.h +++ b/framework/operators/eltwise_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class EltwiseHelper; /// pooling op @@ -34,20 +34,20 @@ class EltwiseHelper; * \brief Eltwise implementation class * public inherit Operator */ -template -class Eltwise : public Operator { +template +class Eltwise : public Operator { public: Eltwise() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Eltwise< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class EltwiseHelper; + friend class EltwiseHelper; }; /** @@ -55,12 +55,12 @@ class Eltwise : public Operator { * public inherit OperatorHelper * including init resource and shape size in Eltwise context */ -template -class EltwiseHelper : public OperatorHelper { +template +class EltwiseHelper : public OperatorHelper { public: EltwiseHelper()=default; - ~EltwiseHelper(); + ~EltwiseHelper() {} Status InitParam() override; @@ -72,8 +72,8 @@ class EltwiseHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,22 +81,20 @@ class EltwiseHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_eltwise stand for Eltwise parameter - saber::EltwiseParam> _param_eltwise; + saber::EltwiseParam _param_eltwise; ///< _funcs_eltwise stand for Eltwise function - saber::Eltwise _funcs_eltwise; + saber::Eltwise::saber_type> _funcs_eltwise; private: ///< _dims stand for Eltwise size PTuple _dims; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/embedding.cpp b/framework/operators/embedding.cpp index 30f4db159..943c81929 100644 --- a/framework/operators/embedding.cpp +++ b/framework/operators/embedding.cpp @@ -6,28 +6,28 @@ namespace ops { #ifdef USE_CUDA template<> -void Embedding::operator()( +void Embedding::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { + const std::vector >& ins, + std::vector >& outs) { auto* impl = - static_cast*>(this->_helper); + static_cast*>(this->_helper); auto& param = - static_cast*>(this->_helper)->_param_embedding; + static_cast*>(this->_helper)->_param_embedding; impl->_funcs_embedding(ins, outs, param, ctx); } #endif #ifdef USE_X86_PLACE template<> -void Embedding::operator()( +void Embedding::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { + const std::vector >& ins, + std::vector >& outs) { auto* impl = - static_cast*>(this->_helper); + static_cast*>(this->_helper); auto& param = - static_cast*>(this->_helper)->_param_embedding; + static_cast*>(this->_helper)->_param_embedding; impl->_funcs_embedding(ins, outs, param, ctx); } #endif @@ -36,77 +36,77 @@ void Embedding::operator()( /// set helper -template -EmbeddingHelper::~EmbeddingHelper() { +template +EmbeddingHelper::~EmbeddingHelper() { } -template -Status EmbeddingHelper::InitParam() { +template +Status EmbeddingHelper::InitParam() { DLOG(WARNING) << "Parsing Embedding op parameter."; auto word_num = GET_PARAMETER(int, word_num); auto emb_dim = GET_PARAMETER(int, emb_dim); auto padding_idx = GET_PARAMETER(int, padding_idx); - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - EmbeddingParam> param_embedding(word_num, emb_dim, padding_idx, &(weights.d_tensor())); + EmbeddingParam param_embedding(word_num, emb_dim, padding_idx, &(weights.d_tensor())); _param_embedding = param_embedding; return Status::OK(); } -template -Status EmbeddingHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status EmbeddingHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_embedding.init(ins, outs, _param_embedding, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status EmbeddingHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { +template +Status EmbeddingHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_embedding.compute_output_shape(ins, outs, _param_embedding)); return Status::OK(); } #ifdef USE_CUDA -template class EmbeddingHelper; -template class EmbeddingHelper; -template class EmbeddingHelper; +template class EmbeddingHelper; +template class EmbeddingHelper; +template class EmbeddingHelper; #endif #ifdef USE_ARM_PLACE -template class EmbeddingHelper; -template class EmbeddingHelper; -template class EmbeddingHelper; +template class EmbeddingHelper; +template class EmbeddingHelper; +template class EmbeddingHelper; #endif #ifdef USE_X86_PLACE -template class EmbeddingHelper; -template class EmbeddingHelper; -template class EmbeddingHelper; +template class EmbeddingHelper; +template class EmbeddingHelper; +template class EmbeddingHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE -ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, X86, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, X86, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Embedding) .Doc("Embedding operator") #ifdef USE_CUDA -.__alias__("embedding") +.__alias__("embedding") #endif #ifdef USE_ARM_PLACE -.__alias__("embedding") +.__alias__("embedding") #endif #ifdef USE_X86_PLACE -.__alias__("embedding") +.__alias__("embedding") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/embedding.h b/framework/operators/embedding.h index 23cebf261..f8455f07c 100644 --- a/framework/operators/embedding.h +++ b/framework/operators/embedding.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class EmbeddingHelper; /// pooling op @@ -34,20 +34,20 @@ class EmbeddingHelper; * \brief operation of ops class * public inheritance Operator */ -template -class Embedding : public Operator { +template +class Embedding : public Operator { public: Embedding() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Embedding< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class EmbeddingHelper; + friend class EmbeddingHelper; }; /** @@ -55,8 +55,8 @@ class Embedding : public Operator { * public inheritance OperatorHelper * including init operation context and the size of shape */ -template -class EmbeddingHelper : public OperatorHelper { +template +class EmbeddingHelper : public OperatorHelper { public: EmbeddingHelper()=default; @@ -72,8 +72,8 @@ class EmbeddingHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class EmbeddingHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_embedding stand for embedding parameter - saber::EmbeddingParam> _param_embedding; + saber::EmbeddingParam _param_embedding; ///< _funcs_embedding stand for embedding function - saber::Embedding _funcs_embedding; + saber::Embedding::saber_type> _funcs_embedding; }; diff --git a/framework/operators/flatten.cpp b/framework/operators/flatten.cpp index af25b198f..cd4a9b22c 100644 --- a/framework/operators/flatten.cpp +++ b/framework/operators/flatten.cpp @@ -4,84 +4,70 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Flatten::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_flatten; - impl->_funcs_flatten(ins, outs, param, ctx); +#define INSTANCE_FLATTEN(Ttype, Ptype) \ +template<> \ +void Flatten::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_flatten; \ + impl->_funcs_flatten(ins, outs, param, ctx); \ } -#endif - -/// TODO ... specialization other type of operator - -/// set helper -template -FlattenHelper::~FlattenHelper() { -} - -template -Status FlattenHelper::InitParam() { +template +Status FlattenHelper::InitParam() { DLOG(WARNING) << "Parsing Flatten op parameter."; auto start_axis = GET_PARAMETER(int, start_axis); auto end_axis = GET_PARAMETER(int, end_axis); - saber::FlattenParam> flatten_param; + saber::FlattenParam flatten_param; _param_flatten = flatten_param; return Status::OK(); } -template -Status FlattenHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status FlattenHelper::Init(OpContext &ctx, const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_flatten.init(ins, outs, _param_flatten, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } - -template -Status FlattenHelper::InferShape(const std::vector >& - ins, - std::vector >& outs) { +template +Status FlattenHelper::InferShape(const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_flatten.compute_output_shape(ins, outs, _param_flatten)); return Status::OK(); } - #ifdef USE_CUDA -template class FlattenHelper; -template class FlattenHelper; -template class FlattenHelper; -#endif - -#ifdef USE_ARM_PLACE -template class FlattenHelper; -template class FlattenHelper; -template class FlattenHelper; +INSTANCE_FLATTEN(NV, Precision::FP32); +template class FlattenHelper; +ANAKIN_REGISTER_OP_HELPER(Flatten, FlattenHelper, NV, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Flatten, FlattenHelper, NV, AK_FLOAT, Precision::FP32); +#ifdef USE_X86_PLACE +INSTANCE_FLATTEN(X86, Precision::FP32); +template class FlattenHelper; +ANAKIN_REGISTER_OP_HELPER(Flatten, FlattenHelper, X86, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Flatten, FlattenHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_FLATTEN(ARM, Precision::FP32); +template class FlattenHelper; +ANAKIN_REGISTER_OP_HELPER(Flatten, FlattenHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Flatten) .Doc("Flatten operator") #ifdef USE_CUDA -.__alias__("flatten") +.__alias__("flatten") #endif #ifdef USE_ARM_PLACE -.__alias__("flatten") +.__alias__("flatten") +#endif +#ifdef USE_X86_PLACE +.__alias__("flatten") #endif .num_in(1) .num_out(1); diff --git a/framework/operators/flatten.h b/framework/operators/flatten.h index eb4e650d7..9a163b3fb 100644 --- a/framework/operators/flatten.h +++ b/framework/operators/flatten.h @@ -26,54 +26,52 @@ namespace anakin { namespace ops { -template +template class FlattenHelper; //! pooling op -template -class Flatten : public Operator { +template +class Flatten : public Operator { public: Flatten() {} //! forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator flatten::type>().type_info() << ">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Flatten< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class FlattenHelper; + friend class FlattenHelper; }; -template -class FlattenHelper : public OperatorHelper { +template +class FlattenHelper : public OperatorHelper { public: FlattenHelper()=default; - ~FlattenHelper(); + ~FlattenHelper() {} Status InitParam() override; //! initial all the resource needed by pooling Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; //! infer the shape of output and input. - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: - saber::FlattenParam> _param_flatten; - saber::Flatten _funcs_flatten; + saber::FlattenParam _param_flatten; + saber::Flatten::saber_type> _funcs_flatten; private: PTuple _dims; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/fusion_ops/batchnorm_scale.cpp b/framework/operators/fusion_ops/batchnorm_scale.cpp new file mode 100644 index 000000000..b6cfe96bd --- /dev/null +++ b/framework/operators/fusion_ops/batchnorm_scale.cpp @@ -0,0 +1,140 @@ +#include "framework/operators/fusion_ops/batchnorm_scale.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_BATCHNORMSCALE(Ttype, Ptype) \ +template<> \ +void BatchnormScale::operator()(\ + OpContext& ctx,\ + const std::vector >& ins,\ + std::vector >& outs) {\ + auto* impl = static_cast*>(this->_helper);\ + auto& param = static_cast*>\ + (this->_helper)->_param_scale;\ + SABER_CHECK(impl->_funcs_scale(ins, outs, param, ctx));\ +} + +template +Status BatchnormScaleHelper::InitParam() { + using pblock_type = PBlock; + LOG(WARNING) << "Parsing BatchnormScale op parameter."; + + // get batchnorm param + auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); + auto momentum = GET_PARAMETER(float, batchnorm_0_momentum); + auto batch_norm_weight_1 = GET_PARAMETER(pblock_type, batchnorm_0_weight_1); + auto mean = batch_norm_weight_1.vector(); + auto batch_norm_weight_2 = GET_PARAMETER(pblock_type, batchnorm_0_weight_2); + auto var = batch_norm_weight_2.vector(); + auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, batchnorm_0_weight_3); + auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); + + // get scale param + auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); + auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); + auto scale_axis = GET_PARAMETER(int, scale_0_axis); + auto scale_weight_1 = GET_PARAMETER(pblock_type, scale_0_weight_1); + auto scale = scale_weight_1.vector(); + auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); + auto shift = scale_weight_2.vector(); + + CHECK_EQ(mean.size(), var.size()); + CHECK_EQ(mean.size(), scale.size()); + if (scale_bias_term){ + CHECK_EQ(mean.size(), shift.size()); + } + + auto new_scale = mean; + auto new_shift = var; + auto scale_factor = batch_norm_weight_3_vector[0]; + for (int i = 0; i < mean.size(); i++) { + auto alpha = 1 / sqrtf(var[i] * scale_factor + epsilon); + auto beta = -alpha * mean[i] * scale_factor; + new_scale[i] = alpha * scale[i]; + new_shift[i] = beta * scale[i]; + + if (scale_bias_term) { + new_shift[i] += shift[i]; + } + } + + saber::ScaleParam scale_param(new_scale, + new_shift, + scale_bias_term, + scale_axis, + scale_num_axes); + + _param_scale = scale_param; + + + return Status::OK(); +} + +template +Status BatchnormScaleHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_scale.init(ins, outs, \ + _param_scale, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status BatchnormScaleHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_scale.compute_output_shape(ins, outs, \ + _param_scale)); + return Status::OK(); +} + +#ifdef USE_ARM_PLACE +INSTANCE_BATCHNORMSCALE(ARM, Precision::FP32); +template class BatchnormScaleHelper; +ANAKIN_REGISTER_OP_HELPER(BatchnormScale, BatchnormScaleHelper, ARM, Precision::FP32); +#endif + +#ifdef USE_CUDA +INSTANCE_BATCHNORMSCALE(NV, Precision::FP32); +template<> +Status BatchnormScaleHelper::Init(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { + _funcs_scale.init(ins, outs, _param_scale, SPECIFY, VENDER_IMPL, ctx); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(BatchnormScale, BatchnormScaleHelper, NV, Precision::FP32); +#endif +//#ifdef USE_X86_PLACE +//INSTANCE_CONVBATCHNORMSCALE(X86, Precision::FP32); +//template class BatchnormScaleHelper; +//ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, BatchnormScaleHelper, X86, AK_FLOAT, +// Precision::FP32); +//#endif + + +//! register op +ANAKIN_REGISTER_OP(BatchnormScale) +.Doc("BatchnormScale fusion operator") +#ifdef USE_CUDA +.__alias__("batchnorm_scale") +#endif +#ifdef USE_ARM_PLACE +.__alias__("batchnorm_scale") +#endif +.num_in(1) +.num_out(1) +.Args("axis", "axis of conv") +.Args("scale_0_num_axes", " num axes for scale") +.Args("scale_0_bias_term", "whether scale has bias") +.Args("scale_0_axis", "axis for scale") +.Args("batchnorm_0_epsilon", "epsilon for batchnorm") +.Args("batchnorm_0_momentum", "momentum for batchnorm"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/fusion_ops/batchnorm_scale.h b/framework/operators/fusion_ops/batchnorm_scale.h new file mode 100644 index 000000000..4b6cd5adb --- /dev/null +++ b/framework/operators/fusion_ops/batchnorm_scale.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_BATCHNORM_SCALE_H +#define ANAKIN_OPERATOR_BATCHNORM_SCALE_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/scale.h" + +namespace anakin { + +namespace ops { + +template +class BatchnormScaleHelper; + +/// pooling op +/** + * \brief BatchnormScaleHelper implementation class + * public inherit Operator + */ +template +class BatchnormScale : public Operator { +public: + BatchnormScale() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator BatchnormScale< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class BatchnormScaleHelper; +}; + +/** + * \brief BatchnormScale helper class to implement it + * public inherit OperatorHelper + * including init resource and shape size in BatchnormScaleHelper context + */ +template +class BatchnormScaleHelper : public OperatorHelper { +public: + BatchnormScaleHelper()=default; + + ~BatchnormScaleHelper() {} + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for BatchnormScale operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_conv_batchnorm_scale stand for BatchnormScale parameter + saber::ScaleParam _param_scale; + ///< _funcs_conv stand for BatchnormScale function + saber::Scale::saber_type> _funcs_scale; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale.cpp b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale.cpp new file mode 100644 index 000000000..0b41f5b74 --- /dev/null +++ b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale.cpp @@ -0,0 +1,216 @@ +#include "framework/operators/fusion_ops/conv_3x3_batchnorm_scale.h" + +namespace anakin { + +namespace ops { + +#ifdef USE_CUDA +template<> +void SassConvBatchnormScale::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*> + (this->_helper); + auto& param = static_cast*> + (this->_helper)->_param_conv_batchnorm_scale; + impl->_funcs_conv_batchnorm_scale(ins, outs, param, ctx); +} +#endif + +/// TODO ... specialization other type of operator + + +/// set helper +template +SassConvBatchnormScaleHelper::~SassConvBatchnormScaleHelper() { +} + +template +Status SassConvBatchnormScaleHelper::InitParam() { + LOG(WARNING) << "Parsing SassConvBatchnormScale op parameter."; + + // get conv param + auto group = GET_PARAMETER(int, group); + auto bias_term = GET_PARAMETER(bool, bias_term); + auto padding = GET_PARAMETER(PTuple, padding); + auto strides = GET_PARAMETER(PTuple, strides); + auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); + auto filter_num = GET_PARAMETER(int, filter_num); + auto kernel_size = GET_PARAMETER(PTuple, kernel_size); + auto axis = GET_PARAMETER(int, axis); + + + using pblock_type = PBlock; + auto weights = GET_PARAMETER(pblock_type, weight_1); + auto weights_shape = weights.shape(); + + // get batchnorm param + auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); + auto momentum = GET_PARAMETER(float, batchnorm_0_momentum); + auto batch_norm_weight_1 = GET_PARAMETER(pblock_type, batchnorm_0_weight_1); + auto batch_norm_weight_1_vector = batch_norm_weight_1.vector(); + auto batch_norm_weight_2 = GET_PARAMETER(pblock_type, batchnorm_0_weight_2); + auto batch_norm_weight_2_vector = batch_norm_weight_2.vector(); + auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, batchnorm_0_weight_3); + auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); + + // get scale param + auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); + auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); + auto scale_axis = GET_PARAMETER(int, scale_0_axis); + auto scale_weight_1 = GET_PARAMETER(pblock_type, scale_0_weight_1); + auto scale_weight_1_vector = scale_weight_1.vector(); + auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); + auto scale_weight_2_vector = scale_weight_2.vector(); + + // check if batchnorm parameters have been optimized + auto is_param_updated = CHECK_PARAMETER(is_param_updated); + if(!is_param_updated) { + SET_PARAMETER(is_param_updated, true, bool); + if(bias_term) { + auto bias = GET_PARAMETER(pblock_type, weight_2); + graph::GraphGlobalMem::Global().template apply(update_weights, + weights,bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor())); + _param_conv_batchnorm_scale = conv_param; + } else { + pblock_type* bias = new pblock_type(); + SET_PARAMETER(bias_term, true, bool); // set attr bias_term true + SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias + + graph::GraphGlobalMem::Global().template apply(update_weights, + weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias->d_tensor())); + _param_conv_batchnorm_scale = conv_param; + } + } else { + auto bias = GET_PARAMETER(pblock_type, weight_2); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor())); + _param_conv_batchnorm_scale = conv_param; + + } + + return Status::OK(); +} + +template +Status SassConvBatchnormScaleHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + _funcs_conv_batchnorm_scale.init(ins, outs, _param_conv_batchnorm_scale, SPECIFY, SABER_IMPL, ctx); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); + + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + } + return Status::OK(); +} + +template +Status SassConvBatchnormScaleHelper::InferShape( + const std::vector >& ins, + std::vector >& outs) { + _funcs_conv_batchnorm_scale.compute_output_shape(ins, outs, _param_conv_batchnorm_scale); + return Status::OK(); +} + +#ifdef USE_CUDA +template class SassConvBatchnormScaleHelper; +template class SassConvBatchnormScaleHelper; +template class SassConvBatchnormScaleHelper; +#endif + +#ifdef USE_ARM_PLACE +template class SassConvBatchnormScaleHelper; +template class SassConvBatchnormScaleHelper; +template class SassConvBatchnormScaleHelper; +#endif + +// register helper +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScale, SassConvBatchnormScaleHelper, NV, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScale, SassConvBatchnormScaleHelper, ARM, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(SassConvBatchnormScale) +.Doc("SassConvBatchnormScale fusion operator") +#ifdef USE_CUDA +.__alias__("convolution_batchnorm_scale_relu") +#endif +#ifdef USE_ARM_PLACE +.__alias__("convolution_batchnorm_scale_relu") +#endif +.num_in(1) +.num_out(1) +.Args("group", " group of conv ") +.Args("bias_term", " whether conv weights have bias") +.Args>("padding", "padding of conv (x, y)") + .Args>("strides", "strides of conv (x)") + .Args>("dilation_rate", "dilation rate of conv (x)") + .Args("filter_num", "filter(kernel) number of weights") + .Args>("kernel_size", "kernel size of kernel (x, y)") + .Args("axis", "axis of conv") + .Args("relu_0_alpha", " alpha for relu") + .Args("scale_0_num_axes", " num axes for scale") + .Args("scale_0_bias_term", "whether scale has bias") + .Args("scale_0_axis", "axis for scale") + .Args("batchnorm_0_epsilon", "epsilon for batchnorm") + .Args("batchnorm_0_momentum", "momentum for batchnorm"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale.h b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale.h new file mode 100644 index 000000000..49cffab07 --- /dev/null +++ b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_CONV_SASS_BATCHNORM_SCALE_H +#define ANAKIN_OPERATOR_CONV_SASS_BATCHNORM_SCALE_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/conv.h" + +namespace anakin { + +namespace ops { + +template +class SassConvBatchnormScaleHelper; + +/// pooling op +/** + * \brief SassConvBatchnormScale implementation class + * public inherit Operator + */ +template +class SassConvBatchnormScale : public Operator { +public: + SassConvBatchnormScale() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator SassConvBatchnormScale< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class SassConvBatchnormScaleHelper; +}; + +/** + * \brief SassConvBatchnormScale helper class to implement it + * public inherit OperatorHelper + * including init resource and shape size in SassConvBatchnormScale context + */ +template +class SassConvBatchnormScaleHelper : public OperatorHelper { +public: + SassConvBatchnormScaleHelper()=default; + + ~SassConvBatchnormScaleHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for SassConvBatchnormScale operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + *///! initial all the resource needed by pooling + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_conv_batchnorm_scale stand for SassConvBatchnormScale parameter + saber::ConvParam _param_conv_batchnorm_scale; + ///< _funcs_conv_batchnorm_scale stand for SassConvBatchnormScale function + saber::Conv::saber_type> _funcs_conv_batchnorm_scale; + +private: + ///< _dims stand for SassConvBatchnormScale size + PTuple _dims; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.cpp b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.cpp index b5a058e5d..3b3717b6e 100644 --- a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.cpp +++ b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.cpp @@ -6,13 +6,13 @@ namespace ops { #ifdef USE_CUDA template<> -void SassConvBatchnormScaleRelu::operator()( +void SassConvBatchnormScaleRelu::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*> + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*> (this->_helper); - auto& param = static_cast*> + auto& param = static_cast*> (this->_helper)->_param_conv_batchnorm_scale_relu; impl->_funcs_conv_batchnorm_scale_relu(ins, outs, param, ctx); } @@ -22,14 +22,13 @@ void SassConvBatchnormScaleRelu::operator()( /// set helper -template -SassConvBatchnormScaleReluHelper::~SassConvBatchnormScaleReluHelper() { +template +SassConvBatchnormScaleReluHelper::~SassConvBatchnormScaleReluHelper() { } -template -Status SassConvBatchnormScaleReluHelper::InitParam() { +template +Status SassConvBatchnormScaleReluHelper::InitParam() { DLOG(WARNING) << "Parsing SassConvBatchnormScaleRelu op parameter."; - saber::ConvParam> _conv_param; // get conv param auto group = GET_PARAMETER(int, group); @@ -41,25 +40,10 @@ Status SassConvBatchnormScaleReluHelper::InitParam() { auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock::type, Ttype>; + + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - - if (bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - _conv_param = conv_param; - } else { - Tensor4d* bias = new Tensor4d();; - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias); - _conv_param = conv_param; - } - + auto weights_shape = weights.shape(); // get batchnorm param auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); @@ -70,10 +54,7 @@ Status SassConvBatchnormScaleReluHelper::InitParam() { auto batch_norm_weight_2_vector = batch_norm_weight_2.vector(); auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, batchnorm_0_weight_3); auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); - BatchnormParam> batchnorm_param(batch_norm_weight_1_vector, - batch_norm_weight_2_vector, - batch_norm_weight_3_vector[0], - momentum, epsilon); + // get scale param auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); @@ -82,69 +63,142 @@ Status SassConvBatchnormScaleReluHelper::InitParam() { auto scale_weight_1_vector = scale_weight_1.vector(); auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); auto scale_weight_2_vector = scale_weight_2.vector(); - saber::ScaleParam> scale_param(scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term, scale_axis, scale_num_axes); // get relu param auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam> active_param(Active_relu);//, alpha); // TEMP - - - ConvActiveParam> conv_act_param(_conv_param, active_param, batchnorm_param, - scale_param); - _param_conv_batchnorm_scale_relu = conv_act_param; + ActivationParam active_param(Active_relu);//, alpha); // TEMP + + // check if batchnorm parameters have been optimized + auto is_param_updated = CHECK_PARAMETER(is_param_updated); + if(!is_param_updated) { + SET_PARAMETER(is_param_updated, true, bool); + + if(bias_term) { + auto bias = GET_PARAMETER(pblock_type, weight_2); + graph::GraphGlobalMem::Global().template apply(update_weights, + weights,bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + _param_conv_batchnorm_scale_relu = conv_param; + } else { + pblock_type* bias = new pblock_type(); + SET_PARAMETER(bias_term, true, bool); // set attr bias_term true + SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias + + graph::GraphGlobalMem::Global().template apply(update_weights, + weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias->d_tensor()), + active_param); + _param_conv_batchnorm_scale_relu = conv_param; + } + } else { + auto bias = GET_PARAMETER(pblock_type, weight_2); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + _param_conv_batchnorm_scale_relu = conv_param; + } return Status::OK(); } -template -Status SassConvBatchnormScaleReluHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status SassConvBatchnormScaleReluHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); _funcs_conv_batchnorm_scale_relu.init(ins, outs, _param_conv_batchnorm_scale_relu, SPECIFY, SABER_IMPL, ctx); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + } return Status::OK(); } -template -Status SassConvBatchnormScaleReluHelper::InferShape( - const std::vector >& ins, - std::vector >& outs) { +template +Status SassConvBatchnormScaleReluHelper::InferShape( + const std::vector >& ins, + std::vector >& outs) { _funcs_conv_batchnorm_scale_relu.compute_output_shape(ins, outs, _param_conv_batchnorm_scale_relu); return Status::OK(); } #ifdef USE_CUDA -template class SassConvBatchnormScaleReluHelper; -template class SassConvBatchnormScaleReluHelper; -template class SassConvBatchnormScaleReluHelper; +template class SassConvBatchnormScaleReluHelper; +template class SassConvBatchnormScaleReluHelper; +template class SassConvBatchnormScaleReluHelper; #endif #ifdef USE_ARM_PLACE -template class SassConvBatchnormScaleReluHelper; -template class SassConvBatchnormScaleReluHelper; -template class SassConvBatchnormScaleReluHelper; +template class SassConvBatchnormScaleReluHelper; +template class SassConvBatchnormScaleReluHelper; +template class SassConvBatchnormScaleReluHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleRelu, SassConvBatchnormScaleReluHelper, NV, - AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleRelu, SassConvBatchnormScaleReluHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleRelu, SassConvBatchnormScaleReluHelper, ARM, - AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleRelu, SassConvBatchnormScaleReluHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(SassConvBatchnormScaleRelu) .Doc("SassConvBatchnormScaleRelu fusion operator") #ifdef USE_CUDA -.__alias__("convolution_batchnorm_scale_relu") +.__alias__("convolution_batchnorm_scale_relu") #endif #ifdef USE_ARM_PLACE -.__alias__("convolution_batchnorm_scale_relu") +.__alias__("convolution_batchnorm_scale_relu") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.h b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.h index b5f29fdfa..c2d401cb3 100644 --- a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.h +++ b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/conv_act.h" +#include "saber/funcs/conv.h" namespace anakin { namespace ops { -template +template class SassConvBatchnormScaleReluHelper; /// pooling op @@ -34,20 +34,20 @@ class SassConvBatchnormScaleReluHelper; * \brief SassConvBatchnormScaleRelu implementation class * public inherit Operator */ -template -class SassConvBatchnormScaleRelu : public Operator { +template +class SassConvBatchnormScaleRelu : public Operator { public: SassConvBatchnormScaleRelu() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - //LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator SassConvBatchnormScaleRelu< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SassConvBatchnormScaleReluHelper; + friend class SassConvBatchnormScaleReluHelper; }; /** @@ -55,8 +55,8 @@ class SassConvBatchnormScaleRelu : public Operator { * public inherit OperatorHelper * including init resource and shape size in SassConvBatchnormScaleRelu context */ -template -class SassConvBatchnormScaleReluHelper : public OperatorHelper { +template +class SassConvBatchnormScaleReluHelper : public OperatorHelper { public: SassConvBatchnormScaleReluHelper()=default; @@ -72,8 +72,8 @@ class SassConvBatchnormScaleReluHelper : public OperatorHelper &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class SassConvBatchnormScaleReluHelper : public OperatorHelper >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_conv_batchnorm_scale_relu stand for SassConvBatchnormScaleRelu parameter - saber::ConvActiveParam> _param_conv_batchnorm_scale_relu; + saber::ConvParam _param_conv_batchnorm_scale_relu; ///< _funcs_conv_batchnorm_scale_relu stand for SassConvBatchnormScaleRelu function - saber::ConvAct _funcs_conv_batchnorm_scale_relu; + saber::Conv::saber_type> _funcs_conv_batchnorm_scale_relu; private: ///< _dims stand for SassConvBatchnormScaleRelu size diff --git a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.cpp b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.cpp index bb453a541..bffe5d3aa 100644 --- a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.cpp +++ b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.cpp @@ -6,19 +6,12 @@ namespace ops { #ifdef USE_CUDA template<> -void SassConvBatchnormScaleReluPool::operator() ( +void SassConvBatchnormScaleReluPool::operator() ( OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - /*LOG(ERROR) << " compute of SassConvBatchnormScaleReluPool "; - float * h_data = new float[outs[0]->size()];//valid_size()]; - LOG(ERROR) << " outs[0]->valid_size() : " << outs[0]->size(); - cudaMemcpy(h_data, outs[0]->mutable_data(), outs[0]->size()*sizeof(float), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); - LOG(ERROR) << "over "; */ - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_conv_batchnorm_scale_relu_pooling; + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_conv_batchnorm_scale_relu_pooling; impl->_funcs_conv_batchnorm_scale_relu_pooling(ins, outs, param, ctx); } #endif @@ -27,15 +20,16 @@ void SassConvBatchnormScaleReluPool::operator() ( /// set helper -template -SassConvBatchnormScaleReluPoolHelper::~SassConvBatchnormScaleReluPoolHelper() { +template +SassConvBatchnormScaleReluPoolHelper::~SassConvBatchnormScaleReluPoolHelper() { } -template -Status SassConvBatchnormScaleReluPoolHelper::InitParam() { +template +Status SassConvBatchnormScaleReluPoolHelper::InitParam() { DLOG(WARNING) << "Parsing SassConvBatchnormScaleReluPool op parameter."; - saber::ConvParam> _conv_param; - PoolingParam> _pooling_param; + ConvParam conv_param_temp; + PoolingParam pooling_param_temp; + // get conv param auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -46,28 +40,10 @@ Status SassConvBatchnormScaleReluPoolHelper::InitParam() { auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock::type, Ttype>; + + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - auto weight_vec = weights.vector(); - - - if (bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - _conv_param = conv_param; - } else { - Tensor4d* bias = new Tensor4d(); - - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias); - _conv_param = conv_param; - } - + auto weights_shape = weights.shape(); // get batchnorm param auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); @@ -78,10 +54,7 @@ Status SassConvBatchnormScaleReluPoolHelper::InitParam() { auto batch_norm_weight_2_vector = batch_norm_weight_2.vector(); auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, batchnorm_0_weight_3); auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); - BatchnormParam> batchnorm_param(batch_norm_weight_1_vector, - batch_norm_weight_2_vector, - batch_norm_weight_3_vector[0], - momentum, epsilon); + // get scale param auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); @@ -90,12 +63,10 @@ Status SassConvBatchnormScaleReluPoolHelper::InitParam() { auto scale_weight_1_vector = scale_weight_1.vector(); auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); auto scale_weight_2_vector = scale_weight_2.vector(); - saber::ScaleParam> scale_param(scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term, scale_axis, scale_num_axes); // get relu param auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam> active_param(Active_relu);//, alpha); // Temp + ActivationParam active_param(Active_relu);//, alpha); // Temp // get pooling param auto global_pooling = GET_PARAMETER(bool, pooling_0_global_pooling); @@ -105,74 +76,154 @@ Status SassConvBatchnormScaleReluPoolHelper::InitParam() { auto pool_method = GET_PARAMETER(std::string, pooling_0_method); auto cmp_out_shape_floor_as_conv = GET_PARAMETER(bool, pooling_0_cmp_out_shape_floor_as_conv); if (pool_method == "MAX") { - PoolingParam> pooling_param(pool_size[0], pool_size[1], - pool_padding[0], pool_padding[1], - pool_strides[0], pool_strides[1], - Pooling_max, global_pooling, - cmp_out_shape_floor_as_conv); - _pooling_param = pooling_param; + PoolingParam pooling_param(pool_size[0], pool_size[1], + pool_padding[0], pool_padding[1], + pool_strides[0], pool_strides[1], + Pooling_max, global_pooling, + cmp_out_shape_floor_as_conv); + pooling_param_temp = pooling_param; } else if (pool_method == "AVG") { - PoolingParam> pooling_param(pool_size[0], pool_size[1], - pool_padding[0], pool_padding[1], - pool_strides[0], pool_strides[1], - Pooling_average_include_padding, global_pooling, - cmp_out_shape_floor_as_conv); - _pooling_param = pooling_param; + PoolingParam pooling_param(pool_size[0], pool_size[1], + pool_padding[0], pool_padding[1], + pool_strides[0], pool_strides[1], + Pooling_average_include_padding, global_pooling, + cmp_out_shape_floor_as_conv); + pooling_param_temp = pooling_param; } else { LOG(FATAL) << " SassConvBatchnormScaleReluPool fusion op doesn't support : " << pool_method << " pooling."; } - ConvActivePoolingParam> conv_act_pooling_param(_conv_param, batchnorm_param, - scale_param, active_param, - _pooling_param); + // check if batchnorm parameters have been optimized + auto is_param_updated = CHECK_PARAMETER(is_param_updated); + if(!is_param_updated) { + SET_PARAMETER(is_param_updated, true, bool); + + if(bias_term) { + auto bias = GET_PARAMETER(pblock_type, weight_2); + graph::GraphGlobalMem::Global().template apply(update_weights, + weights,bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + conv_param_temp = conv_param; + } else { + pblock_type* bias = new pblock_type(); + SET_PARAMETER(bias_term, true, bool); // set attr bias_term true + SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias + + graph::GraphGlobalMem::Global().template apply(update_weights, + weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias->d_tensor()), + active_param); + conv_param_temp = conv_param; + } + } else { + auto bias = GET_PARAMETER(pblock_type, weight_2); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + conv_param_temp = conv_param; + + } + + ConvPoolingParam conv_act_pooling_param(conv_param_temp, pooling_param_temp); _param_conv_batchnorm_scale_relu_pooling = conv_act_pooling_param; return Status::OK(); } -template -Status SassConvBatchnormScaleReluPoolHelper::Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status SassConvBatchnormScaleReluPoolHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); _funcs_conv_batchnorm_scale_relu_pooling.init(ins, outs, _param_conv_batchnorm_scale_relu_pooling, SPECIFY, SABER_IMPL/*VENDER_IMPL*/, ctx); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&ConvPooling::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&ConvPooling::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + } return Status::OK(); } -template -Status SassConvBatchnormScaleReluPoolHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { +template +Status SassConvBatchnormScaleReluPoolHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_conv_batchnorm_scale_relu_pooling.compute_output_shape(ins, outs, _param_conv_batchnorm_scale_relu_pooling)); return Status::OK(); } #ifdef USE_CUDA -template class SassConvBatchnormScaleReluPoolHelper; -template class SassConvBatchnormScaleReluPoolHelper; -template class SassConvBatchnormScaleReluPoolHelper; +template class SassConvBatchnormScaleReluPoolHelper; +template class SassConvBatchnormScaleReluPoolHelper; +template class SassConvBatchnormScaleReluPoolHelper; #endif #ifdef USE_ARM_PLACE -template class SassConvBatchnormScaleReluPoolHelper; -template class SassConvBatchnormScaleReluPoolHelper; -template class SassConvBatchnormScaleReluPoolHelper; +template class SassConvBatchnormScaleReluPoolHelper; +template class SassConvBatchnormScaleReluPoolHelper; +template class SassConvBatchnormScaleReluPoolHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleReluPool, SassConvBatchnormScaleReluPoolHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleReluPool, SassConvBatchnormScaleReluPoolHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleReluPool, SassConvBatchnormScaleReluPoolHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleReluPool, SassConvBatchnormScaleReluPoolHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(SassConvBatchnormScaleReluPool) .Doc("SassConvBatchnormScaleReluPool fusion operator") #ifdef USE_CUDA - .__alias__("convolution_batchnorm_scale_relu_pooling") + .__alias__("convolution_batchnorm_scale_relu_pooling") #endif #ifdef USE_ARM_PLACE - .__alias__("convolution_batchnorm_scale_relu_pooling") + .__alias__("convolution_batchnorm_scale_relu_pooling") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.h b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.h index 3030d910c..edcf981a9 100644 --- a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.h +++ b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/conv_act_pooling.h" +#include "saber/funcs/conv_pooling.h" namespace anakin { namespace ops { -template +template class SassConvBatchnormScaleReluPoolHelper; /// pooling op @@ -34,20 +34,20 @@ class SassConvBatchnormScaleReluPoolHelper; * \brief SassConvBatchnormScaleReluPool implementation class * public inherit Operator */ -template -class SassConvBatchnormScaleReluPool : public Operator { +template +class SassConvBatchnormScaleReluPool : public Operator { public: SassConvBatchnormScaleReluPool() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator SassConvBatchnormScaleReluPool< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SassConvBatchnormScaleReluPoolHelper; + friend class SassConvBatchnormScaleReluPoolHelper; }; /** @@ -55,8 +55,8 @@ class SassConvBatchnormScaleReluPool : public Operator { * public inherit OperatorHelper * including init resource and shape size in SassConvBatchnormScaleReluPool context */ -template -class SassConvBatchnormScaleReluPoolHelper : public OperatorHelper { +template +class SassConvBatchnormScaleReluPoolHelper : public OperatorHelper { public: SassConvBatchnormScaleReluPoolHelper()=default; @@ -72,8 +72,8 @@ class SassConvBatchnormScaleReluPoolHelper : public OperatorHelper &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class SassConvBatchnormScaleReluPoolHelper : public OperatorHelper >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_conv_batchnorm_scale_relu_pooling stand for SassConvBatchnormScaleReluPool parameter - saber::ConvActivePoolingParam> _param_conv_batchnorm_scale_relu_pooling; + saber::ConvPoolingParam _param_conv_batchnorm_scale_relu_pooling; ///< _funcs_conv_batchnorm_scale_relu_pooling stand for SassConvBatchnormScaleReluPool function - saber::ConvActPooling _funcs_conv_batchnorm_scale_relu_pooling; + saber::ConvPooling::saber_type> _funcs_conv_batchnorm_scale_relu_pooling; private: ///< _dims stand for SassConvBatchnormScaleReluPool size diff --git a/framework/operators/fusion_ops/conv_3x3_relu.cpp b/framework/operators/fusion_ops/conv_3x3_relu.cpp index 766f0a441..dbfc43abd 100644 --- a/framework/operators/fusion_ops/conv_3x3_relu.cpp +++ b/framework/operators/fusion_ops/conv_3x3_relu.cpp @@ -6,29 +6,40 @@ namespace ops { #ifdef USE_CUDA template<> -void SassConvRelu::operator()( +void SassConvRelu::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*> (this->_helper)->_param_conv_relu; impl->_funcs_conv_relu(ins, outs, param, ctx); } #endif +#ifdef AMD_GPU +template<> +void SassConvRelu::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*> + (this->_helper)->_param_conv_relu; + impl->_funcs_conv_relu(ins, outs, param, ctx); +} +#endif /// TODO ... specialization other type of operator /// set helper -template -SassConvReluHelper::~SassConvReluHelper() { +template +SassConvReluHelper::~SassConvReluHelper() { } -template -Status SassConvReluHelper::InitParam() { +template +Status SassConvReluHelper::InitParam() { DLOG(WARNING) << "Parsing SassConvRelu op parameter."; - saber::ConvParam> _conv_param; // get conv param auto group = GET_PARAMETER(int, group); @@ -40,81 +51,118 @@ Status SassConvReluHelper::InitParam() { auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); + // get relu param + auto alpha = GET_PARAMETER(float, relu_0_alpha); + ActivationParam active_param(Active_relu);//, alpha); // TEMP + + if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], + saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - _conv_param = conv_param; + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + _param_conv_relu = conv_param; } else { - Tensor4d* bias = new Tensor4d();; - saber::ConvParam> conv_param(group, padding[0], padding[1], + Tensor4d* bias = new Tensor4d();; + saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias); - _conv_param = conv_param; + &(weights.d_tensor()), bias, + active_param); + _param_conv_relu = conv_param; } - // get relu param - auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam> active_param(Active_relu);//, alpha); // TEMP - - - ConvActiveParam> conv_act_param(_conv_param, active_param); - _param_conv_relu = conv_act_param; - return Status::OK(); } -template -Status SassConvReluHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status SassConvReluHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); _funcs_conv_relu.init(ins, outs, _param_conv_relu, SPECIFY, SABER_IMPL, ctx); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_relu, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_relu, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + } return Status::OK(); } -template -Status SassConvReluHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { +template +Status SassConvReluHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { _funcs_conv_relu.compute_output_shape(ins, outs, _param_conv_relu); return Status::OK(); } #ifdef USE_CUDA -template class SassConvReluHelper; -template class SassConvReluHelper; -template class SassConvReluHelper; +template class SassConvReluHelper; +template class SassConvReluHelper; +template class SassConvReluHelper; #endif #ifdef USE_ARM_PLACE -template class SassConvReluHelper; -template class SassConvReluHelper; -template class SassConvReluHelper; +template class SassConvReluHelper; +template class SassConvReluHelper; +template class SassConvReluHelper; +#endif + +#ifdef AMD_GPU +template class SassConvReluHelper; +template class SassConvReluHelper; +template class SassConvReluHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, ARM, Precision::FP32); #endif +#ifdef AMD_GPU +ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, AMD, Precision::FP32); +#endif //! register op ANAKIN_REGISTER_OP(SassConvRelu) .Doc("SassConvRelu fusion operator") #ifdef USE_CUDA -.__alias__("convolution_batchnorm_scale_relu") +.__alias__("convolution_batchnorm_scale_relu") #endif #ifdef USE_ARM_PLACE -.__alias__("convolution_batchnorm_scale_relu") +.__alias__("convolution_batchnorm_scale_relu") +#endif +#ifdef AMD_GPU +.__alias__("convolution_batchnorm_scale_relu") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/conv_3x3_relu.h b/framework/operators/fusion_ops/conv_3x3_relu.h index e63f3f414..fc266f116 100644 --- a/framework/operators/fusion_ops/conv_3x3_relu.h +++ b/framework/operators/fusion_ops/conv_3x3_relu.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/conv_act.h" +#include "saber/funcs/conv.h" namespace anakin { namespace ops { -template +template class SassConvReluHelper; /// pooling op @@ -34,20 +34,20 @@ class SassConvReluHelper; * \brief SassConvRelu implementation class * public inherit Operator */ -template -class SassConvRelu : public Operator { +template +class SassConvRelu : public Operator { public: SassConvRelu() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - //LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator SassConvRelu< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SassConvReluHelper; + friend class SassConvReluHelper; }; /** @@ -55,8 +55,8 @@ class SassConvRelu : public Operator { * public inherit OperatorHelper * including init resource and shape size in SassConvRelu context */ -template -class SassConvReluHelper : public OperatorHelper { +template +class SassConvReluHelper : public OperatorHelper { public: SassConvReluHelper()=default; @@ -72,8 +72,8 @@ class SassConvReluHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class SassConvReluHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_conv stand for SassConvRelu parameter - saber::ConvActiveParam> _param_conv_relu; + saber::ConvParam _param_conv_relu; ///< _funcs_conv_relu stand for SassConvRelu function - saber::ConvAct _funcs_conv_relu; + saber::Conv::saber_type> _funcs_conv_relu; private: ///< _dims stand for SassConvRelu size diff --git a/framework/operators/fusion_ops/conv_3x3_relu_pool.cpp b/framework/operators/fusion_ops/conv_3x3_relu_pool.cpp index c742a9225..6014a51b4 100644 --- a/framework/operators/fusion_ops/conv_3x3_relu_pool.cpp +++ b/framework/operators/fusion_ops/conv_3x3_relu_pool.cpp @@ -6,36 +6,41 @@ namespace ops { #ifdef USE_CUDA template<> -void SassConvReluPool::operator() ( +void SassConvReluPool::operator() ( OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - /*LOG(ERROR) << " compute of SassConvReluPool "; - float * h_data = new float[outs[0]->size()];//valid_size()]; - LOG(ERROR) << " outs[0]->valid_size() : " << outs[0]->size(); - cudaMemcpy(h_data, outs[0]->mutable_data(), outs[0]->size()*sizeof(float), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); - LOG(ERROR) << "over "; */ - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_conv_relu_pooling; + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_conv_relu_pooling; impl->_funcs_conv_relu_pooling(ins, outs, param, ctx); } #endif +#ifdef AMD_GPU +template<> +void SassConvReluPool::operator() ( + OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_conv_relu_pooling; + impl->_funcs_conv_relu_pooling(ins, outs, param, ctx); +} +#endif /// TODO ... specialization other type of operator /// set helper -template -SassConvReluPoolHelper::~SassConvReluPoolHelper() { +template +SassConvReluPoolHelper::~SassConvReluPoolHelper() { } -template -Status SassConvReluPoolHelper::InitParam() { +template +Status SassConvReluPoolHelper::InitParam() { DLOG(WARNING) << "Parsing SassConvReluPool op parameter."; - saber::ConvParam> _conv_param; - PoolingParam> _pooling_param; + + saber::ConvParam conv_param_temp; + PoolingParam pooling_param_temp; // get conv param auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -46,29 +51,13 @@ Status SassConvReluPoolHelper::InitParam() { auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); auto weight_vec = weights.vector(); - if (bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - _conv_param = conv_param; - } else { - Tensor4d* bias = new Tensor4d(); - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias); - _conv_param = conv_param; - } - // get relu param auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam> active_param(Active_relu);//, alpha); // Temp + ActivationParam active_param(Active_relu);//, alpha); // Temp // get pooling param auto global_pooling = GET_PARAMETER(bool, pooling_0_global_pooling); @@ -78,74 +67,131 @@ Status SassConvReluPoolHelper::InitParam() { auto pool_method = GET_PARAMETER(std::string, pooling_0_method); auto cmp_out_shape_floor_as_conv = GET_PARAMETER(bool, pooling_0_cmp_out_shape_floor_as_conv); if (pool_method == "MAX") { - PoolingParam> pooling_param(pool_size[0], pool_size[1], - pool_padding[0], pool_padding[1], - pool_strides[0], pool_strides[1], - Pooling_max, global_pooling, - cmp_out_shape_floor_as_conv); - _pooling_param = pooling_param; + PoolingParam pooling_param(pool_size[0], pool_size[1], + pool_padding[0], pool_padding[1], + pool_strides[0], pool_strides[1], + Pooling_max, global_pooling, + cmp_out_shape_floor_as_conv); + pooling_param_temp = pooling_param; } else if (pool_method == "AVG") { - PoolingParam> pooling_param(pool_size[0], pool_size[1], - pool_padding[0], pool_padding[1], - pool_strides[0], pool_strides[1], - Pooling_average_include_padding, global_pooling, - cmp_out_shape_floor_as_conv); - _pooling_param = pooling_param; + PoolingParam pooling_param(pool_size[0], pool_size[1], + pool_padding[0], pool_padding[1], + pool_strides[0], pool_strides[1], + Pooling_average_include_padding, global_pooling, + cmp_out_shape_floor_as_conv); + pooling_param_temp = pooling_param; } else { LOG(FATAL) << " SassConvReluPool fusion op doesn't support : " << pool_method << " pooling."; } - ConvActivePoolingParam> conv_act_pooling_param(_conv_param, - active_param, - _pooling_param); + + if (bias_term) { + auto bias = GET_PARAMETER(pblock_type, weight_2); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + conv_param_temp = conv_param; + } else { + Tensor4d* bias = new Tensor4d(); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), bias, + active_param); + conv_param_temp = conv_param; + } + + ConvPoolingParam conv_act_pooling_param(conv_param_temp, pooling_param_temp); _param_conv_relu_pooling = conv_act_pooling_param; + return Status::OK(); } -template -Status SassConvReluPoolHelper::Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status SassConvReluPoolHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); _funcs_conv_relu_pooling.init(ins, outs, _param_conv_relu_pooling, STATIC, SABER_IMPL/*VENDER_IMPL*/, ctx); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&ConvPooling::saber_type>::trans_weights, + &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&ConvPooling::saber_type>::trans_weights, + &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + } return Status::OK(); } -template -Status SassConvReluPoolHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { +template +Status SassConvReluPoolHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_conv_relu_pooling.compute_output_shape(ins, outs, _param_conv_relu_pooling)); return Status::OK(); } #ifdef USE_CUDA -template class SassConvReluPoolHelper; -template class SassConvReluPoolHelper; -template class SassConvReluPoolHelper; +template class SassConvReluPoolHelper; +template class SassConvReluPoolHelper; +template class SassConvReluPoolHelper; #endif #ifdef USE_ARM_PLACE -template class SassConvReluPoolHelper; -template class SassConvReluPoolHelper; -template class SassConvReluPoolHelper; +template class SassConvReluPoolHelper; +template class SassConvReluPoolHelper; +template class SassConvReluPoolHelper; +#endif + +#ifdef AMD_GPU +template class SassConvReluPoolHelper; +template class SassConvReluPoolHelper; +template class SassConvReluPoolHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, ARM, Precision::FP32); +#endif + +#ifdef AMD_GPU +ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, AMD, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(SassConvReluPool) .Doc("SassConvReluPool fusion operator") #ifdef USE_CUDA - .__alias__("convolution_batchnorm_scale_relu_pooling") + .__alias__("convolution_batchnorm_scale_relu_pooling") #endif #ifdef USE_ARM_PLACE - .__alias__("convolution_batchnorm_scale_relu_pooling") + .__alias__("convolution_batchnorm_scale_relu_pooling") +#endif +#ifdef AMD_GPU + .__alias__("convolution_batchnorm_scale_relu_pooling") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/conv_3x3_relu_pool.h b/framework/operators/fusion_ops/conv_3x3_relu_pool.h index 258bd35b3..fc71ac12c 100644 --- a/framework/operators/fusion_ops/conv_3x3_relu_pool.h +++ b/framework/operators/fusion_ops/conv_3x3_relu_pool.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/conv_act_pooling.h" +#include "saber/funcs/conv_pooling.h" namespace anakin { namespace ops { -template +template class SassConvReluPoolHelper; /// pooling op @@ -34,20 +34,20 @@ class SassConvReluPoolHelper; * \brief SassConvReluPool implementation class * public inherit Operator */ -template -class SassConvReluPool : public Operator { +template +class SassConvReluPool : public Operator { public: SassConvReluPool() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator SassConvReluPool< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SassConvReluPoolHelper; + friend class SassConvReluPoolHelper; }; /** @@ -55,8 +55,8 @@ class SassConvReluPool : public Operator { * public inherit OperatorHelper * including init resource and shape size in SassConvReluPool context */ -template -class SassConvReluPoolHelper : public OperatorHelper { +template +class SassConvReluPoolHelper : public OperatorHelper { public: SassConvReluPoolHelper()=default; @@ -72,8 +72,8 @@ class SassConvReluPoolHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class SassConvReluPoolHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_conv_relu_pooling stand for SassConvReluPool parameter - saber::ConvActivePoolingParam> _param_conv_relu_pooling; + saber::ConvPoolingParam _param_conv_relu_pooling; ///< _funcs_conv_relu_pooling stand for SassConvReluPool function - saber::ConvActPooling _funcs_conv_relu_pooling; + saber::ConvPooling::saber_type> _funcs_conv_relu_pooling; private: ///< _dims stand for SassConvReluPool size diff --git a/framework/operators/fusion_ops/conv_batchnorm_scale.cpp b/framework/operators/fusion_ops/conv_batchnorm_scale.cpp index 69c64fd80..ec9512021 100644 --- a/framework/operators/fusion_ops/conv_batchnorm_scale.cpp +++ b/framework/operators/fusion_ops/conv_batchnorm_scale.cpp @@ -4,32 +4,22 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void ConvBatchnormScale::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_conv_batchnorm_scale; - impl->_funcs_conv_batchnorm_scale(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - - -/// set helper -template -ConvBatchnormScaleHelper::~ConvBatchnormScaleHelper() { +#define INSTANCE_CONVBATCHNORMSCALE(Ttype, Ptype) \ +template<> \ +void ConvBatchnormScale::operator()(\ + OpContext& ctx,\ + const std::vector >& ins,\ + std::vector >& outs) {\ + auto* impl = static_cast*>(this->_helper);\ + auto& param = static_cast*>\ + (this->_helper)->_param_conv_batchnorm_scale;\ + SABER_CHECK(impl->_funcs_conv_batchnorm_scale(ins, outs, param, ctx));\ } -template -Status ConvBatchnormScaleHelper::InitParam() { - DLOG(WARNING) << "Parsing ConvBatchnormScale op parameter."; - saber::ConvParam> _conv_param; - +template +Status ConvBatchnormScaleHelper::InitParam() { + LOG(WARNING) << "Parsing ConvBatchnormScale op parameter."; + // get conv param auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -40,25 +30,10 @@ Status ConvBatchnormScaleHelper::InitParam() { auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock::type, Ttype>; + + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - - if (bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - _conv_param = conv_param; - } else { - Tensor4d* bias = new Tensor4d();; - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias); - _conv_param = conv_param; - } - + auto weights_shape = weights.shape(); // get batchnorm param auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); @@ -69,10 +44,7 @@ Status ConvBatchnormScaleHelper::InitParam() { auto batch_norm_weight_2_vector = batch_norm_weight_2.vector(); auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, batchnorm_0_weight_3); auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); - BatchnormParam> batchnorm_param(batch_norm_weight_1_vector, - batch_norm_weight_2_vector, - batch_norm_weight_3_vector[0], - momentum, epsilon); + // get scale param auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); @@ -81,67 +53,164 @@ Status ConvBatchnormScaleHelper::InitParam() { auto scale_weight_1_vector = scale_weight_1.vector(); auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); auto scale_weight_2_vector = scale_weight_2.vector(); - saber::ScaleParam> scale_param(scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term, scale_axis, scale_num_axes); - - // get relu param - /*auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam> active_param(Active_relu);//, alpha); // TEMP */ - - ConvActiveParam> conv_act_param(_conv_param, batchnorm_param, scale_param); - _param_conv_batchnorm_scale = conv_act_param; + // check if batchnorm parameters have been optimized + auto is_param_updated = CHECK_PARAMETER(is_param_updated); + if(!is_param_updated) { + SET_PARAMETER(is_param_updated, true, bool); + + if(bias_term) { + auto bias = GET_PARAMETER(pblock_type, weight_2); + graph::GraphGlobalMem::Global().template apply(update_weights, + weights,bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor())); + _param_conv_batchnorm_scale = conv_param; + } else { + pblock_type* bias = new pblock_type(); + SET_PARAMETER(bias_term, true, bool); // set attr bias_term true + SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias + + graph::GraphGlobalMem::Global().template apply(update_weights, + weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias->d_tensor())); + _param_conv_batchnorm_scale = conv_param; + } + } else { + auto bias = GET_PARAMETER(pblock_type, weight_2); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor())); + _param_conv_batchnorm_scale = conv_param; + } return Status::OK(); } -template -Status ConvBatchnormScaleHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - _funcs_conv_batchnorm_scale.init(ins, outs, _param_conv_batchnorm_scale, SPECIFY, VENDER_IMPL, ctx); +template +Status ConvBatchnormScaleHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + SABER_CHECK(_funcs_conv_batchnorm_scale.init(ins, outs, \ + _param_conv_batchnorm_scale, SPECIFY, SABER_IMPL, ctx)); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + } return Status::OK(); } -template -Status ConvBatchnormScaleHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { - _funcs_conv_batchnorm_scale.compute_output_shape(ins, outs, _param_conv_batchnorm_scale); +template +Status ConvBatchnormScaleHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_conv_batchnorm_scale.compute_output_shape(ins, outs, \ + _param_conv_batchnorm_scale)); return Status::OK(); } -#ifdef USE_CUDA -template class ConvBatchnormScaleHelper; -template class ConvBatchnormScaleHelper; -template class ConvBatchnormScaleHelper; -#endif - #ifdef USE_ARM_PLACE -template class ConvBatchnormScaleHelper; -template class ConvBatchnormScaleHelper; -template class ConvBatchnormScaleHelper; +INSTANCE_CONVBATCHNORMSCALE(ARM, Precision::FP32); +template class ConvBatchnormScaleHelper; +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, ARM, Precision::FP32); #endif -// register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, NV, AK_FLOAT, - Precision::FP32); -#endif +INSTANCE_CONVBATCHNORMSCALE(NV, Precision::FP32); +template<> +Status ConvBatchnormScaleHelper::Init(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + _funcs_conv_batchnorm_scale.init(ins, outs, _param_conv_batchnorm_scale, SPECIFY, VENDER_IMPL, ctx); -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, ARM, AK_FLOAT, - Precision::FP32); + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + VENDER_IMPL); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + VENDER_IMPL); + } + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, NV, Precision::FP32); #endif +//#ifdef USE_X86_PLACE +//INSTANCE_CONVBATCHNORMSCALE(X86, Precision::FP32); +//template class ConvBatchnormScaleHelper; +//ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, X86, +// Precision::FP32); +//#endif + //! register op ANAKIN_REGISTER_OP(ConvBatchnormScale) .Doc("ConvBatchnormScale fusion operator") #ifdef USE_CUDA -.__alias__("convolution_batchnorm_scale") +.__alias__("convolution_batchnorm_scale") #endif #ifdef USE_ARM_PLACE -.__alias__("convolution_batchnorm_scale") +.__alias__("convolution_batchnorm_scale") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/conv_batchnorm_scale.h b/framework/operators/fusion_ops/conv_batchnorm_scale.h index 985ec799d..a4003d203 100644 --- a/framework/operators/fusion_ops/conv_batchnorm_scale.h +++ b/framework/operators/fusion_ops/conv_batchnorm_scale.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/conv_act.h" +#include "saber/funcs/conv.h" namespace anakin { namespace ops { -template +template class ConvBatchnormScaleHelper; /// pooling op @@ -34,20 +34,20 @@ class ConvBatchnormScaleHelper; * \brief ConvBatchnormScaleHelper implementation class * public inherit Operator */ -template -class ConvBatchnormScale : public Operator { +template +class ConvBatchnormScale : public Operator { public: ConvBatchnormScale() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator ConvBatchnormScale< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ConvBatchnormScaleHelper; + friend class ConvBatchnormScaleHelper; }; /** @@ -55,12 +55,12 @@ class ConvBatchnormScale : public Operator { * public inherit OperatorHelper * including init resource and shape size in ConvBatchnormScaleHelper context */ -template -class ConvBatchnormScaleHelper : public OperatorHelper { +template +class ConvBatchnormScaleHelper : public OperatorHelper { public: ConvBatchnormScaleHelper()=default; - ~ConvBatchnormScaleHelper(); + ~ConvBatchnormScaleHelper() {} Status InitParam() override; @@ -72,8 +72,8 @@ class ConvBatchnormScaleHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,22 +81,16 @@ class ConvBatchnormScaleHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_conv_batchnorm_scale stand for ConvBatchnormScale parameter - saber::ConvActiveParam> _param_conv_batchnorm_scale; + saber::ConvParam _param_conv_batchnorm_scale; ///< _funcs_conv stand for ConvBatchnormScale function - saber::ConvAct _funcs_conv_batchnorm_scale; - -private: - ///< _dims stand for ConvBatchnormScale size - PTuple _dims; + saber::Conv::saber_type> _funcs_conv_batchnorm_scale; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/fusion_ops/conv_batchnorm_scale_relu.cpp b/framework/operators/fusion_ops/conv_batchnorm_scale_relu.cpp index 8f847a429..4f73a25ce 100644 --- a/framework/operators/fusion_ops/conv_batchnorm_scale_relu.cpp +++ b/framework/operators/fusion_ops/conv_batchnorm_scale_relu.cpp @@ -4,33 +4,23 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void ConvBatchnormScaleRelu::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*> - (this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_conv_batchnorm_scale_relu; - impl->_funcs_conv_batchnorm_scale_relu(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - - -/// set helper -template -ConvBatchnormScaleReluHelper::~ConvBatchnormScaleReluHelper() { +#define INSTANCE_CONVBATCHNORMSCALERELU(Ttype, Ptype) \ +template<> \ +void ConvBatchnormScaleRelu::operator()(\ + OpContext& ctx,\ + const std::vector >& ins,\ + std::vector >& outs) {\ + auto* impl = static_cast*>\ + (this->_helper);\ + auto& param = static_cast*>\ + (this->_helper)->_param_conv_batchnorm_scale_relu;\ + SABER_CHECK(impl->_funcs_conv_batchnorm_scale_relu(ins, outs, param, ctx));\ } -template -Status ConvBatchnormScaleReluHelper::InitParam() { +template +Status ConvBatchnormScaleReluHelper::InitParam() { DLOG(WARNING) << "Parsing ConvBatchnormScaleRelu op parameter."; - saber::ConvParam> _conv_param; - + // get conv param auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -41,123 +31,222 @@ Status ConvBatchnormScaleReluHelper::InitParam() { auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock::type, Ttype>; + + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - - if (bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - _conv_param = conv_param; - } else { - Tensor4d* bias = new Tensor4d();; - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias); - _conv_param = conv_param; - } - + auto weights_shape = weights.shape(); // get batchnorm param auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); auto momentum = GET_PARAMETER(float, batchnorm_0_momentum); - auto batch_norm_weight_1 = GET_PARAMETER(pblock_type, - batchnorm_0_weight_1); + auto batch_norm_weight_1 = GET_PARAMETER(pblock_type, batchnorm_0_weight_1); auto batch_norm_weight_1_vector = batch_norm_weight_1.vector(); - auto batch_norm_weight_2 = GET_PARAMETER(pblock_type, - batchnorm_0_weight_2); + auto batch_norm_weight_2 = GET_PARAMETER(pblock_type, batchnorm_0_weight_2); auto batch_norm_weight_2_vector = batch_norm_weight_2.vector(); - auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, - batchnorm_0_weight_3); + auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, batchnorm_0_weight_3); auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); - BatchnormParam> batchnorm_param(batch_norm_weight_1_vector, - batch_norm_weight_2_vector, - batch_norm_weight_3_vector[0], - momentum, epsilon); + // get scale param auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); auto scale_axis = GET_PARAMETER(int, scale_0_axis); - auto scale_weight_1 = GET_PARAMETER(pblock_type, - scale_0_weight_1); + auto scale_weight_1 = GET_PARAMETER(pblock_type, scale_0_weight_1); auto scale_weight_1_vector = scale_weight_1.vector(); - auto scale_weight_2 = GET_PARAMETER(pblock_type, - scale_0_weight_2); + auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); auto scale_weight_2_vector = scale_weight_2.vector(); - saber::ScaleParam> scale_param(scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term, scale_axis, scale_num_axes); // get relu param auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam> active_param(Active_relu);//, alpha); // TEMP + ActivationParam active_param(Active_relu);//, alpha); // TEMP + // check if batchnorm parameters have been optimized + auto is_param_updated = CHECK_PARAMETER(is_param_updated); + if(!is_param_updated) { + SET_PARAMETER(is_param_updated, true, bool); - ConvActiveParam> conv_act_param(_conv_param, active_param, batchnorm_param, - scale_param); - _param_conv_batchnorm_scale_relu = conv_act_param; + if(bias_term) { + auto bias = GET_PARAMETER(pblock_type, weight_2); + graph::GraphGlobalMem::Global().template apply(update_weights, + weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + _param_conv_batchnorm_scale_relu = conv_param; + } else { + pblock_type* bias = new pblock_type(); + SET_PARAMETER(bias_term, true, bool); // set attr bias_term true + SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias + graph::GraphGlobalMem::Global().template apply(update_weights, + weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias->d_tensor()), + active_param); + _param_conv_batchnorm_scale_relu = conv_param; + } + } else { + auto bias = GET_PARAMETER(pblock_type, weight_2); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + _param_conv_batchnorm_scale_relu = conv_param; + } return Status::OK(); } -template -Status ConvBatchnormScaleReluHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - if (_param_conv_batchnorm_scale_relu.conv_param.group == ins[0]->channel() && \ - _param_conv_batchnorm_scale_relu.conv_param.group == outs[0]->channel()) { - _funcs_conv_batchnorm_scale_relu.init(ins, outs, _param_conv_batchnorm_scale_relu, SPECIFY, - SABER_IMPL, ctx); +template +Status ConvBatchnormScaleReluHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + SABER_CHECK(_funcs_conv_batchnorm_scale_relu.init(ins, outs, _param_conv_batchnorm_scale_relu, SPECIFY, + SABER_IMPL, ctx)); + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); } else { - _funcs_conv_batchnorm_scale_relu.init(ins, outs, _param_conv_batchnorm_scale_relu, SPECIFY, - VENDER_IMPL, ctx); + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); } - - //_funcs_conv_batchnorm_scale_relu.init(ins, outs, _param_conv_batchnorm_scale_relu, SPECIFY, VENDER_IMPL, ctx); return Status::OK(); } -template -Status ConvBatchnormScaleReluHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { - _funcs_conv_batchnorm_scale_relu.compute_output_shape(ins, outs, _param_conv_batchnorm_scale_relu); +template +Status ConvBatchnormScaleReluHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_conv_batchnorm_scale_relu.compute_output_shape(ins, outs, _param_conv_batchnorm_scale_relu)); return Status::OK(); } #ifdef USE_CUDA -template class ConvBatchnormScaleReluHelper; -template class ConvBatchnormScaleReluHelper; -template class ConvBatchnormScaleReluHelper; -#endif +template <> +Status ConvBatchnormScaleReluHelper::Init(OpContext &ctx, \ + const std::vector >& ins, std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + if (_param_conv_batchnorm_scale_relu.group == ins[0]->channel() && \ + _param_conv_batchnorm_scale_relu.group == outs[0]->channel()) { + _funcs_conv_batchnorm_scale_relu.init(ins, outs, _param_conv_batchnorm_scale_relu, SPECIFY, + SABER_IMPL, ctx); -#ifdef USE_ARM_PLACE -template class ConvBatchnormScaleReluHelper; -template class ConvBatchnormScaleReluHelper; -template class ConvBatchnormScaleReluHelper; -#endif + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + } + } else { + _funcs_conv_batchnorm_scale_relu.init(ins, outs, _param_conv_batchnorm_scale_relu, SPECIFY, + VENDER_IMPL, ctx); -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, NV, AK_FLOAT, - Precision::FP32); + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + VENDER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + VENDER_IMPL); + } + } + + return Status::OK(); +} +INSTANCE_CONVBATCHNORMSCALERELU(NV, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, NV, Precision::FP32); #endif +//#ifdef USE_X86_PLACE +//template class ConvBatchnormScaleReluHelper; +//INSTANCE_CONVBATCHNORMSCALERELU(X86, Precision::FP32); +//ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, X86, +// Precision::FP32); +//#endif + #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, ARM, AK_FLOAT, - Precision::FP32); +INSTANCE_CONVBATCHNORMSCALERELU(ARM, Precision::FP32); +template class ConvBatchnormScaleReluHelper; +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(ConvBatchnormScaleRelu) .Doc("ConvBatchnormScaleRelu fusion operator") #ifdef USE_CUDA -.__alias__("convolution_batchnorm_scale_relu") +.__alias__("convolution_batchnorm_scale_relu") #endif #ifdef USE_ARM_PLACE -.__alias__("convolution_batchnorm_scale_relu") +.__alias__("convolution_batchnorm_scale_relu") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/conv_batchnorm_scale_relu.h b/framework/operators/fusion_ops/conv_batchnorm_scale_relu.h index 76f014499..217b99220 100644 --- a/framework/operators/fusion_ops/conv_batchnorm_scale_relu.h +++ b/framework/operators/fusion_ops/conv_batchnorm_scale_relu.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/conv_act.h" +#include "saber/funcs/conv.h" namespace anakin { namespace ops { -template +template class ConvBatchnormScaleReluHelper; /// pooling op @@ -34,20 +34,20 @@ class ConvBatchnormScaleReluHelper; * \brief ConvBatchnormScaleRelu implementation class * public inherit Operator */ -template -class ConvBatchnormScaleRelu : public Operator { +template +class ConvBatchnormScaleRelu : public Operator { public: ConvBatchnormScaleRelu() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator ConvBatchnormScaleRelu< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ConvBatchnormScaleReluHelper; + friend class ConvBatchnormScaleReluHelper; }; /** @@ -55,12 +55,12 @@ class ConvBatchnormScaleRelu : public Operator { * public inherit OperatorHelper * including init resource and shape size in ConvBatchnormScaleRelu context */ -template -class ConvBatchnormScaleReluHelper : public OperatorHelper { +template +class ConvBatchnormScaleReluHelper : public OperatorHelper { public: ConvBatchnormScaleReluHelper()=default; - ~ConvBatchnormScaleReluHelper(); + ~ConvBatchnormScaleReluHelper() {} Status InitParam() override; @@ -72,8 +72,8 @@ class ConvBatchnormScaleReluHelper : public OperatorHelper * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,22 +81,20 @@ class ConvBatchnormScaleReluHelper : public OperatorHelper * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_conv_batchnorm_scale_relu stand for ConvBatchnormScaleRelu parameter - saber::ConvActiveParam> _param_conv_batchnorm_scale_relu; + saber::ConvParam _param_conv_batchnorm_scale_relu; ///< _funcs_conv_batchnorm_scale_relu stand for ConvBatchnormScaleRelu function - saber::ConvAct _funcs_conv_batchnorm_scale_relu; + saber::Conv::saber_type> _funcs_conv_batchnorm_scale_relu; private: ///< _dims stand for ConvBatchnormScaleRelu size PTuple _dims; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.cpp b/framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.cpp index b7df025ec..b024217e0 100644 --- a/framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.cpp +++ b/framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.cpp @@ -4,30 +4,31 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void ConvBatchnormScaleReluPool::operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_conv_batchnorm_scale_relu_pooling; - impl->_funcs_conv_batchnorm_scale_relu_pooling(ins, outs, param, ctx); +#define INSTANCE_CONVBATCHNORMSCALERELUPOOLING(Ttype, Ptype) \ +template<> \ +void ConvBatchnormScaleReluPool::operator()(\ + OpContext& ctx,\ + const std::vector >& ins,\ + std::vector >& outs) {\ + auto* impl = static_cast*>\ + (this->_helper);\ + auto& param = static_cast*>\ + (this->_helper)->_param_conv_batchnorm_scale_relu_pooling;\ + SABER_CHECK(impl->_funcs_conv_batchnorm_scale_relu_pooling(ins, outs, param, ctx));\ } -#endif - -/// TODO ... specialization other type of operator - /// set helper -template -ConvBatchnormScaleReluPoolHelper::~ConvBatchnormScaleReluPoolHelper() { +template +ConvBatchnormScaleReluPoolHelper::~ConvBatchnormScaleReluPoolHelper() { } -template -Status ConvBatchnormScaleReluPoolHelper::InitParam() { +template +Status ConvBatchnormScaleReluPoolHelper::InitParam() { DLOG(WARNING) << "Parsing ConvBatchnormScaleReluPool op parameter."; - saber::ConvParam> _conv_param; - PoolingParam> _pooling_param; + + ConvParam conv_param_temp; + PoolingParam pooling_param_temp; + // get conv param auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -38,24 +39,10 @@ Status ConvBatchnormScaleReluPoolHelper::InitParam() { auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock::type, Ttype>; + + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - if (bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - _conv_param = conv_param; - } else { - Tensor4d* bias = new Tensor4d(); - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias); - _conv_param = conv_param; - } - + auto weights_shape = weights.shape(); // get batchnorm param auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); @@ -66,10 +53,7 @@ Status ConvBatchnormScaleReluPoolHelper::InitParam() { auto batch_norm_weight_2_vector = batch_norm_weight_2.vector(); auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, batchnorm_0_weight_3); auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); - BatchnormParam> batchnorm_param(batch_norm_weight_1_vector, - batch_norm_weight_2_vector, - batch_norm_weight_3_vector[0], - momentum, epsilon); + // get scale param auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); @@ -78,12 +62,10 @@ Status ConvBatchnormScaleReluPoolHelper::InitParam() { auto scale_weight_1_vector = scale_weight_1.vector(); auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); auto scale_weight_2_vector = scale_weight_2.vector(); - saber::ScaleParam> scale_param(scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term, scale_axis, scale_num_axes); // get relu param auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam> active_param(Active_relu);//, alpha); // Temp + ActivationParam active_param(Active_relu);//, alpha); // Temp // get pooling param auto global_pooling = GET_PARAMETER(bool, pooling_0_global_pooling); @@ -93,75 +75,183 @@ Status ConvBatchnormScaleReluPoolHelper::InitParam() { auto pool_method = GET_PARAMETER(std::string, pooling_0_method); auto cmp_out_shape_floor_as_conv = GET_PARAMETER(bool, pooling_0_cmp_out_shape_floor_as_conv); if (pool_method == "MAX") { - PoolingParam> pooling_param(pool_size[0], pool_size[1], - pool_padding[0], pool_padding[1], - pool_strides[0], pool_strides[1], - Pooling_max, global_pooling, - cmp_out_shape_floor_as_conv); - _pooling_param = pooling_param; + PoolingParam pooling_param(pool_size[0], pool_size[1], + pool_padding[0], pool_padding[1], + pool_strides[0], pool_strides[1], + Pooling_max, global_pooling, + cmp_out_shape_floor_as_conv); + pooling_param_temp = pooling_param; } else if (pool_method == "AVG") { - PoolingParam> pooling_param(pool_size[0], pool_size[1], - pool_padding[0], pool_padding[1], - pool_strides[0], pool_strides[1], - Pooling_average_include_padding, - global_pooling, - cmp_out_shape_floor_as_conv); - _pooling_param = pooling_param; + PoolingParam pooling_param(pool_size[0], pool_size[1], + pool_padding[0], pool_padding[1], + pool_strides[0], pool_strides[1], + Pooling_average_include_padding, global_pooling, + cmp_out_shape_floor_as_conv); + pooling_param_temp = pooling_param; } else { - LOG(FATAL) << " ConvBatchnormScaleReluPool fusion op doesn't support : " << pool_method << " pooling."; + LOG(FATAL) << " SassConvBatchnormScaleReluPool fusion op doesn't support : " << pool_method << " pooling."; } - ConvActivePoolingParam> conv_act_pooling_param(_conv_param, batchnorm_param, - scale_param, active_param, - _pooling_param); + // check if batchnorm parameters have been optimized + auto is_param_updated = CHECK_PARAMETER(is_param_updated); + if(!is_param_updated) { + SET_PARAMETER(is_param_updated, true, bool); + + if(bias_term) { + auto bias = GET_PARAMETER(pblock_type, weight_2); + graph::GraphGlobalMem::Global().template apply(update_weights, + weights,bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + conv_param_temp = conv_param; + } else { + pblock_type* bias = new pblock_type(); + SET_PARAMETER(bias_term, true, bool); // set attr bias_term true + SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias + + graph::GraphGlobalMem::Global().template apply(update_weights, + weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias->d_tensor()), + active_param); + conv_param_temp = conv_param; + } + } else { + auto bias = GET_PARAMETER(pblock_type, weight_2); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + conv_param_temp = conv_param; + + } + + ConvPoolingParam conv_act_pooling_param(conv_param_temp, pooling_param_temp); + _param_conv_batchnorm_scale_relu_pooling = conv_act_pooling_param; return Status::OK(); } -template -Status ConvBatchnormScaleReluPoolHelper::Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - _funcs_conv_batchnorm_scale_relu_pooling.init(ins, outs, _param_conv_batchnorm_scale_relu_pooling, SPECIFY, VENDER_IMPL, ctx); +template +Status ConvBatchnormScaleReluPoolHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + SABER_CHECK(_funcs_conv_batchnorm_scale_relu_pooling.init(ins, outs, \ + _param_conv_batchnorm_scale_relu_pooling, SPECIFY, SABER_IMPL, ctx)); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + + graph::GraphGlobalMem::Global().template apply( + std::bind(&ConvPooling::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu_pooling, + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL)); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&ConvPooling::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu_pooling, + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL)); + } return Status::OK(); } -template -Status ConvBatchnormScaleReluPoolHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { - _funcs_conv_batchnorm_scale_relu_pooling.compute_output_shape(ins, outs, _param_conv_batchnorm_scale_relu_pooling); +template +Status ConvBatchnormScaleReluPoolHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_conv_batchnorm_scale_relu_pooling.compute_output_shape(ins, outs, \ + _param_conv_batchnorm_scale_relu_pooling)); return Status::OK(); } #ifdef USE_CUDA -template class ConvBatchnormScaleReluPoolHelper; -template class ConvBatchnormScaleReluPoolHelper; -template class ConvBatchnormScaleReluPoolHelper; -#endif +INSTANCE_CONVBATCHNORMSCALERELUPOOLING(NV, Precision::FP32); +template<> +Status ConvBatchnormScaleReluPoolHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + _funcs_conv_batchnorm_scale_relu_pooling.init(ins, outs, _param_conv_batchnorm_scale_relu_pooling, SPECIFY, VENDER_IMPL, ctx); -#ifdef USE_ARM_PLACE -template class ConvBatchnormScaleReluPoolHelper; -template class ConvBatchnormScaleReluPoolHelper; -template class ConvBatchnormScaleReluPoolHelper; + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + auto is_weights_transed_test = CHECK_PARAMETER(is_weights_transed); + graph::GraphGlobalMem::Global().template apply( + std::bind(&ConvPooling::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + VENDER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&ConvPooling::saber_type>::trans_weights, + &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + VENDER_IMPL); + } + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPoolHelper, NV, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPoolHelper, NV, AK_FLOAT, Precision::FP32); -#endif - #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPoolHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_CONVBATCHNORMSCALERELUPOOLING(ARM, Precision::FP32); +template class ConvBatchnormScaleReluPoolHelper; +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPoolHelper, ARM, Precision::FP32); #endif + //! register op ANAKIN_REGISTER_OP(ConvBatchnormScaleReluPool) .Doc("ConvBatchnormScaleReluPool fusion operator") #ifdef USE_CUDA - .__alias__("convolution_batchnorm_scale_relu_pooling") + .__alias__("convolution_batchnorm_scale_relu_pooling") #endif #ifdef USE_ARM_PLACE - .__alias__("convolution_batchnorm_scale_relu_pooling") + .__alias__("convolution_batchnorm_scale_relu_pooling") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.h b/framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.h index 85d1903e9..832fb6a45 100644 --- a/framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.h +++ b/framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/conv_act_pooling.h" +#include "saber/funcs/conv_pooling.h" namespace anakin { namespace ops { -template +template class ConvBatchnormScaleReluPoolHelper; /// pooling op @@ -34,20 +34,20 @@ class ConvBatchnormScaleReluPoolHelper; * \brief ConvBatchnormScaleReluPool implementation class * public inherit Operator */ -template -class ConvBatchnormScaleReluPool : public Operator { +template +class ConvBatchnormScaleReluPool : public Operator { public: ConvBatchnormScaleReluPool() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator ConvBatchnormScaleReluPool< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ConvBatchnormScaleReluPoolHelper; + friend class ConvBatchnormScaleReluPoolHelper; }; /** @@ -55,8 +55,8 @@ class ConvBatchnormScaleReluPool : public Operator { * public inherit OperatorHelper * including init resource and shape size in ConvBatchnormScaleReluPool context */ -template -class ConvBatchnormScaleReluPoolHelper : public OperatorHelper { +template +class ConvBatchnormScaleReluPoolHelper : public OperatorHelper { public: ConvBatchnormScaleReluPoolHelper()=default; @@ -72,8 +72,8 @@ class ConvBatchnormScaleReluPoolHelper : public OperatorHelper &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class ConvBatchnormScaleReluPoolHelper : public OperatorHelper >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_conv_batchnorm_scale_relu_pooling stand for ConvBatchnormScaleReluPool parameter - saber::ConvActivePoolingParam> _param_conv_batchnorm_scale_relu_pooling; + saber::ConvPoolingParam _param_conv_batchnorm_scale_relu_pooling; ///< _funcs_conv_batchnorm_scale_relu_pooling stand for ConvBatchnormScaleReluPool function - saber::ConvActPooling _funcs_conv_batchnorm_scale_relu_pooling; + saber::ConvPooling::saber_type> _funcs_conv_batchnorm_scale_relu_pooling; private: ///< _dims stand for ConvBatchnormScaleReluPool size diff --git a/framework/operators/fusion_ops/conv_relu.cpp b/framework/operators/fusion_ops/conv_relu.cpp index 95b503fb5..6d904cf42 100644 --- a/framework/operators/fusion_ops/conv_relu.cpp +++ b/framework/operators/fusion_ops/conv_relu.cpp @@ -4,31 +4,21 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void ConvRelu::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = impl->_param_conv_relu; - impl->_funcs_conv_relu(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - - -/// set helper -template -ConvReluHelper::~ConvReluHelper() { +#define INSTANCE_CONVRELU(Ttype, Ptype) \ +template<> \ +void ConvRelu::operator()(\ + OpContext& ctx,\ + const std::vector >& ins,\ + std::vector >& outs) {\ + auto* impl =\ + static_cast*>(this->_helper);\ + auto& param = impl->_param_conv_relu;\ + impl->_funcs_conv_relu(ins, outs, param, ctx);\ } -template -Status ConvReluHelper::InitParam() { +template +Status ConvReluHelper::InitParam() { DLOG(WARNING) << "Parsing ConvRelu op parameter."; - saber::ConvParam> _conv_param; // get conv param auto group = GET_PARAMETER(int, group); @@ -39,111 +29,180 @@ Status ConvReluHelper::InitParam() { auto filter_num = GET_PARAMETER(int, filter_num); auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - DLOG(INFO) << "conv group : " << group; - DLOG(INFO) << "conv bias_term: " << bias_term; - DLOG(INFO) << "conv padding : [" << padding[0] << " " << padding[1] << "]"; - DLOG(INFO) << "conv strides : [" << strides[0] << " " << strides[1] << "]"; - DLOG(INFO) << "conv dilation_rate : [" << dilation_rate[0] << " " << dilation_rate[1] << "]"; - DLOG(INFO) << "conv filter_num : " << filter_num; - DLOG(INFO) << "conv kernel_size : [" << kernel_size[0] << " " << kernel_size[1] << "]"; - DLOG(INFO) << "conv axis : " << axis; - - using pblock_type = PBlock::type, Ttype>; + + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); + + // get relu param + auto alpha = GET_PARAMETER(float, relu_0_alpha); + ActivationParam active_param(Active_relu);//, alpha); // TEMP if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], + saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - _conv_param = conv_param; + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + _param_conv_relu = conv_param; } else { - Tensor4d* bias = new Tensor4d();; - saber::ConvParam> conv_param(group, padding[0], padding[1], + Tensor4d* bias = new Tensor4d();; + saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias); - _conv_param = conv_param; + &(weights.d_tensor()), bias, + active_param); + _param_conv_relu = conv_param; } - - - // get relu param - auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam> active_param(Active_relu);//, alpha); // TEMP - - - ConvActiveParam> conv_act_param(_conv_param, active_param); - _param_conv_relu = conv_act_param; - return Status::OK(); - } -template -Status ConvReluHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { -#if 0 - - if (_param_conv_relu.conv_param.group == ins[0]->channel() && \ - _param_conv_relu.conv_param.group == outs[0]->channel()) { - _funcs_conv_relu.init(ins, outs, _param_conv_relu, SPECIFY, SABER_IMPL, ctx); +template +Status ConvReluHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + SABER_CHECK(_funcs_conv_relu.init(ins, outs, _param_conv_relu, SPECIFY, SABER_IMPL, ctx)); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_relu, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); } else { - _funcs_conv_relu.init(ins, outs, _param_conv_relu, SPECIFY, VENDER_IMPL, ctx); + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_relu, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); } + return Status::OK(); +} -#else +template +Status ConvReluHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + _funcs_conv_relu.compute_output_shape(ins, outs, _param_conv_relu); + return Status::OK(); +} - if (_param_conv_relu.conv_param.group == 1) { +#ifdef USE_CUDA +INSTANCE_CONVRELU(NV, Precision::FP32); +template <> +Status ConvReluHelper::Init(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + if (_param_conv_relu.group == 1|| (_param_conv_relu.group == ins[0]->channel() && \ + _param_conv_relu.group == outs[0]->channel())) { _funcs_conv_relu.init(ins, outs, _param_conv_relu, SPECIFY, SABER_IMPL, ctx); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_relu, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_relu, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + } } else { _funcs_conv_relu.init(ins, outs, _param_conv_relu, SPECIFY, VENDER_IMPL, ctx); - } -#endif - //_funcs_conv_relu.init(ins, outs, _param_conv_relu, SPECIFY, VENDER_IMPL, ctx); + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_relu, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + VENDER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_relu, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + VENDER_IMPL); + } + } return Status::OK(); } +ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, NV, Precision::FP32); +#endif -template -Status ConvReluHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { - _funcs_conv_relu.compute_output_shape(ins, outs, _param_conv_relu); - return Status::OK(); -} +//#ifdef USE_X86_PLACE +//INSTANCE_CONVRELU(X86, Precision::FP32); +//template class ConvReluHelper; +//ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, X86, Precision::FP32); +//#endif -#ifdef USE_CUDA -template class ConvReluHelper; -template class ConvReluHelper; -template class ConvReluHelper; -#endif #ifdef USE_ARM_PLACE -template class ConvReluHelper; -template class ConvReluHelper; -template class ConvReluHelper; +INSTANCE_CONVRELU(ARM, Precision::FP32); +template class ConvReluHelper; +ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, ARM, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, NV, AK_FLOAT, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, ARM, AK_FLOAT, Precision::FP32); +#ifdef AMD_GPU +INSTANCE_CONVRELU(AMD, Precision::FP32); +template class ConvReluHelper; +ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, AMD, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(ConvRelu) .Doc("ConvRelu operator") #ifdef USE_CUDA -.__alias__("power") +.__alias__("power") #endif #ifdef USE_ARM_PLACE -.__alias__("power") +.__alias__("power") +#endif +#ifdef AMD_GPU +.__alias__("power") #endif +//#ifdef USE_X86_PLACE +//.__alias__("power") +//#endif .num_in(1) .num_out(1) .Args("group", " group of conv ") diff --git a/framework/operators/fusion_ops/conv_relu.h b/framework/operators/fusion_ops/conv_relu.h index f06744d4c..0ca48a575 100644 --- a/framework/operators/fusion_ops/conv_relu.h +++ b/framework/operators/fusion_ops/conv_relu.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/conv_act.h" +#include "saber/funcs/conv.h" namespace anakin { namespace ops { -template +template class ConvReluHelper; /// pooling op @@ -34,20 +34,20 @@ class ConvReluHelper; * \brief ConvRelu implementation class * public inherit Operator */ -template -class ConvRelu : public Operator { +template +class ConvRelu : public Operator { public: ConvRelu() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator ConvRelu< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ConvReluHelper; + friend class ConvReluHelper; }; /** @@ -55,12 +55,12 @@ class ConvRelu : public Operator { * public inherit OperatorHelper * including init resource and shape size in ConvRelu context */ -template -class ConvReluHelper : public OperatorHelper { +template +class ConvReluHelper : public OperatorHelper { public: ConvReluHelper()=default; - ~ConvReluHelper(); + ~ConvReluHelper() {} Status InitParam() override; @@ -72,8 +72,8 @@ class ConvReluHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,22 +81,16 @@ class ConvReluHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_conv_relu stand for ConvRelu parameter - saber::ConvActiveParam> _param_conv_relu; + saber::ConvParam _param_conv_relu; ///< _funcs_conv_relu stand for ConvRelu function - saber::ConvAct _funcs_conv_relu; - -private: - ///< _dims stand for ConvRelu size - PTuple _dims; + saber::Conv::saber_type> _funcs_conv_relu; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/fusion_ops/conv_relu_pool.cpp b/framework/operators/fusion_ops/conv_relu_pool.cpp index 1dea03180..a82db73c2 100644 --- a/framework/operators/fusion_ops/conv_relu_pool.cpp +++ b/framework/operators/fusion_ops/conv_relu_pool.cpp @@ -4,38 +4,28 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void ConvReluPool::operator() ( - OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - /*LOG(ERROR) << " compute of ConvReluPool "; - float * h_data = new float[outs[0]->size()];//valid_size()]; - LOG(ERROR) << " outs[0]->valid_size() : " << outs[0]->size(); - cudaMemcpy(h_data, outs[0]->mutable_data(), outs[0]->size()*sizeof(float), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); - LOG(ERROR) << "over "; */ - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_conv_relu_pooling; - impl->_funcs_conv_relu_pooling(ins, outs, param, ctx); +#define INSTANCE_CONVRELUPOOLING(Ttype, Ptype) \ +template<> \ +void ConvReluPool::operator()(\ + OpContext& ctx,\ + const std::vector >& ins,\ + std::vector >& outs) {\ + auto* impl = static_cast*>(this->_helper);\ + auto& param = static_cast*>(this->_helper)->_param_conv_relu_pooling;\ + SABER_CHECK(impl->_funcs_conv_relu_pooling(ins, outs, param, ctx));\ } -#endif - -/// TODO ... specialization other type of operator - /// set helper -template -ConvReluPoolHelper::~ConvReluPoolHelper() { +template +ConvReluPoolHelper::~ConvReluPoolHelper() { } -template -Status ConvReluPoolHelper::InitParam() { +template +Status ConvReluPoolHelper::InitParam() { DLOG(WARNING) << "Parsing ConvReluPool op parameter."; - saber::ConvParam> _conv_param; - PoolingParam> _pooling_param; + saber::ConvParam conv_param_temp; + PoolingParam pooling_param_temp; + // get conv param auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -46,29 +36,13 @@ Status ConvReluPoolHelper::InitParam() { auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); auto weight_vec = weights.vector(); - if (bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - _conv_param = conv_param; - } else { - Tensor4d* bias = new Tensor4d(); - saber::ConvParam> conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias); - _conv_param = conv_param; - } - // get relu param auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam> active_param(Active_relu);//, alpha); // Temp + ActivationParam active_param(Active_relu);//, alpha); // Temp // get pooling param auto global_pooling = GET_PARAMETER(bool, pooling_0_global_pooling); @@ -78,75 +52,150 @@ Status ConvReluPoolHelper::InitParam() { auto pool_method = GET_PARAMETER(std::string, pooling_0_method); auto cmp_out_shape_floor_as_conv = GET_PARAMETER(bool, pooling_0_cmp_out_shape_floor_as_conv); if (pool_method == "MAX") { - PoolingParam> pooling_param(pool_size[0], pool_size[1], + PoolingParam pooling_param(pool_size[0], pool_size[1], pool_padding[0], pool_padding[1], pool_strides[0], pool_strides[1], Pooling_max, global_pooling, cmp_out_shape_floor_as_conv); - _pooling_param = pooling_param; + pooling_param_temp = pooling_param; } else if (pool_method == "AVG") { - PoolingParam> pooling_param(pool_size[0], pool_size[1], + PoolingParam pooling_param(pool_size[0], pool_size[1], pool_padding[0], pool_padding[1], pool_strides[0], pool_strides[1], Pooling_average_include_padding, global_pooling, cmp_out_shape_floor_as_conv); - _pooling_param = pooling_param; + pooling_param_temp = pooling_param; } else { LOG(FATAL) << " ConvReluPool fusion op doesn't support : " << pool_method << " pooling."; } - ConvActivePoolingParam> conv_act_pooling_param(_conv_param, - active_param, - _pooling_param); + if (bias_term) { + auto bias = GET_PARAMETER(pblock_type, weight_2); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + conv_param_temp = conv_param; + } else { + Tensor4d* bias = new Tensor4d(); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), bias, + active_param); + conv_param_temp = conv_param; + } + + + ConvPoolingParam conv_act_pooling_param(conv_param_temp, pooling_param_temp); _param_conv_relu_pooling = conv_act_pooling_param; + return Status::OK(); } -template -Status ConvReluPoolHelper::Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(INFO)<<"IN THIS!!!!!"; - _funcs_conv_relu_pooling.init(ins, outs, _param_conv_relu_pooling, STATIC, SABER_IMPL/*VENDER_IMPL*/, ctx); +template +Status ConvReluPoolHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + SABER_CHECK(_funcs_conv_relu_pooling.init(ins, outs, _param_conv_relu_pooling, SPECIFY, SABER_IMPL, ctx)); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&ConvPooling::saber_type>::trans_weights, + &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&ConvPooling::saber_type>::trans_weights, + &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + } return Status::OK(); } -template -Status ConvReluPoolHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { +template +Status ConvReluPoolHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_conv_relu_pooling.compute_output_shape(ins, outs, _param_conv_relu_pooling)); return Status::OK(); } #ifdef USE_CUDA -template class ConvReluPoolHelper; -template class ConvReluPoolHelper; -template class ConvReluPoolHelper; +INSTANCE_CONVRELUPOOLING(NV, Precision::FP32); +template<> +Status ConvReluPoolHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + _funcs_conv_relu_pooling.init(ins, outs, _param_conv_relu_pooling, STATIC, SABER_IMPL, ctx); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if(!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + graph::GraphGlobalMem::Global().template apply( + std::bind(&ConvPooling::saber_type>::trans_weights, + &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5), + weights.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + weights.map_to_host(); + } else { + PBlock weight_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&ConvPooling::saber_type>::trans_weights, + &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5), + weight_empty.d_tensor(), + strides[0], strides[1], + group, + SABER_IMPL); + } + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(ConvReluPool, ConvReluPoolHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -template class ConvReluPoolHelper; -template class ConvReluPoolHelper; -template class ConvReluPoolHelper; +INSTANCE_CONVRELUPOOLING(ARM, Precision::FP32); +template class ConvReluPoolHelper; +ANAKIN_REGISTER_OP_HELPER(ConvReluPool, ConvReluPoolHelper, ARM, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(ConvReluPool, ConvReluPoolHelper, NV, AK_FLOAT, Precision::FP32); -#endif - -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(ConvReluPool, ConvReluPoolHelper, ARM, AK_FLOAT, Precision::FP32); +#ifdef AMD_GPU +INSTANCE_CONVRELUPOOLING(AMD, Precision::FP32); +template class ConvReluPoolHelper; +ANAKIN_REGISTER_OP_HELPER(ConvReluPool, ConvReluPoolHelper, AMD, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(ConvReluPool) .Doc("ConvReluPool fusion operator") #ifdef USE_CUDA - .__alias__("convolution_batchnorm_scale_relu_pooling") + .__alias__("convolution_batchnorm_scale_relu_pooling") #endif #ifdef USE_ARM_PLACE - .__alias__("convolution_batchnorm_scale_relu_pooling") + .__alias__("convolution_batchnorm_scale_relu_pooling") +#endif +#ifdef AMD_GPU + .__alias__("convolution_batchnorm_scale_relu_pooling") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/conv_relu_pool.h b/framework/operators/fusion_ops/conv_relu_pool.h index 08956df52..6686cb518 100644 --- a/framework/operators/fusion_ops/conv_relu_pool.h +++ b/framework/operators/fusion_ops/conv_relu_pool.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/conv_act_pooling.h" +#include "saber/funcs/conv_pooling.h" namespace anakin { namespace ops { -template +template class ConvReluPoolHelper; /// pooling op @@ -34,20 +34,20 @@ class ConvReluPoolHelper; * \brief ConvReluPool implementation class * public inherit Operator */ -template -class ConvReluPool : public Operator { +template +class ConvReluPool : public Operator { public: ConvReluPool() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator ConvReluPool< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ConvReluPoolHelper; + friend class ConvReluPoolHelper; }; /** @@ -55,8 +55,8 @@ class ConvReluPool : public Operator { * public inherit OperatorHelper * including init resource and shape size in ConvReluPool context */ -template -class ConvReluPoolHelper : public OperatorHelper { +template +class ConvReluPoolHelper : public OperatorHelper { public: ConvReluPoolHelper()=default; @@ -72,8 +72,8 @@ class ConvReluPoolHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class ConvReluPoolHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_conv_relu_pooling stand for ConvReluPool parameter - saber::ConvActivePoolingParam> _param_conv_relu_pooling; + saber::ConvPoolingParam _param_conv_relu_pooling; ///< _funcs_conv_relu_pooling stand for ConvReluPool function - saber::ConvActPooling _funcs_conv_relu_pooling; + saber::ConvPooling::saber_type> _funcs_conv_relu_pooling; private: ///< _dims stand for ConvReluPool size diff --git a/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.cpp b/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.cpp new file mode 100644 index 000000000..cb5672ab0 --- /dev/null +++ b/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.cpp @@ -0,0 +1,188 @@ +#include "framework/operators/fusion_ops/deconv_batchnorm_scale_relu.h" + +namespace anakin { + +namespace ops { + +#ifdef USE_CUDA +template<> +void DeconvBatchnormScaleRelu::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*> + (this->_helper); + auto& param = static_cast*> + (this->_helper)->_param_deconv_batchnorm_scale_relu; + impl->_funcs_deconv_batchnorm_scale_relu(ins, outs, param, ctx); +} +#endif + +/// TODO ... specialization other type of operator + + +/// set helper +template +DeconvBatchnormScaleReluHelper::~DeconvBatchnormScaleReluHelper() { +} + +template +Status DeconvBatchnormScaleReluHelper::InitParam() { + DLOG(WARNING) << "Parsing DeconvBatchnormScaleRelu op parameter."; + + // get conv param + auto group = GET_PARAMETER(int, group); + auto bias_term = GET_PARAMETER(bool, bias_term); + auto padding = GET_PARAMETER(PTuple, padding); + auto strides = GET_PARAMETER(PTuple, strides); + auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); + auto filter_num = GET_PARAMETER(int, filter_num); + auto kernel_size = GET_PARAMETER(PTuple, kernel_size); + auto axis = GET_PARAMETER(int, axis); + + + using pblock_type = PBlock; + auto weights = GET_PARAMETER(pblock_type, weight_1); + auto weights_shape = weights.shape(); + + // get batchnorm param + auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); + auto momentum = GET_PARAMETER(float, batchnorm_0_momentum); + auto batch_norm_weight_1 = GET_PARAMETER(pblock_type, batchnorm_0_weight_1); + auto batch_norm_weight_1_vector = batch_norm_weight_1.vector(); + auto batch_norm_weight_2 = GET_PARAMETER(pblock_type, batchnorm_0_weight_2); + auto batch_norm_weight_2_vector = batch_norm_weight_2.vector(); + auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, batchnorm_0_weight_3); + auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); + + // get scale param + auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); + auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); + auto scale_axis = GET_PARAMETER(int, scale_0_axis); + auto scale_weight_1 = GET_PARAMETER(pblock_type, scale_0_weight_1); + auto scale_weight_1_vector = scale_weight_1.vector(); + auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); + auto scale_weight_2_vector = scale_weight_2.vector(); + + // get relu param + auto alpha = GET_PARAMETER(float, relu_0_alpha); + ActivationParam active_param(Active_relu);//, alpha); // TEMP + + if(bias_term) { + auto bias = GET_PARAMETER(pblock_type, weight_2); + graph::GraphGlobalMem::Global().template apply(update_weights, + weights,bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + _param_deconv_batchnorm_scale_relu = conv_param; + } else { + pblock_type* bias = new pblock_type(); + graph::GraphGlobalMem::Global().template apply(update_weights, + weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias->d_tensor()), + active_param); + _param_deconv_batchnorm_scale_relu = conv_param; + } + + return Status::OK(); +} + +template +Status DeconvBatchnormScaleReluHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + if (_param_deconv_batchnorm_scale_relu.group == ins[0]->channel() && \ + _param_deconv_batchnorm_scale_relu.group == outs[0]->channel()) { + _funcs_deconv_batchnorm_scale_relu.init(ins, outs, _param_deconv_batchnorm_scale_relu, SPECIFY, + SABER_IMPL, ctx); + } else { + _funcs_deconv_batchnorm_scale_relu.init(ins, outs, _param_deconv_batchnorm_scale_relu, SPECIFY, + VENDER_IMPL, ctx); + } + + //_funcs_deconv_batchnorm_scale_relu.init(ins, outs, _param_deconv_batchnorm_scale_relu, SPECIFY, VENDER_IMPL, ctx); + return Status::OK(); +} + +template +Status DeconvBatchnormScaleReluHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + _funcs_deconv_batchnorm_scale_relu.compute_output_shape(ins, outs, _param_deconv_batchnorm_scale_relu); + return Status::OK(); +} + +#ifdef USE_CUDA +template class DeconvBatchnormScaleReluHelper; +template class DeconvBatchnormScaleReluHelper; +template class DeconvBatchnormScaleReluHelper; +#endif + +#ifdef USE_ARM_PLACE +template class DeconvBatchnormScaleReluHelper; +template class DeconvBatchnormScaleReluHelper; +template class DeconvBatchnormScaleReluHelper; +#endif + +// register helper +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(DeconvBatchnormScaleRelu, DeconvBatchnormScaleReluHelper, NV, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +ANAKIN_REGISTER_OP_HELPER(DeconvBatchnormScaleRelu, DeconvBatchnormScaleReluHelper, ARM, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(DeconvBatchnormScaleRelu) +.Doc("DeconvBatchnormScaleRelu fusion operator") +#ifdef USE_CUDA +.__alias__("convolution_batchnorm_scale_relu") +#endif +#ifdef USE_ARM_PLACE +.__alias__("convolution_batchnorm_scale_relu") +#endif +.num_in(1) +.num_out(1) +.Args("group", " group of conv ") +.Args("bias_term", " whether conv weights have bias") +.Args>("padding", "padding of conv (x, y)") +.Args>("strides", "strides of conv (x)") +.Args>("dilation_rate", "dilation rate of conv (x)") +.Args("filter_num", "filter(kernel) number of weights") +.Args>("kernel_size", "kernel size of kernel (x, y)") +.Args("axis", "axis of conv") +.Args("relu_0_alpha", " alpha for relu") +.Args("scale_0_num_axes", " num axes for scale") +.Args("scale_0_bias_term", "whether scale has bias") +.Args("scale_0_axis", "axis for scale") +.Args("batchnorm_0_epsilon", "epsilon for batchnorm") +.Args("batchnorm_0_momentum", "momentum for batchnorm"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.h b/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.h new file mode 100644 index 000000000..12ba4ec3c --- /dev/null +++ b/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_DECONV_BATCHNORM_SCALE_RELU_H +#define ANAKIN_OPERATOR_DECONV_BATCHNORM_SCALE_RELU_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/deconv.h" + +namespace anakin { + +namespace ops { + +template +class DeconvBatchnormScaleReluHelper; + +/// pooling op +/** + * \brief DeconvBatchnormScaleRelu implementation class + * public inherit Operator + */ +template +class DeconvBatchnormScaleRelu : public Operator { +public: + DeconvBatchnormScaleRelu() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator DeconvBatchnormScaleRelu< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class DeconvBatchnormScaleReluHelper; +}; + +/** + * \brief DeconvBatchnormScaleRelu helper class to implement it + * public inherit OperatorHelper + * including init resource and shape size in DeconvBatchnormScaleRelu context + */ +template +class DeconvBatchnormScaleReluHelper : public OperatorHelper { +public: + DeconvBatchnormScaleReluHelper()=default; + + ~DeconvBatchnormScaleReluHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for DeconvBatchnormScaleRelu operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_deconv_batchnorm_scale_relu stand for DeconvBatchnormScaleRelu parameter + saber::ConvParam _param_deconv_batchnorm_scale_relu; + ///< _funcs_deconv_batchnorm_scale_relu stand for DeconvBatchnormScaleRelu function + saber::Deconv::saber_type> _funcs_deconv_batchnorm_scale_relu; + +private: + ///< _dims stand for DeconvBatchnormScaleRelu size + PTuple _dims; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/fusion_ops/deconv_relu.cpp b/framework/operators/fusion_ops/deconv_relu.cpp index 2e100ad04..6307d8f78 100644 --- a/framework/operators/fusion_ops/deconv_relu.cpp +++ b/framework/operators/fusion_ops/deconv_relu.cpp @@ -6,12 +6,12 @@ namespace ops { #ifdef USE_CUDA template<> -void DeconvRelu::operator()( +void DeconvRelu::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { + const std::vector >& ins, + std::vector >& outs) { auto* impl = - static_cast*>(this->_helper); + static_cast*>(this->_helper); auto& param = impl->_param_deconv_relu; impl->_funcs_deconv_relu(ins, outs, param, ctx); } @@ -21,14 +21,13 @@ void DeconvRelu::operator()( /// set helper -template -DeconvReluHelper::~DeconvReluHelper() { +template +DeconvReluHelper::~DeconvReluHelper() { } -template -Status DeconvReluHelper::InitParam() { +template +Status DeconvReluHelper::InitParam() { DLOG(WARNING) << "Parsing DeconvRelu op parameter."; - saber::ConvParam> _conv_param; // get conv param auto group = GET_PARAMETER(int, group); @@ -39,54 +38,49 @@ Status DeconvReluHelper::InitParam() { auto filter_num = GET_PARAMETER(int, filter_num); auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - - using pblock_type = PBlock::type, Ttype>; + + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); + + // get relu param + auto alpha = GET_PARAMETER(float, relu_0_alpha); + ActivationParam active_param(Active_relu);//, alpha); // TEMP if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam> conv_param(group, padding[0], padding[1], + saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - _conv_param = conv_param; + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + _param_deconv_relu = conv_param; } else { - Tensor4d* bias = new Tensor4d();; - saber::ConvParam> conv_param(group, padding[0], padding[1], + Tensor4d* bias = new Tensor4d();; + saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias); - _conv_param = conv_param; + &(weights.d_tensor()), bias, + active_param); + _param_deconv_relu = conv_param; } - - - // get relu param - auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam> active_param(Active_relu);//, alpha); // TEMP - - - ConvActiveParam> conv_act_param(_conv_param, active_param); - _param_deconv_relu = conv_act_param; - return Status::OK(); - } -template -Status DeconvReluHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status DeconvReluHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { bool p = true; - p = p && (_param_deconv_relu.conv_param.weight()->width() == 4); - p = p && (_param_deconv_relu.conv_param.weight()->height() == 4); - p = p && (_param_deconv_relu.conv_param.pad_h == 1); - p = p && (_param_deconv_relu.conv_param.pad_w == 1); - p = p && (_param_deconv_relu.conv_param.stride_h == 2); - p = p && (_param_deconv_relu.conv_param.stride_w == 2); + p = p && (_param_deconv_relu.weight()->width() == 4); + p = p && (_param_deconv_relu.weight()->height() == 4); + p = p && (_param_deconv_relu.pad_h == 1); + p = p && (_param_deconv_relu.pad_w == 1); + p = p && (_param_deconv_relu.stride_h == 2); + p = p && (_param_deconv_relu.stride_w == 2); p = p && (ins[0]->channel() <= 64); p = p && (ins[0]->width() % 32 == 0); - p = p || ((ins[0]->channel() == _param_deconv_relu.conv_param.group) + p = p || ((ins[0]->channel() == _param_deconv_relu.group) && (ins[0]->channel() == outs[0]->channel())); // LOG(ERROR)<<"DECONV RELU INIT"; @@ -100,42 +94,42 @@ Status DeconvReluHelper::Init(OpContext& ctx, return Status::OK(); } -template -Status DeconvReluHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { +template +Status DeconvReluHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { _funcs_deconv_relu.compute_output_shape(ins, outs, _param_deconv_relu); return Status::OK(); } #ifdef USE_CUDA -template class DeconvReluHelper; -template class DeconvReluHelper; -template class DeconvReluHelper; +template class DeconvReluHelper; +template class DeconvReluHelper; +template class DeconvReluHelper; #endif #ifdef USE_ARM_PLACE -template class DeconvReluHelper; -template class DeconvReluHelper; -template class DeconvReluHelper; +template class DeconvReluHelper; +template class DeconvReluHelper; +template class DeconvReluHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(DeconvRelu, DeconvReluHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(DeconvRelu, DeconvReluHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(DeconvRelu, DeconvReluHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(DeconvRelu, DeconvReluHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(DeconvRelu) .Doc("DeconvRelu operator") #ifdef USE_CUDA -.__alias__("deconv_relu") +.__alias__("deconv_relu") #endif #ifdef USE_ARM_PLACE -.__alias__("deconv_relu") +.__alias__("deconv_relu") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/deconv_relu.h b/framework/operators/fusion_ops/deconv_relu.h index 5767a68d5..be6daedf4 100644 --- a/framework/operators/fusion_ops/deconv_relu.h +++ b/framework/operators/fusion_ops/deconv_relu.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/deconv_act.h" +#include "saber/funcs/deconv.h" namespace anakin { namespace ops { -template +template class DeconvReluHelper; /// pooling op @@ -34,20 +34,20 @@ class DeconvReluHelper; * \brief DeconvRelu implementation class * public inherit Operator */ -template -class DeconvRelu : public Operator { +template +class DeconvRelu : public Operator { public: DeconvRelu() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator DeconvRelu< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class DeconvReluHelper; + friend class DeconvReluHelper; }; /** @@ -55,8 +55,8 @@ class DeconvRelu : public Operator { * public inherit OperatorHelper * including init resource and shape size in DeconvRelu context */ -template -class DeconvReluHelper : public OperatorHelper { +template +class DeconvReluHelper : public OperatorHelper { public: DeconvReluHelper()=default; @@ -72,8 +72,8 @@ class DeconvReluHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class DeconvReluHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_deconv_relu stand for DeconvRelu parameter - saber::ConvActiveParam> _param_deconv_relu; + saber::ConvParam _param_deconv_relu; ///< _funcs_deconv_relu stand for DeconvRelu function - saber::DeconvAct _funcs_deconv_relu; + saber::Deconv::saber_type> _funcs_deconv_relu; private: ///< _dims stand for DeconvRelu size diff --git a/framework/operators/fusion_ops/eltwise_relu.cpp b/framework/operators/fusion_ops/eltwise_relu.cpp index 29a75bb9f..9d5216446 100644 --- a/framework/operators/fusion_ops/eltwise_relu.cpp +++ b/framework/operators/fusion_ops/eltwise_relu.cpp @@ -6,12 +6,12 @@ namespace ops { #ifdef USE_CUDA template<> -void EltwiseRelu::operator()( +void EltwiseRelu::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*> (this->_helper)->_param_eltwise_relu; impl->_funcs_eltwise_relu(ins, outs, param, ctx); } @@ -21,18 +21,18 @@ void EltwiseRelu::operator()( /// set helper -template -EltwiseReluHelper::~EltwiseReluHelper() { +template +EltwiseReluHelper::~EltwiseReluHelper() { } -template -Status EltwiseReluHelper::InitParam() { +template +Status EltwiseReluHelper::InitParam() { DLOG(WARNING) << "Parsing EltwiseRelu op parameter."; auto type = GET_PARAMETER(std::string, type); auto alpha = GET_PARAMETER(float, relu_0_alpha); auto coeff = GET_PARAMETER(PTuple, coeff); - ActivationParam> activation_param(Active_relu); + ActivationParam activation_param(Active_relu); EltwiseType elt_type; @@ -45,66 +45,66 @@ Status EltwiseReluHelper::InitParam() { } // Shape shape_coeff(1, 1, 1, coeff.size()); - // Tensor thcoeff(shape_coeff); + // Tensor thcoeff(shape_coeff); // for (int i = 0; i < thcoeff.size(); ++i) { // thcoeff.mutable_data()[i] = coeff[i]; // } - // Tensor4d * tdcoeff_p = new Tensor4d(); + // Tensor4d * tdcoeff_p = new Tensor4d(); // tdcoeff_p->re_alloc(shape_coeff); // tdcoeff_p->copy_from(thcoeff); // - // saber::EltwiseParam> eltwise_param(elt_type, tdcoeff_p); - saber::EltwiseParam> eltwise_param(elt_type, coeff.vector()); - EltwiseActiveParam> eltwise_relu_param(eltwise_param, activation_param); - _param_eltwise_relu = eltwise_relu_param; + // saber::EltwiseParam eltwise_param(elt_type, tdcoeff_p); + saber::EltwiseParam eltwise_param(elt_type, coeff.vector()); + //EltwiseActiveParam eltwise_relu_param(eltwise_param, activation_param); + _param_eltwise_relu =eltwise_param;// eltwise_relu_param; return Status::OK(); } -template -Status EltwiseReluHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status EltwiseReluHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { _funcs_eltwise_relu.init(ins, outs, _param_eltwise_relu, SPECIFY, SABER_IMPL, ctx); return Status::OK(); } -template -Status EltwiseReluHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { +template +Status EltwiseReluHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { _funcs_eltwise_relu.compute_output_shape(ins, outs, _param_eltwise_relu); return Status::OK(); } #ifdef USE_CUDA -template class EltwiseReluHelper; -template class EltwiseReluHelper; -template class EltwiseReluHelper; +template class EltwiseReluHelper; +template class EltwiseReluHelper; +template class EltwiseReluHelper; #endif #ifdef USE_ARM_PLACE -template class EltwiseReluHelper; -template class EltwiseReluHelper; -template class EltwiseReluHelper; +template class EltwiseReluHelper; +template class EltwiseReluHelper; +template class EltwiseReluHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(EltwiseRelu) .Doc("EltwiseRelu operator") #ifdef USE_CUDA -.__alias__("eltwise") +.__alias__("eltwise") #endif #ifdef USE_ARM_PLACE -.__alias__("eltwise") +.__alias__("eltwise") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/eltwise_relu.h b/framework/operators/fusion_ops/eltwise_relu.h index e1a671759..6a5dee117 100644 --- a/framework/operators/fusion_ops/eltwise_relu.h +++ b/framework/operators/fusion_ops/eltwise_relu.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,13 +20,13 @@ #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/eltwise_act.h" +#include "saber/funcs/eltwise.h" namespace anakin { namespace ops { -template +template class EltwiseReluHelper; /// pooling op @@ -34,20 +34,20 @@ class EltwiseReluHelper; * \brief EltwiseRelu implementation class * public inherit Operator */ -template -class EltwiseRelu : public Operator { +template +class EltwiseRelu : public Operator { public: EltwiseRelu() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator convolution::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator EltwiseRelu< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class EltwiseReluHelper; + friend class EltwiseReluHelper; }; /** @@ -55,8 +55,8 @@ class EltwiseRelu : public Operator { * public inherit OperatorHelper * including init resource and shape size in EltwiseRelu context */ -template -class EltwiseReluHelper : public OperatorHelper { +template +class EltwiseReluHelper : public OperatorHelper { public: EltwiseReluHelper()=default; @@ -72,8 +72,8 @@ class EltwiseReluHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class EltwiseReluHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_eltwise_relu stand for EltwiseRelu parameter - saber::EltwiseActiveParam> _param_eltwise_relu; + saber::EltwiseParam _param_eltwise_relu; ///< _funcs_eltwise_relu stand for EltwiseRelu function - saber::EltwiseActive _funcs_eltwise_relu; + saber::Eltwise::saber_type> _funcs_eltwise_relu; private: ///< _dims stand for EltwiseRelu size diff --git a/framework/operators/fusion_ops/permute_power.cpp b/framework/operators/fusion_ops/permute_power.cpp index 7c265a541..6fa2d9c31 100644 --- a/framework/operators/fusion_ops/permute_power.cpp +++ b/framework/operators/fusion_ops/permute_power.cpp @@ -6,12 +6,12 @@ namespace ops { #ifdef USE_CUDA template<> -void PermutePower::operator()( +void PermutePower::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*> (this->_helper)->_param_permute_power; impl->_funcs_permute_power(ins, outs, param, ctx); } @@ -21,70 +21,70 @@ void PermutePower::operator()( /// set helper -template -PermutePowerHelper::~PermutePowerHelper() { +template +PermutePowerHelper::~PermutePowerHelper() { LOG(INFO) << "Decons permute_cpu_float"; } -template -Status PermutePowerHelper::InitParam() { +template +Status PermutePowerHelper::InitParam() { DLOG(WARNING) << "Parsing PermutePower op parameter."; auto dims = GET_PARAMETER(PTuple, dims); auto scale = GET_PARAMETER(float, power_0_scale); auto shift = GET_PARAMETER(float, power_0_shift); auto power = GET_PARAMETER(float, power_0_power); - saber::PermuteParam> permute_param(dims.vector()); - saber::PowerParam> power_param(power, scale, shift); - saber::PermutePowerParam> permute_power_param(permute_param, power_param); + saber::PermuteParam permute_param(dims.vector()); + saber::PowerParam power_param(power, scale, shift); + saber::PermutePowerParam permute_power_param(permute_param, power_param); _param_permute_power = permute_power_param; return Status::OK(); } -template -Status PermutePowerHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status PermutePowerHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { _funcs_permute_power.init(ins, outs, _param_permute_power, SPECIFY, SABER_IMPL, ctx); return Status::OK(); } -template -Status PermutePowerHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { +template +Status PermutePowerHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { _funcs_permute_power.compute_output_shape(ins, outs, _param_permute_power); return Status::OK(); } #ifdef USE_CUDA -template class PermutePowerHelper; -template class PermutePowerHelper; -template class PermutePowerHelper; +template class PermutePowerHelper; +template class PermutePowerHelper; +template class PermutePowerHelper; #endif #ifdef USE_ARM_PLACE -template class PermutePowerHelper; -template class PermutePowerHelper; -template class PermutePowerHelper; +template class PermutePowerHelper; +template class PermutePowerHelper; +template class PermutePowerHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(PermutePower, PermutePowerHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(PermutePower, PermutePowerHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(PermutePower, PermutePowerHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(PermutePower, PermutePowerHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(PermutePower) .Doc("PermutePower fusion operator") #ifdef USE_CUDA -.__alias__("permute_power") +.__alias__("permute_power") #endif #ifdef USE_ARM_PLACE -.__alias__("permute_power") +.__alias__("permute_power") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/permute_power.h b/framework/operators/fusion_ops/permute_power.h index caa7ef070..3769cadb1 100644 --- a/framework/operators/fusion_ops/permute_power.h +++ b/framework/operators/fusion_ops/permute_power.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class PermutePowerHelper; /// pooling op @@ -34,20 +34,20 @@ class PermutePowerHelper; * \brief PermutePower implementation class * public inherit Operator */ -template -class PermutePower : public Operator { +template +class PermutePower : public Operator { public: PermutePower() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator permute_power::type>().type_info(); + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator PermutePower< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class PermutePowerHelper; + friend class PermutePowerHelper; }; /** @@ -55,8 +55,8 @@ class PermutePower : public Operator { * public inherit OperatorHelper * including init resource and shape size in PermutePower context */ -template -class PermutePowerHelper : public OperatorHelper { +template +class PermutePowerHelper : public OperatorHelper { public: PermutePowerHelper()=default; @@ -72,8 +72,8 @@ class PermutePowerHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class PermutePowerHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_permute_power stand for PermutePower parameter - saber::PermutePowerParam> _param_permute_power; + saber::PermutePowerParam _param_permute_power; ///< _funcs_permute_power stand for PermutePower function - saber::PermutePower _funcs_permute_power; + saber::PermutePower::saber_type> _funcs_permute_power; private: ///< _dims stand for PermutePower size diff --git a/framework/operators/gather.cpp b/framework/operators/gather.cpp new file mode 100644 index 000000000..353fb6aa5 --- /dev/null +++ b/framework/operators/gather.cpp @@ -0,0 +1,105 @@ +#include "framework/operators/gather.h" + +namespace anakin { + +namespace ops { + +#ifdef USE_CUDA +template<> +void Gather::operator()(OpContext& ctx, + const std::vector>& ins, + std::vector>& outs) { +} +#endif +#ifdef USE_X86_PLACE +template<> +void Gather::operator()(OpContext& ctx, + const std::vector>& ins, + std::vector>& outs) { +} +#endif + +/// TODO ... specialization other type of operator + + +/// set helper +template +GatherHelper::~GatherHelper() { +} + +template +Status GatherHelper::InitParam() { + DLOG(WARNING) << "Parsing Gather op parameter."; + return Status::OK(); +} + +template +Status GatherHelper::Init(OpContext& ctx, + const std::vector>& ins, + std::vector>& outs) { + return Status::OK(); +} + +template +Status GatherHelper::InferShape(const std::vector >& + ins, + std::vector >& outs) { + outs[0]->set_shape(ins[0]->valid_shape()); + outs[0]->set_seq_offset(ins[0]->get_seq_offset()); + return Status::OK(); +} + +#ifdef USE_CUDA +template class GatherHelper; +template class GatherHelper; +template class GatherHelper; +#endif + +#ifdef USE_ARM_PLACE +template class GatherHelper; +template class GatherHelper; +template class GatherHelper; +#endif +#ifdef USE_X86_PLACE +template class GatherHelper; +template class GatherHelper; +template class GatherHelper; +#endif + +// register help +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, NV, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, NV, Precision::FP16); +ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, NV, Precision::INT8); +#endif + +#ifdef USE_ARM_PLACE +ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, ARM, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, ARM, Precision::FP16); +ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, ARM, Precision::INT8); +#endif + +#ifdef USE_X86_PLACE +ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, X86, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, X86, Precision::FP16); +ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, X86, Precision::INT8); +#endif + +//! register op +ANAKIN_REGISTER_OP(Gather) +#ifdef USE_CUDA + .__alias__("gather") +#endif +#ifdef USE_ARM_PLACE + .__alias__("gather") +#endif +#ifdef USE_X86_PLACE + .__alias__("gather") +#endif + .Doc("Gather operator [ only a middle data holder and reshape ] "); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/gather.h b/framework/operators/gather.h new file mode 100644 index 000000000..15246acb2 --- /dev/null +++ b/framework/operators/gather.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_GATHER_H +#define ANAKIN_OPERATOR_GATHER_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" + +namespace anakin { + +namespace ops { + +template +class GatherHelper; + +/// Gather op without any compute, this a holder for input +/** + * \brief Gather implementation class + * public inherit Operator + */ +template +class Gather : public Operator { +public: + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + //LOG(ERROR) << "Not Impl Yet Operator Gather< Ttype(" + // << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class GatherHelper; +}; + +/** + * \brief Gather helper class to implement it + * public inherit OperatorHelper + * including init resource and shape size in input context + */ +template +class GatherHelper : public OperatorHelper { + typedef OperatorHelper Base; +public: + GatherHelper() {} + + ~GatherHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for Gather operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/gru.cpp b/framework/operators/gru.cpp deleted file mode 100644 index 530c9ee5e..000000000 --- a/framework/operators/gru.cpp +++ /dev/null @@ -1,139 +0,0 @@ -#include "framework/operators/gru.h" - -namespace anakin { - -namespace ops { - -#ifdef USE_CUDA -template<> -void Gru::operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_gru; - impl->_funcs_gru(ins, outs, param, ctx); -} -#endif -#ifdef USE_X86_PLACE -template<> -void Gru::operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_gru; - impl->_funcs_gru(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator -/// set helper -template -GruHelper::~GruHelper() { -} - -template -Status GruHelper::InitParam() { - DLOG(WARNING) << "Parsing Gru op parameter."; - auto is_reverse = GET_PARAMETER(bool, is_reverse); - auto gate_act = GET_PARAMETER(std::string, gate_activation); - auto hidden_act = GET_PARAMETER(std::string, activation); - auto formula = GET_PARAMETER(std::string, gru_formula); - -// auto weight_h2h = GET_PARAMETER(PBlock::type, Ttype>, weight_1); -// auto bias = GET_PARAMETER(PBlock::type, Ttype>, weight_3); -// auto weight_i2h = GET_PARAMETER(PBlock::type, Ttype>, weight_2); - - using pblock_type = PBlock::type, Ttype>; - auto weight_wu = GET_PARAMETER(pblock_type, weight_1); - auto bias = GET_PARAMETER(pblock_type, weight_2); - - CHECK((formula != "") && (formula == "gru_origin" - || formula == "gru_cudnn")) << "formula illegal"; - - std::unordered_map act_map = { - {"sigmoid_fluid", Active_sigmoid_fluid}, - {"relu_fluid", Active_relu}, - {"tanh_fluid", Active_tanh_fluid}, - {"identity_fluid", Active_identity} - }; - std::unordered_map formula_map = { - {"gru_origin", GRU_ORIGIN}, - {"gru_cudnn", GRU_CUDNN}, - }; - CHECK_GT(weight_wu.d_tensor().valid_size(),0)<<"weights size must > 0"; - CHECK_GT(bias.d_tensor().valid_size(),0)<<"bias size must > 0"; - - GruParam> gru_param(&(weight_wu.d_tensor()), &(bias.d_tensor()), - formula_map[formula], act_map[gate_act], - act_map[hidden_act], is_reverse); - - _param_gru = gru_param; - - return Status::OK(); -} - -template -Status GruHelper::Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - SABER_CHECK(_funcs_gru.init(ins, outs, _param_gru, SPECIFY, SABER_IMPL, ctx)); - return Status::OK(); -} - -template -Status GruHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { - SABER_CHECK(_funcs_gru.compute_output_shape(ins, outs, _param_gru)); - return Status::OK(); -} - -#ifdef USE_CUDA -template class GruHelper; -template class GruHelper; -template class GruHelper; -#endif - -#ifdef USE_ARM_PLACE -template class GruHelper; -template class GruHelper; -template class GruHelper; -#endif - -#ifdef USE_X86_PLACE -template class GruHelper; -template class GruHelper; -template class GruHelper; -#endif - -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, NV, AK_FLOAT, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, ARM, AK_FLOAT, Precision::FP32); -#endif -#ifdef USE_X86_PLACE -ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, X86, AK_FLOAT, Precision::FP32); -#endif -//! register op -ANAKIN_REGISTER_OP(Gru) - .Doc("Gru operator") -#ifdef USE_CUDA - .__alias__("gru") -#endif -#ifdef USE_ARM_PLACE - .__alias__("gru") -#endif -#ifdef USE_X86_PLACE - .__alias__("gru") -#endif - .num_in(1) - .num_out(1) - .Args("is_reverse", " is_reverse for gru.") - .Args("gate_activation", "gate_activation for gru.") - .Args("activation", "hidden_activation for gru."); - -} /* namespace ops */ - -} /* namespace anakin */ - - diff --git a/framework/operators/im2sequence.cpp b/framework/operators/im2sequence.cpp index 6feeb7e6e..67b69c263 100644 --- a/framework/operators/im2sequence.cpp +++ b/framework/operators/im2sequence.cpp @@ -6,11 +6,11 @@ namespace ops { #ifdef USE_CUDA template<> -void Im2Sequence::operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_im2sequence; +void Im2Sequence::operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_im2sequence; impl->_funcs_im2sequence(ins, outs, param, ctx); } #endif @@ -19,19 +19,19 @@ void Im2Sequence::operator() (OpContext &ctx, /// set helper -template -Im2SequenceHelper::~Im2SequenceHelper() { +template +Im2SequenceHelper::~Im2SequenceHelper() { } -template -Status Im2SequenceHelper::InitParam() { +template +Status Im2SequenceHelper::InitParam() { DLOG(WARNING) << "Parsing Im2Sequence op parameter."; auto paddings = GET_PARAMETER(PTuple, paddings); auto strides = GET_PARAMETER(PTuple, strides); auto window_size = GET_PARAMETER(PTuple, window_size); auto dilations = GET_PARAMETER(PTuple, dilations); - Im2SequenceParam> im2sequence_param(window_size[0], window_size[1], + Im2SequenceParam im2sequence_param(window_size[0], window_size[1], paddings[0], paddings[1], paddings[2], paddings[3], strides[0], strides[1], @@ -42,52 +42,52 @@ Status Im2SequenceHelper::InitParam() { return Status::OK(); } -template -Status Im2SequenceHelper::Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status Im2SequenceHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_im2sequence.init(ins, outs, _param_im2sequence, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status Im2SequenceHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { +template +Status Im2SequenceHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_im2sequence.compute_output_shape(ins, outs, _param_im2sequence)); return Status::OK(); } #ifdef USE_CUDA -template class Im2SequenceHelper; -template class Im2SequenceHelper; -template class Im2SequenceHelper; +template class Im2SequenceHelper; +template class Im2SequenceHelper; +template class Im2SequenceHelper; #endif #ifdef USE_ARM_PLACE -template class Im2SequenceHelper; -template class Im2SequenceHelper; -template class Im2SequenceHelper; +template class Im2SequenceHelper; +template class Im2SequenceHelper; +template class Im2SequenceHelper; #endif -//template class Im2SequenceHelper; -//template class Im2SequenceHelper; -//template class Im2SequenceHelper; +//template class Im2SequenceHelper; +//template class Im2SequenceHelper; +//template class Im2SequenceHelper; // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Im2Sequence, Im2SequenceHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Im2Sequence, Im2SequenceHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Im2Sequence, Im2SequenceHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Im2Sequence, Im2SequenceHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Im2Sequence) .Doc("Im2Sequence operator") #ifdef USE_CUDA - .__alias__("im2sequence") + .__alias__("im2sequence") #endif #ifdef USE_ARM_PLACE - .__alias__("im2sequence") + .__alias__("im2sequence") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/im2sequence.h b/framework/operators/im2sequence.h index df8ec979d..a051d77c4 100644 --- a/framework/operators/im2sequence.h +++ b/framework/operators/im2sequence.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class Im2SequenceHelper; @@ -35,20 +35,20 @@ class Im2SequenceHelper; * \brief Im2Sequence implementation class * public inherit Operator */ -template -class Im2Sequence : public Operator { +template +class Im2Sequence : public Operator { public: Im2Sequence() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator Im2Sequence::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Im2Sequence< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class Im2SequenceHelper; + friend class Im2SequenceHelper; }; /** @@ -56,8 +56,8 @@ class Im2Sequence : public Operator { * public inherit OperatorHelper * including init resource and shape size in Im2Sequence context */ -template -class Im2SequenceHelper : public OperatorHelper { +template +class Im2SequenceHelper : public OperatorHelper { public: Im2SequenceHelper()=default; @@ -73,8 +73,8 @@ class Im2SequenceHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -82,14 +82,14 @@ class Im2SequenceHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_im2sequence stand for Im2Sequence parameter - saber::Im2SequenceParam> _param_im2sequence; + saber::Im2SequenceParam _param_im2sequence; ///< _funcs_im2sequence stand for Im2Sequence function - saber::Im2Sequence _funcs_im2sequence; + saber::Im2Sequence::saber_type> _funcs_im2sequence; }; } /* namespace ops */ diff --git a/framework/operators/input.cpp b/framework/operators/input.cpp index 351507323..1affd8549 100644 --- a/framework/operators/input.cpp +++ b/framework/operators/input.cpp @@ -4,34 +4,17 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Input::operator()(OpContext& ctx, - const std::vector>& ins, - std::vector>& outs) { -} -#endif -#ifdef USE_X86_PLACE -template<> -void Input::operator()(OpContext& ctx, - const std::vector>& ins, - std::vector>& outs) { -} -#endif +#define INSTANCE_INPUT(Ttype, Ptype) \ +template<> \ +void Input::operator()(OpContext& ctx, \ + const std::vector>& ins, \ + std::vector>& outs) {} -/// TODO ... specialization other type of operator - -/// set helper -template -InputHelper::~InputHelper() { -} - -template -Status InputHelper::InitParam() { - DLOG(WARNING) << "Parsing Input op parameter."; +template +Status InputHelper::InitParam() { + LOG(WARNING) << "Parsing Input op parameter."; input_shape = GET_PARAMETER(PTuple, input_shape); - for (int i = 0; i < input_shape.size(); i++) { LOG(INFO) << " |-- shape [" << i << "]: " << input_shape[i]; } @@ -39,19 +22,17 @@ Status InputHelper::InitParam() { return Status::OK(); } -template -Status InputHelper::Init(OpContext& ctx, - const std::vector>& ins, - std::vector>& outs) { +template +Status InputHelper::Init(OpContext &ctx, + const std::vector> &ins, + std::vector> &outs) { return Status::OK(); } -template -Status InputHelper::InferShape(const std::vector >& - ins, - std::vector >& outs) { +template +Status InputHelper::InferShape(const std::vector> &ins, + std::vector> &outs) { saber::Shape out_shape; - for (int i = 0; i < input_shape.size(); i++) { out_shape.push_back(input_shape[i]); } @@ -64,54 +45,45 @@ Status InputHelper::InferShape(const std::vector; -template class InputHelper; -template class InputHelper; +INSTANCE_INPUT(NV, Precision::FP32); +template class InputHelper; +ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -template class InputHelper; -template class InputHelper; -template class InputHelper; -#endif -#ifdef USE_X86_PLACE -template class InputHelper; -template class InputHelper; -template class InputHelper; -#endif - -// register help -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, NV, AK_FLOAT, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, NV, AK_FLOAT, Precision::FP16); -ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, NV, AK_FLOAT, Precision::INT8); -#endif +INSTANCE_INPUT(ARM, Precision::FP32); +template class InputHelper; +ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, ARM, Precision::FP32); +#endif //arm -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, ARM, AK_FLOAT, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, ARM, AK_FLOAT, Precision::FP16); -ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, ARM, AK_FLOAT, Precision::INT8); +#ifdef USE_X86_PLACE +INSTANCE_INPUT(X86, Precision::FP32); +template class InputHelper; +ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, X86, Precision::FP32); #endif -#ifdef USE_X86_PLACE -ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, X86, AK_FLOAT, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, X86, AK_FLOAT, Precision::FP16); -ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, X86, AK_FLOAT, Precision::INT8); +#ifdef AMD_GPU +INSTANCE_INPUT(AMD, Precision::FP32); +template class InputHelper; +ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, AMD, Precision::FP32); #endif //! register op -ANAKIN_REGISTER_OP(Input) - .Doc("Input operator [ only a input data holder and reshape ] ") +ANAKIN_REGISTER_OP(Input) +.Doc("Input operator [ only a input data holder and reshape ] ") #ifdef USE_CUDA - .__alias__("input") +.__alias__("input") +#endif +#ifdef AMD_GPU + .__alias__("input") #endif #ifdef USE_ARM_PLACE - .__alias__("input") +.__alias__("input") #endif #ifdef USE_X86_PLACE - .__alias__("input") +.__alias__("input") #endif - .Args>("input_shape", " shape of graph input."); +.Args>("input_shape", " shape of graph input."); } /* namespace ops */ diff --git a/framework/operators/input.h b/framework/operators/input.h index 65863445e..1156bdd09 100644 --- a/framework/operators/input.h +++ b/framework/operators/input.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ namespace anakin { namespace ops { -template +template class InputHelper; /// Input op without any compute, this a holder for input @@ -33,17 +33,15 @@ class InputHelper; * \brief Input implementation class * public inherit Operator */ -template -class Input : public Operator { +template +class Input : public Operator { public: virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator Input::type>().type_info(); + const std::vector >& ins, + std::vector >& outs) { } - friend class InputHelper; + friend class InputHelper; }; /** @@ -51,13 +49,13 @@ class Input : public Operator { * public inherit OperatorHelper * including init resource and shape size in input context */ -template -class InputHelper : public OperatorHelper { - typedef OperatorHelper Base; +template +class InputHelper : public OperatorHelper { + typedef OperatorHelper Base; public: InputHelper() {} - ~InputHelper(); + ~InputHelper() {} Status InitParam() override; @@ -69,8 +67,8 @@ class InputHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -78,9 +76,8 @@ class InputHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; - + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; private: ///< input_shape :input op may hold motl-input diff --git a/framework/operators/layer_norm.cpp b/framework/operators/layer_norm.cpp index 04129e934..6faef4f5a 100644 --- a/framework/operators/layer_norm.cpp +++ b/framework/operators/layer_norm.cpp @@ -4,77 +4,77 @@ namespace anakin{ namespace ops{ -#define INSTANCE_LAYERNORM(Ttype, Dtype, Ptype) \ +#define INSTANCE_LAYERNORM(Ttype, Ptype) \ template<> \ -void LayerNorm::operator()(OpContext& ctx, \ - const std::vector >& ins, \ - std::vector >& outs) { \ - auto* impl = static_cast*>(this->_helper); \ - auto& param = static_cast*> \ +void LayerNorm::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ (this->_helper)->_param_layer_norm; \ impl->_funcs_layer_norm(ins, outs, param, ctx); \ } -template -Status LayerNormHelper::InitParam() { +template +Status LayerNormHelper::InitParam() { auto axis = GET_PARAMETER(int, begin_norm_axis); auto eps = GET_PARAMETER(float, eps); - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto input_scale = GET_PARAMETER(pblock_type, weight_1); auto input_bias = GET_PARAMETER(pblock_type, weight_2); - saber::LayerNormParam> param(axis, eps, &(input_scale.d_tensor()), \ + saber::LayerNormParam param(axis, eps, &(input_scale.d_tensor()), \ &(input_bias.d_tensor())); _param_layer_norm = param; return Status::OK(); } -template -Status LayerNormHelper::Init(OpContext &ctx, -const std::vector >& ins, -std::vector >& outs) { +template +Status LayerNormHelper::Init(OpContext &ctx, +const std::vector >& ins, +std::vector >& outs) { SABER_CHECK(_funcs_layer_norm.init(ins, outs, _param_layer_norm, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status LayerNormHelper::InferShape(const std::vector >& ins, \ - std::vector >& outs) { +template +Status LayerNormHelper::InferShape(const std::vector >& ins, \ + std::vector >& outs) { SABER_CHECK(_funcs_layer_norm.compute_output_shape(ins, outs, _param_layer_norm)); return Status::OK(); } #ifdef USE_CUDA -INSTANCE_LAYERNORM(NV, AK_FLOAT, Precision::FP32); -template class LayerNormHelper; -ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, NV, AK_FLOAT, Precision::FP32); +INSTANCE_LAYERNORM(NV, Precision::FP32); +template class LayerNormHelper; +ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, NV, Precision::FP32); #endif #ifdef USE_X86_PLACE -INSTANCE_LAYERNORM(X86, AK_FLOAT, Precision::FP32); -template class LayerNormHelper; -ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, X86, AK_FLOAT, Precision::FP32); +INSTANCE_LAYERNORM(X86, Precision::FP32); +template class LayerNormHelper; +ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, X86, Precision::FP32); #endif #ifdef USE_ARM_PLACE -INSTANCE_LAYERNORM(ARM, AK_FLOAT, Precision::FP32); -template class LayerNormHelper; -ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_LAYERNORM(ARM, Precision::FP32); +template class LayerNormHelper; +ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(LayerNorm) .Doc("LayerNorm operator") #ifdef USE_CUDA -.__alias__("layernorm") +.__alias__("layernorm") #endif #ifdef USE_ARM_PLACE -.__alias__("layernorm") +.__alias__("layernorm") #endif #ifdef USE_X86_PLACE -.__alias__("layernorm") +.__alias__("layernorm") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/layer_norm.h b/framework/operators/layer_norm.h index 977d406dd..636a4df5b 100644 --- a/framework/operators/layer_norm.h +++ b/framework/operators/layer_norm.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class LayerNormHelper; /// pooling op @@ -34,20 +34,20 @@ class LayerNormHelper; * \brief Normalize operation class * public inheritance Operator */ -template -class LayerNorm : public Operator { +template +class LayerNorm : public Operator { public: LayerNorm() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator Normalize::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator LayerNorm< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class LayerNormHelper; + friend class LayerNormHelper; }; /** @@ -55,8 +55,8 @@ class LayerNorm : public Operator { * public inherit OperatorHelper * including init resource and shape size in normalize context */ -template -class LayerNormHelper : public OperatorHelper { +template +class LayerNormHelper : public OperatorHelper { public: LayerNormHelper()=default; @@ -72,8 +72,8 @@ class LayerNormHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class LayerNormHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_normalize stand for Normalize parameter - saber::LayerNormParam> _param_layer_norm; + saber::LayerNormParam _param_layer_norm; ///< _funcs_normalize stand for Normalize function - saber::LayerNorm _funcs_layer_norm; + saber::LayerNorm::saber_type> _funcs_layer_norm; }; } /* namespace ops */ diff --git a/framework/operators/lrn.cpp b/framework/operators/lrn.cpp index 0db763004..ca3da0d88 100644 --- a/framework/operators/lrn.cpp +++ b/framework/operators/lrn.cpp @@ -6,12 +6,12 @@ namespace ops { #ifdef USE_CUDA template<> -void Lrn::operator()( +void Lrn::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { + const std::vector >& ins, + std::vector >& outs) { auto* impl = - static_cast*>(this->_helper); + static_cast*>(this->_helper); auto& param = impl->_param_lrn; impl->_funcs_lrn(ins, outs, param, ctx); } @@ -21,12 +21,12 @@ void Lrn::operator()( /// set helper -template -LrnHelper::~LrnHelper() { +template +LrnHelper::~LrnHelper() { } -template -Status LrnHelper::InitParam() { +template +Status LrnHelper::InitParam() { DLOG(WARNING) << "Parsing Lrn op parameter."; auto local_size_in = GET_PARAMETER(int, local_size); @@ -36,10 +36,10 @@ Status LrnHelper::InitParam() { auto k_in = GET_PARAMETER(float, k); if (norm_region_in == "ACROSS_CHANNELS") { - LrnParam> param_lrn(local_size_in, alpha_in, beta_in, k_in, ACROSS_CHANNELS); + LrnParam param_lrn(local_size_in, alpha_in, beta_in, k_in, ACROSS_CHANNELS); _param_lrn = param_lrn; } else if (norm_region_in == "WITHIN_CHANNEL") { - LrnParam> param_lrn(local_size_in, alpha_in, beta_in, k_in, WITHIN_CHANNEL); + LrnParam param_lrn(local_size_in, alpha_in, beta_in, k_in, WITHIN_CHANNEL); _param_lrn = param_lrn; } else { LOG(FATAL) << "Other Lrn norm_region" << norm_region_in << " should be replace by other ops."; @@ -48,50 +48,50 @@ Status LrnHelper::InitParam() { return Status::OK(); } -template -Status LrnHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status LrnHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_lrn.init(ins, outs, _param_lrn, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status LrnHelper::InferShape(const std::vector >& +template +Status LrnHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { + std::vector >& outs) { SABER_CHECK(_funcs_lrn.compute_output_shape(ins, outs, _param_lrn)); return Status::OK(); } #ifdef USE_CUDA -template class LrnHelper; -template class LrnHelper; -template class LrnHelper; +template class LrnHelper; +template class LrnHelper; +template class LrnHelper; #endif #ifdef USE_ARM_PLACE -template class LrnHelper; -template class LrnHelper; -template class LrnHelper; +template class LrnHelper; +template class LrnHelper; +template class LrnHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Lrn, LrnHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Lrn, LrnHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Lrn, LrnHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Lrn, LrnHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Lrn) .Doc("LRN operator") #ifdef USE_CUDA -.__alias__("LRN") +.__alias__("LRN") #endif #ifdef USE_ARM_PLACE -.__alias__("LRN") +.__alias__("LRN") #endif .num_in(3) .num_out(1); diff --git a/framework/operators/lrn.h b/framework/operators/lrn.h index 2a1907345..d7edf5694 100644 --- a/framework/operators/lrn.h +++ b/framework/operators/lrn.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class LrnHelper; /// lrn op @@ -34,20 +34,21 @@ class LrnHelper; * \brief operation of Lrn class * public inheritance Operator */ -template -class Lrn : public Operator { +template +class Lrn : public Operator { public: Lrn() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator lrn::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Lrn< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } - friend class LrnHelper; + friend class LrnHelper; }; /** @@ -55,8 +56,8 @@ class Lrn : public Operator { * public inheritance OperatorHelper * including init operation context and the size of shape */ -template -class LrnHelper : public OperatorHelper { +template +class LrnHelper : public OperatorHelper { public: LrnHelper()=default; @@ -72,8 +73,8 @@ class LrnHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +82,14 @@ class LrnHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_lrn stand for lrn parameter - saber::LrnParam> _param_lrn; + saber::LrnParam _param_lrn; ///< _funcs_lrn stand for lrn function - saber::Lrn _funcs_lrn; + saber::Lrn::saber_type> _funcs_lrn; private: ///< _dims stand for lrn size diff --git a/framework/operators/lstm.cpp b/framework/operators/lstm.cpp new file mode 100644 index 000000000..71cb9cb19 --- /dev/null +++ b/framework/operators/lstm.cpp @@ -0,0 +1,152 @@ +#include "framework/operators/lstm.h" +#include +namespace anakin { + +namespace ops { + +#ifdef USE_CUDA +template<> +void Lstm::operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_lstm; + impl->_funcs_lstm(ins, outs, param, ctx); +} +#endif +#ifdef USE_X86_PLACE +template<> +void Lstm::operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_lstm; + impl->_funcs_lstm(ins, outs, param, ctx); +} +#endif + +/// TODO ... specialization other type of operator +/// set helper +template +LstmHelper::~LstmHelper() { +} + +template +Status LstmHelper::InitParam() { + DLOG(WARNING) << "Parsing Lstm op parameter."; + + auto num_direction = GET_PARAMETER(int, num_direction); + auto dropout_param = GET_PARAMETER(float, dropout_param); + auto num_layers = GET_PARAMETER(int, num_layers); + auto input_activation = GET_PARAMETER(std::string, input_activation); + auto gate_activation = GET_PARAMETER(std::string, gate_activation); + auto cell_activation = GET_PARAMETER(std::string, cell_activation); + auto candidate_activation = GET_PARAMETER(std::string, candidate_activation); + auto is_reverse = GET_PARAMETER(bool, is_reverse); + auto use_peepholes = GET_PARAMETER(bool, use_peepholes); + + //auto weight_wu = GET_PARAMETER(PBlock::type>, weight_1); + //auto bias = GET_PARAMETER(PBlock::type>, weight_2); + using pblock_type = PBlock; + auto weight_wu = GET_PARAMETER(pblock_type, weight_1); + auto bias = GET_PARAMETER(pblock_type, weight_2); + + + LOG(INFO)<<"lstm act = ["< enum_map = { + {"null",Active_unknow}, + {"sigmoid_fluid", Active_sigmoid}, + {"relu_fluid", Active_relu}, + {"tanh_fluid", Active_tanh}, + {"identity_fluid", Active_identity}, + {"sigmoid", Active_sigmoid}, + {"tanh", Active_tanh}, + }; + LstmParam lstm_param(&(weight_wu.d_tensor()), &(bias.d_tensor()), nullptr, + enum_map[input_activation], enum_map[gate_activation], + enum_map[cell_activation], enum_map[candidate_activation], + use_peepholes, false, is_reverse, dropout_param, + num_direction, num_layers); + _param_lstm = lstm_param; + + return Status::OK(); +} + +template +Status LstmHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + DLOG(INFO)<<"inti lstm in op.cpp"; + SABER_CHECK(_funcs_lstm.init(ins, outs, _param_lstm, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status LstmHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_lstm.compute_output_shape(ins, outs, _param_lstm)); + return Status::OK(); +} + +#ifdef USE_CUDA +template class LstmHelper; +template class LstmHelper; +template class LstmHelper; +#endif + +#ifdef USE_ARM_PLACE +template class LstmHelper; +template class LstmHelper; +template class LstmHelper; +#endif + +#ifdef USE_X86_PLACE +template class LstmHelper; +template class LstmHelper; +template class LstmHelper; +#endif + +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(Lstm, LstmHelper, NV, Precision::FP32); +#endif +#ifdef USE_ARM_PLACE +ANAKIN_REGISTER_OP_HELPER(Lstm, LstmHelper, ARM, Precision::FP32); +#endif +#ifdef USE_X86_PLACE +ANAKIN_REGISTER_OP_HELPER(Lstm, LstmHelper, X86, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(Lstm) + .Doc("Lstm operator") +#ifdef USE_CUDA + .__alias__("LSTM") + .__alias__("Lstm") +#endif +#ifdef USE_ARM_PLACE + .__alias__("LSTM") + .__alias__("Lstm") +#endif +#ifdef USE_X86_PLACE + .__alias__("LSTM") + .__alias__("Lstm") +#endif + .num_in(1) + .num_out(1) + .Args("is_reverse", " is_reverse for lstm.") + .Args("num_direction", "some descp") + .Args("dropout_param", "some descp") + .Args("num_layers", "some descp") + .Args("input_activation", "some descp") + .Args("gate_activation", "some descp") + .Args("cell_activation", "some descp") + .Args("candidate_activation", "some descp") + .Args("is_reverse", "some descp") + .Args("use_peephole", "some descp"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/lstm.h b/framework/operators/lstm.h new file mode 100644 index 000000000..98ae15149 --- /dev/null +++ b/framework/operators/lstm.h @@ -0,0 +1,104 @@ + + +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_LSTM_H +#define ANAKIN_OPERATOR_LSTM_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/lstm.h" + +namespace anakin { + +namespace ops { + +template +class LstmHelper; + + +/// lstm op +/** + * \brief Lstm implementation class + * public inherit Operator + */ +template +class Lstm : public Operator { +public: + Lstm() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Lstm< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + + } + + friend class LstmHelper; +}; + +/** + * \brief Lstm helper class to implement Lstm + * public inherit OperatorHelper + * including init resource and shape size in Lstm context + */ +template +class LstmHelper : public OperatorHelper { +public: + LstmHelper()=default; + + ~LstmHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by lstm + * \param ctx stand for Lstm operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_lstm stand for Lstm parameter + saber::LstmParam _param_lstm; + ///< _funcs_lstm stand for Lstm function + saber::Lstm::saber_type> _funcs_lstm; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif + + diff --git a/framework/operators/normalize.cpp b/framework/operators/normalize.cpp index 56cfa921c..011ee9034 100644 --- a/framework/operators/normalize.cpp +++ b/framework/operators/normalize.cpp @@ -6,12 +6,12 @@ namespace ops { #ifdef USE_CUDA template<> -void Normalize::operator() ( +void Normalize::operator() ( OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_normalize; + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_normalize; impl->_funcs_normalize(ins, outs, param, ctx); } #endif @@ -20,71 +20,71 @@ void Normalize::operator() ( /// set helper -template -NormalizeHelper::~NormalizeHelper() { +template +NormalizeHelper::~NormalizeHelper() { } -template -Status NormalizeHelper::InitParam() { +template +Status NormalizeHelper::InitParam() { //DLOG(WARNING) << "Parsing Normalize op parameter."; auto is_across_spatial = GET_PARAMETER(bool, is_across_spatial); auto is_shared_channel = GET_PARAMETER(bool, is_shared_channel); auto eps = GET_PARAMETER(float, eps); auto p = GET_PARAMETER(int, p); - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto input_scale = GET_PARAMETER(pblock_type, weight_1); - saber::NormalizeParam> normalize_param(is_across_spatial, is_shared_channel, \ + saber::NormalizeParam normalize_param(is_across_spatial, is_shared_channel, \ &(input_scale.d_tensor()), eps, p); _param_normalize = normalize_param; return Status::OK(); } -template -Status NormalizeHelper::Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status NormalizeHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_normalize.init(ins, outs, _param_normalize, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status NormalizeHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { +template +Status NormalizeHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_normalize.compute_output_shape(ins, outs, _param_normalize)); return Status::OK(); } #ifdef USE_CUDA -template class NormalizeHelper; -template class NormalizeHelper; -template class NormalizeHelper; +template class NormalizeHelper; +template class NormalizeHelper; +template class NormalizeHelper; #endif #ifdef USE_ARM_PLACE -template class NormalizeHelper; -template class NormalizeHelper; -template class NormalizeHelper; +template class NormalizeHelper; +template class NormalizeHelper; +template class NormalizeHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Normalize, NormalizeHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Normalize, NormalizeHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Normalize, NormalizeHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Normalize, NormalizeHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Normalize) .Doc("Normalize operator") #ifdef USE_CUDA - .__alias__("normalize") + .__alias__("normalize") #endif #ifdef USE_ARM_PLACE - .__alias__("normalize") + .__alias__("normalize") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/normalize.h b/framework/operators/normalize.h index fc67702e5..33981d326 100644 --- a/framework/operators/normalize.h +++ b/framework/operators/normalize.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class NormalizeHelper; /// pooling op @@ -34,20 +34,20 @@ class NormalizeHelper; * \brief Normalize operation class * public inheritance Operator */ -template -class Normalize : public Operator { +template +class Normalize : public Operator { public: Normalize() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator Normalize::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Normalize< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class NormalizeHelper; + friend class NormalizeHelper; }; /** @@ -55,8 +55,8 @@ class Normalize : public Operator { * public inherit OperatorHelper * including init resource and shape size in normalize context */ -template -class NormalizeHelper : public OperatorHelper { +template +class NormalizeHelper : public OperatorHelper { public: NormalizeHelper()=default; @@ -72,8 +72,8 @@ class NormalizeHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class NormalizeHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_normalize stand for Normalize parameter - saber::NormalizeParam> _param_normalize; + saber::NormalizeParam _param_normalize; ///< _funcs_normalize stand for Normalize function - saber::Normalize _funcs_normalize; + saber::Normalize::saber_type> _funcs_normalize; private: ///< _dims stand for Normalize size diff --git a/framework/operators/ops.h b/framework/operators/ops.h index 4b4d2a017..577ef72a9 100644 --- a/framework/operators/ops.h +++ b/framework/operators/ops.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,17 +15,67 @@ #ifndef ANAKIN_OPERATORS_H #define ANAKIN_OPERATORS_H - -#include "framework/operators/pooling.h" +#if 1 +//#include "framework/graph/llvm/fusion/graph_pattern.h" +#include "framework/operators/activation.h" +//#include "framework/operators/arg_max.h" +//#include "framework/operators/axpy.h" +#include "framework/operators/batch_norm.h" +#include "framework/operators/concat.h" +#include "framework/operators/conv_3x3.h" +#include "framework/operators/convolution.h" +//#include "framework/operators/crf_decoding.h" +#include "framework/operators/crop.h" +#include "framework/operators/ctc_align.h" +#include "framework/operators/deconvolution.h" +#include "framework/operators/deformconvolution.h" +//#include "framework/operators/dense.h" +#include "framework/operators/depwise_sep_convolution.h" +#include "framework/operators/detection_output.h" +#include "framework/operators/dot.h" +#include "framework/operators/dropout.h" +#include "framework/operators/eltwise_op.h" +#include "framework/operators/elu.h" +//#include "framework/operators/embedding.h" +#include "framework/operators/exp.h" +//#include "framework/operators/flatten.h" +//#include "framework/operators/gru.h" +//#include "framework/operators/im2sequence.h" #include "framework/operators/input.h" -//#include "framework/operators/pooling.h" -//#include "framework/operators/pooling.h" -//#include "framework/operators/pooling.h" -//#include "framework/operators/pooling.h" -//#include "framework/operators/pooling.h" -//#include "framework/operators/pooling.h" -//#include "framework/operators/pooling.h" +#include "framework/operators/log.h" +//#include "framework/operators/lrn.h" +#include "framework/operators/mvn.h" +#include "framework/operators/normalize.h" +#include "framework/operators/output.h" +//#include "framework/operators/permute.h" +#include "framework/operators/pooling.h" +#include "framework/operators/power.h" +#include "framework/operators/prelu.h" +//#include "framework/operators/priorbox.h" +#include "framework/operators/relu.h" +//#include "framework/operators/reshape.h" +#include "framework/operators/scale.h" +#include "framework/operators/sequence_pool.h" +//#include "framework/operators/slice.h" +//#include "framework/operators/softmax.h" +#include "framework/operators/spatial_pyramid_pooling.h" +#include "framework/operators/split.h" +#include "framework/operators/standard_rnn.h" + +//#include "framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.h" +//#include "framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.h" +//#include "framework/operators/fusion_ops/conv_3x3_relu.h" +//#include "framework/operators/fusion_ops/conv_3x3_relu_pool.h" +//#include "framework/operators/fusion_ops/conv_batchnorm_scale.h" +//#include "framework/operators/fusion_ops/conv_batchnorm_scale_relu.h" +//#include "framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.h" +//#include "framework/operators/fusion_ops/conv_relu.h" +//#include "framework/operators/fusion_ops/conv_relu_pool.h" +//#include "framework/operators/fusion_ops/deconv_relu.h" +//#include "framework/operators/fusion_ops/eltwise_relu.h" +//#include "framework/operators/fusion_ops/permute_power.h" +#endif //0 namespace anakin { namespace ops { diff --git a/framework/operators/output.cpp b/framework/operators/output.cpp index 1a34513ab..119963600 100644 --- a/framework/operators/output.cpp +++ b/framework/operators/output.cpp @@ -4,62 +4,62 @@ namespace anakin { namespace ops { -template<> -void Output::operator()( - OpContext& ctx, - const std::vector>& ins, - std::vector>& outs) { -} - +#define INSTANCE_OUTPUT(Ttype, Ptype) \ +template<> \ +void Output::operator()( \ + OpContext& ctx, \ + const std::vector>& ins, \ + std::vector>& outs) {} -/// TODO ... specialization other type of operator - -/// set helper -template -OutputHelper::~OutputHelper() { -} - -template -Status OutputHelper::InitParam() { +template +Status OutputHelper::InitParam() { return Status::OK(); } -template -Status OutputHelper::Init(OpContext& ctx, - const std::vector>& ins, - std::vector>& outs) { +template +Status OutputHelper::Init(OpContext &ctx, + const std::vector> &ins, + std::vector> &outs) { + return Status::OK(); } -template -Status OutputHelper::InferShape(const std::vector >& - ins, - std::vector >& outs) { +template +Status OutputHelper::InferShape(const std::vector> &ins, + std::vector> &outs) { return Status::OK(); } -template class OutputHelper; -template class OutputHelper; -template class OutputHelper; - -template class OutputHelper; -template class OutputHelper; -template class OutputHelper; +#ifdef USE_CUDA +INSTANCE_OUTPUT(NV, Precision::FP32); +template class OutputHelper; +ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, NV, Precision::FP32); +#endif -// register help -ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, NV, AK_FLOAT, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, NV, AK_FLOAT, Precision::FP16); -ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, NV, AK_FLOAT, Precision::INT8); +#ifdef USE_X86_PLACE +INSTANCE_OUTPUT(X86, Precision::FP32); +template class OutputHelper; +ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, X86, Precision::FP32); +#endif -ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, ARM, AK_FLOAT, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, ARM, AK_FLOAT, Precision::FP16); -ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, ARM, AK_FLOAT, Precision::INT8); +#ifdef USE_ARM_PLACE +INSTANCE_OUTPUT(ARM, Precision::FP32); +template class OutputHelper; +ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, ARM, Precision::FP32); +#endif //arm //! register op ANAKIN_REGISTER_OP(Output) -.Doc("Output operator [ only a input data holder and reshape ] ") -.__alias__("output") -.__alias__("output"); +#ifdef USE_CUDA +.__alias__("output") +#endif +#ifdef USE_ARM_PLACE +.__alias__("output") +#endif +#ifdef USE_X86_PLACE +.__alias__("output") +#endif +.Doc("Output operator [ only a input data holder and reshape ] "); } /* namespace ops */ diff --git a/framework/operators/output.h b/framework/operators/output.h index 71bb39c02..b1ebd6e07 100644 --- a/framework/operators/output.h +++ b/framework/operators/output.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ namespace anakin { namespace ops { -template +template class OutputHelper; /// Output op without any compute, this a holder for input @@ -33,17 +33,15 @@ class OutputHelper; * \brief Output implementation class * public inherit Operator */ -template -class Output : public Operator { +template +class Output : public Operator { public: virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator Output::type>().type_info(); + const std::vector >& ins, + std::vector >& outs) { } - friend class OutputHelper; + friend class OutputHelper; }; /** @@ -51,13 +49,13 @@ class Output : public Operator { * public inherit OperatorHelper * including init resource and shape size in output context */ -template -class OutputHelper : public OperatorHelper { - typedef OperatorHelper Base; +template +class OutputHelper : public OperatorHelper { + typedef OperatorHelper Base; public: OutputHelper() {} - ~OutputHelper(); + ~OutputHelper(){} Status InitParam() override; @@ -69,8 +67,8 @@ class OutputHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -78,9 +76,8 @@ class OutputHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; - + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; }; diff --git a/framework/operators/permute.cpp b/framework/operators/permute.cpp index 178bdaae2..c4b5c19c2 100644 --- a/framework/operators/permute.cpp +++ b/framework/operators/permute.cpp @@ -4,84 +4,76 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Permute::operator()(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_permute; - impl->_funcs_permute(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - - -/// set helper -template -PermuteHelper::~PermuteHelper() { +#define INSTANCE_PERMUTE(Ttype, Ptype) \ +template<> \ +void Permute::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*>\ + (this->_helper)->_param_permute; \ + impl->_funcs_permute(ins, outs, param, ctx); \ } -template -Status PermuteHelper::InitParam() { - LOG(WARNING) << "!!!!!!!! Parsing Permute op parameter."; +template +Status PermuteHelper::InitParam() { + LOG(WARNING) << " Parsing Permute op parameter."; auto dims = GET_PARAMETER(PTuple, dims); for (int i = 0; i < dims.size(); i++) { LOG(INFO) << " |-- dims [" << i << "]: " << dims[i]; } - saber::PermuteParam> permute_param(dims.vector()); + saber::PermuteParam permute_param(dims.vector()); _param_permute = permute_param; return Status::OK(); } -template -Status PermuteHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status PermuteHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_permute.init(ins, outs, _param_permute, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status PermuteHelper::InferShape(const std::vector >& +template +Status PermuteHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { + std::vector >& outs) { SABER_CHECK(_funcs_permute.compute_output_shape(ins, outs, _param_permute)); return Status::OK(); } #ifdef USE_CUDA -template class PermuteHelper; -template class PermuteHelper; -template class PermuteHelper; +INSTANCE_PERMUTE(NV, Precision::FP32); +template class PermuteHelper; +ANAKIN_REGISTER_OP_HELPER(Permute, PermuteHelper, NV, Precision::FP32); #endif -#ifdef USE_ARM_PLACE -template class PermuteHelper; -template class PermuteHelper; -template class PermuteHelper; +#ifdef USE_X86_PLACE +INSTANCE_PERMUTE(X86, Precision::FP32); +template class PermuteHelper; +ANAKIN_REGISTER_OP_HELPER(Permute, PermuteHelper, X86, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Permute, PermuteHelper, NV, AK_FLOAT, Precision::FP32); -#endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Permute, PermuteHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_PERMUTE(ARM, Precision::FP32); +template class PermuteHelper; +ANAKIN_REGISTER_OP_HELPER(Permute, PermuteHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Permute) .Doc("Permute operator") #ifdef USE_CUDA -.__alias__("permute") +.__alias__("permute") #endif #ifdef USE_ARM_PLACE -.__alias__("permute") +.__alias__("permute") +#endif +#ifdef USE_X86_PLACE +.__alias__("permute") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/permute.h b/framework/operators/permute.h index b8e45e729..0263d2d47 100644 --- a/framework/operators/permute.h +++ b/framework/operators/permute.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class PermuteHelper; /// pooling op @@ -34,20 +34,21 @@ class PermuteHelper; * \brief Permute implementation class * public inherit Operator */ -template -class Permute : public Operator { +template +class Permute : public Operator { public: Permute() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator permute::type>().type_info(); + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Permute< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } - friend class PermuteHelper; + friend class PermuteHelper; }; /** @@ -55,12 +56,12 @@ class Permute : public Operator { * public inherit OperatorHelper * including init resource and shape size in Permut context */ -template -class PermuteHelper : public OperatorHelper { +template +class PermuteHelper : public OperatorHelper { public: PermuteHelper()=default; - ~PermuteHelper(); + ~PermuteHelper() {} Status InitParam() override; @@ -72,8 +73,8 @@ class PermuteHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,18 +82,16 @@ class PermuteHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_permute stand for permute parameter - saber::PermuteParam> _param_permute; + saber::PermuteParam _param_permute; ///< _funcs_permute stand for permute function - saber::Permute _funcs_permute; + saber::Permute::saber_type> _funcs_permute; }; - - - + } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/pooling.cpp b/framework/operators/pooling.cpp index ccc2c97dc..691e00db7 100644 --- a/framework/operators/pooling.cpp +++ b/framework/operators/pooling.cpp @@ -4,28 +4,20 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Pooling::operator()(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_pooling; - impl->_funcs_pooling(ins, outs, param, ctx); +#define INSTANCE_POOLING(Ttype, Ptype) \ +template<> \ +void Pooling::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*>\ + (this->_helper)->_param_pooling; \ + impl->_funcs_pooling(ins, outs, param, ctx); \ } -#endif - -/// TODO ... specialization other type of operator - -/// set helper -template -PoolingHelper::~PoolingHelper() { -} -template -Status PoolingHelper::InitParam() { +template +Status PoolingHelper::InitParam() { DLOG(WARNING) << "Parsing Pooling op parameter."; auto cmp_out_shape_floor_as_conv = GET_PARAMETER(bool, cmp_out_shape_floor_as_conv); auto global_pooling = GET_PARAMETER(bool, global_pooling); @@ -35,83 +27,94 @@ Status PoolingHelper::InitParam() { auto pool_method = GET_PARAMETER(std::string, method); if (pool_method == "MAX") { - PoolingParam> pooling_param(pool_size[0], pool_size[1], - pool_padding[0], pool_padding[1], - pool_strides[0], pool_strides[1], - Pooling_max, global_pooling, cmp_out_shape_floor_as_conv); + PoolingParam pooling_param(pool_size[0], pool_size[1], + pool_padding[0], pool_padding[1], + pool_strides[0], pool_strides[1], + Pooling_max, global_pooling, cmp_out_shape_floor_as_conv); _param_pooling = pooling_param; } else if (pool_method == "AVG") { - PoolingParam> pooling_param(pool_size[0], pool_size[1], - pool_padding[0], pool_padding[1], - pool_strides[0], pool_strides[1], - Pooling_average_include_padding, global_pooling, cmp_out_shape_floor_as_conv); + PoolingParam pooling_param(pool_size[0], pool_size[1], + pool_padding[0], pool_padding[1], + pool_strides[0], pool_strides[1], + Pooling_average_include_padding, global_pooling, cmp_out_shape_floor_as_conv); _param_pooling = pooling_param; } else { - LOG(FATAL) << " Pooling op doesn't support : " << pool_method << " pooling."; + LOG(FATAL) << " Pooling op doesn't support : " << pool_method << " pooling."; } return Status::OK(); } -template -Status PoolingHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - SABER_CHECK(_funcs_pooling.init(ins, outs, _param_pooling, SPECIFY, VENDER_IMPL, ctx)); +template +Status PoolingHelper::Init(OpContext &ctx, const std::vector> &ins, + std::vector> &outs) { + SABER_CHECK(_funcs_pooling.init(ins, outs, _param_pooling, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status PoolingHelper::InferShape(const std::vector >& - ins, - std::vector >& outs) { +template +Status PoolingHelper::InferShape(const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_pooling.compute_output_shape(ins, outs, _param_pooling)); return Status::OK(); } #ifdef USE_CUDA -template class PoolingHelper; -template class PoolingHelper; -template class PoolingHelper; +INSTANCE_POOLING(NV, Precision::FP32); +template <> +Status PoolingHelper::Init(OpContext &ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { + SABER_CHECK(_funcs_pooling.init(ins, outs, _param_pooling, SPECIFY, VENDER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -template class PoolingHelper; -template class PoolingHelper; -template class PoolingHelper; +INSTANCE_POOLING(ARM, Precision::FP32); +template class PoolingHelper; +ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, ARM, Precision::FP32); +#endif //arm + +#ifdef USE_X86_PLACE +INSTANCE_POOLING(X86, Precision::FP32); +template class PoolingHelper; +ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, X86, Precision::FP32); #endif -//template class PoolingHelper; -//template class PoolingHelper; -//template class PoolingHelper; -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, NV, AK_FLOAT, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, ARM, AK_FLOAT, Precision::FP32); +#ifdef AMD_GPU +INSTANCE_POOLING(AMD, Precision::FP32); +template class PoolingHelper; +ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, AMD, Precision::FP32); #endif - //! register op ANAKIN_REGISTER_OP(Pooling) .Doc("Pooling operator") #ifdef USE_CUDA -.__alias__("pooling") -.__alias__("pool") +.__alias__("pooling") +.__alias__("pool") #endif #ifdef USE_ARM_PLACE -.__alias__("pooling") -.__alias__("pool") +.__alias__("pooling") +.__alias__("pool") +#endif +#ifdef USE_X86_PLACE +.__alias__("pooling") +.__alias__("pool") +#endif +#ifdef AMD_GPU +.__alias__("pooling") +.__alias__("pool") #endif .num_in(1) .num_out(1) .Args("method", "Pooling type to be applied (MAX, SUM, AVG).") -.Args("cmp_out_shape_floor_as_conv", - "cmp_out_shape_floor_as_conv of pooling for adu novel approach") +.Args("cmp_out_shape_floor_as_conv cmp_out_shape_floor_as_conv of pooling for adu novel approach") .Args("global_pooling", "whether execute global pooling on input") .Args>("pool_size", " kernel size for pooling (x, y) or (x, y, z).") - .Args>("strides", "stride for pooling (x, y) or (x, y, z).") - .Args>("padding", "pad for pooling: (x, y) or (x, y, z)."); +.Args>("strides", "stride for pooling (x, y) or (x, y, z).") +.Args>("padding", "pad for pooling: (x, y) or (x, y, z)."); } /* namespace ops */ diff --git a/framework/operators/pooling.h b/framework/operators/pooling.h index b4f8ed430..fbaf30da7 100644 --- a/framework/operators/pooling.h +++ b/framework/operators/pooling.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class PoolingHelper; /** @@ -44,20 +44,21 @@ enum class PoolingType { * \brief Pooling implementation class * public inherit Operator */ -template -class Pooling : public Operator { +template +class Pooling : public Operator { public: Pooling() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator Pooling::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Pooling< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } - friend class PoolingHelper; + friend class PoolingHelper; }; /** @@ -65,12 +66,12 @@ class Pooling : public Operator { * public inherit OperatorHelper * including init resource and shape size in Pooling context */ -template -class PoolingHelper : public OperatorHelper { +template +class PoolingHelper : public OperatorHelper { public: PoolingHelper()=default; - ~PoolingHelper(); + ~PoolingHelper() {} Status InitParam() override; @@ -82,8 +83,8 @@ class PoolingHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -91,16 +92,17 @@ class PoolingHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_pooling stand for Pooling parameter - saber::PoolingParam> _param_pooling; + saber::PoolingParam _param_pooling; ///< _funcs_pooling stand for Pooling function - saber::Pooling _funcs_pooling; + saber::Pooling::saber_type> _funcs_pooling; }; + } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/power.cpp b/framework/operators/power.cpp index 151687aab..42d59c0ec 100644 --- a/framework/operators/power.cpp +++ b/framework/operators/power.cpp @@ -6,12 +6,12 @@ namespace ops { #ifdef USE_CUDA template<> -void Power::operator()( +void Power::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { + const std::vector >& ins, + std::vector >& outs) { auto* impl = - static_cast*>(this->_helper); + static_cast*>(this->_helper); auto& param = impl->_param_power; impl->_funcs_power(ins, outs, param, ctx); } @@ -21,66 +21,66 @@ void Power::operator()( /// set helper -template -PowerHelper::~PowerHelper() { +template +PowerHelper::~PowerHelper() { } -template -Status PowerHelper::InitParam() { +template +Status PowerHelper::InitParam() { DLOG(WARNING) << "Parsing Power op parameter."; auto scale = GET_PARAMETER(float, scale); auto shift = GET_PARAMETER(float, shift); auto power = GET_PARAMETER(float, power); - saber::PowerParam> power_param(power, scale, shift); + saber::PowerParam power_param(power, scale, shift); _param_power = power_param; return Status::OK(); } -template -Status PowerHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status PowerHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_power.init(ins, outs, _param_power, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status PowerHelper::InferShape(const std::vector >& +template +Status PowerHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { + std::vector >& outs) { SABER_CHECK(_funcs_power.compute_output_shape(ins, outs, _param_power)); return Status::OK(); } #ifdef USE_CUDA -template class PowerHelper; -template class PowerHelper; -template class PowerHelper; +template class PowerHelper; +template class PowerHelper; +template class PowerHelper; #endif #ifdef USE_ARM_PLACE -template class PowerHelper; -template class PowerHelper; -template class PowerHelper; +template class PowerHelper; +template class PowerHelper; +template class PowerHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Power, PowerHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Power, PowerHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Power, PowerHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Power, PowerHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Power) .Doc("Power operator") #ifdef USE_CUDA -.__alias__("power") +.__alias__("power") #endif #ifdef USE_ARM_PLACE -.__alias__("power") +.__alias__("power") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/power.h b/framework/operators/power.h index 0492249af..81e6acd78 100644 --- a/framework/operators/power.h +++ b/framework/operators/power.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class PowerHelper; /// pooling op @@ -34,20 +34,21 @@ class PowerHelper; * \brief Power implementation class * public inherit Operator */ -template -class Power : public Operator { +template +class Power : public Operator { public: Power() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Power< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } - friend class PowerHelper; + friend class PowerHelper; }; /** @@ -55,8 +56,8 @@ class Power : public Operator { * public inherit OperatorHelper * including init resource and shape size in Power context */ -template -class PowerHelper : public OperatorHelper { +template +class PowerHelper : public OperatorHelper { public: PowerHelper()=default; @@ -72,8 +73,8 @@ class PowerHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +82,14 @@ class PowerHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_power stand for Power parameter - saber::PowerParam> _param_power; + saber::PowerParam _param_power; ///< _funcs_power stand for Power function - saber::Power _funcs_power; + saber::Power::saber_type> _funcs_power; private: ///< _dims stand for Power size diff --git a/framework/operators/priorbox.cpp b/framework/operators/priorbox.cpp index 2e6a7698b..e5b1c049a 100644 --- a/framework/operators/priorbox.cpp +++ b/framework/operators/priorbox.cpp @@ -4,28 +4,18 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void PriorBox::operator()(OpContext& ctx, \ - const std::vector >& ins, \ - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_priorbox; - impl->_funcs_priorbox(ins, outs, param, ctx); +#define INSTANCE_PRIORBOX(Ttype, Ptype) \ +template<> \ +void PriorBox::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*>(this->_helper)->_param_priorbox; \ + impl->_funcs_priorbox(ins, outs, param, ctx); \ } -#endif - -/// TODO ... specialization other type of operator - -/// set helper -template -PriorBoxHelper::~PriorBoxHelper() { -} - -template -Status PriorBoxHelper::InitParam() { +template +Status PriorBoxHelper::InitParam() { DLOG(WARNING) << "Parsing PriorBox op parameter."; auto min_size_ = GET_PARAMETER(PTuple, min_size); auto max_size_ = GET_PARAMETER(PTuple, max_size); @@ -38,55 +28,71 @@ Status PriorBoxHelper::InitParam() { auto step_h_ = GET_PARAMETER(float, step_h); auto step_w_ = GET_PARAMETER(float, step_w); auto offset_ = GET_PARAMETER(float, offset); + auto order = GET_PARAMETER(PTuple, order); + std::vector order_; + + for (int i = 0; i < order.size(); i++) { + if (order[i] == "MIN") { + order_.push_back(PRIOR_MIN); + } else if (order[i] == "MAX") { + order_.push_back(PRIOR_MAX); + } else if (order[i] == "COM") { + order_.push_back(PRIOR_COM); + } + } - saber::PriorBoxParam> param_priorbox(min_size_.vector(), max_size_.vector(), \ + saber::PriorBoxParam param_priorbox(min_size_.vector(), max_size_.vector(), \ as_ratio.vector(), var.vector(), flip_flag, clip_flag, \ - image_w, image_h, step_w_, step_h_, offset_); + image_w, image_h, step_w_, step_h_, offset_, order_); _param_priorbox = param_priorbox; return Status::OK(); } -template -Status PriorBoxHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status PriorBoxHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_priorbox.init(ins, outs, _param_priorbox, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status PriorBoxHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { +template +Status PriorBoxHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_priorbox.compute_output_shape(ins, outs, _param_priorbox)); return Status::OK(); } #ifdef USE_CUDA -template class PriorBoxHelper; -template class PriorBoxHelper; -template class PriorBoxHelper; +INSTANCE_PRIORBOX(NV, Precision::FP32); +template class PriorBoxHelper; +ANAKIN_REGISTER_OP_HELPER(PriorBox, PriorBoxHelper, NV, Precision::FP32); #endif + #ifdef USE_ARM_PLACE -template class PriorBoxHelper; -template class PriorBoxHelper; -template class PriorBoxHelper; -#endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(PriorBox, PriorBoxHelper, NV, AK_FLOAT, Precision::FP32); +INSTANCE_PRIORBOX(ARM, Precision::FP32); +template class PriorBoxHelper; +ANAKIN_REGISTER_OP_HELPER(PriorBox, PriorBoxHelper, ARM, Precision::FP32); #endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(PriorBox, PriorBoxHelper, ARM, AK_FLOAT, Precision::FP32); + +#ifdef USE_X86_PLACE +INSTANCE_PRIORBOX(X86, Precision::FP32); +template class PriorBoxHelper; +ANAKIN_REGISTER_OP_HELPER(PriorBox, PriorBoxHelper, X86, Precision::FP32); #endif + //! register op ANAKIN_REGISTER_OP(PriorBox) .Doc("PriorBox operator") #ifdef USE_CUDA -.__alias__("priorbox") +.__alias__("priorbox") #endif #ifdef USE_ARM_PLACE -.__alias__("priorbox") +.__alias__("priorbox") +#endif +#ifdef USE_X86_PLACE +.__alias__("priorbox") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/priorbox.h b/framework/operators/priorbox.h index adb56a594..56de76d29 100644 --- a/framework/operators/priorbox.h +++ b/framework/operators/priorbox.h @@ -22,51 +22,49 @@ namespace anakin { namespace ops { -template +template class PriorBoxHelper; //! PriorBox op -template -class PriorBox : public Operator { +template +class PriorBox : public Operator { public: PriorBox() {} //! forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - //LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator PriorBox< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class PriorBoxHelper; + friend class PriorBoxHelper; }; -template -class PriorBoxHelper : public OperatorHelper { +template +class PriorBoxHelper : public OperatorHelper { public: PriorBoxHelper()=default; - ~PriorBoxHelper(); + ~PriorBoxHelper() {} Status InitParam() override; //! initial all the resource needed by pooling Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; //! infer the shape of output and input. - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: - saber::PriorBoxParam> _param_priorbox; - saber::PriorBox _funcs_priorbox; + saber::PriorBoxParam _param_priorbox; + saber::PriorBox::saber_type> _funcs_priorbox; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/relu.cpp b/framework/operators/relu.cpp index 6164e8402..87299225c 100644 --- a/framework/operators/relu.cpp +++ b/framework/operators/relu.cpp @@ -4,82 +4,89 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void ReLU::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = impl->_param_relu; - impl->_funcs_relu(ins, outs, param, ctx); +#define INSTANCE_RELU(Ttype, Ptype) \ +template<> \ +void ReLU::operator()(OpContext& ctx, \ + const std::vector >& ins,\ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = impl->_param_relu; \ + impl->_funcs_relu(ins, outs, param, ctx); \ } -#endif -/// TODO ... specialization other type of operator - -/// set helper -template -ReLUHelper::~ReLUHelper() { -} - -template -Status ReLUHelper::InitParam() { +template +Status ReLUHelper::InitParam() { DLOG(WARNING) << "Parsing ReLU op parameter."; // get relu param auto alpha = GET_PARAMETER(float, alpha); - ActivationParam> active_param(Active_relu);//, alpha); // TEMP + ActivationParam active_param(Active_relu);//, alpha); // TEMP _param_relu = active_param; return Status::OK(); } -template -Status ReLUHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - SABER_CHECK(_funcs_relu.init(ins, outs, _param_relu, SPECIFY, VENDER_IMPL, ctx)); +template +Status ReLUHelper::Init(OpContext &ctx, const std::vector> &ins, + std::vector> &outs) { + SABER_CHECK(_funcs_relu.init(ins, outs, _param_relu, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status ReLUHelper::InferShape(const std::vector >& - ins, - std::vector >& outs) { +template +Status ReLUHelper::InferShape(const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_relu.compute_output_shape(ins, outs, _param_relu)); return Status::OK(); } #ifdef USE_CUDA -template class ReLUHelper; -template class ReLUHelper; -template class ReLUHelper; +INSTANCE_RELU(NV, Precision::FP32); +template <> +Status ReLUHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_relu.init(ins, outs, _param_relu, SPECIFY, VENDER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, NV, Precision::FP32); #endif -#ifdef USE_ARM_PLACE -template class ReLUHelper; -template class ReLUHelper; -template class ReLUHelper; +#ifdef USE_X86_PLACE +INSTANCE_RELU(X86, Precision::FP32); +template class ReLUHelper; +ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, X86, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, NV, AK_FLOAT, Precision::FP32); -#endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_RELU(ARM, Precision::FP32); +template class ReLUHelper; +ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_RELU(AMD, Precision::FP32); +template class ReLUHelper; +ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, AMD, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(ReLU) .Doc("ReLU operator") #ifdef USE_CUDA -.__alias__("Relu") +.__alias__("Relu") +#endif + +#ifdef AMD_GPU +.__alias__("Relu") #endif + #ifdef USE_ARM_PLACE -.__alias__("Relu") +.__alias__("Relu") +#endif +#ifdef USE_X86_PLACE +.__alias__("Relu") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/relu.h b/framework/operators/relu.h index c3220bb3d..60ffe298c 100644 --- a/framework/operators/relu.h +++ b/framework/operators/relu.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class ReLUHelper; /// pooling op @@ -34,20 +34,20 @@ class ReLUHelper; * \brief ReLU implementation class * public inherit Operator */ -template -class ReLU : public Operator { +template +class ReLU : public Operator { public: ReLU() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator ReLU< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ReLUHelper; + friend class ReLUHelper; }; /** @@ -55,12 +55,12 @@ class ReLU : public Operator { * public inherit OperatorHelper * including init resource and shape size in ReLU context */ -template -class ReLUHelper : public OperatorHelper { +template +class ReLUHelper : public OperatorHelper { public: ReLUHelper()=default; - ~ReLUHelper(); + ~ReLUHelper() {} Status InitParam() override; @@ -72,8 +72,8 @@ class ReLUHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,22 +81,20 @@ class ReLUHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_relu stand for ReLU parameter - saber::ActivationParam> _param_relu; + saber::ActivationParam _param_relu; ///< _funcs_relu stand for ReLU function - saber::Activation _funcs_relu; + saber::Activation::saber_type> _funcs_relu; private: ///< _dims stand for ReLU size PTuple _dims; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/reshape.cpp b/framework/operators/reshape.cpp index f575d1427..bdb00e3a7 100644 --- a/framework/operators/reshape.cpp +++ b/framework/operators/reshape.cpp @@ -4,80 +4,75 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Reshape::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_reshape; - impl->_funcs_reshape(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - - -/// set helper -template -ReshapeHelper::~ReshapeHelper() { +#define INSTANCE_RESHAPE(Ttype, Ptype) \ +template<> \ +void Reshape::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_reshape; \ + impl->_funcs_reshape(ins, outs, param, ctx); \ } -template -Status ReshapeHelper::InitParam() { +template +Status ReshapeHelper::InitParam() { DLOG(WARNING) << "Parsing Reshape op parameter."; auto dims = GET_PARAMETER(PTuple, dims); - ReshapeParam> param_reshape(dims.vector()); + ReshapeParam param_reshape(dims.vector()); _param_reshape = param_reshape; return Status::OK(); } -template -Status ReshapeHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { + +template +Status ReshapeHelper::Init(OpContext &ctx, const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_reshape.init(ins, outs, _param_reshape, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status ReshapeHelper::InferShape(const std::vector >& - ins, - std::vector >& outs) { + +template +Status ReshapeHelper::InferShape(const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_reshape.compute_output_shape(ins, outs, _param_reshape)); outs[0]->set_seq_offset(ins[0]->get_seq_offset()); return Status::OK(); } + #ifdef USE_CUDA -template class ReshapeHelper; -template class ReshapeHelper; -template class ReshapeHelper; -#endif -#ifdef USE_ARM_PLACE -template class ReshapeHelper; -template class ReshapeHelper; -template class ReshapeHelper; +INSTANCE_RESHAPE(NV, Precision::FP32); +template class ReshapeHelper; +ANAKIN_REGISTER_OP_HELPER(Reshape, ReshapeHelper, NV, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Reshape, ReshapeHelper, NV, AK_FLOAT, Precision::FP32); + +#ifdef USE_X86_PLACE +INSTANCE_RESHAPE(X86, Precision::FP32); +template class ReshapeHelper; +ANAKIN_REGISTER_OP_HELPER(Reshape, ReshapeHelper, X86, Precision::FP32); #endif + #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Reshape, ReshapeHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_RESHAPE(ARM, Precision::FP32); +template class ReshapeHelper; +ANAKIN_REGISTER_OP_HELPER(Reshape, ReshapeHelper, ARM, Precision::FP32); #endif + //! register op ANAKIN_REGISTER_OP(Reshape) .Doc("Reshape operator") #ifdef USE_CUDA -.__alias__("reshape") +.__alias__("reshape") #endif #ifdef USE_ARM_PLACE -.__alias__("reshape") +.__alias__("reshape") +#endif +#ifdef USE_X86_PLACE +.__alias__("reshape") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/reshape.h b/framework/operators/reshape.h index b8dbe0734..65789cf41 100644 --- a/framework/operators/reshape.h +++ b/framework/operators/reshape.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class ReshapeHelper; /// pooling op @@ -34,33 +34,34 @@ class ReshapeHelper; * \brief Reshape implementation class * public inherit Operator */ -template -class Reshape : public Operator { +template +class Reshape : public Operator { public: Reshape() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Reshape< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ReshapeHelper; + friend class ReshapeHelper; }; + /** * \brief Reshape helper class to implement reshape * public inherit OperatorHelper * including init resource and shape size in reshape context */ -template -class ReshapeHelper : public OperatorHelper { +template +class ReshapeHelper : public OperatorHelper { public: ReshapeHelper()=default; - ~ReshapeHelper(); + ~ReshapeHelper() {} Status InitParam() override; @@ -72,8 +73,8 @@ class ReshapeHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,18 +82,16 @@ class ReshapeHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_reshape stand for reshape parameter - saber::ReshapeParam> _param_reshape; + saber::ReshapeParam _param_reshape; ///< _funcs_reshape stand for reshape function - saber::Reshape _funcs_reshape; + saber::Reshape::saber_type> _funcs_reshape; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/scale.cpp b/framework/operators/scale.cpp index 4a6771a68..3e1254eb6 100644 --- a/framework/operators/scale.cpp +++ b/framework/operators/scale.cpp @@ -4,32 +4,23 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Scale::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_scale; - impl->_funcs_scale(ins, outs, param, ctx); +#define INSTANCE_SCALE(Ttype, Ptype) \ +template<> \ +void Scale::operator()( \ + OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_scale; \ + impl->_funcs_scale(ins, outs, param, ctx); \ } -#endif - -/// TODO ... specialization other type of operator - -/// set helper -template -ScaleHelper::~ScaleHelper() { -} - -template -Status ScaleHelper::InitParam() { +template +Status ScaleHelper::InitParam() { DLOG(WARNING) << "Parsing Scale op parameter."; - using pblock_type = PBlock::type, Ttype>; + using pblock_type = PBlock; auto axis = GET_PARAMETER(int, axis); auto num_axes = GET_PARAMETER(int, num_axes); @@ -38,56 +29,60 @@ Status ScaleHelper::InitParam() { if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - ScaleParam > param_scale(weights.vector(), bias.vector(), bias_term, axis, num_axes); + ScaleParam param_scale(weights.vector(), bias.vector(), bias_term, axis, num_axes); _param_scale = param_scale; } else { - ScaleParam > param_scale(weights.vector(), bias_term, axis, num_axes); + ScaleParam param_scale(weights.vector(), bias_term, axis, num_axes); _param_scale = param_scale; } return Status::OK(); } -template -Status ScaleHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status ScaleHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_scale.init(ins, outs, _param_scale, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status ScaleHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { +template +Status ScaleHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_scale.compute_output_shape(ins, outs, _param_scale)); return Status::OK(); } #ifdef USE_CUDA -template class ScaleHelper; -template class ScaleHelper; -template class ScaleHelper; -#endif -#ifdef USE_ARM_PLACE -template class ScaleHelper; -template class ScaleHelper; -template class ScaleHelper; +INSTANCE_SCALE(NV, Precision::FP32); +template class ScaleHelper; +ANAKIN_REGISTER_OP_HELPER(Scale, ScaleHelper, NV, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Scale, ScaleHelper, NV, AK_FLOAT, Precision::FP32); + +#ifdef USE_X86_PLACE +INSTANCE_SCALE(X86, Precision::FP32); +template class ScaleHelper; +ANAKIN_REGISTER_OP_HELPER(Scale, ScaleHelper, X86, Precision::FP32); #endif + #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Scale, ScaleHelper, ARM, AK_FLOAT, Precision::FP32); -#endif +INSTANCE_SCALE(ARM, Precision::FP32); +template class ScaleHelper; +ANAKIN_REGISTER_OP_HELPER(Scale, ScaleHelper, ARM, Precision::FP32); +#endif//arm + //! register op ANAKIN_REGISTER_OP(Scale) .Doc("Scale operator") #ifdef USE_CUDA -.__alias__("Scale") +.__alias__("Scale") #endif #ifdef USE_ARM_PLACE -.__alias__("Scale") +.__alias__("Scale") +#endif +#ifdef USE_X86_PLACE +.__alias__("Scale") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/scale.h b/framework/operators/scale.h index d6518bc98..2d93e68fc 100644 --- a/framework/operators/scale.h +++ b/framework/operators/scale.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class ScaleHelper; /// pooling op @@ -34,20 +34,20 @@ class ScaleHelper; * \brief operation of ops class * public inheritance Operator */ -template -class Scale : public Operator { +template +class Scale : public Operator { public: Scale() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Scale< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ScaleHelper; + friend class ScaleHelper; }; /** @@ -55,12 +55,12 @@ class Scale : public Operator { * public inheritance OperatorHelper * including init operation context and the size of shape */ -template -class ScaleHelper : public OperatorHelper { +template +class ScaleHelper : public OperatorHelper { public: ScaleHelper()=default; - ~ScaleHelper(); + ~ScaleHelper(){} Status InitParam() override; @@ -72,8 +72,8 @@ class ScaleHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,18 +81,16 @@ class ScaleHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_scale stand for scale parameter - saber::ScaleParam> _param_scale; + saber::ScaleParam _param_scale; ///< _funcs_scale stand for scale function - saber::Scale _funcs_scale; + saber::Scale::saber_type> _funcs_scale; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/sequence_conv.cpp b/framework/operators/sequence_conv.cpp new file mode 100644 index 000000000..c57277a5c --- /dev/null +++ b/framework/operators/sequence_conv.cpp @@ -0,0 +1,157 @@ +#include "framework/operators/sequence_conv.h" + +namespace anakin { + +namespace ops { + +#ifdef USE_CUDA +template<> +void SequenceConv::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*> + (this->_helper)->_param; + impl->_funcs(ins, outs, param, ctx); +} +#endif + +#ifdef USE_X86_PLACE +template<> +void SequenceConv::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*> + (this->_helper)->_param; + impl->_funcs(ins, outs, param, ctx); +} +#endif + +/// TODO ... specialization other type of operator + + +/// set helper +template +SequenceConvHelper::~SequenceConvHelper() { +} + +template +Status SequenceConvHelper::InitParam() { + DLOG(WARNING) << "Parsing SequenceConv op parameter."; + + auto context_length=GET_PARAMETER(int, context_length); + auto context_start=GET_PARAMETER(int, context_start); + auto context_stride=GET_PARAMETER(int, context_stride); + auto padding_trainable=GET_PARAMETER(bool, padding_trainable); + //auto filter_tensor=GET_PARAMETER(PBlock::type> , filter_tensor); + //auto padding_tensor=GET_PARAMETER(PBlock::type> , padding_tensor); + using pblock_type = PBlock; + auto filter_tensor = GET_PARAMETER(pblock_type, filter_tensor); + auto padding_tensor = GET_PARAMETER(pblock_type, padding_tensor); + + + if(padding_tensor.d_tensor().valid_size()>0) { + SequenceConvParam param(&(filter_tensor.d_tensor()), context_length, context_start, + context_stride, padding_trainable, &(padding_tensor.d_tensor())); + _param = param; + }else{ + SequenceConvParam param(&(filter_tensor.d_tensor()), context_length, context_start, + context_stride, padding_trainable); + _param = param; + } + + return Status::OK(); +} + +template<> +Status SequenceConvHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(INFO) << "are you ok"; + SABER_CHECK(_funcs.init(ins, outs, _param, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +template<> +Status SequenceConvHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(INFO) << "are you ok"; + SABER_CHECK(_funcs.init(ins, outs, _param, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template<> +Status SequenceConvHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(INFO) << "are you ok"; + SABER_CHECK(_funcs.init(ins, outs, _param, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status SequenceConvHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs.init(ins, outs, _param, STATIC, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status SequenceConvHelper::InferShape(const + std::vector >& + ins, + std::vector >& outs) { + SABER_CHECK(_funcs.compute_output_shape(ins, outs, _param)); + return Status::OK(); +} +#ifdef USE_X86_PLACE +template class SequenceConvHelper; +template class SequenceConvHelper; +template class SequenceConvHelper; +#endif +#ifdef USE_CUDA +template class SequenceConvHelper; +template class SequenceConvHelper; +template class SequenceConvHelper; +#endif +#ifdef USE_ARM_PLACE +template class SequenceConvHelper; +template class SequenceConvHelper; +template class SequenceConvHelper; +#endif +// register helper +#ifdef USE_X86_PLACE +ANAKIN_REGISTER_OP_HELPER(SequenceConv, SequenceConvHelper, X86, Precision::FP32); +#endif + +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(SequenceConv, SequenceConvHelper, NV, Precision::FP32); +#endif +#ifdef USE_ARM_PLACE +ANAKIN_REGISTER_OP_HELPER(SequenceConv, SequenceConvHelper, ARM, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(SequenceConv) +.Doc("SequenceConv operator") +#ifdef USE_X86_PLACE +.__alias__("SequenceConv") +#endif +#ifdef USE_CUDA +.__alias__("SequenceConv") +#endif +#ifdef USE_ARM_PLACE +.__alias__("SequenceConv") +#endif +.num_in(1) +.num_out(1) +.Args("axis", " axis "); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/sequence_conv.h b/framework/operators/sequence_conv.h new file mode 100644 index 000000000..36ada8db7 --- /dev/null +++ b/framework/operators/sequence_conv.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_SEQUENCE_CONV_H +#define ANAKIN_OPERATOR_SEQUENCE_CONV_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/sequence_conv.h" + +namespace anakin { + +namespace ops { + +template +class SequenceConvHelper; + +/// pooling op +/** + * \brief SequenceConv implementation class + * public inherit Operator + */ +template +class SequenceConv : public Operator { +public: + SequenceConv() {} + + /// forward impl + virtual void operator()(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator SequenceConv< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class SequenceConvHelper; +}; + +/** + * \brief SequenceConv helper class to implement SequenceConv + * public inherit OperatorHelper + * including init resource and shape size in SequenceConv context + */ +template +class SequenceConvHelper : public OperatorHelper { +public: + SequenceConvHelper() = default; + + ~SequenceConvHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for SequenceConv operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_softmax stand for softmax parameter + saber::SequenceConvParam _param; + ///< _funcs_SequenceConv stand for softmax function + saber::SequenceConv::saber_type> _funcs; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/sequence_pool.cpp b/framework/operators/sequence_pool.cpp index 9842c4590..7d25236f6 100644 --- a/framework/operators/sequence_pool.cpp +++ b/framework/operators/sequence_pool.cpp @@ -4,14 +4,14 @@ namespace anakin { namespace ops { -#ifdef USE_X86 +#ifdef USE_X86_PLACE template<> -void SequencePool::operator()( +void SequencePool::operator()( OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_sequence_pool; + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_sequence_pool; impl->_funcs_sequence_pool(ins, outs, param, ctx); } #endif @@ -20,78 +20,85 @@ void SequencePool::operator()( /// set helper -template -SequencePoolHelper::~SequencePoolHelper() { +template +SequencePoolHelper::~SequencePoolHelper() { } -template -Status SequencePoolHelper::InitParam() { +template +Status SequencePoolHelper::InitParam() { DLOG(WARNING) << "Parsing SequencePool op parameter."; auto pooltype = GET_PARAMETER(std::string, pooltype); - - saber::SequencePoolParam> sequence_pool_param; + std::unordered_map type_map; + type_map.insert(std::make_pair("null", anakin::saber::Sequence_pool_unknow)); + type_map.insert(std::make_pair("AVERAGE", anakin::saber::Sequence_pool_average)); + type_map.insert(std::make_pair("SUM", anakin::saber::Sequence_pool_sum)); + type_map.insert(std::make_pair("SQRT", anakin::saber::Sequence_pool_sqrt)); + type_map.insert(std::make_pair("LAST", anakin::saber::Sequence_pool_last)); + type_map.insert(std::make_pair("FIRST", anakin::saber::Sequence_pool_first)); + type_map.insert(std::make_pair("MAX", anakin::saber::Sequence_pool_max)); + saber::SequencePoolParam sequence_pool_param(type_map[pooltype]); _param_sequence_pool = sequence_pool_param; return Status::OK(); } -template -Status SequencePoolHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status SequencePoolHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_sequence_pool.init(ins, outs, _param_sequence_pool, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status SequencePoolHelper::InferShape(const std::vector >& +template +Status SequencePoolHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { + std::vector >& outs) { SABER_CHECK(_funcs_sequence_pool.compute_output_shape(ins, outs, _param_sequence_pool)); return Status::OK(); } #ifdef USE_CUDA -template class SequencePoolHelper; -template class SequencePoolHelper; -template class SequencePoolHelper; +template class SequencePoolHelper; +template class SequencePoolHelper; +template class SequencePoolHelper; #endif #ifdef USE_ARM_PLACE -template class SequencePoolHelper; -template class SequencePoolHelper; -template class SequencePoolHelper; +template class SequencePoolHelper; +template class SequencePoolHelper; +template class SequencePoolHelper; #endif #ifdef USE_X86_PLACE -template class SequencePoolHelper; -template class SequencePoolHelper; -template class SequencePoolHelper; +template class SequencePoolHelper; +template class SequencePoolHelper; +template class SequencePoolHelper; #endif // register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, NV, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, ARM, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE -ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, X86, AK_FLOAT, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, X86, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(SequencePool) .Doc("SequencePool operator") #ifdef USE_CUDA -.__alias__("SequencePool") +.__alias__("SequencePool") #endif #ifdef USE_ARM_PLACE -.__alias__("SequencePool") +.__alias__("SequencePool") #endif #ifdef USE_X86_PLACE -.__alias__("SequencePool") +.__alias__("SequencePool") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/sequence_pool.h b/framework/operators/sequence_pool.h index c0856204b..efa301dab 100644 --- a/framework/operators/sequence_pool.h +++ b/framework/operators/sequence_pool.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class SequencePoolHelper; /// pooling op @@ -34,20 +34,20 @@ class SequencePoolHelper; * \brief SequencePool operation class * public inheritance Operator */ -template -class SequencePool : public Operator { +template +class SequencePool : public Operator { public: SequencePool() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator SequencePool::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + //LOG(ERROR) << "Not Impl Yet Operator SequencePool< Ttype(" + //<< target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SequencePoolHelper; + friend class SequencePoolHelper; }; /** @@ -55,8 +55,8 @@ class SequencePool : public Operator { * public inherit OperatorHelper * including init resource and shape size in sequence_pool context */ -template -class SequencePoolHelper : public OperatorHelper { +template +class SequencePoolHelper : public OperatorHelper { public: SequencePoolHelper()=default; @@ -72,8 +72,8 @@ class SequencePoolHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class SequencePoolHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_sequence_pool stand for SequencePool parameter - saber::SequencePoolParam> _param_sequence_pool; + saber::SequencePoolParam _param_sequence_pool; ///< _funcs_sequence_pool stand for SequencePool function - saber::SequencePool _funcs_sequence_pool; + saber::SequencePool::saber_type> _funcs_sequence_pool; private: ///< _dims stand for SequencePool size diff --git a/framework/operators/slice.cpp b/framework/operators/slice.cpp index e35250752..da925f192 100644 --- a/framework/operators/slice.cpp +++ b/framework/operators/slice.cpp @@ -4,30 +4,21 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Slice::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_slice; - impl->_funcs_slice(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - - -/// set helper -template -SliceHelper::~SliceHelper() { +#define INSTANCE_SLICE(Ttype, Ptype) \ +template<> \ +void Slice::operator()( \ + OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_slice; \ + impl->_funcs_slice(ins, outs, param, ctx); \ } -template -Status SliceHelper::InitParam() { +template +Status SliceHelper::InitParam() { DLOG(WARNING) << "Parsing Slice op parameter."; auto slice_dim = GET_PARAMETER(int, slice_dim); _slice_point = GET_PARAMETER(PTuple, slice_point); @@ -41,30 +32,30 @@ Status SliceHelper::InitParam() { LOG(INFO) << " axis " << _axis; - SliceParam> param_slice(_axis, _slice_point.vector()); + SliceParam param_slice(_axis, _slice_point.vector()); _param_slice = param_slice; return Status::OK(); } -template -Status SliceHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status SliceHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { SABER_CHECK(_funcs_slice.init(ins, outs, _param_slice, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } -template -Status SliceHelper::InferShape(const std::vector >& +template +Status SliceHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { + std::vector >& outs) { if (_slice_point.size() + 1 != outs.size()) { if (_slice_point.size() == 1) { for (int i = 0; i < outs.size() - 2; i++) { _slice_point.push_back(_slice_point[0] + _slice_point[_slice_point.size() - 1]); } - SliceParam> param_slice(_axis, _slice_point.vector()); + SliceParam param_slice(_axis, _slice_point.vector()); _param_slice = param_slice; } } @@ -74,36 +65,39 @@ Status SliceHelper::InferShape(const std::vector; -template class SliceHelper; -template class SliceHelper; +INSTANCE_SLICE(NV, Precision::FP32); +template class SliceHelper; +ANAKIN_REGISTER_OP_HELPER(Slice, SliceHelper, NV, Precision::FP32); +template class SliceHelper; +template class SliceHelper; #endif -#ifdef USE_ARM_PLACE -template class SliceHelper; -template class SliceHelper; -template class SliceHelper; -#endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Slice, SliceHelper, NV, AK_FLOAT, Precision::FP32); + +#ifdef USE_X86_PLACE +INSTANCE_SLICE(X86, Precision::FP32); +template class SliceHelper; +ANAKIN_REGISTER_OP_HELPER(Slice, SliceHelper, X86, Precision::FP32); #endif + #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Slice, SliceHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_SLICE(ARM, Precision::FP32); +template class SliceHelper; +ANAKIN_REGISTER_OP_HELPER(Slice, SliceHelper, ARM, Precision::FP32); #endif + //! register op ANAKIN_REGISTER_OP(Slice) .Doc("Slice operator") #ifdef USE_CUDA -.__alias__("slice") +.__alias__("slice") #endif #ifdef USE_ARM_PLACE -.__alias__("slice") +.__alias__("slice") #endif .num_in(1) .num_out(1) .Args("slice_dim", " slice dim at input ") .Args>("slice_point", " slice point of op") - .Args("axis", " axis of input to slice"); +.Args("axis", " axis of input to slice"); } /* namespace ops */ diff --git a/framework/operators/slice.h b/framework/operators/slice.h index 75b00aa6c..7abbda31b 100644 --- a/framework/operators/slice.h +++ b/framework/operators/slice.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class SliceHelper; /// pooling op @@ -34,20 +34,20 @@ class SliceHelper; * \brief Slice implementation class * public inherit Operator */ -template -class Slice : public Operator { +template +class Slice : public Operator { public: Slice() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Slice< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SliceHelper; + friend class SliceHelper; }; /** @@ -55,12 +55,12 @@ class Slice : public Operator { * public inherit OperatorHelper * including init resource and shape size in Slice context */ -template -class SliceHelper : public OperatorHelper { +template +class SliceHelper : public OperatorHelper { public: SliceHelper()=default; - ~SliceHelper(); + ~SliceHelper() {} Status InitParam() override; @@ -72,8 +72,8 @@ class SliceHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,14 +81,14 @@ class SliceHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_slice stand for slice parameter - saber::SliceParam> _param_slice; + saber::SliceParam _param_slice; ///< _funcs_slice stand for slice function - saber::Slice _funcs_slice; + saber::Slice::saber_type> _funcs_slice; private: ///< _slice_point stand for op slice @@ -97,8 +97,6 @@ class SliceHelper : public OperatorHelper { int _axis; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/softmax.cpp b/framework/operators/softmax.cpp index 509a8445e..3264470eb 100644 --- a/framework/operators/softmax.cpp +++ b/framework/operators/softmax.cpp @@ -4,78 +4,117 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Softmax::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_softmax; - impl->_funcs_softmax(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - - -/// set helper -template -SoftmaxHelper::~SoftmaxHelper() { +#define INSTANCE_SOFTMAX(Ttype, Ptype) \ +template<> \ +void Softmax::operator()( \ + OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*>\ + (this->_helper)->_param_softmax; \ + impl->_funcs_softmax(ins, outs, param, ctx); \ } -template -Status SoftmaxHelper::InitParam() { +template +Status SoftmaxHelper::InitParam() { DLOG(WARNING) << "Parsing Softmax op parameter."; auto axis = GET_PARAMETER(int, axis); - SoftmaxParam> param_softmax(axis); + SoftmaxParam param_softmax(axis); _param_softmax = param_softmax; return Status::OK(); } -template -Status SoftmaxHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template<> +Status SoftmaxHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_softmax.init(ins, outs, _param_softmax, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +template<> +Status SoftmaxHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_softmax.init(ins, outs, _param_softmax, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template<> +Status SoftmaxHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_softmax.init(ins, outs, _param_softmax, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status SoftmaxHelper::Init(OpContext &ctx, const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_softmax.init(ins, outs, _param_softmax, STATIC, SABER_IMPL, ctx)); return Status::OK(); } -template -Status SoftmaxHelper::InferShape(const std::vector >& - ins, - std::vector >& outs) { +template +Status SoftmaxHelper::InferShape(const std::vector> &ins, + std::vector> &outs) { SABER_CHECK(_funcs_softmax.compute_output_shape(ins, outs, _param_softmax)); return Status::OK(); } #ifdef USE_CUDA -template class SoftmaxHelper; -template class SoftmaxHelper; -template class SoftmaxHelper; -#endif -#ifdef USE_ARM_PLACE -template class SoftmaxHelper; -template class SoftmaxHelper; -template class SoftmaxHelper; +INSTANCE_SOFTMAX(NV, Precision::FP32); +template class SoftmaxHelper; +template class SoftmaxHelper; +template class SoftmaxHelper; +ANAKIN_REGISTER_OP_HELPER(Softmax, SoftmaxHelper, NV, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Softmax, SoftmaxHelper, NV, AK_FLOAT, Precision::FP32); + +#ifdef USE_X86_PLACE +INSTANCE_SOFTMAX(X86, Precision::FP32); +template class SoftmaxHelper; +ANAKIN_REGISTER_OP_HELPER(Softmax, SoftmaxHelper, X86, Precision::FP32); #endif + #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Softmax, SoftmaxHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_SOFTMAX(ARM, Precision::FP32); +template <> +Status SoftmaxHelper::Init(OpContext &ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { + SABER_CHECK(_funcs_softmax.init(ins, outs, _param_softmax, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(Softmax, SoftmaxHelper, ARM, Precision::FP32); +#endif + +#ifdef AMD_GPU +INSTANCE_SOFTMAX(AMD, Precision::FP32); +template <> +Status SoftmaxHelper::Init(OpContext &ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { + SABER_CHECK(_funcs_softmax.init(ins, outs, _param_softmax, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(Softmax, SoftmaxHelper, AMD, Precision::FP32); #endif + //! register op ANAKIN_REGISTER_OP(Softmax) .Doc("Softmax operator") #ifdef USE_CUDA -.__alias__("softmax") +.__alias__("softmax") #endif #ifdef USE_ARM_PLACE -.__alias__("softmax") +.__alias__("softmax") +#endif +#ifdef USE_X86_PLACE +.__alias__("softmax") +#endif +#ifdef AMD_GPU +.__alias__("softmax") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/softmax.h b/framework/operators/softmax.h index aa8b8ef5f..ad91b1fec 100644 --- a/framework/operators/softmax.h +++ b/framework/operators/softmax.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ namespace anakin { namespace ops { -template +template class SoftmaxHelper; /// pooling op @@ -34,33 +34,35 @@ class SoftmaxHelper; * \brief softmax implementation class * public inherit Operator */ -template -class Softmax : public Operator { +template +class Softmax : public Operator { public: Softmax() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - //LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Softmax< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SoftmaxHelper; + friend class SoftmaxHelper; }; + + /** * \brief softmax helper class to implement softmax * public inherit OperatorHelper * including init resource and shape size in softmax context */ -template -class SoftmaxHelper : public OperatorHelper { +template +class SoftmaxHelper : public OperatorHelper { public: SoftmaxHelper()=default; - ~SoftmaxHelper(); + ~SoftmaxHelper() {} Status InitParam() override; @@ -72,8 +74,8 @@ class SoftmaxHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -81,18 +83,16 @@ class SoftmaxHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< _param_softmax stand for softmax parameter - saber::SoftmaxParam> _param_softmax; + saber::SoftmaxParam _param_softmax; ///< _funcs_softmax stand for softmax function - saber::Softmax _funcs_softmax; + saber::Softmax::saber_type> _funcs_softmax; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/split.cpp b/framework/operators/split.cpp index 09de18e83..41b1a3dd3 100644 --- a/framework/operators/split.cpp +++ b/framework/operators/split.cpp @@ -3,102 +3,73 @@ namespace anakin { namespace ops { +#define INSTANCE_SPLIT(Ttype, Ptype) \ +template<> \ +void Split::operator()( \ + OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) {} -#ifdef USE_CUDA -template<> -void Split::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { -} -#endif - -#ifdef USE_X86_PLACE -template<> -void Split::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { -} -#endif - -/// TODO ... specialization other type of operator - - -/// set helper -template -SplitHelper::~SplitHelper() { -} -template -Status SplitHelper::InitParam() { +template +Status SplitHelper::InitParam() { DLOG(WARNING) << "Parsing Split op parameter."; split_num = GET_PARAMETER(int, split_num); return Status::OK(); } -template -Status SplitHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { +template +Status SplitHelper::Init(OpContext &ctx, const std::vector> &ins, + std::vector> &outs) { return Status::OK(); } -template -Status SplitHelper::InferShape(const std::vector >& - ins, - std::vector >& outs) { +template +Status SplitHelper::InferShape(const std::vector> &ins, + std::vector> &outs) { for (int i = 0; i < split_num; i++) { outs[i]->set_shape(ins[0]->valid_shape()); outs[i]->set_seq_offset(ins[0]->get_seq_offset()); } - return Status::OK(); } -#ifdef USE_CUDA -template class SplitHelper; -template class SplitHelper; -template class SplitHelper; -#endif -#ifdef USE_ARM_PLACE -template class SplitHelper; -template class SplitHelper; -template class SplitHelper; -#endif -#ifdef USE_X86_PLACE -template class SplitHelper; -template class SplitHelper; -template class SplitHelper; -#endif -// register helper #ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, NV, AK_FLOAT, Precision::FP32); +INSTANCE_SPLIT(NV, Precision::FP32); +template class SplitHelper; +ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, NV, Precision::FP32); #endif + #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, ARM, AK_FLOAT, Precision::FP32); +INSTANCE_SPLIT(ARM, Precision::FP32); +template class SplitHelper; +ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, ARM, Precision::FP32); #endif + #ifdef USE_X86_PLACE -ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, X86, AK_FLOAT, Precision::FP32); +INSTANCE_SPLIT(X86, Precision::FP32); +template class SplitHelper; +ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, X86, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(Split) .Doc("Split operator") #ifdef USE_CUDA -.__alias__("split") +.__alias__("split") #endif #ifdef USE_ARM_PLACE -.__alias__("split") +.__alias__("split") #endif #ifdef USE_X86_PLACE -.__alias__("split") +.__alias__("split") #endif .num_in(1) .num_out(1) .Args("split_num", " split output number. "); + } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/split.h b/framework/operators/split.h index 09b5dd1bd..a52f24bfa 100644 --- a/framework/operators/split.h +++ b/framework/operators/split.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ namespace anakin { namespace ops { -template +template class SplitHelper; /// pooling op @@ -33,20 +33,18 @@ class SplitHelper; * \brief Split implementation class * public inherit Operator */ -template -class Split : public Operator { +template +class Split : public Operator { public: Split() {} /// forward impl virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - //LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; + const std::vector >& ins, + std::vector >& outs) { } - friend class SplitHelper; + friend class SplitHelper; }; /** @@ -54,12 +52,12 @@ class Split : public Operator { * public inherit OperatorHelper * including init resource and shape size in Split context */ -template -class SplitHelper : public OperatorHelper { +template +class SplitHelper : public OperatorHelper { public: SplitHelper()=default; - ~SplitHelper(); + ~SplitHelper(){} Status InitParam() override; @@ -71,8 +69,8 @@ class SplitHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; + const std::vector >& ins, + std::vector >& outs) override; /** * \brief infer the shape of output and input. @@ -80,8 +78,8 @@ class SplitHelper : public OperatorHelper { * \param outs stand for output tensor vector * \return status */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; public: ///< split_num stand for split-numbers @@ -89,8 +87,6 @@ class SplitHelper : public OperatorHelper { }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/service/CMakeLists.txt b/framework/service/CMakeLists.txt new file mode 100644 index 000000000..7e7657f8f --- /dev/null +++ b/framework/service/CMakeLists.txt @@ -0,0 +1,14 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved +# ---------------------------------------------------------------------------- + +# used for temporary +anakin_fetch_include_recursively(${ANAKIN_SERVICE}) +set(ANAKIN_BASE_SRC "") + +anakin_fetch_files_with_suffix(${ANAKIN_SERVICE}/api "cpp" ANAKIN_BASE_SRC) +anakin_fetch_files_with_suffix(${ANAKIN_SERVICE} "cpp" ANAKIN_BASE_SRC) + +list(APPEND ANAKIN_SRC ${ANAKIN_BASE_SRC}) +set(ANAKIN_SRC ${ANAKIN_SRC} PARENT_SCOPE) +unset(ANAKIN_BASE_SRC) diff --git a/framework/service/anakin_service.cpp b/framework/service/anakin_service.cpp new file mode 100644 index 000000000..4f7fb9b68 --- /dev/null +++ b/framework/service/anakin_service.cpp @@ -0,0 +1,165 @@ +#include "framework/service/anakin_service.h" + +namespace anakin { + +namespace rpc { + +template +void AnakinService::set_device_id(int dev_id) { + _dev_id = dev_id; + saber::TargetWrapper::set_device(_dev_id); +} + +template +void AnakinService::initial(std::string model_name, + std::string model_path, + int thread_num) { + _worker_map[model_name] = std::make_shared >(model_path, + thread_num); +} + +template +void AnakinService::launch() { + for(auto it = _worker_map.begin(); it != _worker_map.end();) { + it->second->launch(); + it++; + } +} + +template +void AnakinService::register_inputs(std::string model_name, + std::vector in_names) { + _worker_map[model_name]->register_inputs(in_names); +} + +template +void AnakinService::register_outputs(std::string model_name, + std::vector out_names) { + _worker_map[model_name]->register_outputs(out_names); +} + +template +void AnakinService::Reshape(std::string model_name, + std::string in_name, + std::vector in_shape) { + _worker_map[model_name]->Reshape(in_name, in_shape); +} + +template +void AnakinService::register_interior_edges(std::string model_name, + std::string edge_start, + std::string edge_end) { + _worker_map[model_name]->register_interior_edges(edge_start, edge_end); +} + +template +inline void AnakinService::extract_request( + const RPCRequest* request, + std::vector::type> >& inputs) { + for(int i = 0; i < request->inputs_size(); i++) { + LOG(INFO) << "Get " << i << "input"; + auto& io = request->inputs(i); + auto& data = io.tensor(); + auto& shape = data.shape(); + saber::Shape tensor_shape({shape[0],shape[1],shape[2],shape[3]}); + Tensor4d::type> h_tensor; + h_tensor.re_alloc(tensor_shape); + float* h_data = (float*)(h_tensor.mutable_data()); + DLOG(INFO) <<"Check shape: " << shape[0] << " " << shape[1] << " " << shape[2] << " " < +inline void AnakinService::fill_response_data( + int request_id, + std::string model_name, + RPCResponse* response, + std::vector::type> >& outputs) { + response->set_model(model_name); + response->set_request_id(request_id); + int count =0; + for(auto& h_out : outputs) { + LOG(INFO) << "Get " << count << " output"; + count++; + // copy to host + auto shape = h_out.valid_shape(); + // fill response + IO* output = response->add_outputs(); + Data* data = output->mutable_tensor(); + data->add_shape(shape[0]); + data->add_shape(shape[1]); + data->add_shape(shape[2]); + data->add_shape(shape[3]); + data->mutable_data()->Reserve(shape[0]*shape[1]*shape[2]*shape[3]); + for(int j=0; jadd_data(((float*)(h_out.mutable_data()))[j]); + } + LOG(INFO) << " output size: " <data_size(); + } +} + +template +inline void AnakinService::fill_response_exec_info(RPCResponse* response) { + auto* info = response->mutable_info(); + info->set_msg("SUC"); + DeviceStatus* status_p = info->mutable_device_status(); + status_p->set_id(_monitor.get_id()); + status_p->set_name(_monitor.get_name()); + status_p->set_temp(_monitor.get_temp()); + status_p->set_mem_free(_monitor.get_mem_free()); + status_p->set_mem_used(_monitor.get_mem_used()); + info->set_duration_in_nano_seconds(-1); +} + +#ifdef USE_CUDA +template class AnakinService; +template class AnakinService; +template class AnakinService; + +template class AnakinService; +template class AnakinService; +template class AnakinService; +#endif + +#ifdef USE_X86_PLACE +template class AnakinService; +template class AnakinService; +template class AnakinService; + +template class AnakinService; +template class AnakinService; +template class AnakinService; +#endif + +#ifdef USE_ARM_PLACE +#ifdef ANAKIN_TYPE_FP32 +template class AnakinService; +template class AnakinService; +#endif + +#ifdef ANAKIN_TYPE_FP16 +template class AnakinService; +template class AnakinService; +#endif + +#ifdef ANAKIN_TYPE_INT8 +template class AnakinService; +template class AnakinService; +#endif //int8 + +#endif //arm + + +} /* namespace rpc */ + +} /* namespace anakin */ + diff --git a/framework/service/anakin_service.h b/framework/service/anakin_service.h new file mode 100644 index 000000000..1a0a136e5 --- /dev/null +++ b/framework/service/anakin_service.h @@ -0,0 +1,120 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SERVICE_H +#define ANAKIN_SERVICE_H + +#include + +#include "framework/service/monitor.h" +#include "framework/core/net/worker.h" +#include "framework/service/api/service.pb.h" + +namespace anakin { + +namespace rpc { + +template +class AnakinService : public RPCService { +public: + void evaluate(::google::protobuf::RpcController* controller_base, + const RPCRequest* request, + RPCResponse* response, + ::google::protobuf::Closure* done) { + _evaluate(controller_base, request, response, done, ServiceRunPatternToType()); + } + +public: + void set_device_id(int dev_id); + + void initial(std::string model_name, std::string model_path, int thread_num); + + void launch(); + + void Reshape(std::string model_name, std::string in_name, std::vector in_shape); + + void register_inputs(std::string model_name, std::vector in_names); + + void register_outputs(std::string model_name, std::vector); + + void register_interior_edges(std::string model_name, std::string edge_start, std::string edge_end); + + template + void register_aux_function(std::string model_name, functor function, ParamTypes ...args) { + _worker_map[model_name].register_aux_function(function, std::forward(args)...); + } + + template + void create_monitor(int interval_time_in_sec) { + _monitor.template create_instance(_dev_id, interval_time_in_sec); + } + +private: + void extract_request(const RPCRequest* request, + std::vector::type> >& inputs); + void fill_response_data(int request_id, std::string model_name, + RPCResponse* response, + std::vector::type> >& outputs); + void fill_response_exec_info(RPCResponse* response); + +private: + void _evaluate(::google::protobuf::RpcController* controller_base, + const RPCRequest* request, + RPCResponse* response, + ::google::protobuf::Closure* done, + ServiceRunPatternToType) { + // make sure that done will be invoked + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(controller_base); + // receive remote call from client. + LOG(INFO) << "Received request[log_id=" << cntl->log_id() << "] from " << cntl->remote_side(); + if (!cntl->request_attachment().empty()) { + LOG(INFO) << " |-- (attached=" << cntl->request_attachment() << ")"; + } + std::string model_name = request->model(); + int request_id = request->request_id(); + LOG(INFO) <<" |-- Get model: "<::type> > inputs; + extract_request(request, inputs); + auto ret = _worker_map[model_name]->sync_prediction(inputs); + auto results = ret.get(); + LOG(ERROR) << "do infer over! thread id: " << std::this_thread::get_id(); + fill_response_data(request_id, model_name, response, results); + fill_response_exec_info(response); + } + + void _evaluate(::google::protobuf::RpcController* controller_base, + const RPCRequest* request, + RPCResponse* response, + ::google::protobuf::Closure* done, + ServiceRunPatternToType) { + // make sure that done will be invoked + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(controller_base); + } + + + +private: + std::unordered_map > > _worker_map; + Monitor _monitor; + int _dev_id; +}; + +} /* namespace rpc */ + +} /* namespace anakin */ + +#endif diff --git a/framework/service/api/.gitignore b/framework/service/api/.gitignore new file mode 100644 index 000000000..0d9561969 --- /dev/null +++ b/framework/service/api/.gitignore @@ -0,0 +1,42 @@ +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +# python +*.pyc + +# c++ +*.h +*.cpp +*.cc +*.pb.h +*.pb.cpp diff --git a/framework/service/api/service.proto b/framework/service/api/service.proto new file mode 100644 index 000000000..baa61f03f --- /dev/null +++ b/framework/service/api/service.proto @@ -0,0 +1,51 @@ +syntax = "proto3"; + +package anakin.rpc; + +option cc_generic_services = true; + +message Data { + repeated int32 shape = 1; + repeated float data = 2; +}; + +message IO { + Data tensor = 1; // input tensor +}; + +// RPC request +message RPCRequest { + bytes model = 1; + repeated IO inputs = 2; + int64 request_id = 3; // you need to set request ID,then to get async retults by request_id +}; + +message DeviceStatus { + int32 id = 1; // device id (represent as device num id) + bytes name = 2; // device name + int32 temp = 3; // device temperature Celsius degree + int32 mem_free = 4; // device memory free bytes + int32 mem_used = 5; // device memory used bytes +}; + +// RPC service execution information +message ExecutionInfo { + // additional exception message of the execution + bytes msg = 1; + // duration of this execution in nano seconds + int32 duration_in_nano_seconds = 2; + // device status + DeviceStatus device_status = 3; +}; + +// RPC response +message RPCResponse { + bytes model = 1; // model name + repeated IO outputs = 2; // evaluation output of a batch + ExecutionInfo info = 3; // the additional information of this execution + int64 request_id = 4; +}; + +service RPCService { + rpc evaluate (RPCRequest) returns (RPCResponse); +}; diff --git a/framework/service/device_info.cpp b/framework/service/device_info.cpp new file mode 100644 index 000000000..9639c5699 --- /dev/null +++ b/framework/service/device_info.cpp @@ -0,0 +1,99 @@ +#include "framework/service/device_info.h" + +namespace anakin { + +namespace rpc { + +#ifdef USE_CUDA +template<> +struct Inquiry { + ~Inquiry() { + result = nvmlShutdown(); + if (NVML_SUCCESS != result) { + LOG(FATAL) << "Failed to shutdown the nvml of device: " << nvmlErrorString(result); + } + } + + void init(int dev_id = 0) { + _dev_id = dev_id; + memory_has_inspected = false; + result = nvmlInit(); + if (NVML_SUCCESS != result) { + LOG(FATAL) <<" Failed to initialize NVML: " << nvmlErrorString(result); + } + result = nvmlDeviceGetHandleByIndex(dev_id, &device); + if (NVML_SUCCESS != result) { + LOG(FATAL) << " Failed to get handle for device: " << nvmlErrorString(result); + } + } + + template + typename InfoTraits::data_type get() { + LOG(WARNING) << "Target not support! "; + return InfoTraits::data_type(); + } + +private: + int _dev_id; + nvmlReturn_t result; + nvmlDevice_t device; + nvmlMemory_t memory; + bool memory_has_inspected; +}; + +template<> +typename InfoTraits::data_type Inquiry::get() { + return _dev_id; +} + +template<> +typename InfoTraits::data_type Inquiry::get() { + char name[NVML_DEVICE_NAME_BUFFER_SIZE]; + result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); + if (NVML_SUCCESS != result) { + LOG(FATAL) << "Failed to get name of device: " << nvmlErrorString(result); + } + return std::string(name); +} + +template<> +typename InfoTraits::data_type Inquiry::get() { + nvmlTemperatureSensors_t sensorType = NVML_TEMPERATURE_GPU; + unsigned int temp; + result = nvmlDeviceGetTemperature(device, sensorType, &temp); + if (NVML_SUCCESS != result) { + LOG(FATAL) << "Failed to get temperature of device: " << nvmlErrorString(result); + } + return temp; +} + +template<> +typename InfoTraits::data_type Inquiry::get() { + if(!memory_has_inspected) { + result = nvmlDeviceGetMemoryInfo(device, &memory); + if (NVML_SUCCESS != result) { + LOG(FATAL) << "Failed to get device memory info of device: " << nvmlErrorString(result); + } + memory_has_inspected = true; + } + return memory.free; +} + +template<> +typename InfoTraits::data_type Inquiry::get() { + if(!memory_has_inspected) { + result = nvmlDeviceGetMemoryInfo(device, &memory); + if (NVML_SUCCESS != result) { + LOG(FATAL) << "Failed to get device memory info of device: " << nvmlErrorString(result); + } + memory_has_inspected = true; + } + return memory.used; +} + +#endif + +} /* namespace rpc */ + +} /* namespace anakin */ + diff --git a/framework/service/device_info.h b/framework/service/device_info.h new file mode 100644 index 000000000..409999f26 --- /dev/null +++ b/framework/service/device_info.h @@ -0,0 +1,190 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_DEVICE_INFO_H +#define ANAKIN_DEVICE_INFO_H + +#include "anakin_config.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef USE_CUDA +#include +#include +#include +#include // cuda driver types +#endif + +#include "utils/logger/logger.h" +#include "saber/saber_types.h" + +namespace anakin { + +namespace rpc { + +enum Info { + DEV_ID, + DEV_NAME, + DEV_TMP, + DEV_MEM_FREE, + DEV_MEM_USED, +}; + +template +struct check_same { + const static bool value = false; +}; + +template<> +struct check_same { + const static bool value = true; +}; + +template<> +struct check_same { + const static bool value = true; +}; + +template<> +struct check_same { + const static bool value = true; +}; + +template<> +struct check_same { + const static bool value = true; +}; + +template<> +struct check_same { + const static bool value = true; +}; + + +template +struct InfoTraits { + typedef float data_type; + float _val; +}; + +template<> +struct InfoTraits { + typedef std::string data_type; +}; + +template<> +struct InfoTraits { + typedef int data_type; +}; + +template +struct InfoStruct { + void _set(typename InfoTraits::data_type value) { + _val = value; + } + typename InfoTraits::data_type _get() { + return _val; + } +private: + typename InfoTraits::data_type _val; +}; + +template +struct Inquiry { + ~Inquiry() {} + + void init(int dev_id = 0) {} + + template + typename InfoTraits::data_type get() { + LOG(WARNING) << "Target not support! "; + return typename InfoTraits::data_type(); + } +private: + int _dev_id; +}; + +template +struct HasTarget { + const static bool value = check_same::value || HasTarget::value; +}; + +template +struct HasTarget { + const static bool value = check_same::value; +}; + +template +class DevInfo : public InfoStruct... { +public: + template + void set(typename InfoTraits::data_type value) { + std::unique_lock lock(this->_mut); + if(HasTarget::value) { + LOG(FATAL)<<" DevInfo parameter pack doesn't have target info type " << I; + } + InfoStruct::_set(value); + } + + template + typename InfoTraits::data_type get() { + if(HasTarget::value) { + LOG(ERROR)<<" DevInfo parameter pack doesn't have target info type " << I; + return typename InfoTraits::data_type(); + } + return InfoStruct::_get(); + } + + template + void inquiry(int dev_id) { + Inquiry instance; + instance.init(dev_id); + std::vector info_vec = {infos...}; + for(auto& info : info_vec) { + switch(info) { + case DEV_ID: { + set(instance.get()); + } break; + case DEV_NAME: { + set(instance.get()); + } break; + case DEV_TMP: { + set(instance.get()); + } break; + case DEV_MEM_FREE: { + set(instance.get()); + } break; + case DEV_MEM_USED: { + set(instance.get()); + } break; + default: break; + } + } + } +private: + std::mutex _mut; +}; + +} /* namespace rpc */ + +} /* namespace anakin */ + +#endif diff --git a/framework/service/monitor.cpp b/framework/service/monitor.cpp new file mode 100644 index 000000000..c059c85e6 --- /dev/null +++ b/framework/service/monitor.cpp @@ -0,0 +1,10 @@ +#include "framework/service/monitor.h" + +namespace anakin { + +namespace rpc { + +} /* namespace rpc */ + +} /* namespace anakin */ + diff --git a/framework/service/monitor.h b/framework/service/monitor.h new file mode 100644 index 000000000..3ef423c1f --- /dev/null +++ b/framework/service/monitor.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_MONITOR_H +#define ANAKIN_MONITOR_H + +#include "framework/service/device_info.h" + +namespace anakin { + +namespace rpc { + +/// monitor thread pool +template +class Monitor { +public: + Monitor(){} + ~Monitor(){} + + template + void create_instance(int dev_id, int interval_time_in_sec) { + _id = dev_id; + _monitor_thread = new std::thread([this](int dev_id, int time) { + DevInfo dev_info_pack; + std::chrono::time_point start = sys_clock::now(); + for(;;) { + double elapsed_time_ms =\ + std::chrono::duration_cast(sys_clock::now()-start).count(); + if(elapsed_time_ms > time * 1000) { + start = sys_clock::now(); + dev_info_pack.template inquiry(dev_id); + _name = dev_info_pack.template get(); + _temp = dev_info_pack.template get(); + _mem_free = dev_info_pack.template get(); + _mem_used = dev_info_pack.template get(); + } + } + }, dev_id, interval_time_in_sec); + } + + int get_id() { return _id; } + + std::string get_name() { return _name; } + + float get_temp() { return _temp; } + + float get_mem_free() { return _mem_free; } + + float get_mem_used() { return _mem_used; } + +private: + typedef std::chrono::system_clock sys_clock; + int _id{-1}; // device id (represent as device num id) + std::string _name{"unknown"}; // device name + float _temp{-1000}; // device temperature Celsius degree + float _mem_free{-1}; // device memory free bytes + float _mem_used{-1}; + std::thread* _monitor_thread; +}; + +} /* namespace rpc */ + +} /* namespace anakin */ + +#endif diff --git a/framework/service/service_daemon.cpp b/framework/service/service_daemon.cpp new file mode 100644 index 000000000..697c43b14 --- /dev/null +++ b/framework/service/service_daemon.cpp @@ -0,0 +1,99 @@ +#include "framework/service/service_daemon.h" + +namespace anakin { + +namespace rpc { + +void ServiceDaemon::operator()(std::function server_start, + std::vector device_list, + int server_port) { + // Our process ID and Session ID + pid_t pid, sid; + + // Fork off the parent process + pid = fork(); + if (pid < 0) { + exit(EXIT_FAILURE); + } + // exit the parent process. + if (pid > 0) { + exit(EXIT_SUCCESS); + } + + // Change the file mode mask, so we can use the files created by daemon. + umask(0); + + // Create a new SID(a new session) for the child process + sid = setsid(); + if (sid < 0) { + // Log the failure + exit(EXIT_FAILURE); + } + + // Change the current working directory + if ((chdir("/")) < 0) { + exit(EXIT_FAILURE); + } + + // Close out the standard file descriptors + //close(STDIN_FILENO); // 0 + //close(STDOUT_FILENO); // 1 + //close(STDERR_FILENO); // 2 + + // Daemon-specific initialization goes here */ + pid_t *pid_news = new pid_t[device_list.size()]; + for(;;) { + for(auto dev_id : device_list) { + if(!check_port_occupied(server_port) || !check_process_exist(pid_news[dev_id])) { + LOG(WARNING) <<" Create daemon process on device : " << dev_id; + // reaped zombie process + if(pid_news[dev_id]) waitpid(pid_news[dev_id], NULL, 0); + + pid_news[dev_id] = fork(); + // fork new process + if(pid_news[dev_id] == 0) { + prctl(PR_SET_NAME, "anakin_child_rpc_service"); + int ret = server_start(server_port, dev_id); + if(ret == 0) exit(EXIT_SUCCESS); + else exit(EXIT_FAILURE); + } + } + } + + sleep(30); // wait 30 seconds + } + exit(EXIT_SUCCESS); +} + +bool ServiceDaemon::check_port_occupied(int port) { + struct sockaddr_in client; + int sk; + + client.sin_family = AF_INET; + client.sin_port = htons(port); + client.sin_addr.s_addr = inet_addr("0.0.0.0"); + + sk = (int) socket(AF_INET, SOCK_STREAM, 0); + + int result = connect(sk, (struct sockaddr *) &client, sizeof(client)); + + if (result == 0) { + return true; // port is ocuupied. + } else { + return false; + } +} + +bool ServiceDaemon::check_process_exist(pid_t pid) { + if(kill(pid, 0) == -1) { + return false; + } else { + // process still exists + return true; + } +} + +} /* namespace rpc */ + +} /* namespace anakin */ + diff --git a/framework/service/service_daemon.h b/framework/service/service_daemon.h new file mode 100644 index 000000000..6adbccce8 --- /dev/null +++ b/framework/service/service_daemon.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SERVICE_DAEMON_H +#define ANAKIN_SERVICE_DAEMON_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "framework/service/anakin_service.h" + +namespace anakin { + +namespace rpc { + +class ServiceDaemon { +public: + ServiceDaemon() {} + ~ServiceDaemon() {} + + void operator()(std::function server_start, + std::vector device_list, + int server_port); + +private: + bool check_port_occupied(int port); + + bool check_process_exist(pid_t pid); + +private: +}; + +} /* namespace rpc */ + +} /* namespace anakin */ + +#endif diff --git a/framework/utils/parameter_fusion.h b/framework/utils/parameter_fusion.h new file mode 100644 index 000000000..9edf97ef0 --- /dev/null +++ b/framework/utils/parameter_fusion.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_FRAMEWORK_UTILS_PARAMETER_FUSION_H +#define ANAKIN_FRAMEWORK_UTILS_PARAMETER_FUSION_H + +#include +#include +#include "framework/core/parameter.h" + +namespace anakin { + +/** + * \brief update conv weights with batchnorm and scale parameters. + */ +template +void update_weights(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term) { + D* weights_p = (D* )(weights.h_tensor().mutable_data()); + if(!conv_bias_term) { + bias.re_alloc(Shape4d({1, batchnorm_mean.size(), 1, 1})); + void* new_bias_data = bias.h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(D) * bias.h_tensor().size()); + } + D* bias_p = (D* )(bias.h_tensor().mutable_data()); + + batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; + int chw = c * h * w; + for (int i = 0; i < n; i++) { + D alpha = 1.f; + D beta = 0.f; + // insert batchnorm parameters + alpha = batchnorm_variance[i] * batchnorm_scale + batchnorm_eps; + alpha = 1.f / sqrtf(alpha); + beta = -1.f * (batchnorm_mean[i] * batchnorm_scale); + beta = beta * alpha; + + // insert scale parameters + alpha = scale_w[i] * alpha; + if(scale_bias_term) { + beta = beta * scale_w[i] + scale_b[i]; + } else { + beta = beta * scale_w[i]; + } + for(int j=0; j < chw; j++) { + weights_p[i * chw + j] *= alpha; + } + bias_p[i] *= alpha; + bias_p[i] += beta; + } + weights.d_tensor().copy_from(weights.h_tensor()); + bias.d_tensor().copy_from(bias.h_tensor()); +} + +} /* namespace anakin */ + +#endif diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt index 82d9bcdab..52b61ce72 100644 --- a/saber/CMakeLists.txt +++ b/saber/CMakeLists.txt @@ -1,14 +1,22 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved -# @file CMakeLists files in the saber directory of project -# @auther cuichaowen -# @date 2017-10-24 -# ---------------------------------------------------------------------------- +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. set(ANAKIN_SABER_STATIC_RELAY "" ) set(ANAKIN_SABER_BASE_SRC "") anakin_fetch_include_recursively(${ANAKIN_SABER}) +anakin_fetch_include_recursively(${ANAKIN_UTILS}) # add ak_base_source files anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core "cpp" ANAKIN_SABER_BASE_SRC) @@ -17,6 +25,14 @@ anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs "cpp" ANAKIN_SABER_BASE_SRC if(USE_ARM_PLACE) anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/arm "cpp" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/arm "cpp" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/arm/impl "cpp" ANAKIN_SABER_BASE_SRC) +endif() + +if(USE_BM_PLACE) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/bm "cpp" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/bm "cpp" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/bm/impl "cpp" ANAKIN_SABER_BASE_SRC) endif() if(USE_GPU_PLACE) @@ -24,12 +40,16 @@ if(USE_GPU_PLACE) anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/cuda "cpp" ANAKIN_SABER_BASE_SRC) anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/cuda "cpp" ANAKIN_SABER_BASE_SRC) else() - message(ERROR "Use GPU place but not use cuda, not support amd yet.") + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/amd "cpp" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/amd "cpp" ANAKIN_SABER_BASE_SRC) endif() endif() -anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/x86 "cpp" ANAKIN_SABER_BASE_SRC) +if(USE_X86_PLACE OR USE_BM_PLACE) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/x86 "cpp" ANAKIN_SABER_BASE_SRC) +endif() + if(USE_X86_PLACE) anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/x86 "cpp" ANAKIN_SABER_BASE_SRC) anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/x86/kernel "cpp" ANAKIN_SABER_BASE_SRC) @@ -56,63 +76,59 @@ if(USE_CUDA) # set select arch for cuda add_subdirectory(${ANAKIN_SABER}/funcs/impl/cuda/base) - set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) + set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) set(CMAKE_CXX_FLAGS "") if(BUILD_SHARED) - CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS SHARED ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) + CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS SHARED ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) endif() if(BUILD_STATIC) CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS STATIC ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) endif() set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP}) - set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY} - ${BEGIN_WHOLE_ARCHIVE} - ${ANAKIN_SABER_SASS_STATIC_LIB} - ${WHOLE_ARCHIVE_END}) -endif() - -if(USE_BM) - anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/bm "cpp" ANAKIN_SABER_BASE_SRC) - anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/bm "cpp" ANAKIN_SABER_BASE_SRC) - - # set root - set(BM_BASE_CODE_ROOT ${ANAKIN_SABER}/funcs/impl/bm/base) - # set select arch for cuda - add_subdirectory(${ANAKIN_SABER}/funcs/impl/bm/base) - - set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) - set(CMAKE_CXX_FLAGS "") - #if(BUILD_SHARED) - #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS SHARED ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) - #endif() - #if(BUILD_STATIC) - #CUDA_COMPILE(ANAKIN_SABER_BM_C_SRC_OBJS STATIC ${ANAKIN_SABER_BM_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) - #endif() - set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP}) - - set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY} - ${BEGIN_WHOLE_ARCHIVE} - ${ANAKIN_SABER_BM_STATIC_LIB} - ${WHOLE_ARCHIVE_END}) + set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY} + ${BEGIN_WHOLE_ARCHIVE} + ${ANAKIN_SABER_SASS_STATIC_LIB} + ${WHOLE_ARCHIVE_END}) endif() +#if(USE_BM_PLACE) +# add_subdirectory(${ANAKIN_SABER}/funcs/impl/bm) +#endif() # add saber library to static -if(UNIX OR APPLE) - ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) - #$) - if(USE_X86_PLACE) - message(STATUS ${ANAKIN_SABER_DEPENDENCIES}) - add_dependencies(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_SABER_DEPENDENCIES}) +if(UNIX OR APPLE) + if (USE_ARM_PLACE) + ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} STATIC ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) + set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${ANAKIN_ROOT}/output/) + else() + if (BUILD_SHARED) + # 2018/11/13 try to use static lib for debug + ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) + #ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) + #$) + if(USE_X86_PLACE) + message(STATUS ${ANAKIN_SABER_DEPENDENCIES}) + add_dependencies(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_SABER_DEPENDENCIES}) + endif() + set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES VERSION ${VERSION}) + message("add_link1:${BIN_NAME}") + #target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} -Wl,--whole-archive ${BIN_NAME} -Wl,--no-whole-archive) + #target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} -Wl,--whole-archive ${BM_ROOT}/lib/cmodel/bmlib.a -Wl,--no-whole-archive) + target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_LINKER_LIBS}) + target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_SABER_STATIC_RELAY}) + set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES LINK_FLAGS "") + set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${ANAKIN_ROOT}/output/) + else() + ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} STATIC ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) + message("add_link2:${BIN_NAME}") + # target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${BIN_NAME}) + target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_LINKER_LIBS}) + set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${ANAKIN_ROOT}/output/) + endif () endif() - set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES VERSION ${VERSION}) - target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_LINKER_LIBS}) - target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_SABER_STATIC_RELAY}) - set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES LINK_FLAGS "") - set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES LIBRARY_OUTPUT_DIRECTORY - ${ANAKIN_ROOT}/output/) endif() - - set(ANAKIN_SABER_LIB_TARGET ${ANAKIN_SABER_TEMP_COMMMON_LIB} PARENT_SCOPE) diff --git a/saber/core/buffer.h b/saber/core/buffer.h index 13903b68e..2991ac81f 100644 --- a/saber/core/buffer.h +++ b/saber/core/buffer.h @@ -1,31 +1,30 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_CORE_BUFFER_H #define ANAKIN_SABER_CORE_BUFFER_H -#include "core/target_wrapper.h" +#include "saber/core/target_wrapper.h" +#include "saber/core/data_traits.h" namespace anakin{ namespace saber{ -//struct TargetWrapper; -#define INSTANTIATE_BUFFER(TargetType) \ - template class Buffer; - template class Buffer { public: + typedef typename DataTraitBase::PtrDtype TPtr; typedef TargetWrapper API; //typedef typename TargetTypeTraits::target_type target_type; @@ -42,7 +41,7 @@ class Buffer { _id = API::get_device_id(); } - explicit Buffer(void* data, size_t size, int id) + explicit Buffer(TPtr data, size_t size, int id) : _own_data(false), _count(size), _capacity(size){ _data = data; _id = API::get_device_id(); @@ -53,7 +52,7 @@ class Buffer { * \brief copy constructor */ Buffer(Buffer& buf){ - CHECK_EQ(buf._data != nullptr, true) << "input buffer is empty"; + CHECK_GT(buf._count, 0) << "input buffer is empty"; _count = buf._count; _id = API::get_device_id(); if (buf._id == _id){ @@ -63,7 +62,7 @@ class Buffer { } else{ _own_data = true; SABER_CHECK(re_alloc(buf._count)); - API::sync_memcpy_p2p(_data, _id, buf.get_data(), buf._id, buf._count); + API::sync_memcpy_p2p(_data, 0, _id, buf.get_data(), 0, buf._id, buf._count); } } @@ -80,7 +79,7 @@ class Buffer { } else{ this->_own_data = true; SABER_CHECK(this->re_alloc(buf._count)); - API::sync_memcpy_p2p(this->_data, this->_id, buf.get_data(), buf._id, \ + API::sync_memcpy_p2p(this->_data, 0, this->_id, buf.get_data(), 0, buf._id, \ buf._count); } return *this; @@ -97,7 +96,7 @@ class Buffer { } else{ _own_data = true; SABER_CHECK(re_alloc(buf._count)); - API::sync_memcpy_p2p(_data, _id, buf.get_data(), buf._id, buf._count); + API::sync_memcpy_p2p(_data, 0, _id, buf.get_data(), 0, buf._id, buf._count); return 0; } } @@ -124,7 +123,7 @@ class Buffer { * \brief re-alloc memory, only if hold the data, can be relloc */ SaberStatus re_alloc(size_t size){ - if (size > _capacity || _data == nullptr){ + if (size > _capacity){ if (_own_data) { CHECK_EQ(_id, API::get_device_id()) << \ "buffer is not declared in current device, could not re_alloc buffer"; @@ -176,8 +175,8 @@ class Buffer { LOG(INFO) << "sync memcpy h2h, size: " << buf.get_count(); - process_API::sync_memcpy(_data, _id, buf.get_data(), \ - buf.get_id(), buf.get_count(), flag_type()); + process_API::sync_memcpy(_data, 0, _id, buf.get_data(), \ + 0, buf.get_id(), buf.get_count(), flag_type()); return SaberSuccess; } @@ -185,12 +184,16 @@ class Buffer { /** * \brief return const data pointer */ - const void* get_data(){return _data;} + const TPtr get_data(){ + return _data; + } /** * \brief return mutable data pointer */ - void* get_data_mutable(){return _data;} + TPtr get_data_mutable(){ + return _data; + } /** * \brief return current size of memory, in size @@ -205,7 +208,7 @@ class Buffer { private: //! \brief device id where data allocated int _id; - void* _data; + TPtr _data; bool _own_data; size_t _count; size_t _capacity; @@ -225,6 +228,65 @@ class Buffer { } }; +template +static inline int MemShare(std::shared_ptr>& dst, \ + const std::shared_ptr>& src, __DtoD) { + //LOG(INFO) << "shared D2D"; + if(dst->get_id() == src->get_id()){ + dst = src; + return 1; + } + //LOG(INFO) << "copied D2D"; + SABER_CHECK(dst->re_alloc(src->get_count())); + SABER_CHECK(dst->sync_copy_from(*src)); + return 0; +} + +template +static inline int MemShare(std::shared_ptr>& dst, \ + const std::shared_ptr>& src, __HtoD) { + //LOG(INFO) << "copied H2D"; + SABER_CHECK(dst->re_alloc(src->get_count())); + SABER_CHECK(dst->sync_copy_from(*src)); + return 0; +} + +template +static inline int MemShare(std::shared_ptr>& dst, \ + const std::shared_ptr>& src, __HtoH) { + //LOG(INFO) << "shared H2H"; + dst = src; + return 1; +} + +template +static inline int MemShare(std::shared_ptr>& dst, \ + const std::shared_ptr>& src, __DtoH) { + //LOG(INFO) << "copied D2H"; + SABER_CHECK(dst->re_alloc(src->get_count())); + SABER_CHECK(dst->sync_copy_from(*src)); + return 0; +} + +template +static inline int BufferMemShare(std::shared_ptr>& dst, \ + const std::shared_ptr>& src){ + + typedef typename TargetTypeTraits::target_type target_type_dst; + typedef typename TargetTypeTraits::target_type target_type_src; + typedef typename TargetTypeTraits::target_category target_category_dst; + + typedef typename IF::value, __HtoH, __DtoH>::Type then_type; + typedef typename IF::value, __DtoD, __HtoD>::Type else_type; + typedef typename IF::value, then_type, else_type>::Type flag_type; + CHECK_EQ(src == nullptr, false) << "input buffer is null!"; + if (!dst){ + dst = std::make_shared>(src->get_count()); + } + return MemShare(dst, src, flag_type()); +} + + } //namespace saber } //namespace anakin diff --git a/saber/core/common.h b/saber/core/common.h index 54d6c56dd..3a64b0d5d 100644 --- a/saber/core/common.h +++ b/saber/core/common.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include "utils/logger/logger.h" #include "anakin_config.h" @@ -54,8 +56,11 @@ inline const char* saber_get_error_string(SaberStatus error_code){ return "ANAKIN_SABER_STATUS_OUT_OF_MEMORY"; case SaberUnImplError: return "ANAKIN_SABER_STATUS_UNIMPL_ERROR"; + case SaberWrongDevice: + return "ANAKIN_SABER_STATUS_WRONG_DEVICE"; + default: + return "ANAKIN SABER UNKOWN ERRORS"; } - return "ANAKIN SABER UNKOWN ERRORS"; } template @@ -135,29 +140,51 @@ const char* cublas_get_errorstring(cublasStatus_t error); const char* cudnn_get_errorstring(cudnnStatus_t status); #endif //USE_CUDNN -#ifdef USE_AMD -#include +#ifdef AMD_GPU + +#ifdef __APPLE__ +#include +#include +#else +#include +#include #endif +#define AMD_CHECK_MSG(condition, msg) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cl_int error = condition; \ + CHECK_EQ(error, CL_SUCCESS) << " " << msg << " (err=" << opencl_get_error_string(error) << ")"; \ + } while (0) -#ifdef USE_ARM_PLACE +#define AMD_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cl_int error = condition; \ + CHECK_EQ(error, CL_SUCCESS) << " " << opencl_get_error_string(error); \ + } while (0) #endif + +#ifdef USE_ARM_PLACE +#ifdef USE_OPENMP +#include +#include +#endif //openmp +#endif //ARM + #endif //ANAKIN_SABER_CORE_COMMON_H -#ifdef USE_BM +#ifdef USE_BM_PLACE #include "bmlib_runtime.h" -#include "bmdnn_api.h" -#include "bmdnn_ext_api.h" #include "bmlib_utils.h" -#define BMDNN_CHECK(condition) \ +#define BM_CHECK(condition) \ do { \ bm_status_t error = condition; \ CHECK_EQ(error, BM_SUCCESS) << " Failed with error code:" << error; \ } while (0) -#endif // USE_BM - +#endif // USE_BM_PLACE diff --git a/saber/core/context.h b/saber/core/context.h index a661cce46..fc21bc755 100644 --- a/saber/core/context.h +++ b/saber/core/context.h @@ -1,29 +1,23 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_CORE_CONTEXT_H #define ANAKIN_SABER_CORE_CONTEXT_H #include "core/env.h" #include "saber/saber_types.h" -#include - -#ifdef USE_BM -#include "bmlib_runtime.h" -#include "bmdnn_api.h" -#include "bmlib_utils.h" -#endif namespace anakin{ @@ -41,10 +35,16 @@ class Context final{ * @param compute_stream_id */ Context(int device_id = 0, int data_stream_id = 0, int compute_stream_id = 0){ +#ifdef USE_BM_PLACE if(std::is_same::value){ LOG(INFO) << "context init for BM"; + int dev_count = 0; + TargetWrapper::get_device_count(dev_count); + CHECK_GE(dev_count, 1) << "Env is not initialized or current target is not exit!"; + _bm_handle = TargetWrapper::get_handle(); return; } +#endif CHECK_GT(devs.size(), 0) << "Env is not initialized or current target is not exit!"; if (device_id >= devs.size()){ @@ -69,16 +69,23 @@ class Context final{ } Context(const Context& ctx){ +#ifdef USE_BM_PLACE if(std::is_same::value){ LOG(INFO) << "context init for BM"; + _bm_handle = ctx._bm_handle; return; } - +#endif _device_id = ctx._device_id; _data_stream_id = ctx._data_stream_id; _compute_stream_id = ctx._compute_stream_id; _stream_compute = ctx._stream_compute; _stream_data = ctx._stream_data; +#ifdef USE_ARM_PLACE + _act_ids = ctx._act_ids; + _mode = ctx._mode; +#endif + } Context& operator=(const Context& ctx){ @@ -87,6 +94,13 @@ class Context final{ this->_compute_stream_id = ctx._compute_stream_id; this->_stream_data = ctx._stream_data; this->_stream_compute = ctx._stream_compute; +#ifdef USE_ARM_PLACE + this->_act_ids = ctx._act_ids; + this->_mode = ctx._mode; +#endif +#ifdef USE_BM_PLACE + this->_bm_handle = ctx._bm_handle; +#endif return *this; } @@ -95,6 +109,9 @@ class Context final{ comp_eq = comp_eq && (_device_id == right._device_id); comp_eq = comp_eq && (_data_stream_id == right._data_stream_id); comp_eq = comp_eq && (_compute_stream_id == right._compute_stream_id); +#ifdef USE_BM_PLACE + comp_eq = comp_eq && (_bm_handle == right._bm_handle); +#endif return comp_eq; } @@ -124,11 +141,20 @@ class Context final{ #ifdef USE_ARM_PLACE - void set_power_mode(PowerMode mode); - void set_act_cores(std::vector ids); + //void set_act_cores(std::vector ids); + //void set_power_mode(PowerMode mode); + void set_run_mode(PowerMode mode, int threads); + //void set_cache(size_t l1size, size_t l2size, size_t l3size); void bind_dev(); - PowerMode get_mode(); - std::vector get_act_ids(); + PowerMode get_mode(int& threads); + //PowerMode get_mode(); + //std::vector get_act_ids(); +#endif + +#ifdef USE_BM_PLACE + bm_handle_t get_handle() { + return _bm_handle; + } #endif @@ -141,8 +167,11 @@ class Context final{ int _data_stream_id; int _compute_stream_id; #ifdef USE_ARM_PLACE - PowerMode _mode; - std::vector _act_ids; + PowerMode _mode{SABER_POWER_HIGH}; + std::vector _act_ids{0}; +#endif +#ifdef USE_BM_PLACE + bm_handle_t _bm_handle; #endif }; diff --git a/saber/core/data_traits.h b/saber/core/data_traits.h index 64de4af9f..342331caf 100644 --- a/saber/core/data_traits.h +++ b/saber/core/data_traits.h @@ -1,92 +1,295 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_CORE_DATA_TRAITS_H #define ANAKIN_SABER_CORE_DATA_TRAITS_H #include "saber_types.h" -#ifdef USE_BM +#ifdef USE_BM_PLACE #include "bmlib_runtime.h" -#include "bmdnn_api.h" #include "bmlib_utils.h" #endif -namespace anakin{ +namespace anakin { -namespace saber{ +namespace saber { -template -struct DataTrait{ - typedef __invalid_type dtype; +template +struct DataTraitLp { + typedef void* PtrDtype; }; -template <> -struct DataTrait { - typedef short dtype; +template +struct DataTraitBase { + typedef void* PtrDtype; }; +#ifdef USE_OPENCL template <> -struct DataTrait { - typedef float dtype; +struct DataTraitLp { + typedef cl_mem PtrDtype; }; template <> -struct DataTrait { - typedef double dtype; +struct DataTraitBase { + typedef cl_mem PtrDtype; }; +#endif -template <> -struct DataTrait { - typedef char dtype; +static size_t type_length(DataType type) { + switch (type) { + case AK_INT8: + return 1; + case AK_UINT8: + return 1; + case AK_INT16: + return 2; + case AK_UINT16: + return 2; + case AK_INT32: + return 4; + case AK_UINT32: + return 4; + case AK_INT64: + return 8; + case AK_HALF: + return 2; + case AK_FLOAT: + return 4; + case AK_DOUBLE: + return 8; + default: + return 4; + } +} + +template +struct DataTrait { + typedef __invalid_type Dtype; + typedef __invalid_type PtrDtype; }; -template <> -struct DataTrait { - typedef short dtype; +template +struct DataTrait { + typedef short Dtype; + typedef short* PtrDtype; }; -template <> -struct DataTrait { - typedef int dtype; +template +struct DataTrait { + typedef float Dtype; + typedef float* PtrDtype; +}; + +template +struct DataTrait { + typedef double Dtype; + typedef double* PtrDtype; +}; + +template +struct DataTrait { + typedef char Dtype; + typedef char* PtrDtype; +}; + +template +struct DataTrait { + typedef short Dtype; + typedef short* PtrDtype; +}; + +template +struct DataTrait { + typedef int Dtype; + typedef int* PtrDtype; +}; + +template +struct DataTrait { + typedef long Dtype; + typedef long* PtrDtype; +}; + +template +struct DataTrait { + typedef unsigned char Dtype; + typedef unsigned char* PtrDtype; +}; + +template +struct DataTrait { + typedef unsigned short Dtype; + typedef unsigned short* PtrDtype; }; +template +struct DataTrait { + typedef unsigned int Dtype; + typedef unsigned int* PtrDtype; +}; + +#ifdef USE_BM_PLACE + +struct BM_mem_addr: bm_mem_desc { + + BM_mem_addr() {}; + + BM_mem_addr(void* k) { + if (k == nullptr) { + *this = BM_MEM_NULL; + } else { + CHECK(false) << "not suport construct not null ptr"; + } + } + + inline bool compare_char_array(const unsigned char* a, const unsigned char* b, int size)const { + for (int i = 0; i < size; ++i) { + if (a[i] != b[i]) { + return false; + } + } + + return true; + } + + bool operator==(const bm_mem_desc& right) { + return compare_char_array(desc, right.desc, sizeof(desc)); + } + bool operator!=(const bm_mem_desc& right) { + return !compare_char_array(desc, right.desc, sizeof(desc)); + } + + bool operator==(const void* right) { + if (right == nullptr) { + return *this == BM_MEM_NULL; + } else { + CHECK(false) << "not suport compare not null BM_mem_addr with nullptr"; + return false; + } + } + + bool operator!=(const void* right) { + return !(*this == right); + } + + BM_mem_addr(struct bm_mem_desc init_desc): bm_mem_desc(init_desc) { + + ; + } + + BM_mem_addr& operator+(int offset) { + if (offset != 0) { + unsigned long long target_addr = bm_mem_get_device_addr(*this); + bm_mem_set_device_addr(*this, target_addr + offset); + DLOG(INFO)<<"offset = "< -struct DataTrait { - typedef long dtype; +struct DataTraitLp { + typedef BM_mem_addr PtrDtype; }; template <> -struct DataTrait { - typedef unsigned char dtype; +struct DataTraitBase { + typedef BM_mem_addr PtrDtype; +}; + + +#endif + + +#ifdef USE_OPENCL +struct ClMem { + ClMem() { + dmem = nullptr; + offset = 0; + } + + ClMem(cl_mem mem_in, size_t offset_in = 0) { + dmem = mem_in; + offset = offset_in; + } + + ClMem(const ClMem& right) { + dmem = right.dmem; + offset = right.offset; + } + + ClMem& operator=(const ClMem& right) { + this->dmem = right.dmem; + this->offset = right.offset; + return *this; + } + + ClMem& operator+(const size_t offset_in) { + this->offset += offset_in; + return *this; + } + + ClMem& operator ++() { + this->offset += 1; + return *this; + } + + ClMem& operator ++(int) { + this->offset += 1; + return *this; + } + + size_t offset{0}; + cl_mem dmem{nullptr}; }; template <> -struct DataTrait { - typedef unsigned short dtype; +struct DataTrait { + typedef float Dtype; + typedef cl_mem PtrDtype; }; template <> -struct DataTrait { - typedef unsigned int dtype; +struct DataTrait { + typedef double Dtype; + typedef cl_mem PtrDtype; }; template <> -struct DataTrait { - typedef bm_device_mem_t dtype; +struct DataTrait { + typedef char Dtype; + typedef cl_mem PtrDtype; }; +template <> +struct DataTrait { + typedef short Dtype; + typedef cl_mem PtrDtype; +}; +#endif //USE_OPENCL } //namespace saber } //namespace anakin diff --git a/saber/core/device.h b/saber/core/device.h index 12e929e0f..d979e3e2d 100644 --- a/saber/core/device.h +++ b/saber/core/device.h @@ -1,16 +1,17 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_CORE_DEVICE_H #define ANAKIN_SABER_CORE_DEVICE_H @@ -55,6 +56,32 @@ struct Device { std::vector _compute_stream; }; +#ifdef AMD_GPU +template <> +struct Device { + + typedef TargetWrapper API; + + Device(int max_stream = 1); + + void get_info(); + void create_stream(); + DeviceInfo _info; + int _max_stream; + + std::vector _data_stream; + std::vector _compute_stream; + + cl_device_id get_device() {return id;}; + cl_context get_context() {return context;}; + + typename API::stream_t get_available_stream(typename API::stream_t default_stream=nullptr); + +private: + cl_device_id id; + cl_context context; +}; +#endif } //namespace saber } //namespace anakin diff --git a/saber/core/env.h b/saber/core/env.h index 3ae42165b..e1465d4c3 100644 --- a/saber/core/env.h +++ b/saber/core/env.h @@ -1,16 +1,17 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_CORE_ENV_H #define ANAKIN_SABER_CORE_ENV_H @@ -38,21 +39,115 @@ class Env { int count = 0; API::get_device_count(count); if (count == 0) { - LOG(WARNING) << "no device found!"; + CHECK(false) << "no device found!"; } else { LOG(INFO) << "found " << count << " device(s)"; } int cur_id = API::get_device_id(); for (int i = 0; i < count; i++) { API::set_device(i); + LOG(INFO) << "init device "<(max_stream)); } API::set_device(cur_id); + LOG(INFO)<<"dev size = "<> env_holder; + +template <> +class Env { +public: + typedef TargetWrapper API; + typedef std::vector> Devs; + static Devs& cur_env() { + static Devs* _g_env = new Devs(); + return *_g_env; + } + static void env_init(int max_stream = 4){ + Devs& devs = cur_env(); + if (devs.size() > 0){ + return; + } + int count = 0; +// API::init_handle(); + API::get_device_count(count); + if (count == 0) { + CHECK(false) << "no device found!"; + } else { + LOG(INFO) << "found " << count << " device(s)"; + } + int cur_id = API::get_device_id(); + for (int i = 0; i < count; i++) { + API::set_device(i); + devs.push_back(Device(max_stream)); + } +// API::set_device(cur_id); + LOG(INFO)<<"dev size = "< cl_event_list; + +template <> +class Env { +public: + typedef TargetWrapper API; + typedef std::vector> Devs; + static Devs& cur_env() { + static Devs* _g_env = new Devs(); + return *_g_env; + } + + static void env_init(int max_stream = 4); + static bool is_init(); + static cl_platform_id get_platform_id(); + + static void add_event(const char *tag, cl_event_list event); + static void add_event(cl_event_list event) { + add_event(mTag.c_str(), event); + } + + static void pop(); + static void set_tag(const char *tag){ + mTag = std::string(tag); + } + + static const std::string& get_tag(){ + return mTag; + } + + static void start_record(){ + record = true; + } + static void stop_record(){ + record = false; + } +private: + Env(){} + + static cl_platform_id platform_id; + static std::map> eMap; + static std::list tList; + static bool record; + static std::string mTag; + +}; +#endif } //namespace saber diff --git a/saber/core/events.h b/saber/core/events.h index dd6094bea..e83f3a767 100644 --- a/saber/core/events.h +++ b/saber/core/events.h @@ -1,16 +1,17 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_CORE_EVENTS_H #define ANAKIN_SABER_CORE_EVENTS_H @@ -29,7 +30,7 @@ class Events{ * \brief create target specific event */ explicit Events(){ - API::create_event(_event); + API::create_event(&_event); } /** diff --git a/saber/core/impl/amd/amd_device.cpp b/saber/core/impl/amd/amd_device.cpp new file mode 100644 index 000000000..b7e0953d4 --- /dev/null +++ b/saber/core/impl/amd/amd_device.cpp @@ -0,0 +1,153 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "core/device.h" +#include "core/env.h" + +namespace anakin{ + +namespace saber{ + +#ifdef AMD_GPU + +typedef TargetWrapper AMD_API; + +size_t split(const std::string &txt, std::vector &strs, char ch) +{ + size_t pos = txt.find( ch ); + size_t initialPos = 0; + strs.clear(); + + // Decompose statement + while( pos != std::string::npos ) { + strs.push_back( txt.substr( initialPos, pos - initialPos ) ); + initialPos = pos + 1; + + pos = txt.find( ch, initialPos ); + } + + // Add the last one + strs.push_back( txt.substr( initialPos, std::min( pos, txt.size() ) - initialPos + 1 ) ); + return strs.size(); +} + +template +static void get_param(cl_device_id dev, cl_device_info param_name, T **param_value){ + size_t valueSize; + clGetDeviceInfo(dev, param_name, 0, NULL, &valueSize); + T *value = (T *)malloc(valueSize); + clGetDeviceInfo(dev, param_name, valueSize, value, NULL); + *param_value = value; +} + + +Device::Device(int max_stream) : _max_stream(max_stream){ + if(!Env::is_init()) + return; + + //get cl device id; + int nums = 0; + AMD_API::get_device_count(nums); + cl_device_id *device_ids = new cl_device_id[nums]; + cl_uint device_nums; + clGetDeviceIDs(Env::get_platform_id(), CL_DEVICE_TYPE_GPU, (cl_uint)nums, device_ids, &device_nums); + id = device_ids[AMD_API::get_device_id()]; + free(device_ids); + + //init context, one by one mapping to device. + cl_int errNum; + const cl_context_properties prop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)Env::get_platform_id(), 0}; + context = clCreateContext(prop, 1, &id, NULL, NULL, &errNum); + CHECK(errNum == CL_SUCCESS); + + get_info(); + create_stream(); +} + +void Device::create_stream() { + _data_stream.clear(); + _compute_stream.clear(); + for(int i = 0; i < _max_stream; i++) { + typename AMD_API::stream_t stream_data; + typename AMD_API::stream_t stream_compute; + + API::_create_stream_with_flag(stream_data, context, id, CL_QUEUE_PROFILING_ENABLE); + API::_create_stream_with_flag(stream_compute, context, id, CL_QUEUE_PROFILING_ENABLE); + _data_stream.push_back(stream_data); + _compute_stream.push_back(stream_compute); + } +} + +void Device::get_info() { + + _info._idx = AMD_API::get_device_id(); + + char *name; + get_param(id, CL_DEVICE_NAME, &name); + _info._device_name = std::string(name); + free(name); + + cl_uint *num; + get_param(id,CL_DEVICE_MAX_COMPUTE_UNITS, &num); + _info._compute_core_num = *num; + free(num); + + get_param(id, CL_DEVICE_MAX_CLOCK_FREQUENCY, &num); + _info._max_frequence = *num; + _info._min_frequence = *num; + free(num); + + get_param(id, CL_DEVICE_VERSION, &name); + std::string version = std::string(name); + std::vector strs; + split(version, strs,' '); + _info._generate_arch = (int)(stof(strs[1]) * 10); + free(name); + + cl_ulong *size; + get_param(id, CL_DEVICE_GLOBAL_MEM_SIZE, &size); + _info._max_memory = (int)(*size / 1048576); + free(size); + + LOG(INFO) << "Device id: " << _info._idx << " , name: " << _info._device_name; + LOG(INFO) << "Multiprocessors: " << _info._compute_core_num; + LOG(INFO) << "frequency:" << _info._max_frequence << " MHz"; + LOG(INFO) << "AMD OpenCL Capability : "<< _info._generate_arch; + LOG(INFO) << "total global memory: " << _info._max_memory << " MBytes."; +}; + +typename AMD_API::stream_t Device::get_available_stream(typename AMD_API::stream_t stream) { + if(stream == nullptr) + return _data_stream[0]; + + cl_device_id t_device_id; + if(clGetCommandQueueInfo(stream, CL_QUEUE_DEVICE, sizeof(cl_device_id), &t_device_id, NULL) == CL_SUCCESS){ + if(t_device_id == id) + return stream; + + } + LOG(INFO) << "Can't find this stream use default data stream to instead"; + return _data_stream[0]; + +} + +//template void Device::create_stream(); +//template void Device::get_info(); + +#endif // AMD_GPU + +} //namespace saber +} //namespace anakin + diff --git a/saber/core/impl/amd/amd_env.cpp b/saber/core/impl/amd/amd_env.cpp new file mode 100644 index 000000000..412459e0e --- /dev/null +++ b/saber/core/impl/amd/amd_env.cpp @@ -0,0 +1,198 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "core/env.h" +namespace anakin{ + +namespace saber{ + +#ifdef AMD_GPU + + +typedef TargetWrapper AMD_API; +typedef Env AMD_ENV; + +cl_platform_id AMD_ENV::platform_id = NULL; + +void AMD_ENV::env_init(int max_stream){ + Devs& devs = cur_env(); + if (devs.size() > 0){ + return; + } + + platform_id = API::get_platform_id(); + + int count = 0; + API::get_device_count(count); + if (count == 0) { + LOG(WARNING) << "no device found!"; + } else { + LOG(INFO) << "found " << count << " device(s)"; + } + + int cur_id = API::get_device_id(); + for (int i = 0; i < count; i++) { + API::set_device(i); + devs.push_back(Device(max_stream)); + } + API::set_device(cur_id); +} + +bool AMD_ENV::is_init(){ + CHECK(platform_id != NULL); + return true; +} + +cl_platform_id AMD_ENV::get_platform_id(){ + if(!is_init()) { + return NULL; + } + return platform_id; +} + +bool AMD_ENV::record = false; +std::string AMD_ENV::mTag; +std::list AMD_ENV::tList; +std::map> AMD_ENV::eMap; + +void AMD_ENV::add_event(const char *tag, cl_event_list event){ + if(!record) return; + + std::map>::iterator it; + it = eMap.find(std::string(tag)); + if(it != eMap.end()) { + it->second.push_back(event); + } else { + LOG(INFO) << "record [" << tList.size() << "]=" << tag; + tList.push_back(std::string(tag)); + std::list list; + list.push_back(event); + eMap[std::string(tag)]=list; + } +} +void AMD_ENV::pop(){ + std::map>::iterator it; + size_t t_size, e_size, s_size, size; + float executionTime = 0, waitTime = 0, g_execute=0, g_wait=0; + cl_ulong submit, start, end, wait, execute; + CHECK(tList.size() == eMap.size()); + t_size = tList.size(); + std::string log; + log.append("\n"); + std::string tmp; + + for(int i = 0 ; i < t_size; i++) { + waitTime = executionTime = 0; + std::string tag = tList.front(); + it = eMap.find(tag); + std::list list = it->second; + e_size = list.size(); + + cl_ulong *s_waits=NULL, *s_executes=NULL; + for(int j = 0 ; j< e_size; j++) { + cl_event_list eList = list.front(); + + s_size = eList.size(); + + if (s_size > 1) { + + if(s_waits == NULL) { + s_waits = new cl_ulong[s_size]; + s_executes = new cl_ulong[s_size]; + memset(s_waits, 0, s_size * sizeof(cl_ulong)); + memset(s_executes, 0, s_size * sizeof(cl_ulong)); + } + + int tmps = 0; + for(cl_event_list::iterator ite = eList.begin(); ite != eList.end(); ite++) { + + cl_event event = *ite; + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit,NULL); + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start,NULL); + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); + + s_waits[tmps] += (start - submit); + s_executes[tmps] += (end - start); + tmps++; + } + + cl_event eventS = eList.front(); + cl_event eventE = eList.back(); + clGetEventProfilingInfo(eventS, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start,NULL); + clGetEventProfilingInfo(eventE, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); + execute = end - start; + + clGetEventProfilingInfo(eventS, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit,NULL); + wait = start - submit; + + clGetEventProfilingInfo(eventS, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end,NULL); + clGetEventProfilingInfo(eventE, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &start, NULL); + wait += start - end; + } else { + + cl_event event = eList.front(); + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit,NULL); + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start,NULL); + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); + + wait = start - submit; + execute = end - start; + } + eList.clear(); + + executionTime += (execute) * 1e-6; + waitTime += (wait) * 1e-6; + list.pop_front(); + //LOG(INFO) << tag << ":" << "["<< wait <<", " << execute <<"]"; + } + //LOG(INFO) << "[" << i << "]" << tag << " avg - wait :"<< waitTime/e_size << " ms, execute " << executionTime/e_size <<" ms"; + tmp = std::string("[") + std::to_string(i) + std::string("]\t") + \ + tag + std::string("\t") + std::to_string(executionTime/e_size) + std::string(" ms\n"); + + if(s_size > 1) { + for(int s = 0; s < s_size; s++) { + tmp.append(std::string("--[") + std::to_string(i)+ std::string("-") + std::to_string(s) + std::string("]\t\t")); + tmp.append(std::to_string((float)s_executes[s]*1e-6/e_size) + std::string(" ms\n")); + } + + delete s_waits; + delete s_executes; + s_waits = s_executes = NULL; + } + + log.append(tmp); + + g_wait += (waitTime/e_size); + g_execute += (executionTime/e_size); + //LOG(INFO) << "[" << i << "]" << tag << '\t' << " avg - execute " << '\t'<< executionTime/e_size <<" ms"; + + tList.pop_front(); + } + + tmp = std::string("[Total]\t\t") + \ + std::to_string(g_execute) + std::string(" ms\n"); + log.append(tmp); + LOG(INFO) << log; +} + + + +//template void AMD_ENV::evn_init(); + +#endif // AMD_GPU + +} //namespace saber +} //namespace anakin + diff --git a/saber/core/impl/amd/amd_impl.cpp b/saber/core/impl/amd/amd_impl.cpp new file mode 100644 index 000000000..57e9cefae --- /dev/null +++ b/saber/core/impl/amd/amd_impl.cpp @@ -0,0 +1,706 @@ +#include "core/tensor.h" +#include "core/env.h" + +namespace anakin{ + +namespace saber{ + +#ifdef AMD_GPU + +#define AMD_GPU_EXTENSION + +const char* opencl_get_error_string(cl_int err){ + switch (err) { + case CL_INVALID_PLATFORM: + return "CL_INVALID_PLATFORM"; + case CL_INVALID_DEVICE_TYPE: + return "CL_INVALID_DEVICE_TYPE"; + case CL_INVALID_CONTEXT: + return "CL_INVALID_CONTEXT"; + case CL_INVALID_VALUE: + return "CL_INVALID_VALUE"; + case CL_INVALID_BUFFER_SIZE: + return "CL_INVALID_BUFFER_SIZE"; + case CL_INVALID_HOST_PTR: + return "CL_INVALID_HOST_PTR"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case CL_OUT_OF_RESOURCES: + return "CL_OUT_OF_RESOURCES"; + case CL_OUT_OF_HOST_MEMORY: + return "CL_OUT_OF_HOST_MEMORY"; + case CL_INVALID_EVENT: + return "CL_INVALID_EVENT"; + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: + return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + } + return "Unknown cl error"; + +} + + +/** + * \brief for AMD device target only, device target is AMD gpu + * use opencl api to manage memory + * support device to device, device to host, host to device memcpy +*/ +typedef TargetWrapper AMD_API; +typedef Env AMD_ENV; + +int AMD_API::current_device_id_index = 0; +std::map AMD_API::buffers; + +void AMD_API::get_device_count(int &count) { + cl_platform_id id = AMD_ENV::get_platform_id(); + cl_uint nums; + AMD_CHECK(clGetDeviceIDs(id, CL_DEVICE_TYPE_GPU, 0, NULL, &nums)); + count = (int)nums; +} + +void AMD_API::set_device(int id){ + LOG(INFO) << "set device id = " << id; + current_device_id_index = id; +} + +void AMD_API::mem_alloc(TPtr* ptr, size_t n){ + AMD_ENV::is_init(); + +#ifdef AMD_GPU_EXTENSION + //LOG(INFO) << "use CL_MEM_USE_PERSISTENT_MEM_AMD to create buffer."; +#else + //LOG(INFO) << "use CL_MEM_ALLOC_HOST_PTR to create buffer."; +#endif + + int index = get_device_id(); + + cl_context context = AMD_ENV::cur_env()[index].get_context(); + + cl_int err; + cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE +#ifdef AMD_GPU_EXTENSION + | CL_MEM_USE_PERSISTENT_MEM_AMD +#else + | CL_MEM_ALLOC_HOST_PTR +#endif + , n, NULL, &err); + + AMD_CHECK(err); + + //ClMem* clbuf = (ClMem*)malloc(sizeof(ClMem)); + //clbuf->dmem = buf; + //clbuf->offset = 0; + ClMem clbuf(buf); + + //*ptr = (void *) buf; + *ptr = clbuf; + + LOG(INFO) << __func__ << "device =" << index << " get context :" << context << " buffer :" << buf <<" size :" << n; +} + +void AMD_API::mem_free(TPtr ptr){ + + cl_mem mem = ptr.dmem; + if (mem != nullptr) { + clReleaseMemObject(mem); + } +} + +#if 1 +void AMD_API::mem_set(TPtr ptr, int value, size_t n){ + + cl_mem mem = ptr.dmem; + + if(mem == nullptr) + return ; + + AMD_ENV::is_init(); + + Device dev = AMD_ENV::cur_env()[current_device_id_index]; + stream_t cm = dev.get_available_stream(); + + clEnqueueFillBuffer(cm, mem, &value, sizeof(int), 0, n, 0, NULL, NULL); +} + +#else + +template +void AMD_API::mem_set(TPtr ptr, U value, size_t n){ + if(ptr == nullptr) + return ; + + AMD_ENV::is_init(); + + Device dev = AMD_ENV::cur_env()[current_device_id_index]; + stream_t cm = dev.get_available_stream(stream); + + cl_mem mem = ptr.dmem; + + clEnqueueFillBuffer(cm, mem, &value, sizeof(U), 0, n, 0, NULL, NULL); +} +#endif + +void AMD_API::create_event(event_t& event, bool flag) { + + LOG(INFO) << "ceate_event break opencl call sequence. Is baidu expect clCreateUserEvent?"; + //do nothing for this. + event = nullptr; + + //Env::is_init(); + //cl_int err = CL_SUCCESS; + //event = clCreaeUserEvent(AMD_ENV::cur_env()[current_device_id_index].context, &err); + //AMD_CHECK(err); +} + +void AMD_API::create_stream(stream_t& stream) { + create_stream_with_flag(stream, 0); +} + +/** + * \brief create cuda stream with flag + * @param stream input stream + * @param flag input flag, 0: default stream flag, 1: cudaStreamNonBlocking + */ +void AMD_API::create_stream_with_flag(stream_t& stream, unsigned int flag) { + Env::is_init(); + cl_int err = CL_SUCCESS; + stream = clCreateCommandQueue(Env::cur_env()[current_device_id_index].get_context(), Env::cur_env()[current_device_id_index].get_device(), (cl_command_queue_properties) flag, &err); + AMD_CHECK(err); +} + +void AMD_API::_create_stream_with_flag(stream_t& stream, cl_context context, cl_device_id dev, unsigned int flag){ + cl_int err = CL_SUCCESS; + stream = clCreateCommandQueue(context, dev, (cl_command_queue_properties) flag, &err); + AMD_CHECK(err); +} + +void AMD_API::create_stream_with_priority(stream_t& stream, unsigned int flag, int priority) { + // TODO + LOG(ERROR) << "not support, use create_stream_with_flag to instead"; + create_stream_with_flag(stream, flag); +} + +void AMD_API::destroy_stream(stream_t& stream) { + AMD_CHECK(clReleaseCommandQueue(stream)); +} + +void AMD_API::destroy_event(event_t& event) { +// LOG(INFO) << __func__ <<" :Does baidu expect this event is an User Event?"; + + if(event == nullptr){ +// LOG(INFO) << "event is empty, do nothing"; + return; + } + + cl_command_type t; + AMD_CHECK(clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &t, NULL)); + if ( t == CL_COMMAND_USER) { + cl_int refs; + AMD_CHECK(clGetEventInfo(event, CL_EVENT_REFERENCE_COUNT, sizeof(cl_int), &refs, NULL)); + if(refs == 1) + AMD_CHECK(clSetUserEventStatus(event, CL_COMPLETE)); + + AMD_CHECK(clReleaseEvent(event)); + } else { +// LOG(INFO) << "NOT User Event, do nothing"; + } + +} + +void AMD_API::record_event(event_t& event, stream_t stream) { + //LOG(WARNING) << "OpenCL record event when calling clEnqueueXXX, so we use marker to simulate this behavior"; + AMD_CHECK(clEnqueueMarkerWithWaitList(stream, 0, NULL, &event)); + //LOG(INFO) << "marker event "<< event; +} + +void AMD_API::query_event(event_t& event) { + // TODO + LOG(ERROR) << "OpenCL us clGetEventInfo to retrive event's specific info. so we need to know what info user want to know"; +} + +void AMD_API::sync_event(event_t& event) { +// LOG(INFO) << __func__ ; + + if(event == nullptr){ + LOG(INFO) << "event is empty, do nothing"; + return; + } + +// LOG(INFO) << "sync_event E " << event; + AMD_CHECK(clWaitForEvents( 1, &event)); +// LOG(INFO) << "sync_event X " << event; +} + +void AMD_API::sync_stream(event_t& event, stream_t& stream) { + LOG(INFO) << __func__ ; + if(event != nullptr) { + LOG(INFO) << "event is null"; + return; + } + + LOG(INFO) << "sync_stream E "; + AMD_CHECK(clEnqueueBarrierWithWaitList(stream, 1, &event, NULL)); + clFlush(stream); + LOG(INFO) << "sync_stream D "; +} + + +void AMD_API::sync_memcpy(TPtr dst, int dst_id, const TPtr src, int src_id, \ + size_t count, __DtoD) { + + cl_mem dst_mem = dst.dmem; + cl_mem src_mem = src.dmem; + + size_t dst_offset = dst.offset; + size_t src_offset = src.offset; + + LOG(INFO) << __func__<< " D2D dst=" << (void*)dst_mem << " dst_id=" << dst_id << " dst_office=" << dst_offset << " src=" <<(void*)src_mem << " src_id=" << src_id << " src_offset=" << src_offset << " count=" << count; + //sync_memcpy_with_offset(dst, dst_id, 0, src, src_id, 0, count, __DtoD()); + + if(dst_id == src_id){ + cl_command_queue cm = AMD_ENV::cur_env()[dst_id].get_available_stream(); + cl_event event; + AMD_CHECK(clEnqueueCopyBuffer(cm, src_mem, dst_mem, src_offset, dst_offset, count, 0, NULL, &event)); + clFlush(cm); + clWaitForEvents(1, &event); + LOG(INFO) << "OpenCL, sync, D2D, size: " << count; + } else{ + cl_command_queue dst_cm = AMD_ENV::cur_env()[dst_id].get_available_stream(); + cl_command_queue src_cm = AMD_ENV::cur_env()[src_id].get_available_stream(); + + cl_int err; + cl_event event; + void *host_ptr = clEnqueueMapBuffer(src_cm, src_mem, CL_TRUE, CL_MAP_READ, src_offset, count, 0, NULL, NULL, &err); + AMD_CHECK(err); + AMD_CHECK(clEnqueueWriteBuffer(dst_cm, dst_mem, CL_TRUE, dst_offset, count, host_ptr, 0, NULL, NULL)); + AMD_CHECK(clEnqueueUnmapMemObject(src_cm, src_mem, host_ptr, 0, NULL, &event)); + clFlush(src_cm); + clFlush(dst_cm); + clWaitForEvents(1, &event); + LOG(INFO) << "OpenCL, sync, P2P, size: " << count; + } +} +#if 0 +void AMD_API::sync_memcpy_with_offset(void* dst, int dst_id, size_t dst_offset, const void* src, int src_id, size_t src_offset, \ + // TODO + size_t count, __DtoD) { + + LOG(INFO) << __func__<< " D2D dst=" << dst << " dst_id=" << dst_id << " dst_office=" << dst_offset << " src=" <; + +//! AMD Buffer +INSTANTIATE_BUFFER(AMD); + +//! AMD Tensor + +INSTANTIATE_TENSOR(AMD, AK_FLOAT, NCHW); +INSTANTIATE_TENSOR(AMD, AK_FLOAT, NHWC); +INSTANTIATE_TENSOR(AMD, AK_FLOAT, HW); +INSTANTIATE_TENSOR(AMD, AK_FLOAT, NHW); +INSTANTIATE_TENSOR(AMD, AK_FLOAT, NW); + +INSTANTIATE_TENSOR(AMD, AK_INT8, NCHW); +INSTANTIATE_TENSOR(AMD, AK_INT8, NHWC); +INSTANTIATE_TENSOR(AMD, AK_INT8, HW); +INSTANTIATE_TENSOR(AMD, AK_INT8, NHW); +INSTANTIATE_TENSOR(AMD, AK_INT8, NW); + +INSTANTIATE_TENSOR(AMD, AK_HALF, NCHW); +INSTANTIATE_TENSOR(AMD, AK_HALF, NHWC); +INSTANTIATE_TENSOR(AMD, AK_HALF, HW); +INSTANTIATE_TENSOR(AMD, AK_HALF, NHW); +INSTANTIATE_TENSOR(AMD, AK_HALF, NW); +//! +template struct Env; + +#endif // AMD_GPU + +} //namespace saber + +} //namespace anakin diff --git a/saber/core/impl/amd/tensor_op_amd.cpp b/saber/core/impl/amd/tensor_op_amd.cpp new file mode 100644 index 000000000..15f1225fd --- /dev/null +++ b/saber/core/impl/amd/tensor_op_amd.cpp @@ -0,0 +1,202 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "saber/core/tensor_op.h" +#include + +#ifdef AMD_GPU + +#include + +namespace anakin{ + +namespace saber{ + + +typedef TargetWrapper AMD_API; + +#if 1 +template +void fill_tensor_device_const(Tensor_t& tensor, \ + typename Tensor_t::Dtype value, \ + typename Tensor_t::API::stream_t stream){ + + typedef typename Tensor_t::Dtype Dtype; + typedef typename Tensor_t::PtrDtype PtrDtype; + + PtrDtype data_ptr = (PtrDtype)tensor.get_buf()->get_data_mutable(); + int size = tensor.size(); + + Device dev = Env::cur_env()[tensor.device_id()]; + + if(stream == nullptr){ + LOG(INFO) << "stream is empty, use default stream"; + stream = dev._data_stream[0]; + } + + + cl_mem mem = data_ptr.dmem; + cl_event event; + clEnqueueFillBuffer(stream, mem, &value, sizeof(Dtype), 0, size * sizeof(Dtype), 0, NULL, &event); + clFlush(stream); + clWaitForEvents(1, &event); + +}; + + +template +void fill_tensor_device_rand(Tensor_t& tensor, typename Tensor_t::API::stream_t stream) { + + typedef typename Tensor_t::Dtype Dtype; + typedef typename Tensor_t::PtrDtype PtrDtype; + + PtrDtype ptr = (PtrDtype)tensor.get_buf()->get_data_mutable(); + cl_mem mem = ptr.dmem; + int size = tensor.size(); + + + Device dev = Env::cur_env()[tensor.device_id()]; + if(stream == nullptr){ + LOG(INFO) << "stream is empty, use default stream"; + stream = dev._data_stream[0]; + } + + cl_int err; + Dtype* data_ptr = (Dtype *)clEnqueueMapBuffer(stream, mem, CL_TRUE, CL_MAP_WRITE, 0, size * sizeof(Dtype), 0, NULL, NULL, &err); + if (err != CL_SUCCESS){ + LOG(ERROR) << "Can't map buffer to host, err=" << err; + return; + } + + for (int i = 0; i < size; ++i) { + data_ptr[i] = static_cast(rand()); + } + + cl_event event; + clEnqueueUnmapMemObject(stream, mem, data_ptr, 0, NULL, &event); + clFlush(stream); + clWaitForEvents(1, &event); + +}; + +template +void fill_tensor_device_rand(Tensor_t& tensor, typename Tensor_t::Dtype vstart, \ + typename Tensor_t::Dtype vend, typename Tensor_t::API::stream_t stream) { + + typedef typename Tensor_t::Dtype Dtype; + typedef typename Tensor_t::PtrDtype PtrDtype; + + PtrDtype ptr = (PtrDtype)tensor.get_buf()->get_data_mutable(); + cl_mem mem = ptr.dmem; + + int size = tensor.size(); + + Device dev = Env::cur_env()[tensor.device_id()]; + if(stream == nullptr){ + LOG(INFO) << "stream is empty, use default stream"; + stream = dev._data_stream[0]; + } + + cl_int err; + Dtype* data_ptr = (Dtype *)clEnqueueMapBuffer(stream, mem, CL_TRUE, CL_MAP_WRITE, 0, size * sizeof(Dtype), 0, NULL, NULL, &err); + if (err != CL_SUCCESS){ + LOG(ERROR) << "Can't map buffer to host, err=" << err; + return; + } + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(0, 1.f); + for (int i = 0; i < size; ++i) { + Dtype random_num = vstart + (vend - vstart) * dis(gen); + data_ptr[i] = random_num; + } + + cl_event event; + clEnqueueUnmapMemObject(stream, mem, data_ptr, 0, NULL, &event); + clFlush(stream); + clWaitForEvents(1, &event); +}; + +template +void print_tensor_device(Tensor_t& tensor, typename Tensor_t::API::stream_t stream){ + + typedef typename Tensor_t::Dtype Dtype; + typedef typename Tensor_t::PtrDtype PtrDtype; + + PtrDtype ptr = (PtrDtype)tensor.get_buf()->get_data_mutable(); + cl_mem mem = ptr.dmem; + + LOG(INFO) << "device tensor size: " << tensor.size() << " type size: " << sizeof(Dtype); + int size = tensor.size(); + + Device dev = Env::cur_env()[tensor.device_id()]; + + if(stream == nullptr){ + LOG(INFO) << "stream is empty, use default stream"; + stream = dev._data_stream[0]; + } + + cl_int err; + Dtype * data_ptr = (Dtype *)clEnqueueMapBuffer(stream, mem, CL_TRUE, CL_MAP_READ, 0, size * sizeof(Dtype), 0, NULL, NULL, &err); + if (err != CL_SUCCESS){ + LOG(ERROR) << "Can't map buffer to host, err=" << err; + return; + } + + for (int i = 0; i < size; ++i) { + printf("%.5f ", static_cast(data_ptr[i])); + if ((i + 1) % tensor.width() == 0) { + printf("\n"); + } + } + printf("\n"); + + clEnqueueUnmapMemObject(stream, mem, data_ptr, 0, NULL, NULL); + //clFinish(stream); + +}; + +#define FILL_TENSOR_AMD(type, layout) \ + template void fill_tensor_device_const>\ + (Tensor& tensor, DataTrait::Dtype value, \ + typename TargetWrapper::stream_t stream); \ + template void fill_tensor_device_rand>\ + (Tensor& tensor, typename TargetWrapper::stream_t stream); \ + template void fill_tensor_device_rand>\ + (Tensor& tensor, DataTrait::Dtype vstart, \ + DataTrait::Dtype vend, typename TargetWrapper::stream_t stream); \ + template void print_tensor_device>\ + (Tensor& tensor, typename TargetWrapper::stream_t stream); + +FILL_TENSOR_AMD(AK_FLOAT, NCHW); +FILL_TENSOR_AMD(AK_FLOAT, NHWC); +FILL_TENSOR_AMD(AK_FLOAT, NHW); +FILL_TENSOR_AMD(AK_FLOAT, NW); +FILL_TENSOR_AMD(AK_FLOAT, HW); +FILL_TENSOR_AMD(AK_FLOAT, W); + +FILL_TENSOR_AMD(AK_INT8, NCHW); +FILL_TENSOR_AMD(AK_INT8, NHWC); +FILL_TENSOR_AMD(AK_INT8, NHW); +FILL_TENSOR_AMD(AK_INT8, NW); +FILL_TENSOR_AMD(AK_INT8, HW); +FILL_TENSOR_AMD(AK_INT8, W); + +#endif +} //namespace saber + +} //namespace anakin + +#endif //AMD_GPU diff --git a/saber/core/impl/arm/arm_device.cpp b/saber/core/impl/arm/arm_device.cpp index f214dfa96..f8b3ea9bf 100644 --- a/saber/core/impl/arm/arm_device.cpp +++ b/saber/core/impl/arm/arm_device.cpp @@ -1,12 +1,261 @@ #include "device.h" #include "context.h" + #ifdef USE_ARM_PLACE -#include "arm_device.h" + +#ifdef PLATFORM_ANDROID +#include +#include +#define __NCPUBITS__ (8 * sizeof (unsigned long)) + +#define __CPU_SET(cpu, cpusetp) \ + ((cpusetp)->mask_bits[(cpu) / __NCPUBITS__] |= (1UL << ((cpu) % __NCPUBITS__))) + +#define __CPU_ZERO(cpusetp) \ + memset((cpusetp), 0, sizeof(cpu_set_t)) + +#endif //PLATFORM_ANDROID + +#if __APPLE__ +#include "TargetConditionals.h" +#if TARGET_OS_IPHONE +#include +#include +#include +#endif //TARGET_OS_IPHONE +#endif //__APPLE__ namespace anakin{ namespace saber{ +int arm_get_cpucount() { +#ifdef PLATFORM_ANDROID + // get cpu count from /proc/cpuinfo + FILE* fp = fopen("/proc/cpuinfo", "rb"); + if (!fp) { + return 1; + } + int count = 0; + char line[1024]; + while (!feof(fp)) { + char* s = fgets(line, 1024, fp); + if (!s) { + break; + } + + if (memcmp(line, "processor", 9) == 0) { + count++; + } + } + + fclose(fp); + + if (count < 1) { + count = 1; + } + return count; + +#elif TARGET_IOS + int count = 0; + size_t len = sizeof(count); + sysctlbyname("hw.ncpu", &count, &len, NULL, 0); + if (count < 1) { + count = 1; + } + return count; +#else + return 1; +#endif +} + +size_t arm_get_meminfo() { +#ifdef PLATFORM_ANDROID + // get cpu count from /proc/cpuinfo + FILE* fp = fopen("/proc/meminfo", "rb"); + if (!fp) { + return 1; + } + + size_t memsize = 0; + char line[1024]; + while (!feof(fp)) + { + char* s = fgets(line, 1024, fp); + if (!s) { + break; + } + sscanf(s, "MemTotal: %d kB", &memsize); + } + + fclose(fp); + + return memsize; +#elif TARGET_IOS + // to be implemented + LOG(ERROR) << "not implemented"; + return 0; +#endif +} + +#ifdef PLATFORM_ANDROID +static int get_max_freq_khz(int cpuid) +{ + // first try, for all possible cpu + char path[256]; + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state",\ + cpuid); + + FILE* fp = fopen(path, "rb"); + + if (!fp) + { + // second try, for online cpu + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",\ + cpuid); + fp = fopen(path, "rb"); + + if (!fp) + { + // third try, for online cpu + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",\ + cpuid); + fp = fopen(path, "rb"); + + if (!fp) { + return -1; + } + + int max_freq_khz = -1; + fscanf(fp, "%d", &max_freq_khz); + + fclose(fp); + + return max_freq_khz; + } + } + + int max_freq_khz = 0; + while (!feof(fp)) + { + int freq_khz = 0; + int nscan = fscanf(fp, "%d %*d", &freq_khz); + if (nscan != 1) { + break; + } + + if (freq_khz > max_freq_khz) { + max_freq_khz = freq_khz; + } + } + + fclose(fp); + + return max_freq_khz; +} + +int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector& cpuids, \ + std::vector& cpu_freq, std::vector& cluster_ids) { + //const int cpu_count = cpuids.size(); + + if (cpu_count == 0) { + return 0; + } + + //std::vector cpu_max_freq_khz; + cpuids.resize(cpu_count); + cpu_freq.resize(cpu_count); + cluster_ids.resize(cpu_count); + + for (int i = 0; i < cpu_count; i++) + { + int max_freq_khz = get_max_freq_khz(i); + //printf("%d max freq = %d khz\n", i, max_freq_khz); + cpuids[i] = i; + cpu_freq[i] = max_freq_khz / 1000; + } + + // SMP + int mid_max_freq_khz = (cpu_freq.front() + cpu_freq.back()) / 2; + + for (int i = 0; i < cpu_count; i++) { + if (cpu_freq[i] >= mid_max_freq_khz) { + cluster_ids[i] = 0; + } + else{ + cluster_ids[i] = 1; + } + } + + return 0; +} + +int set_sched_affinity(const std::vector& cpuids) { + // cpu_set_t definition + // ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity + + typedef struct { + unsigned long mask_bits[1024 / __NCPUBITS__]; + }cpu_set_t; + + // set affinity for thread + pid_t pid = gettid(); + + cpu_set_t mask; + __CPU_ZERO(&mask); + for (int i = 0; i < (int)cpuids.size(); i++) + { + __CPU_SET(cpuids[i], &mask); + } + + int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); + if (syscallret) + { + LOG(ERROR) << "syscall error " << syscallret; + return -1; + } + + return 0; +} + +int set_cpu_affinity(const std::vector& cpuids) { +#ifdef USE_OPENMP + int num_threads = cpuids.size(); + omp_set_num_threads(num_threads); + std::vector ssarets(num_threads, 0); +#pragma omp parallel for + for (int i = 0; i < num_threads; i++) { + ssarets[i] = set_sched_affinity(cpuids); + } + for (int i = 0; i < num_threads; i++) { + if (ssarets[i] != 0) { + LOG(ERROR)<<"set cpu affinity failed, cpuID: " << cpuids[i]; + return -1; + } + } +#else + std::vector cpuid1; + cpuid1.push_back(cpuids[0]); + int ssaret = set_sched_affinity(cpuid1); + if (ssaret != 0) { + LOG(ERROR)<<"set cpu affinity failed, cpuID: " << cpuids[0]; + return -1; + } +#endif + return 0; +} +#endif //PLATFORM_ANDROID + +#ifdef TARGET_IOS +int set_cpu_affinity(const std::vector& cpuids) { +#ifdef USE_OPENMP + int num_threads = cpuids.size(); + omp_set_num_threads(num_threads); +#endif + return 0; +} +#endif + template <> void Device::create_stream() { _compute_stream.resize(_max_stream); @@ -53,75 +302,105 @@ void Device::get_info() { } } -template void Device::get_info(); -template void Device::create_stream(); - template <> void Context::bind_dev() { set_cpu_affinity(_act_ids); } template <> -void Context::set_power_mode(PowerMode mode) { - _mode = mode; - Device dev = devs[_device_id]; - if (mode == SABER_POWER_FULL){ - _act_ids = dev._info._core_ids; - } - else if (mode == SABER_POWER_LOW) { - _act_ids.clear(); - for (int i = 0; i < dev._info._cluster_ids.size(); ++i) { - if (dev._info._cluster_ids[i] == 1) { - _act_ids.push_back(dev._info._core_ids[i]); - } - } - if (_act_ids.size() == 0){ - LOG(WARNING) << "LOW POWER MODE is not support"; - _act_ids.push_back(dev._info._core_ids[0]); +void Context::set_run_mode(PowerMode mode, int threads) { + std::vector big_cores; + std::vector small_cores; + for (int i = 0; i < devs[0]._info._cluster_ids.size(); ++i) { + if (devs[0]._info._cluster_ids[i] == 0) { + big_cores.push_back(devs[0]._info._core_ids[i]); + } else { + small_cores.push_back(devs[0]._info._core_ids[i]); } } - else if (mode == SABER_POWER_HIGH){ - _act_ids.clear(); - for (int i = 0; i < dev._info._cluster_ids.size(); ++i) { - if (dev._info._cluster_ids[i] == 0) { - _act_ids.push_back(dev._info._core_ids[i]); - } - } - if (_act_ids.size() == 0){ - LOG(WARNING) << "HIGH POWER MODE is not support"; - _act_ids.push_back(dev._info._core_ids[0]); - } + int big_core_size = big_cores.size(); + int small_core_size = small_cores.size(); + if (threads > big_core_size + small_core_size) { + threads = big_core_size + small_core_size; } - bind_dev(); -} + switch (mode) { + case SABER_POWER_FULL: + _mode = mode; + _act_ids.clear(); + for (int i = 0; i < threads; ++i) { + if (i < big_core_size) { + _act_ids.push_back(big_cores[i]); + } else { + _act_ids.push_back(small_cores[i - big_core_size]); + } + } + break; + case SABER_POWER_HIGH: + _act_ids.clear(); + if (big_core_size > 0) { + _mode = SABER_POWER_HIGH; + if (threads > big_core_size) { + LOG(ERROR) << "threads: " << threads << " exceed the big cores size: " << big_core_size; + _act_ids = big_cores; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(big_cores[i]); + } + } + } else { + _mode = SABER_POWER_LOW; + LOG(ERROR) << "HIGH POWER MODE is not support, switch to small cores"; + if(threads > small_core_size) { + _act_ids = small_cores; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(small_cores[i]); + } + } -template <> -void Context::set_act_cores(std::vector ids) { - Device dev = devs[_device_id]; - if (ids.size() == 0){ - _act_ids.resize(1); - _act_ids[0] = dev._info._core_ids[0]; - }else { - _act_ids.clear(); - for (int i = 0; i < ids.size(); ++i) { - if (ids[i] < dev._info._core_ids.size()){ - _act_ids.push_back(ids[i]); } - } + break; + case SABER_POWER_LOW: + _act_ids.clear(); + if (small_core_size > 0) { + _mode = SABER_POWER_LOW; + if (threads > small_core_size) { + LOG(ERROR) << "threads: " << threads << " exceed the small cores size: " << small_core_size; + _act_ids = small_cores; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(small_cores[i]); + } + } + } else { + _mode = SABER_POWER_HIGH; + LOG(ERROR) << "LOW POWER MODE is not support, switch to big cores"; + if(threads > big_core_size) { + _act_ids = big_cores; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(small_cores[i]); + } + } + + } + break; + } + LOG(INFO) << "mode: \n0: big cores only;\n1: small cores only;\n2: all cores"; + LOG(INFO) << "|----run mode: " << 0; + LOG(INFO) << "|----thread num: " << _act_ids.size(); + for (int j = 0; j < _act_ids.size(); ++j) { + LOG(INFO) << "|----active id: " << _act_ids[j]; } bind_dev(); } template <> -PowerMode Context::get_mode() { +PowerMode Context::get_mode(int& threads) { + threads = _act_ids.size(); return _mode; } -template <> -std::vector Context::get_act_ids() { - return _act_ids; -} - } //namespace saber } //namespace anakin diff --git a/saber/core/impl/arm/arm_impl.cpp b/saber/core/impl/arm/arm_impl.cpp index a51b48d6d..03c85c65b 100644 --- a/saber/core/impl/arm/arm_impl.cpp +++ b/saber/core/impl/arm/arm_impl.cpp @@ -13,18 +13,9 @@ template struct TargetWrapper; template class Buffer; //! ARM Tensor -template class Tensor; -template class Tensor; -template class Tensor; - -template class Tensor; -template class Tensor; -template class Tensor; - -template class Tensor; -template class Tensor; -template class Tensor; +template class Tensor; +//! ARM Env template class Env; #endif //USE_ARM_PLACE diff --git a/saber/core/impl/bm/bm_device.cpp b/saber/core/impl/bm/bm_device.cpp index c89045dcf..c0119e477 100644 --- a/saber/core/impl/bm/bm_device.cpp +++ b/saber/core/impl/bm/bm_device.cpp @@ -1,12 +1,21 @@ #include "core/device.h" -namespace anakin{ +namespace anakin { -namespace saber{ +namespace saber { template <> void Device::create_stream() { - // todo - LOG(WARNING) << "BM create_stream is not implemented"; + _data_stream.clear(); + _compute_stream.clear(); + for (int i = 0; i < _max_stream; i++) { + typedef TargetWrapper API; + typename API::stream_t stream_data; + typename API::stream_t stream_compute; + API::create_stream_with_flag(&stream_data, 1); + API::create_stream_with_flag(&stream_compute, 1); + _data_stream.push_back(stream_data); + _compute_stream.push_back(stream_compute); + } } template <> @@ -15,8 +24,8 @@ void Device::get_info() { LOG(WARNING) << "BM get_info is not implemented"; } -template void Device::get_info(); -template void Device::create_stream(); +//template void Device::get_info(); +//template void Device::create_stream(); } //namespace saber diff --git a/saber/core/impl/bm/bm_impl.cpp b/saber/core/impl/bm/bm_impl.cpp index e73e355b7..791549341 100644 --- a/saber/core/impl/bm/bm_impl.cpp +++ b/saber/core/impl/bm/bm_impl.cpp @@ -1,136 +1,187 @@ #include "core/tensor.h" +#include "core/common.h" +#include "core/data_traits.h" #include "env.h" -#include "bmlib_runtime.h" -#include "bmdnn_api.h" -#include "bmlib_utils.h" - -#ifdef USE_BM const char* bmdnn_get_errorstring(bm_status_t error) { switch (error) { - case BM_SUCCESS: - return "BM API call correct"; - case BM_ERR_FAILURE: - return "BM API fail to return"; - case BM_ERR_TIMEOUT: - return "BM API time out"; - case BM_ERR_PARAM: - return "BM API invalid parameter"; - case BM_ERR_NOMEM: - return "BM API insufficient memory"; - case BM_ERR_DATA: - return "BM API invalid data"; - case BM_ERR_BUSY: - return "BM device is busy"; - case BM_NOT_SUPPORTED: - return "BM unsupported operate"; + case BM_SUCCESS: + return "BM API call correct"; + + case BM_ERR_FAILURE: + return "BM API fail to return"; + + case BM_ERR_TIMEOUT: + return "BM API time out"; + + case BM_ERR_PARAM: + return "BM API invalid parameter"; + + case BM_ERR_NOMEM: + return "BM API insufficient memory"; + + case BM_ERR_DATA: + return "BM API invalid data"; + + case BM_ERR_BUSY: + return "BM device is busy"; + + case BM_NOT_SUPPORTED: + return "BM unsupported operate"; } + return "Unknown bmdnn status"; } -#endif -namespace anakin{ -namespace saber{ +namespace anakin { + +namespace saber { + -#ifdef USE_BM typedef TargetWrapper BM_API; -// Init handle only once in the lifetime static bm_handle_t handle; -static bm_status_t init_handle{bmdnn_init(&handle)}; -void BM_API::get_device_count(int &count) { - BMDNN_CHECK(bm_dev_getcount(&count)); +void BM_API::init_handle(){ + LOG(INFO) << "BM init handle"; + bmlib_kernel_init(&handle); +}; + +void BM_API::deinit_handle(){ + LOG(INFO) << "BM deinit handle"; + bmlib_kernel_deinit(handle); +}; + +bm_handle_t BM_API::get_handle() { + return handle; +}; + +void BM_API::get_device_count(int& count) { + BM_CHECK(bm_dev_getcount(&count)); } -void BM_API::set_device(int id){ +void BM_API::set_device(int id) { //(bm_handle_t &handle, bool bmkernel_used, int id){ - //BMDNN_CHECK(bm_dev_request(&handle, 0, id)); + LOG(INFO) << "BM set_device id" << id; + BM_CHECK(bm_dev_request(&handle, 0, id)); } -//TODO: Do we have this functionality? -int BM_API::get_device_id(){ +int BM_API::get_device_id() { + LOG(INFO) << "BM get_device_id " ; return 0; } - -void BM_API::mem_alloc(void** ptr, size_t n){ - handle = get_bm_handle(); + +void BM_API::mem_alloc(TPtr* ptr, size_t n) { /* bm_device_mem_t *mem = reinterpret_cast(*ptr); */ - bm_device_mem_t *mem = new bm_device_mem_t(); - BMDNN_CHECK(bm_malloc_device_byte(handle, mem, n)); - *ptr = mem; + // bm_device_mem_t *mem = new bm_device_mem_t(); + bm_device_mem_t mem; + BM_CHECK(bm_malloc_device_byte(handle, &mem, n)); + *ptr = TPtr(mem); } - -void BM_API::mem_free(void* ptr){ - if(ptr != nullptr){ - handle = get_bm_handle(); - bm_free_device(handle, *(struct bm_mem_desc*)(ptr)); - delete ptr; + +void BM_API::mem_free(TPtr ptr) { + if (bm_mem_get_type(ptr) == BM_MEM_TYPE_SYSTEM) { + bm_free_device(handle, ptr); + // delete ptr; } } - -void BM_API::mem_set(void* ptr, int value, size_t n){ + +void BM_API::mem_set(TPtr ptr, int value, size_t n) { //(bm_handle_t handle, const int value, bm_device_mem_t mem){ - BMDNN_CHECK(bm_memset_device(handle, value, bm_mem_from_system(ptr))); + BM_CHECK(bm_memset_device(handle, value, ptr)); //bm_device_mem_t* pmem = (struct bm_mem_desc *)(ptr); - //BMDNN_CHECK(bm_memset_device(handle, value, *pmem)); + //BM_CHECK(bm_memset_device(handle, value, *pmem)); } -void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoD) { - handle = get_bm_handle(); - //BMDNN_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count)); - BMDNN_CHECK(bm_memcpy_d2d(handle, *(bm_device_mem_t *)(dst), dst_id, *(bm_device_mem_t *)(src), src_id, count)); - LOG(INFO) << "BM sync_memcpy: device to device, finished"; +void BM_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ + const TPtr src, size_t src_offset, int src_id, \ + size_t count, __DtoD) { + if(count==0) + return; + //BM_CHECK(bm_memcpy_d2d(handle, bm_mem_from_device(dst), dst_id, bm_mem_from_device(src), src_id, count)); + BM_CHECK(bm_memcpy_d2d(handle, dst, dst_offset, src, src_offset, count)); }; -void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __HtoD) { - handle = get_bm_handle(); - BMDNN_CHECK(bm_memcpy_s2d(handle, *(bm_device_mem_t *)(dst), bm_mem_from_system(src))); - - #ifdef DEBUG - for(int i=0; i<10; i++) - LOG(INFO) << "HtoD src: " << *((float *)(src)+i); - #endif - - LOG(INFO) << "BM sync_memcpy: host to device, finished"; -}; +void BM_API::sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, __HtoD) { + if(count==0) + return; + BM_CHECK(bm_memcpy_s2d(handle, dst+dst_offset, bm_mem_from_system(const_cast(src)+src_offset))); -void BM_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoH) { - handle = get_bm_handle(); - BMDNN_CHECK(bm_memcpy_d2s(handle, bm_mem_from_system(dst), *(bm_device_mem_t *)(src))); +#ifdef DEBUG - #ifdef DEBUG - for(int i=0; i<10; i++) - LOG(INFO) << "DtoH dst: " << *((float *)(dst)+i); - #endif + for (int i = 0; i < 10; i++) { + LOG(INFO) << "HtoD src: " << *((float*)(src) + i); + } - LOG(INFO) << "BM sync_memcpy: device to host, finished"; +#endif }; -void BM_API::sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count) { +void BM_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const TPtr src, size_t src_offset, int src_id, \ + size_t count, __DtoH) { + if(count==0) + return; +// LOG(INFO)<<"host ptr = "<<(dst)<<",dst_offset = "<; //! BM Buffer template class Buffer; //! BM Tensor -INSTANTIATE_TENSOR(BM, AK_BM, NCHW); -template struct Env; -#endif //USE_BM + +/** + * \brief Constructor with allocated data ptr and entire memory shape. only for BM +*/ +template <> +template +Tensor::Tensor(typename DataTraitBase::PtrDtype data_ptr, TargetType_t target, int id, Shape shape,DataType type = AK_FLOAT) { + + _shape = shape; + _valid_shape = shape; + _offset = Shape::zero(shape); + _dtype = type; + _type_len = type_length(type); + std::shared_ptr> buf_from_date = \ + std::make_shared>(&bm_mem_from_system(const_cast(data_ptr)), + shape.count() * _type_len, id); + + BufferMemShare(_buf, buf_from_date); + _is_shared = true; + _is_subbuf = false; +} +template class Tensor; + + +template class Env; + + } //namespace saber diff --git a/saber/core/impl/bm/tensor_op_bm.cpp b/saber/core/impl/bm/tensor_op_bm.cpp new file mode 100644 index 000000000..7c14c21da --- /dev/null +++ b/saber/core/impl/bm/tensor_op_bm.cpp @@ -0,0 +1,66 @@ +#include "core/tensor_op.h" +namespace anakin { + +namespace saber { + +template<> +void fill_tensor_const(Tensor& tensor, float value, + typename Tensor::API::stream_t stream = NULL) { + Tensor temp_tensor(tensor.shape(), tensor.get_dtype()); + temp_tensor.set_shape(tensor.valid_shape()); + fill_tensor_const(temp_tensor, value); + tensor.copy_from(temp_tensor); +} +template<> +void fill_tensor_rand(Tensor& tensor, typename Tensor::API::stream_t stream = NULL) { + Tensor temp_tensor(tensor.shape(), tensor.get_dtype()); + temp_tensor.set_shape(tensor.valid_shape()); + fill_tensor_rand(temp_tensor); + tensor.copy_from(temp_tensor); +} + +template<> +void fill_tensor_rand(Tensor& tensor, float vstart, float vend, + typename Tensor::API::stream_t stream = NULL) { + Tensor temp_tensor(tensor.shape(), tensor.get_dtype()); + temp_tensor.set_shape(tensor.valid_shape()); + fill_tensor_rand(temp_tensor, vstart, vend); + tensor.copy_from(temp_tensor); +} + +template<> +void print_tensor(Tensor& tensor, typename Tensor::API::stream_t stream = NULL) { + LOG(INFO) << "BM device tensor data:"; + Tensor temp_tensor(tensor.shape(), tensor.get_dtype()); + temp_tensor.set_shape(tensor.valid_shape()); + temp_tensor.copy_from(tensor); + print_tensor(temp_tensor); +} + +template<> +void print_tensor_valid(Tensor& tensor, typename Tensor::API::stream_t stream = NULL) { + LOG(INFO) << "device tensor data"; + print_tensor(tensor); +} + +template<> +double tensor_mean_value(Tensor& tensor, typename Tensor::API::stream_t stream = NULL) { + Tensor temp_tensor(tensor.shape(), tensor.get_dtype()); + temp_tensor.set_shape(tensor.valid_shape()); + temp_tensor.copy_from(tensor); + return tensor_mean_value(temp_tensor); +} + +template<> +double tensor_mean_value_valid(Tensor& tensor, + typename Tensor::API::stream_t stream = NULL) { + Tensor temp_tensor(tensor.shape(), tensor.get_dtype()); + temp_tensor.set_shape(tensor.valid_shape()); + temp_tensor.copy_from(tensor); + return tensor_mean_value(temp_tensor); +} + + + +} +} \ No newline at end of file diff --git a/saber/core/impl/cuda/cuda_device.cpp b/saber/core/impl/cuda/cuda_device.cpp index 58e832304..d26564899 100644 --- a/saber/core/impl/cuda/cuda_device.cpp +++ b/saber/core/impl/cuda/cuda_device.cpp @@ -27,8 +27,8 @@ void Device::create_stream() { typename API::stream_t stream_data; typename API::stream_t stream_compute; //cudaStreamNonBlocking - API::create_stream_with_flag(stream_data, 1); - API::create_stream_with_flag(stream_compute, 1); + API::create_stream_with_flag(&stream_data, 1); + API::create_stream_with_flag(&stream_compute, 1); _data_stream.push_back(stream_data); _compute_stream.push_back(stream_compute); } @@ -48,7 +48,7 @@ void Device::get_info() { _info._max_frequence = deviceProp.clockRate / 1000; _info._min_frequence = deviceProp.clockRate / 1000; LOG(INFO) << "frequency:" << deviceProp.clockRate / 1000 << "MHz"; - _info._generate_arch = deviceProp.major*10+deviceProp.minor; + _info._generate_arch = deviceProp.major * 10 + deviceProp.minor; LOG(INFO) << "CUDA Capability : "<< deviceProp.major << "." << deviceProp.minor; _info._max_memory = deviceProp.totalGlobalMem / 1048576; LOG(INFO) << "total global memory: " << deviceProp.totalGlobalMem / 1048576 << "MBytes."; @@ -57,7 +57,19 @@ void Device::get_info() { template <> void Device::create_stream() { //todo - LOG(ERROR) << "NVHX86 create_stream is not implemented"; + //LOG(ERROR) << "NVHX86 create_stream is not implemented"; + _data_stream.clear(); + _compute_stream.clear(); + for(int i = 0; i < _max_stream; i++) { + typedef TargetWrapper API; + typename API::stream_t stream_data; + typename API::stream_t stream_compute; + //cudaStreamNonBlocking + API::create_stream_with_flag(&stream_data, 1); + API::create_stream_with_flag(&stream_compute, 1); + _data_stream.push_back(stream_data); + _compute_stream.push_back(stream_compute); + } } template <> diff --git a/saber/core/impl/cuda/cuda_impl.cpp b/saber/core/impl/cuda/cuda_impl.cpp index 4ecb27725..2c2351654 100644 --- a/saber/core/impl/cuda/cuda_impl.cpp +++ b/saber/core/impl/cuda/cuda_impl.cpp @@ -80,10 +80,13 @@ typedef TargetWrapper NVH_API; void NVH_API::get_device_count(int &count) { //todo + LOG(WARNING) << "host target NVHX86 \" get_device_count\" is not implemented"; + count = 1; } void NVH_API::set_device(int id) { //todo + LOG(WARNING) << "host target NVHX86 \" set_device\" is not implemented"; } void NVH_API::mem_alloc(void** ptr, size_t n) { @@ -100,47 +103,84 @@ void NVH_API::mem_set(void* ptr, int value, size_t n){ memset(ptr, value, n); } -void NVH_API::create_event(event_t& event, bool flag) {} +void NVH_API::create_event(event_t* event, bool flag) { + if(flag) { + CUDA_CHECK(cudaEventCreateWithFlags(event, cudaEventDefault)); + }else{ + CUDA_CHECK(cudaEventCreateWithFlags(event, cudaEventDisableTiming)); + } +} -void NVH_API::destroy_event(event_t& event) {} +void NVH_API::destroy_event(event_t event) { + CUDA_CHECK(cudaEventDestroy(event)); +} -void NVH_API::record_event(event_t& event, stream_t stream) {} +void NVH_API::record_event(event_t event, stream_t stream) { + CUDA_CHECK(cudaEventRecord(event, stream)); +} -void NVH_API::create_stream(stream_t& stream) {} +void NVH_API::create_stream(stream_t* stream) { + CUDA_CHECK(cudaStreamCreate(stream)); +} -void NVH_API::create_stream_with_flag(stream_t& stream, unsigned int flag) {} +void NVH_API::create_stream_with_flag(stream_t* stream, unsigned int flag) { + CUDA_CHECK(cudaStreamCreateWithFlags(stream, flag)); +} -void NVH_API::create_stream_with_priority(stream_t& stream, unsigned int flag, int priority) {} +void NVH_API::create_stream_with_priority(stream_t* stream, unsigned int flag, int priority) { + CUDA_CHECK(cudaStreamCreateWithPriority(stream, flag, priority)); +} -void NVH_API::destroy_stream(stream_t& stream) {} +void NVH_API::destroy_stream(stream_t stream) { + CUDA_CHECK(cudaStreamDestroy(stream)); +} -void NVH_API::query_event(event_t& event) {} +void NVH_API::query_event(event_t event) { + CUDA_CHECK(cudaEventQuery(event)); +} -void NVH_API::sync_event(event_t& event) {} +void NVH_API::sync_event(event_t event) { + CUDA_CHECK(cudaEventSynchronize(event)); +} + +void NVH_API::sync_stream(event_t event, stream_t stream) { + CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0)); +} -void NVH_API::sync_stream(event_t& event, stream_t& stream) {} +void NVH_API::sync_stream(stream_t stream) { + CUDA_CHECK(cudaStreamSynchronize(stream)); +} -void NVH_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __HtoH) { - CUDA_CHECK(cudaMemcpy(dst, src, count, cudaMemcpyHostToHost)); - //LOG(INFO) << "NVH, sync, H2H, size: " << count; +void NVH_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, __HtoH) { + CUDA_CHECK(cudaMemcpy((char*)dst + dst_offset, (char*)src + src_offset, count, cudaMemcpyHostToHost)); + CUDA_CHECK(cudaStreamSynchronize(0)); + //LOG(INFO) << "NVH, sync, H2H, size: " << count << ", src_offset: " \ + << src_offset << ", data:" << ((const float*)((char*)src + src_offset))[0]; } -void NVH_API::async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, stream_t& stream, __HtoH) { - CUDA_CHECK(cudaMemcpy(dst, src, count, cudaMemcpyHostToHost)); +void NVH_API::async_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream, __HtoH) { + CUDA_CHECK(cudaMemcpy((char*)dst + dst_offset, (char*)src + src_offset, count, cudaMemcpyHostToHost)); //LOG(INFO) << "NVH, sync, H2H, size: " << count; } -void NVH_API::sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count) {} +void NVH_API::sync_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, size_t count) {} -void NVH_API::async_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count, stream_t& stream) {} +void NVH_API::async_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream) {} int NVH_API::get_device_id(){ return 0; } + +void NVH_API::device_sync() { + CUDA_CHECK(cudaDeviceSynchronize()); +} /** * \brief for NV device target only, device target is NV gpu * use cuda api to manage memory @@ -170,16 +210,16 @@ void NV_API::mem_set(void* ptr, int value, size_t n){ CUDA_CHECK(cudaMemset(ptr, value, n)); } -void NV_API::create_event(event_t& event, bool flag) { +void NV_API::create_event(event_t* event, bool flag) { if(flag) { - CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDefault)); + CUDA_CHECK(cudaEventCreateWithFlags(event, cudaEventDefault)); }else{ - CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + CUDA_CHECK(cudaEventCreateWithFlags(event, cudaEventDisableTiming)); } } -void NV_API::create_stream(stream_t& stream) { - CUDA_CHECK(cudaStreamCreate(&stream)); +void NV_API::create_stream(stream_t* stream) { + CUDA_CHECK(cudaStreamCreate(stream)); } /** @@ -187,98 +227,115 @@ void NV_API::create_stream(stream_t& stream) { * @param stream input stream * @param flag input flag, 0: default stream flag, 1: cudaStreamNonBlocking */ -void NV_API::create_stream_with_flag(stream_t& stream, unsigned int flag) { - CUDA_CHECK(cudaStreamCreateWithFlags(&stream, flag)); +void NV_API::create_stream_with_flag(stream_t* stream, unsigned int flag) { + CUDA_CHECK(cudaStreamCreateWithFlags(stream, flag)); } -void NV_API::create_stream_with_priority(stream_t& stream, unsigned int flag, int priority) { - CUDA_CHECK(cudaStreamCreateWithPriority(&stream, flag, priority)); +void NV_API::create_stream_with_priority(stream_t* stream, unsigned int flag, int priority) { + CUDA_CHECK(cudaStreamCreateWithPriority(stream, flag, priority)); } -void NV_API::destroy_stream(stream_t& stream) { +void NV_API::destroy_stream(stream_t stream) { CUDA_CHECK(cudaStreamDestroy(stream)); } -void NV_API::destroy_event(event_t& event) { - cudaEventDestroy(event); +void NV_API::destroy_event(event_t event) { + CUDA_CHECK(cudaEventDestroy(event)); } -void NV_API::record_event(event_t& event, stream_t stream) { - cudaEventRecord(event, stream); +void NV_API::record_event(event_t event, stream_t stream) { + CUDA_CHECK(cudaEventRecord(event, stream)); } -void NV_API::query_event(event_t& event) { - cudaEventQuery(event); +void NV_API::query_event(event_t event) { + CUDA_CHECK(cudaEventQuery(event)); } -void NV_API::sync_event(event_t& event) { - cudaEventSynchronize(event); +void NV_API::sync_event(event_t event) { + CUDA_CHECK(cudaEventSynchronize(event)); } -void NV_API::sync_stream(event_t& event, stream_t& stream) { - cudaStreamWaitEvent(stream, event, 0); +void NV_API::sync_stream(event_t event, stream_t stream) { + CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0)); +} + +void NV_API::sync_stream(stream_t stream) { + CUDA_CHECK(cudaStreamSynchronize(stream)); } -void NV_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoD) { +void NV_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, __DtoD) { + if(dst_id == src_id){ - CUDA_CHECK(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpy((char*)dst + dst_offset, (char*)src + src_offset, count, cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaStreamSynchronize(0)); //LOG(INFO) << "cuda, sync, D2D, size: " << count; } else{ - CUDA_CHECK(cudaMemcpyPeer(dst, dst_id, src, src_id, count)); + CUDA_CHECK(cudaMemcpyPeer((char*)dst + dst_offset, dst_id, (char*)src + src_offset, src_id, count)); //LOG(INFO) << "cuda, async, P2P, size: " << count; } } -void NV_API::async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, stream_t& stream, __DtoD) { +void NV_API::async_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream, __DtoD) { + if(dst_id == src_id){ - CUDA_CHECK(cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync((char*)dst + dst_offset, (char*)src + src_offset, count, cudaMemcpyDeviceToDevice, stream)); //record_event(event, stream); //LOG(INFO) << "cuda, async, D2D, size: " << count; } else{ - CUDA_CHECK(cudaMemcpyPeerAsync(dst, dst_id, src, src_id, count, stream)); + CUDA_CHECK(cudaMemcpyPeerAsync((char*)dst + dst_offset, dst_id, (char*)src + src_offset, src_id, count, stream)); //record_event(event, stream); //LOG(INFO) << "cuda, async P2P, size: " << count; } } -void NV_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ +void NV_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ size_t count, __HtoD) { - CUDA_CHECK(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy((char*)dst + dst_offset, (char*)src + src_offset, count, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaStreamSynchronize(0)); //LOG(INFO) << "cuda, sync, H2D, size: " << count; } -void NV_API::async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, stream_t& stream, __HtoD) { - CUDA_CHECK(cudaMemcpyAsync(dst, src, count, cudaMemcpyHostToDevice, stream)); +void NV_API::async_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream, __HtoD) { + CUDA_CHECK(cudaMemcpyAsync((char*)dst + dst_offset, (char*)src + src_offset, count, cudaMemcpyHostToDevice, stream)); //record_event(event, stream); //LOG(INFO) << "cuda, async, H2D, size: " << count; } -void NV_API::sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ +void NV_API::sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ size_t count, __DtoH) { - CUDA_CHECK(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy((char*)dst + dst_offset, (char*)src + src_offset, count, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaStreamSynchronize(0)); //LOG(INFO) << "cuda, sync, D2H, size: " << count; } -void NV_API::async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, stream_t& stream, __DtoH) { - CUDA_CHECK(cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToHost, stream)); +void NV_API::async_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream, __DtoH) { + CUDA_CHECK(cudaMemcpyAsync((char*)dst + dst_offset, (char*)src + src_offset, count, cudaMemcpyDeviceToHost, stream)); //record_event(event, stream); //LOG(INFO) << "cuda, async, D2H, size: " << count; } -void NV_API::sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count) { - CUDA_CHECK(cudaMemcpyPeer(dst, dst_dev, src, src_dev, count)); +void NV_API::sync_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count) { + CUDA_CHECK(cudaMemcpyPeer((char*)dst + dst_offset, dst_id, (char*)src + src_offset, src_id, count)); //LOG(INFO) << "cuda, sync, P2P, size: " << count; } -void NV_API::async_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count, stream_t& stream) { - CUDA_CHECK(cudaMemcpyPeerAsync(dst, dst_dev, src, src_dev, count, stream)); +void NV_API::async_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream) { + CUDA_CHECK(cudaMemcpyPeerAsync((char*)dst + dst_offset, dst_id, (char*)src + src_offset, src_id, count, stream)); //record_event(event, stream); //LOG(INFO) << "cuda, async, P2P, size: " << count; } @@ -293,41 +350,17 @@ int NV_API::get_device_id(){ return device_id; } -//! NV TargetWrapper -template struct TargetWrapper; - -//! NVH Buffer -INSTANTIATE_BUFFER(NVHX86); +void NV_API::device_sync() { + CUDA_CHECK(cudaDeviceSynchronize()); +} //! NV Buffer -INSTANTIATE_BUFFER(NV); - -//! NVH Tensor -INSTANTIATE_TENSOR(NVHX86, AK_FLOAT, NCHW); -INSTANTIATE_TENSOR(NVHX86, AK_FLOAT, NHWC); -INSTANTIATE_TENSOR(NVHX86, AK_FLOAT, HW); - -INSTANTIATE_TENSOR(NVHX86, AK_INT8, NCHW); -INSTANTIATE_TENSOR(NVHX86, AK_INT8, NHWC); -INSTANTIATE_TENSOR(NVHX86, AK_INT8, HW); - -INSTANTIATE_TENSOR(NVHX86, AK_HALF, NCHW); -INSTANTIATE_TENSOR(NVHX86, AK_HALF, NHWC); -INSTANTIATE_TENSOR(NVHX86, AK_HALF, HW); - -//! NV Tensor - -INSTANTIATE_TENSOR(NV, AK_FLOAT, NCHW); -INSTANTIATE_TENSOR(NV, AK_FLOAT, NHWC); -INSTANTIATE_TENSOR(NV, AK_FLOAT, HW); +template class Buffer; +template class Buffer; -INSTANTIATE_TENSOR(NV, AK_INT8, NCHW); -INSTANTIATE_TENSOR(NV, AK_INT8, NHWC); -INSTANTIATE_TENSOR(NV, AK_INT8, HW); +template class Tensor; +template class Tensor; -INSTANTIATE_TENSOR(NV, AK_HALF, NCHW); -INSTANTIATE_TENSOR(NV, AK_HALF, NHWC); -INSTANTIATE_TENSOR(NV, AK_HALF, HW); //! template struct Env; diff --git a/saber/core/impl/x86/x86_device.cpp b/saber/core/impl/x86/x86_device.cpp index b4d039c82..323d5d76f 100644 --- a/saber/core/impl/x86/x86_device.cpp +++ b/saber/core/impl/x86/x86_device.cpp @@ -14,8 +14,8 @@ void Device::create_stream() { typename API::stream_t stream_data; typename API::stream_t stream_compute; //cudaStreamNonBlocking - API::create_stream_with_flag(stream_data, 1); - API::create_stream_with_flag(stream_compute, 1); + API::create_stream_with_flag(&stream_data, 1); + API::create_stream_with_flag(&stream_compute, 1); _data_stream.push_back(stream_data); _compute_stream.push_back(stream_compute); } diff --git a/saber/core/impl/x86/x86_impl.cpp b/saber/core/impl/x86/x86_impl.cpp index a867a3e13..c888d9234 100644 --- a/saber/core/impl/x86/x86_impl.cpp +++ b/saber/core/impl/x86/x86_impl.cpp @@ -4,7 +4,7 @@ namespace anakin{ namespace saber{ -using namespace anakin::saber; + //! target wrapper template struct TargetWrapper; @@ -12,18 +12,7 @@ template struct TargetWrapper; template class Buffer; //! X86 Tensor -template class Tensor; -template class Tensor; -template class Tensor; -template class Tensor; - -template class Tensor; -template class Tensor; -template class Tensor; - -template class Tensor; -template class Tensor; -template class Tensor; +template class Tensor; template struct Env; diff --git a/saber/core/shape.h b/saber/core/shape.h index 658aeb6b3..d7cf3e503 100644 --- a/saber/core/shape.h +++ b/saber/core/shape.h @@ -1,16 +1,17 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_CORE_SHAPE_H #define ANAKIN_SABER_CORE_SHAPE_H @@ -26,12 +27,46 @@ class Shape : public std::vector { public: using vector = std::vector; - Shape():vector(){} - template - Shape(First first, Args... res) { - init_dims(first, res...); + Shape() : vector(), _layout(nullptr) { + create_layout(Layout_NCHW); } + Shape(vector data, LayoutType layout_type = Layout_NCHW) { + create_layout(layout_type); + CHECK_EQ(_layout->dims(), data.size()); + for (int i = 0; i < _layout->dims(); ++i) { + this->push_back(data[i]); + } + if (_layout->inner_c() != -1) { + CHECK_EQ(data[4], _layout->inner_c()) \ + << " Layout must be an integer multiple of " + << _layout->inner_c(); + } + } + ~Shape() { + delete _layout; + _layout = nullptr; + } + + Shape(const Shape& right) + : std::vector(right) { + this->clear(); + for (int i = 0; i < right.size(); ++i) { + this->push_back(right[i]); + } + create_layout(right.get_layout()); + } + + Shape &operator=(const Shape& right) { + this->clear(); + for (int i = 0; i < right.size(); ++i) { + this->push_back(right[i]); + } + delete _layout; + _layout = nullptr; + create_layout(right.get_layout()); + return *this; + } Shape operator+(const Shape& shape) { Shape tmp_shape(*this); @@ -49,7 +84,7 @@ class Shape : public std::vector { for (size_t i = 0; i < size(); i++) { tmp_shape[i] = p[i] - shape[i]; } - return tmp_shape; + return tmp_shape; } bool operator<(const Shape& shape) const { @@ -61,7 +96,7 @@ class Shape : public std::vector { const int* p = data(); for (size_t i = 0; i < size(); i++) { - flag &= (p[i] < shape[i]); + flag = flag && (p[i] < shape[i]); } return flag; } @@ -74,7 +109,34 @@ class Shape : public std::vector { } const int* p = data(); for (size_t i = 0; i < size(); i++) { - flag &= (p[i] <= shape[i]); + flag = flag && (p[i] <= shape[i]); + } + return flag; + } + + bool operator>(const Shape& shape) const { + + bool flag = size() == shape.size(); + if (!flag) { + return false; + } + + const int* p = data(); + for (size_t i = 0; i > size(); i++) { + flag = flag && (p[i] > shape[i]); + } + return flag; + } + + bool operator>=(const Shape& shape) const{ + + bool flag = size() == shape.size(); + if (!flag) { + return false; + } + const int* p = data(); + for (size_t i = 0; i > size(); i++) { + flag = flag && (p[i] >= shape[i]); } return flag; } @@ -87,20 +149,102 @@ class Shape : public std::vector { } const int* p = data(); for (size_t i = 0; i < size(); i++) { - flag &= (p[i] == shape[i]); + flag = flag && (p[i] == shape[i]); } return flag; } - - int count(int start = 0) const { + int num_index() const { + if (_layout) { + return _layout->num_index(); + } else { + return -1; + } + } + int channel_index() const { + if (_layout) { + return _layout->channel_index(); + } else { + return -1; + } + } + int height_index() const { + if (_layout) { + return _layout->height_index(); + } else { + return -1; + } + } + int width_index() const { + if (_layout) { + return _layout->width_index(); + } else { + return -1; + } + } + int depth_index() const { + if (_layout) { + return _layout->depth_index(); + } else { + return -1; + } + } + int num() const { + int shape_num = this->num_index() == -1 ? 1 : this->data()[this->num_index()]; + return shape_num; + } + int channel() const { + int shape_channel = this->channel_index() == -1 ? 1 : this->data()[this->channel_index()]; + if (_layout->inner_c() != -1) { + shape_channel *= _layout->inner_c(); + } + return shape_channel; + } + int height() const { + int shape_height = this->height_index() == -1 ? 1 : this->data()[this->height_index()]; + return shape_height; + } + int width() const { + int shape_width = this->width_index() == -1 ? 1 : this->data()[this->width_index()]; + return shape_width; + } + int depth() const { + int shape_depth = this->depth_index() == -1 ? 1 : this->data()[this->depth_index()]; + return shape_depth; + } + long long count(int start = 0) const { + if (start > dims()) { + start = dims(); + } if (this->size() == 0) { return 0; } - int sum = 1; - for_each(this->begin()+start, this->end(), [&](int n){sum *= n;}); + long long sum = 1; + for_each(this->begin() + start, this->end(), [&](int n){sum *= n;}); return sum; } - + long long count(int start, int end) const { + if (start < 0) { + start = 0; + } + if (end > dims()) { + end = dims(); + } + if (end < start) { + end = start; + } + long long sum = 1; + for (int i = start; i < end; ++i) { + sum *= data()[i]; + } + return sum; + } + Shape get_stride() const { + Shape data_stride = Shape::zero(*this); + for (int i = 0; i < dims(); ++i) { + data_stride[i] = count(i + 1); + } + return data_stride; + } int dims() const { return this->size(); } @@ -119,30 +263,129 @@ class Shape : public std::vector { } return true; } - - static Shape zero(int dims){ - Shape sh; - for (int i = 0; i < dims; ++i) { - sh.push_back(0); + LayoutType get_layout() const { + if (_layout) { + return _layout->type(); + } else { + return Layout_invalid; + } + } + void set_num (const int num) { + CHECK_GT(num, 0); + if (_layout->num_index() != -1) { + this->data()[_layout->num_index()] = num; + } + } + void set_channel (const int channel) { + CHECK_GT(channel, 0); + if (_layout->channel_index() != -1) { + int shape_channel = channel; + if (_layout->inner_c() != -1) { + CHECK_EQ(channel % _layout->inner_c(), 0); + shape_channel /= _layout->inner_c(); + } + this->data()[_layout->channel_index()] = shape_channel; + } + } + void set_height (const int height) { + CHECK_GT(height, 0); + if (_layout->height_index() != -1) { + this->data()[_layout->height_index()] = height; + } + } + void set_width (const int width) { + CHECK_GT(width, 0); + if (_layout->width_index() != -1) { + this->data()[_layout->width_index()] = width; + } + } + void set_depth (const int depth) { + CHECK_GT(depth, 0); + if (_layout->depth_index() != -1) { + this->data()[_layout->depth_index()] = depth; + } + } + void set_layout(LayoutType layout_type, std::vector new_shape = {}) { + Shape sh = *this; + Layout* layout = this->_layout; + create_layout(layout_type); + if (sh._layout== nullptr) { + return; + } + this->clear(); + if (new_shape.size() != 0) { + CHECK_EQ(_layout->dims(), new_shape.size()) << "new_shape dims miss match with layout dims"; + for (auto i : new_shape) { + this->push_back(i); + } + return; + } + this->resize(_layout->dims()); + if (_layout->num_index() != -1) { + this->data()[_layout->num_index()] = sh.num(); + } + if (_layout->channel_index() != -1) { + this->data()[_layout->channel_index()] = sh.channel(); + if (_layout->inner_c() != -1) { + CHECK_EQ(sh.channel() % _layout->inner_c(), 0); + this->data()[_layout->channel_index()] /= _layout->inner_c(); + this->data()[4] = _layout->inner_c(); + } + } + if (_layout->height_index() != -1) { + this->data()[_layout->height_index()] = sh.height(); + } + if (_layout->width_index() != -1) { + this->data()[_layout->width_index()] = sh.width(); + } + if (_layout->depth_index() != -1) { + this->data()[_layout->depth_index()] = sh.depth(); + } + delete layout; + } + static Shape zero(const Shape &right){ + Shape sh = right; + for (int i = 0; i < right.size(); ++i) { + sh[i] = 0; } return sh; } - static Shape minusone(int dims){ - Shape sh; - for (int i = 0; i < dims; ++i) { - sh.push_back(-1); + static Shape minusone(const Shape &right){ + Shape sh = right; + for (int i = 0; i < right.size(); ++i) { + sh[i] = -1; } return sh; } + friend std::ostream& operator<<(std::ostream& out, const Shape& s) { + out << " [AK Shape : ("; + for (int i = 0; i < s.dims(); i++) { + out << s.data()[i] << ","; + } + out << ") ]"; + return out; + } + +protected: + Layout* _layout{nullptr}; private: - template - void init_dims(First head, Args...args){ - push_back(head); - init_dims(args...); + void create_layout(LayoutType layout_type) { + switch(layout_type) { + case Layout_invalid: this->_layout = nullptr; break; + case Layout_W: this->_layout = new W(); break; + case Layout_HW: this->_layout = new HW(); break; + case Layout_WH: this->_layout = new WH(); break; + case Layout_NW: this->_layout = new NW(); break; + case Layout_NHW: this->_layout = new NHW(); break; + case Layout_NCHW: this->_layout = new NCHW(); break; + case Layout_NHWC: this->_layout = new NHWC(); break; + case Layout_NCHW_C4: this->_layout = new NCHW_C4(); break; + case Layout_NCHW_C8: this->_layout = new NCHW_C8(); break; + case Layout_NCHW_C16: this->_layout = new NCHW_C16(); break; + } } - void init_dims(){}; }; } //namespace saber diff --git a/saber/core/target_traits.h b/saber/core/target_traits.h index b4eb38ff0..9a059313a 100644 --- a/saber/core/target_traits.h +++ b/saber/core/target_traits.h @@ -1,16 +1,17 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_CORE_TARGET_TRAITS_H #define ANAKIN_SABER_CORE_TARGET_TRAITS_H diff --git a/saber/core/target_wrapper.h b/saber/core/target_wrapper.h index aafbf3648..d13eda671 100644 --- a/saber/core/target_wrapper.h +++ b/saber/core/target_wrapper.h @@ -1,23 +1,25 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_CORE_TARGET_WRAPPER_H #define ANAKIN_SABER_CORE_TARGET_WRAPPER_H -#include "core/target_traits.h" +#include "saber/core/target_traits.h" +#include "saber/core/data_traits.h" #include -namespace anakin { +namespace anakin{ namespace saber { @@ -66,11 +68,13 @@ struct TargetWrapper { */ static void get_device_count(int& count) { // todo - LOG(WARNING) << "host target \" get_device_count\" is not implemented"; + LOG(WARNING) << "host target x \" get_device_count\" is not implemented"; count = 1; } static void set_device(int id) { + LOG(INFO) << "set_device"; + // todo } @@ -105,70 +109,73 @@ struct TargetWrapper { * \brief create event, empty function for host target * */ - static void create_event(event_t& event, bool flag = false) {} + static void create_event(event_t* event, bool flag = false) {} /** * \brief destroy event, empty function for host target * */ - static void destroy_event(event_t& event) {} + static void destroy_event(event_t event) {} /** * \brief create stream, empty function for host target * */ - static void create_stream(stream_t& stream) {} + static void create_stream(stream_t* stream) {} /** * \brief create stream with flag, empty function for host target * */ - static void create_stream_with_flag(stream_t& stream, unsigned int flag) {} + static void create_stream_with_flag(stream_t* stream, unsigned int flag) {} /** * \brief create stream with priority, empty function for host target * */ - static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority) {} + static void create_stream_with_priority(stream_t* stream, unsigned int flag, int priority) {} /** * \brief destroy event, empty function for host target * */ - static void destroy_stream(stream_t& stream) {} + static void destroy_stream(stream_t stream) {} /** * \brief record event, empty function for host target * */ - static void record_event(event_t& event, stream_t stream) {} + static void record_event(event_t event, stream_t stream) {} /** * \brief query event, empty function for host target * */ - static void query_event(event_t& event) {} + static void query_event(event_t event) {} /** * \brief synchronize event, empty function for host target * */ - static void sync_event(event_t& event) {} + static void sync_event(event_t event) {} /** * \brief crreate event, empty function for host target * */ - static void sync_stream(event_t& event, stream_t& stream) {} + static void sync_stream(event_t event, stream_t stream) {} + + static void sync_stream(stream_t stream) {} /** * \brief memory copy function, use memcopy from host to host * */ - static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __HtoH) { - memcpy(dst, src, count); + static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, __HtoH) { + memcpy((char*)dst + dst_offset, (char*)src + src_offset, count); //LOG(INFO) << "host, sync, H2H, size: " << count; } @@ -181,9 +188,10 @@ struct TargetWrapper { * @param src_id * @param count */ - static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, stream_t& stream, __HtoH) { - memcpy(dst, src, count); + static void async_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream, __HtoH) { + memcpy((char*)dst + dst_offset, (char*)src + src_offset, count); //LOG(INFO) << "host, sync, H2H, size: " << count; } @@ -196,8 +204,8 @@ struct TargetWrapper { * @param src_dev * @param count */ - static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count) {} + static void sync_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, size_t count) {} /** * \brief asynchronize memcpy peer to peer, for device memory copy between different devices @@ -208,8 +216,8 @@ struct TargetWrapper { * @param src_dev * @param count */ - static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count, stream_t& stream) {} + static void async_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, size_t count, stream_t stream) {} /** * \brief host target return 0 @@ -218,6 +226,8 @@ struct TargetWrapper { static int get_device_id() { return 0; } + + static void device_sync() {} }; @@ -230,8 +240,8 @@ struct TargetWrapper { */ template <> struct TargetWrapper { - typedef __invalid_type event_t; - typedef __invalid_type stream_t; + typedef cudaEvent_t event_t; + typedef cudaStream_t stream_t; static void get_device_count(int& count); @@ -243,39 +253,45 @@ struct TargetWrapper { static void mem_set(void* ptr, int value, size_t n); - static void create_event(event_t& event, bool flag = false); + static void create_event(event_t* event, bool flag = false); + + static void destroy_event(event_t event); - static void destroy_event(event_t& event); + static void record_event(event_t event, stream_t stream); - static void record_event(event_t& event, stream_t stream); + static void create_stream(stream_t* stream); - static void create_stream(stream_t& stream); + static void create_stream_with_flag(stream_t* stream, unsigned int flag); - static void create_stream_with_flag(stream_t& stream, unsigned int flag); + static void create_stream_with_priority(stream_t* stream, unsigned int flag, int priority); - static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority); + static void destroy_stream(stream_t stream); - static void destroy_stream(stream_t& stream); + static void query_event(event_t event); - static void query_event(event_t& event); + static void sync_event(event_t event); - static void sync_event(event_t& event); + static void sync_stream(event_t event, stream_t stream); - static void sync_stream(event_t& event, stream_t& stream); + static void sync_stream(stream_t stream); - static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __HtoH); + static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, __HtoH); - static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, stream_t& stream, __HtoH); + static void async_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream, __HtoH); - static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count); + static void sync_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, size_t count); - static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count, stream_t& stream); + static void async_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream); static int get_device_id(); + static void device_sync(); }; /** @@ -302,80 +318,85 @@ struct TargetWrapper { //template static void mem_set(void* ptr, int value, size_t n); - static void create_event(event_t& event, bool flag = false); + static void create_event(event_t* event, bool flag = false); - static void create_stream(stream_t& stream); + static void create_stream(stream_t* stream); /** * \brief create cuda stream with flag * @param stream input stream * @param flag input flag, 0: default stream flag, 1: cudaStreamNonBlocking */ - static void create_stream_with_flag(stream_t& stream, unsigned int flag); + static void create_stream_with_flag(stream_t* stream, unsigned int flag); - static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority); + static void create_stream_with_priority(stream_t* stream, unsigned int flag, int priority); - static void destroy_stream(stream_t& stream); + static void destroy_stream(stream_t stream); - static void destroy_event(event_t& event); + static void destroy_event(event_t event); - static void record_event(event_t& event, stream_t stream); + static void record_event(event_t event, stream_t stream); - static void query_event(event_t& event); + static void query_event(event_t event); - static void sync_event(event_t& event); + static void sync_event(event_t event); - static void sync_stream(event_t& event, stream_t& stream); + static void sync_stream(event_t event, stream_t stream); + static void sync_stream(stream_t stream); - static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoD); + static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, __DtoD); - static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, stream_t& stream, __DtoD); + static void async_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream, __DtoD); - static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __HtoD); + static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, __HtoD); - static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, stream_t& stream, __HtoD); + static void async_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream, __HtoD); - static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoH); + static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, __DtoH); - static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, stream_t& stream, __DtoH); + static void async_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream, __DtoH); - static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count); + static void sync_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count); - static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count, stream_t& stream); + static void async_memcpy_p2p(void* dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream); /** * \brief device target return currently used device id * @return currently activated device id */ static int get_device_id(); + static void device_sync(); }; #endif //USE_CUDA -#ifdef USE_BM - /** + +#ifdef USE_BM_PLACE +/** * \brief for Bitmain sophon device target only, device target is BM tpu * use bitmain api to manage memory * support device to device, device to host, host to device memcpy */ template <> struct TargetWrapper { -// TargetWrapper () -// { -// CHECK_EQ(bmdnn_init(&handle),BM_SUCCESS) << "Error:bmdnn_init failed"; -// } -// ~TargetWrapper () -// { -// CHECK_EQ(bmdnn_deinit(handle),BM_SUCCESS) << "Error:bmdnn_deinit failed"; -// } + + typedef typename DataTraitBase::PtrDtype TPtr; typedef void* event_t; typedef void* stream_t; @@ -385,51 +406,173 @@ struct TargetWrapper { static void set_device(int id); //We should add strategy to avoid malloc directly - static void mem_alloc(void** ptr, size_t n); + static void mem_alloc(TPtr* ptr, size_t n); //template - static void mem_free(void * ptr); - + static void mem_free(TPtr ptr); + //template - static void mem_set(void* ptr, int value, size_t n); + static void mem_set(TPtr ptr, int value, size_t n); // brief create event, empty function for bitmain target - static void create_event(event_t& event, bool flag = false) {} + static void create_event(event_t* event, bool flag = false) {} static void destroy_event(event_t& event) {} - static void create_stream(stream_t& stream) {} - static void create_stream_with_flag(stream_t& stream, unsigned int flag) {} - static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority) {} - static void destroy_stream(stream_t& stream) {} - static void record_event(event_t& event, stream_t stream) {} - static void query_event(event_t& event) {} - static void sync_event(event_t& event) {} - static void sync_stream(event_t& event, stream_t& stream) {} + static void create_stream(stream_t* stream) {} + static void create_stream_with_flag(stream_t* stream, unsigned int flag) {} + static void create_stream_with_priority(stream_t* stream, unsigned int flag, int priority) {} + static void destroy_stream(stream_t stream) {} + static void record_event(event_t event, stream_t stream) {} + static void query_event(event_t event) {} + static void sync_event(event_t event) {} + static void sync_stream(event_t event, stream_t stream) {} // brief create event, empty function for bitmain target - static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + static void sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ + const TPtr src, size_t src_offset, int src_id, \ size_t count, __DtoD); - static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + static void sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ size_t count, __HtoD); - static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ + static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const TPtr src, size_t src_offset, int src_id, \ size_t count, __DtoH); - static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count); + static void sync_memcpy_p2p(TPtr dst, size_t dst_offset, int dst_id, \ + const TPtr src, size_t src_offset, int src_id, \ + size_t count); /** * \brief device target return currently used device id * @return currently activated device id */ static int get_device_id(); + static void device_sync(){}; + static bm_handle_t get_handle(); + static void init_handle(); + static void deinit_handle(); + +}; +#endif //USE_BM_PLACE + +#ifdef AMD_GPU + +/** + * \brief for AMD device target only, device target is AMD gpu + * use cuda api to manage memory + * support device to device, device to host, host to device memcpy +*/ +template <> +struct TargetWrapper { + + typedef typename DataTraitBase::PtrDtype TPtr; + + typedef cl_event event_t; + typedef cl_command_queue stream_t; + + static void get_device_count(int& count); + + static void set_device(int id); + + //We should add strategy to avoid malloc directly + static void mem_alloc(TPtr* ptr, size_t n); + + //template + static void mem_free(TPtr ptr); + + static void mem_set(TPtr ptr, int value, size_t n); + + static void create_event(event_t* event, bool flag = false); + + static void create_stream(stream_t* stream); + + /** + * \brief create cuda stream with flag + * @param stream input stream + * @param flag input flag, 0: default stream flag, 1: cudaStreamNonBlocking + */ + static void create_stream_with_flag(stream_t* stream, unsigned int flag); + + static void create_stream_with_priority(stream_t* stream, unsigned int flag, int priority); + + static void destroy_stream(stream_t stream); + + static void destroy_event(event_t event); + + static void record_event(event_t event, stream_t stream); + + static void query_event(event_t event); + + static void sync_event(event_t event); + + static void sync_stream(event_t event, stream_t stream); + static void sync_stream(stream_t stream); + + static void sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ + const TPtr src, size_t src_offset, int src_id, \ + size_t count, __DtoD); + + static void async_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ + const TPtr src, size_t src_offset, int src_id, \ + size_t count, stream_t stream, __DtoD); + + static void sync_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, __HtoD); + + static void async_memcpy(TPtr dst, size_t dst_offset, int dst_id, \ + const void* src, size_t src_offset, int src_id, \ + size_t count, stream_t stream, __HtoD); + + static void sync_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const TPtr src, size_t src_offset, int src_id, \ + size_t count, __DtoH); + + static void async_memcpy(void* dst, size_t dst_offset, int dst_id, \ + const TPtr src, size_t src_offset, int src_id, \ + size_t count, stream_t stream, __DtoH); + + static void sync_memcpy_p2p(TPtr dst, size_t dst_offset, int dst_id, \ + const TPtr src, size_t src_offset, int src_id, size_t count); + + static void async_memcpy_p2p(TPtr dst, size_t dst_offset, int dst_id, \ + const TPtr src, size_t src_offset, int src_id, \ + size_t count, stream_t stream); + + /** + * \brief device target return currently used device id + * @return currently activated device id + */ + static int get_device_id(); + + static void device_sync(); + + //static cl_platform_id platform_id; + //static cl_device_id current_device_id; + + static cl_platform_id get_platform_id(); + + /** + * \brief create cuda stream with flag + * @param stream input stream + * @param flag input flag + */ + static void _create_stream_with_flag(stream_t* stream, cl_context context, cl_device_id dev, unsigned int flag); + + //static void init(); + + //static cl_int enable_amd; + //static cl_device_id* device_ids; + //static cl_platform_id platform_id; + //static cl_uint device_nums; + static int current_device_id_index; + static std::map buffers; + //static cl_context* contexts; -// static bm_handle_t get_handler(); - -// bm_handle_t handle; }; -#endif //USE_BM +#endif //AMD_GPU } //namespace saber diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 860749981..20ceaf770 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,66 +16,30 @@ #ifndef ANAKIN_SABER_CORE_TENSOR_H #define ANAKIN_SABER_CORE_TENSOR_H -#include "core/shape.h" -#include "core/events.h" -#include "core/tensor_traits.h" -#include +#include "saber/core/shape.h" +#include "saber/core/events.h" +#include "saber/core/buffer.h" + namespace anakin{ namespace saber{ -#define INSTANTIATE_TENSOR(TargetType, datatype, LayOutType) \ - template class Tensor; - -class TensorBase { +template +class Tensor { public: - TensorBase() {} - virtual ~TensorBase() {} - virtual SaberStatus set_shape(Shape valid_shape, Shape shape = Shape(), \ - Shape offset = Shape()) = 0; - virtual SaberStatus reshape(Shape valid_shape, Shape shape = Shape(), \ - Shape offset = Shape()) = 0; - virtual SaberStatus re_alloc(Shape shape) = 0; - virtual bool is_continue_mem() const = 0; - virtual int size() const = 0; - virtual int valid_size() const = 0; - virtual int count(int start, int end) const = 0; - virtual int count_valid(int start, int end) const = 0; - virtual int dims() const = 0; - virtual Shape shape() const = 0; - virtual Shape valid_shape() const = 0; - virtual Shape get_stride() const = 0; - virtual Shape offset() const = 0; - virtual int device_id() const = 0; - virtual int num() const = 0; - virtual int num_index() const = 0; - virtual int channel() const = 0; - virtual int channel_index() const = 0; - virtual int height() const = 0; - virtual int height_index() const = 0; - virtual int width() const = 0; - virtual int width_index() const = 0; -}; -template -class Tensor : public TensorBase { -public: - typedef TargetType targetType_t; - typedef typename DataTrait::dtype Dtype; + typedef typename DataTraitBase::PtrDtype BaseDtype; + typedef typename TargetTypeTraits::target_category target_category; typedef typename TargetTypeTraits::target_type target_type; typedef TargetWrapper API; - typedef TensorTraits> TensorAPI; - typedef typename TensorAPI::layout_category layout_category; - typedef typename TensorAPI::layout_type layout_type; /** * \brief Default constructor */ - Tensor() { - _shape = Shape::zero(TensorAPI::layout_dims::value); - _valid_shape = Shape::zero(TensorAPI::layout_dims::value); - _offset = Shape::zero(TensorAPI::layout_dims::value); + Tensor(DataType type = AK_FLOAT) : _valid_shape(), _shape(), _offset() { + _dtype = type; + _type_len = type_length(type); _buf = std::make_shared>(); _is_subbuf = false; } @@ -83,106 +47,163 @@ class Tensor : public TensorBase { /** * \brief Constructor with shape, memory is alloced according to shape. */ - Tensor(Shape shape) { - - CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \ - "shape dims is not matched to layout type"; + Tensor(Shape shape, DataType type = AK_FLOAT) { _shape = shape; _valid_shape = shape; - _offset = Shape::zero(shape.dims()); - _buf = std::make_shared>(shape.count() * _type_len()); - _is_subbuf = false; - } -#if 0 - /** - * \brief constructor with currently used shape, offset and entire memory shape, - * memory is alloced according to the shape - */ - Tensor(Shape shape, Shape valid_shape, Shape offset) { - CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \ - "shape dims is not matched to layout type"; - CHECK_EQ(valid_shape.dims(), TensorAPI::layout_dims::value) << \ - "valid shape dims is not matched to layout type"; - CHECK_EQ(offset.dims(), TensorAPI::layout_dims::value) << \ - "offset dims is not matched to layout type"; - CHECK_EQ(true, (offset + valid_shape) <= shape) << \ - "valid shape + offset should <= shape"; - _shape = shape; - _valid_shape = valid_shape; - _offset = offset; - _buf = std::make_shared>(_shape.count()); + _offset = Shape::zero(shape); + _dtype = type; + _type_len = type_length(type); + _buf = std::make_shared>(shape.count() * _type_len); + _is_shared = false; _is_subbuf = false; } -#endif + /** * \brief Constructor with allocated data ptr and entire memory shape. */ -// template -// Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape) { -// -// CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \ -// "shape dims is not matched to layout type"; -// _shape = shape; -// _valid_shape = shape; -// _offset = Shape::zero(shape.dims()); -// std::shared_ptr> buf_from_date = \ -// std::make_shared>(data_ptr, shape.count() * _type_len(), id); -// BufferMemShare(_buf, buf_from_date); -// _is_subbuf = false; -// } + //! now only support fp32 data pointer + template + Tensor(typename DataTraitBase::PtrDtype data_ptr, TargetType_t target, int id, Shape shape, DataType type = AK_FLOAT) { -#ifdef USE_BM - /** - * \brief Constructor with allocated data ptr and entire memory shape. only for BM - */ - template - Tensor(Dtype_s* data_ptr, TargetType_t target, int id, Shape shape) { - CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \ - "shape dims is not matched to layout type"; _shape = shape; _valid_shape = shape; - _offset = Shape::zero(shape.dims()); - - if(typeid(Dtype_s) == typeid(AK_FLOAT)) - { - std::shared_ptr> buf_from_date = \ - std::make_shared>(&bm_mem_from_system(const_cast(data_ptr)), shape.count() * _type_len(), id); - - BufferMemShare(_buf, buf_from_date); - } - else - { + _offset = Shape::zero(shape); + _dtype = type; + _type_len = type_length(type); std::shared_ptr> buf_from_date = \ - std::make_shared>(data_ptr, shape.count() * _type_len(), id); - + std::make_shared>(data_ptr, shape.count() * _type_len, id); BufferMemShare(_buf, buf_from_date); - } + _is_shared = true; _is_subbuf = false; } -#endif + /** * \brief Copy constructor, shallow copy. */ - Tensor(const Tensor& tensor){ + Tensor(const Tensor& tensor) { _shape = tensor._shape; _valid_shape = tensor._valid_shape; _offset = tensor._offset; + _dtype = tensor._dtype; + _type_len = tensor._type_len; _buf = tensor._buf; _is_subbuf = tensor._is_subbuf; + _is_shared = tensor._is_shared; _seq_offset = tensor._seq_offset; + _scale = tensor._scale; } /** * \brief Copy constructor without events control. */ - Tensor(Tensor& tensor){ + Tensor(Tensor& tensor) { _shape = tensor._shape; _valid_shape = tensor._valid_shape; _offset = tensor._offset; + _dtype = tensor._dtype; + _type_len = tensor._type_len; _buf = tensor._buf; tensor.add_events(&_events_tree); _is_subbuf = tensor._is_subbuf; + _is_shared = tensor._is_shared; _seq_offset = tensor._seq_offset; + _scale = tensor._scale; + } +#if 0 + /** + * \brief create tensor with buffer + * @param shape + * @param type_len + * @param flag_create_lp + */ + void create(Shape shape, DataType type = AK_FLOAT) { + _dtype = type; + _type_len = type_length(type); + _shape = shape; + _valid_shape = shape; + _offset = Shape::zero(shape); + _dtype = type; + _type_len = type_length(type); + _buf = std::make_shared>(shape.count() * _type_len); + _is_shared = false; + _is_subbuf = false; + } +#endif // 0 + /** + * \brief set scale for different precision data convert + * @param scale + */ + void set_scale(std::vector scale) { + _scale = scale; + } + + /** + * \brief get scale + * @param scale + */ + std::vector get_scale() const { + return _scale; + } + + SaberStatus set_dtype(DataType type) { + _dtype = type; + _type_len = type_length(type); + if (_buf->get_capacity() < _shape.count() * _type_len) { + if (_is_shared || _is_subbuf) { + LOG(FATAL) << "tensor is shared, memory can not be re-alloced"; + return SaberOutOfAuthority; + } + _buf->re_alloc(_shape.count() * _type_len); + } + return SaberSuccess; + } + + /** + * \brief get tensor's DataType, AK_INT8 / AK_FLOAT ... + * @return + */ + DataType get_dtype() const { + return _dtype; + } + + size_t get_dtype_size() const { + switch(_dtype) { + case AK_HALF: { + return sizeof(unsigned short); + } break; + case AK_FLOAT: { + return sizeof(float); + } break; + case AK_DOUBLE: { + return sizeof(double); + } break; + case AK_INT8: { + return sizeof(int8_t); + } break; + case AK_INT32: { + return sizeof(int); + } break; + default: { + LOG(ERROR) << "tensor's data type is not supported. "; + return -1; + } break; + } + return -1; + } + + + /** + * \brief change tensor's layout and type + * @param layout + * @param data + * @return + */ + SaberStatus set_layout(LayoutType layout, std::vector data = {}) { + _valid_shape.set_layout(layout, data); + return SaberSuccess; + } + LayoutType get_layout() const { + return _shape.get_layout(); } /** @@ -191,40 +212,24 @@ class Tensor : public TensorBase { * \param valid_shape * \param offset */ - SaberStatus set_shape(Shape valid_shape, Shape shape = Shape(), Shape offset = Shape() \ - /*Shape shape = Shape::zero(TensorAPI::layout_dims::value), \ - Shape offset = Shape::minusone(TensorAPI::layout_dims::value)*/) { + SaberStatus set_shape(Shape valid_shape, Shape shape = Shape(), Shape offset = Shape()) { - //if (shape.dims() != TensorAPI::layout_dims::value || \ - valid_shape.dims() != TensorAPI::layout_dims::value \ - || offset.dims() != TensorAPI::layout_dims::value || \ - !(valid_shape > Shape::zero(TensorAPI::layout_dims::value))) { \ - return SaberInvalidValue; \ - } - CHECK_EQ(valid_shape.dims(), TensorAPI::layout_dims::value) << \ - "valid shape dims is not matched to layout type"; if (shape.dims() > 0) { - CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \ - "shape dims is not matched to layout type"; _shape = shape; } if (offset.dims() > 0 && _is_subbuf) { - CHECK_EQ(offset.dims(), TensorAPI::layout_dims::value) << \ - "offset dims is not matched to layout type"; _offset = offset; } - CHECK_EQ(valid_shape > Shape::zero(TensorAPI::layout_dims::value), true) << \ - "valid_shape size should > 0"; + CHECK_EQ(valid_shape > Shape::zero(valid_shape), true) << "valid_shape size should > 0"; _valid_shape = valid_shape; if (!_is_subbuf) { if (_shape.count() <= _valid_shape.count()) { _shape = _valid_shape; } - _offset = Shape::zero(TensorAPI::layout_dims::value); + _offset = Shape::zero(valid_shape); } else { - auto shape_zero = Shape::zero(TensorAPI::layout_dims::value); - if (_shape == shape_zero) { + if (_shape == Shape::zero(_valid_shape)) { _shape = valid_shape; } //if (!(_valid_shape + _offset <= _shape)) { \ @@ -239,64 +244,36 @@ class Tensor : public TensorBase { /** * \brief Free old buffer and alloc a new tensor buffer. */ - SaberStatus re_alloc(Shape shape){ + SaberStatus re_alloc(Shape shape, DataType type = AK_FLOAT) { //if (!shape.dims() == TensorAPI::layout_dims::value) { // return SaberInvalidValue; //} //if (_is_subbuf || _is_shared) { // return SaberOutOfAuthority; //} - CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \ - "shape dims is not matched to layout type"; - CHECK_EQ(_is_shared || _is_subbuf, false) << \ - "shared tensor could not re_alloc"; // by ccw 2018/4/3 + CHECK_EQ(_is_shared || _is_subbuf, false) << "shared tensor could not re_alloc"; + _dtype = type; + _type_len = type_length(type); _shape = shape; _valid_shape = _shape; - _offset =Shape::zero(_shape.dims()); - _buf->alloc(_shape.count() * _type_len()); + _offset =Shape::zero(_shape); + _buf->alloc(_shape.count() * _type_len); return SaberSuccess; } - void try_expand_size(Shape& shape) { - // LOG(INFO)<<"in try expand "< (valid_size())) { - re_alloc(shape); - } - - } - void try_expand_size(int size) { - Shape shape(1, 1, 1, size); - try_expand_size(shape); - } - - /** * \brief Change tensor shape, * if input shape's count is bigger than the capacity of buffer, alloc a new buffer. */ - SaberStatus reshape(Shape valid_shape, Shape shape = Shape(), Shape offset = Shape()\ - /*Shape::zero(TensorAPI::layout_dims::value), \ - Shape offset = Shape::minusone(TensorAPI::layout_dims::value)*/) { + SaberStatus reshape(Shape valid_shape, Shape shape = Shape(), Shape offset = Shape()) { - //if (shape.dims() != TensorAPI::layout_dims::value || \ - valid_shape.dims() != TensorAPI::layout_dims::value \ - || offset.dims() != TensorAPI::layout_dims::value || \ - !(valid_shape > Shape::zero(TensorAPI::layout_dims::value))) { \ - return SaberInvalidValue; \ - } - CHECK_EQ(valid_shape.dims(), TensorAPI::layout_dims::value) << \ - "valid shape dims is not matched to layout type"; if (shape.dims() > 0) { - CHECK_EQ(shape.dims(), TensorAPI::layout_dims::value) << \ - "shape dims is not matched to layout type"; _shape = shape; } if (offset.dims() > 0 && _is_subbuf) { - CHECK_EQ(offset.dims(), TensorAPI::layout_dims::value) << \ - "offset dims is not matched to layout type"; _offset = offset; } - CHECK_EQ(valid_shape > Shape::zero(TensorAPI::layout_dims::value), true) << \ + CHECK_EQ(valid_shape > Shape::zero(valid_shape), true) << \ "valid_shape size should > 0"; _valid_shape = valid_shape; @@ -304,9 +281,9 @@ class Tensor : public TensorBase { if (_shape.count() < _valid_shape.count()) { _shape = _valid_shape; } - _offset = Shape::zero(TensorAPI::layout_dims::value); + _offset = Shape::zero(_valid_shape); } else { - if (_shape == Shape::zero(TensorAPI::layout_dims::value)) { + if (_shape == Shape::zero(valid_shape)) { _shape = valid_shape; } //if (!(_valid_shape + _offset <= _shape)) { \ @@ -315,13 +292,13 @@ class Tensor : public TensorBase { CHECK_EQ(_valid_shape + _offset <= _shape, true) << \ "valid_shape + offet should <= shape"; } - bool exceed_flag = _shape.count() * _type_len() > _buf->get_capacity() \ + bool exceed_flag = _shape.count() * _type_len > _buf->get_capacity() \ && (_is_subbuf || _is_shared); //if (exceed_flag) { // return SaberOutOfAuthority; //} CHECK_EQ(exceed_flag, false) << "shared tensor shape exceed origin data buffer size"; - SABER_CHECK(_buf->re_alloc(_shape.count() * _type_len())); + SABER_CHECK(_buf->re_alloc(_shape.count() * _type_len)); return SaberSuccess; } @@ -338,21 +315,8 @@ class Tensor : public TensorBase { * \param end Input end index (exclude in calculation). * \return the size from start index to end index. */ - int count(int start, int end) const { - //if (start < 0) { \ - start = 0; \ - } - //if (end > dims()) { \ - end = dims(); \ - } - CHECK_GE(start, 0) << "start index shold >= 0!"; - CHECK_LE(end, _shape.size()) << "end index shold <= shape dims!"; - CHECK_LE(start, end) << "start index should < end index!"; - int sum = 1; - for (int i = start; i < end; ++i) { - sum *= _shape[i]; - } - return sum; + long long count(int start, int end) const { + return _shape.count(start, end); } /** @@ -361,27 +325,14 @@ class Tensor : public TensorBase { * \param end input end index (exclude in calculation). * \return the size from start index to end index. */ - int count_valid(int start, int end) const { - //if (start < 0) { \ - start = 0; \ - } - //if (end > dims()) { \ - end = dims(); \ - } - CHECK_GE(start, 0) << "start index shold >= 0!"; - CHECK_LE(end, _valid_shape.size()) << "end index shold <= shape dims!"; - CHECK_LE(start, end) << "start index should < end index!"; - int sum = 1; - for (int i = start; i < end; ++i) { - sum *= _valid_shape[i]; - } - return sum; + long long count_valid(int start, int end) const { + return _valid_shape.count(start, end); } /** * \brief Return tensor shape size, not the valid shape size. */ - int size() const { + long long size() const { return _shape.count(); } @@ -389,7 +340,7 @@ class Tensor : public TensorBase { * \brief Return the valid shape size. * \return Return the valid shape size. */ - int valid_size() const{ + long long valid_size() const{ return _valid_shape.count(); } @@ -397,7 +348,7 @@ class Tensor : public TensorBase { * \brief Return tensor shape dims. */ int dims() const { - return TensorAPI::layout_dims::value; + return _valid_shape.dims(); } /** @@ -418,18 +369,10 @@ class Tensor : public TensorBase { * \brief compute data stride. */ Shape get_stride() const { - Shape data_stride = Shape::zero(dims()); if (_is_subbuf) { - for (int i = 0; i < dims(); ++i) { - data_stride[i] = _shape.count(i + 1); - } - } else { - for (int i = 0; i < dims(); ++i) { - data_stride[i] = _valid_shape.count(i + 1); - } + return _shape.get_stride(); } - - return data_stride; + return _valid_shape.get_stride(); } /** @@ -440,11 +383,44 @@ class Tensor : public TensorBase { } /** - * \brief Return reference shared_ptr of tensor. + * \brief Return valid shape of tensor */ - const std::shared_ptr>& get_buf() const { - return _buf; - } + int data_offset() const { + return start_index(); + } + + + /** + * \brief get sequence offset, lot tensor + * @return + */ + std::vector> get_seq_offset() const { + return _seq_offset; + } + + /** + * \brief set sequence offset, lot tensor + * @param seq_offset + * @return + */ + SaberStatus set_seq_offset(std::vector> seq_offset) { + _seq_offset = seq_offset; + return SaberSuccess; + } + +// /** +// * \brief Return reference shared_ptr of tensor. +// */ +// const std::shared_ptr>& get_buf() const { +// return _fbuf; +// } +// +// /** +// * \brief Return reference shared_ptr of tensor. +// */ +// const std::shared_ptr>& get_lpbuf() const { +// return _lpbuf; +// } /** * \brief Return tensor device id. @@ -457,21 +433,31 @@ class Tensor : public TensorBase { * \brief Return number */ int num() const { - return TensorAPI::num(_valid_shape); + return _valid_shape.num(); } /** * \brief Return number index in shape. */ int num_index() const { - return TensorAPI::num_idx::value; - }; + return _valid_shape.num_index(); + } + + /** + * \brief set number to valid shape. + */ + void set_num(int num) { + _valid_shape.set_num(num); + if (_shape.count() < _valid_shape.count()) { + _shape = _valid_shape; + } + } /** * \brief Return channel. */ int channel() const { - return TensorAPI::channel(_valid_shape); + return _valid_shape.channel(); } /** @@ -479,7 +465,17 @@ class Tensor : public TensorBase { * \return */ int channel_index() const { - return TensorAPI::channel_idx::value; + return _valid_shape.channel_index(); + } + + /** + * \brief set channel to valid shape. + */ + void set_channel(int channel) { + _valid_shape.set_channel(channel); + if (_shape.count() < _valid_shape.count()) { + _shape = _valid_shape; + } } /** @@ -487,7 +483,7 @@ class Tensor : public TensorBase { * \return */ int height() const { - return TensorAPI::height(_valid_shape); + return _valid_shape.height(); } /** @@ -495,7 +491,17 @@ class Tensor : public TensorBase { * \return */ int height_index() const { - return TensorAPI::height_idx::value; + return _valid_shape.height_index(); + } + + /** + * \brief set height to valid shape. + */ + void set_height(int h) { + _valid_shape.set_height(h); + if (_shape.count() < _valid_shape.count()) { + _shape = _valid_shape; + } } /** @@ -503,7 +509,7 @@ class Tensor : public TensorBase { * \return */ int width() const { - return TensorAPI::width(_valid_shape); + return _valid_shape.width(); } /** @@ -511,13 +517,23 @@ class Tensor : public TensorBase { * \return */ int width_index() const { - return TensorAPI::width_idx::value; + return _valid_shape.width_index(); } /** - * \brief Return tensor mutable data pointer, with data type of current tensor (Dtype*). + * \brief set width to valid shape. */ - Dtype* mutable_data(int index = 0) { + void set_width(int w) { + _valid_shape.set_width(w); + if (_shape.count() < _valid_shape.count()) { + _shape = _valid_shape; + } + } + + /** + * \brief Return tensor mutable data pointer void*. + */ + BaseDtype mutable_data() { // synchronize the events tree //sync(); CHECK_EQ(device_id(), API::get_device_id()) << \ @@ -525,13 +541,14 @@ class Tensor : public TensorBase { if (_buf->get_capacity() == 0){ return nullptr; } - return static_cast(_buf->get_data_mutable()) + start_index() + index; + + return static_cast(_buf->get_data_mutable()); } /** * \brief Return tensor data pointer, with data type of current tensor (Dtype*). */ - const Dtype * data(int index = 0) const { + const BaseDtype data() const { // synchronize the events tree //sync(); CHECK_EQ(device_id(), API::get_device_id()) << \ @@ -539,7 +556,7 @@ class Tensor : public TensorBase { if (_buf->get_capacity() == 0){ return nullptr; } - return static_cast(_buf->get_data()) + start_index() + index; + return static_cast(_buf->get_data_mutable()); } /** @@ -549,21 +566,19 @@ class Tensor : public TensorBase { * only shared buffer ptr, current tensor will have continuous memory, * only if current shape and valid shape are the same, and offset is all set to 0. */ - //template ::layout_type>::value>::type> - //class = typename std::enable_if::layout_type>::value>::type > - template - SaberStatus share_from(const Tensor_t& tensor) { - - CHECK_EQ(_shape > Shape::zero(TensorAPI::layout_dims::value), true) << \ - "current tensor is not initialized (no shape info, use set_shape)"; - typedef typename Tensor_t::Dtype dtype_t; - CHECK_LE(size() * _type_len(), tensor.size() * sizeof(dtype_t)) << \ - "current tensor size should <= input tensor size"; - - _is_shared = BufferMemShare(_buf, tensor.get_buf()) > 0; + SaberStatus share_from(const Tensor& tensor) { + + CHECK_LE(size(), tensor.size()) << "current tensor size should <= input tensor size"; + + //_is_shared = BufferMemShare(_buf, tensor.get_buf()) > 0; + + CHECK_GE(tensor._buf->get_capacity(), _shape.count() * _type_len) << "capacity of input tensor should > current tensor"; + + _buf = tensor._buf; _is_subbuf = false; - _seq_offset = tensor.get_seq_offset(); + _seq_offset = tensor._seq_offset; + _is_shared = true; + //if(shared){ // _is_root = false; // tensor.add_events((EventsTree*)(&_events_tree)); @@ -572,44 +587,44 @@ class Tensor : public TensorBase { //} return SaberSuccess; } - std::vector get_seq_offset() const {return _seq_offset;} - SaberStatus set_seq_offset(std::vector seq_offset) {_seq_offset = seq_offset; return SaberSuccess;} - SaberStatus share_sub_buffer(const Tensor& tensor, \ - Shape valid_shape, Shape offset) { + + SaberStatus share_sub_buffer(const Tensor& tensor, Shape valid_shape, Shape offset) { //if (valid_shape.dims() != TensorAPI::layout_dims::value \ || offset.dims() != TensorAPI::layout_dims::value || \ !((offset + valid_shape) <= tensor.shape())) { \ return SaberInvalidValue; \ } - CHECK_EQ(valid_shape.dims(), TensorAPI::layout_dims::value) << \ - "shape dims is not matched to layout type"; - CHECK_EQ(offset.dims(), TensorAPI::layout_dims::value) << \ - "shape dims is not matched to layout type"; CHECK_EQ(true, (offset + valid_shape) <= tensor.shape()) << \ "offset + valid_shape <= shape"; _valid_shape = valid_shape; _offset = offset; _shape = tensor.shape(); - _buf = tensor.get_buf(); + _buf = tensor._buf; _is_subbuf = true; _is_shared = true; - _seq_offset = tensor.get_seq_offset(); + _seq_offset = tensor._seq_offset; return SaberSuccess; } /** * \brief Deep copy data within region of interest from input tensor. */ - template - SaberStatus copy_from(const Tensor& tensor) { + template + SaberStatus copy_from(const Tensor& tensor) { + //if (valid_size() != tensor.valid_size()) { \ return SaberInvalidValue; \ } + CHECK_EQ(tensor.get_dtype(), _dtype) << "data type should be the same"; CHECK_EQ(valid_size(), tensor.valid_size()) \ << "sizes of two valid shapes must be the same"; - + + if (_buf->get_capacity() == 0) { + reshape(_valid_shape); + } + /// get the proper process target wrapper typedef TargetWrapper API_t; typedef typename TargetTypeTraits::target_type target_type_t; @@ -618,17 +633,29 @@ class Tensor : public TensorBase { typedef typename IF::value, then_type, else_type>::Type flag_type; typedef typename IF::value, API_t, API>::Type process_API; + typedef typename DataTraitBase::PtrDtype BaseDtype_src; + + /// return if src and dst data ptrs are the same - if (data() == tensor.data()){ - return SaberSuccess; - } + /// FIXME weather true or false compare will be compiled +// if (std::is_same::value){ +// if (data() == tensor.data()) { +// return SaberSuccess; +// } +// } /// both tensors are continuous, copy entire buffer if (is_continue_mem() && tensor.is_continue_mem()) { - Dtype* ptr_dst = mutable_data(); - const Dtype* ptr_src = tensor.data(); - process_API::sync_memcpy(ptr_dst, device_id(), ptr_src, tensor.device_id(), \ - _type_len() * valid_size(), flag_type()); + int dst_data_offset = data_offset(); + int src_data_offset = tensor.data_offset(); + + BaseDtype ptr_dst = _buf->get_data_mutable(); + const BaseDtype_src ptr_src = tensor.data(); + + process_API::sync_memcpy(ptr_dst, _type_len * dst_data_offset, device_id(), \ + ptr_src, _type_len * src_data_offset, tensor.device_id(), \ + _type_len * valid_size(), flag_type()); + return SaberSuccess; } @@ -722,8 +749,12 @@ class Tensor : public TensorBase { int ratio_dst = cpy_len_dst / cpy_len; int ratio_src = cpy_len_src / cpy_len; - Dtype* dst = mutable_data(); - const Dtype* src = tensor.data(); + + int dst_data_offset = data_offset(); + int src_data_offset = tensor.data_offset(); + + BaseDtype ptr_dst = _buf->get_data_mutable(); + const BaseDtype_src ptr_src = tensor.data(); for (int i = 0; i < cpy_num; ++i) { int idx_dst = (i % ratio_dst) * cpy_len;//off_dst[abs(axis_discontinue_dst)] * \ @@ -743,33 +774,32 @@ class Tensor : public TensorBase { res_src = res_src % count_src[j]; } //printf("i: %d, idx_src: %d, idx_dst: %d\n", i, idx_src, idx_dst); - Dtype* ptr_dst = dst + idx_dst;//_buf->get_data_mutable() + idx_dst; - const Dtype* ptr_src = src + idx_src;//tensor.get_buf()->get_data() + idx_src; - process_API::sync_memcpy(ptr_dst, device_id(), ptr_src, tensor.device_id(), \ - _type_len() * cpy_len, flag_type()); + + int cpy_dst_offset = dst_data_offset + idx_dst; + int cpy_src_offset = src_data_offset + idx_src; + + process_API::sync_memcpy(ptr_dst, _type_len * cpy_dst_offset, device_id(), \ + ptr_src, _type_len * cpy_src_offset, tensor.device_id(), \ + _type_len * cpy_len, flag_type()); } return SaberSuccess; } -#ifdef USE_BM - template - SaberStatus copy_from(const Tensor& tensor) { - LOG(WARNING) << "Invalid: copy_from is not allowed for current type."; - return SaberInvalidValue; - } - -#endif - /** * \brief Asynchronously copy entire buffer from source tensor. */ - template ::target_category, __host_target>::value, \ typename TargetWrapper::stream_t, typename TargetWrapper::stream_t>::Type> - SaberStatus async_copy_from(const Tensor& tensor, \ - stream_type stream) { - CHECK_EQ(valid_size() == tensor.valid_size(), true) \ - << "input tensor size should equal to this tensor size"; + SaberStatus async_copy_from(const Tensor& tensor, stream_type stream) { + + CHECK_EQ(tensor.get_dtype(), _dtype) << "data type should be the same"; + CHECK_EQ(valid_size(), tensor.valid_size()) \ + << "sizes of two valid shapes must be the same"; + + if (_buf->get_capacity() == 0) { + reshape(_valid_shape); + } /// get the proper process target wrapper typedef TargetWrapper API_t; @@ -779,17 +809,27 @@ class Tensor : public TensorBase { typedef typename IF::value, then_type, else_type>::Type flag_type; typedef typename IF::value, API_t, API>::Type process_API; + typedef typename DataTraitBase::PtrDtype BaseDtype_src; + /// return if src and dst data ptrs are the same - if (data() == tensor.data()){ - return SaberSuccess; + if (std::is_same::value){ + if ((const void*)data() == (const void*)(tensor.data())) { + return SaberSuccess; + } } /// both tensors are continuous, copy entire buffer if (is_continue_mem() && tensor.is_continue_mem()) { - Dtype* ptr_dst = mutable_data(); - const Dtype* ptr_src = tensor.data(); - process_API::async_memcpy(ptr_dst, device_id(), ptr_src, tensor.device_id(), \ - _type_len() * valid_size(), stream, flag_type()); + int dst_data_offset = data_offset(); + int src_data_offset = tensor.data_offset(); + + BaseDtype ptr_dst = _buf->get_data_mutable(); + const BaseDtype_src ptr_src = tensor.data(); + + process_API::async_memcpy(ptr_dst, _type_len * dst_data_offset, device_id(), \ + ptr_src, _type_len * src_data_offset, tensor.device_id(), \ + _type_len * valid_size(), stream, flag_type()); + return SaberSuccess; } @@ -797,8 +837,8 @@ class Tensor : public TensorBase { Shape val_sh_dst = _valid_shape; Shape sh_src = tensor.shape(); Shape val_sh_src = tensor.valid_shape(); - Shape off_dst = _offset; - Shape off_src = tensor.offset(); + //Shape off_dst = _offset; + //Shape off_src = tensor.offset(); if (is_continue_mem()) { sh_dst = _valid_shape; @@ -831,8 +871,8 @@ class Tensor : public TensorBase { } //printf("dst axis=%d, src axis=%d\n", axis_discontinue_dst, axis_discontinue_src); - /// Only copy the region of interest. - /// Compute the copy length of each memcpy. + /// only copy the region of interest + /// compute the copy length of each memcpy int cpy_len_dst = 1; int cpy_len_src = 1; if (axis_discontinue_dst < 0){ @@ -852,11 +892,11 @@ class Tensor : public TensorBase { //printf("cpy_len_dst=%d, %d, cpy_len_src=%d, %d\n", cpy_len_dst, valid_size(), cpy_len_src, tensor.valid_size()); int cpy_len = cpy_len_dst < cpy_len_src? cpy_len_dst : cpy_len_src; - /// Compute the total copy times. + /// compute the total copy times int cpy_num = valid_size() / cpy_len; //printf("cpy_len=%d, cpy_num=%d\n", cpy_len, cpy_num); - /// Compute the stride and start index of dst buffer and src buffer. + /// compute the stride and start index of dst buffer and src buffer std::vector count_dst(abs(axis_discontinue_dst) + 1); std::vector count_src(abs(axis_discontinue_src) + 1); @@ -879,12 +919,16 @@ class Tensor : public TensorBase { } } - /// Compute the start position of each buffer, memcpy from src to dst. + /// compute the start position of each buffer, memcpy from src to dst int ratio_dst = cpy_len_dst / cpy_len; int ratio_src = cpy_len_src / cpy_len; - Dtype* dst = mutable_data(); - const Dtype* src = tensor.data(); + + int dst_data_offset = data_offset(); + int src_data_offset = tensor.data_offset(); + + BaseDtype ptr_dst = _buf->get_data_mutable(); + const BaseDtype_src ptr_src = tensor.data(); for (int i = 0; i < cpy_num; ++i) { int idx_dst = (i % ratio_dst) * cpy_len;//off_dst[abs(axis_discontinue_dst)] * \ @@ -904,10 +948,13 @@ class Tensor : public TensorBase { res_src = res_src % count_src[j]; } //printf("i: %d, idx_src: %d, idx_dst: %d\n", i, idx_src, idx_dst); - Dtype* ptr_dst = dst + idx_dst;//_buf->get_data_mutable() + idx_dst; - const Dtype* ptr_src = src + idx_src;//tensor.get_buf()->get_data() + idx_src; - process_API::async_memcpy(ptr_dst, device_id(), ptr_src, tensor.device_id(), \ - _type_len() * cpy_len, stream, flag_type()); + + int cpy_dst_offset = dst_data_offset + idx_dst; + int cpy_src_offset = src_data_offset + idx_src; + + process_API::async_memcpy(ptr_dst, _type_len * cpy_dst_offset, device_id(), \ + ptr_src, _type_len * cpy_src_offset, tensor.device_id(), \ + _type_len * cpy_len, stream, flag_type()); } return SaberSuccess; } @@ -936,10 +983,13 @@ class Tensor : public TensorBase { private: + //! scale for quantization + std::vector _scale; + ///< Length of datatype. - size_t _type_len(){ - return sizeof(Dtype); - } + DataType _dtype; + size_t _type_len; + ///< Represent the raw mem shape. Shape _shape; ///< Represent the mem you have right to access shape. @@ -954,6 +1004,9 @@ class Tensor : public TensorBase { bool _is_subbuf{false}; bool _is_shared{false}; + //! lot tensor + std::vector> _seq_offset; + /// Get data real start index. int start_index() const { if (!_is_subbuf) { @@ -966,91 +1019,8 @@ class Tensor : public TensorBase { } return idx; } - - std::vector _seq_offset; }; -#ifdef USE_BM - -#ifndef BM_TENSOR_COPY -#define BM_TENSOR_COPY - - -template<> inline -size_t Tensor::_type_len(){ - return 4; -} - -template<> -template<> inline -SaberStatus Tensor::copy_from(const Tensor& tensor) { - LOG(INFO) << "BM copy_from X86"; - CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; - - auto* device_data_ptr = mutable_data(); - BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast(tensor.data())))); - return SaberSuccess; -} - -template<> -template<> inline -SaberStatus Tensor::copy_from(const Tensor& tensor) { - LOG(INFO) << "X86 copy_from BM"; - CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; - - auto* device_data_ptr = const_cast(tensor.data()); - BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr)); - return SaberSuccess; -} - -/* - template<> inline - size_t Tensor::_type_len(){ - return 4; - } - - template<> - template<> inline - SaberStatus Tensor::copy_from(const Tensor& tensor) { - LOG(INFO) << "BM copy_from X86"; - CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; - - auto* device_data_ptr = mutable_data(); - BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(const_cast(tensor.data())))); - //BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *(bm_device_mem_t *)(mutable_data()), bm_mem_from_system(tensor.data()))); - return SaberSuccess; - } - - template<> - template<> inline - SaberStatus Tensor::copy_from(const Tensor& tensor) { - LOG(INFO) << "X86 copy_from BM"; - CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; - - auto* device_data_ptr = const_cast(tensor.data()); - BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr)); - //BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *(bm_device_mem_t *)(tensor.data()))); - return SaberSuccess; - } - - template<> - template<> inline - SaberStatus Tensor::copy_from(const Tensor& tensor) { - LOG(INFO) << "BM copy_from BM"; - CHECK_EQ(valid_size(), tensor.valid_size()) << "sizes of two valid shapes must be the same"; - - auto* device_data_ptr = const_cast(tensor.data()); - //BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *device_data_ptr)); - //BMDNN_CHECK(bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(mutable_data()), *(bm_device_mem_t *)(tensor.data()))); - return SaberSuccess; - } -*/ - -#endif - -#endif - - } //namespace saber } //namespace anakin diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index d7ee91231..7ba1832be 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -1,456 +1,315 @@ #include "tensor_op.h" -#include "anakin_config.h" -#include +#include namespace anakin { namespace saber { -template -void fill_tensor_host_const(Tensor_t& tensor, typename Tensor_t::Dtype value) { - - typedef typename Tensor_t::Dtype Dtype; - Dtype* data_ptr = static_cast(tensor.get_buf()->get_data_mutable()); - int size = tensor.size(); - - for (int i = 0; i < size; ++i) { - data_ptr[i] = value; +template +void fill_tensor_host_const_impl(Dtype* dio, Dtype value, long long size) { + for (long long i = 0; i < size; ++i) { + dio[i] = value; } } -template -void fill_tensor_host_rand(Tensor_t& tensor) { - typedef typename Tensor_t::Dtype Dtype; - Dtype* data_ptr = static_cast(tensor.get_buf()->get_data_mutable()); +template +void fill_tensor_const(Tensor& tensor, float value, typename Tensor::API::stream_t stream) { + + long long size = tensor.size(); + void* dio = tensor.mutable_data(); + DataType type = tensor.get_dtype(); + switch (type){ + case AK_UINT8: fill_tensor_host_const_impl((unsigned char*)dio, static_cast(value), size); break; + case AK_INT8: fill_tensor_host_const_impl((char*)dio, static_cast(value), size); break; + case AK_INT16: fill_tensor_host_const_impl((short*)dio, static_cast(value), size); break; + case AK_UINT16: fill_tensor_host_const_impl((unsigned short*)dio, static_cast(value), size); break; + case AK_HALF: fill_tensor_host_const_impl((short*)dio, static_cast(value), size); break; + case AK_UINT32: fill_tensor_host_const_impl((unsigned int*)dio, static_cast(value), size); break; + case AK_INT32: fill_tensor_host_const_impl((int*)dio, static_cast(value), size); break; + case AK_FLOAT: fill_tensor_host_const_impl((float*)dio, static_cast(value), size); break; + case AK_DOUBLE: fill_tensor_host_const_impl((double*)dio, static_cast(value), size); break; + default: LOG(FATAL) << "data type: " << type << " is unsupported now"; + } +} - for (int i = 0; i < tensor.size(); ++i) { - data_ptr[i] = static_cast(rand()); +template +void fill_tensor_host_rand_impl(Dtype* dio, long long size) { + for (long long i = 0; i < size; ++i) { + Dtype rand_x=static_cast(rand()%256); + dio[i] = (rand_x-128)/128; + } +} +template <> +void fill_tensor_host_rand_impl(char* dio, long long size) { + for (long long i = 0; i < size; ++i) { + dio[i] = rand()%256-128; + } +} +template <> +void fill_tensor_host_rand_impl(unsigned char* dio, long long size) { + for (long long i = 0; i < size; ++i) { + dio[i] = rand()%256; + } +} +template +void fill_tensor_host_seq_impl(Dtype* dio, long long size) { + for (long long i = 0; i < size; ++i) { + dio[i] = static_cast(i); } } -template -void fill_tensor_host_seq(Tensor_t& tensor) { - typedef typename Tensor_t::Dtype Dtype; - Dtype* data_ptr = static_cast(tensor.get_buf()->get_data_mutable()); +template +void fill_tensor_rand(Tensor& tensor, typename Tensor::API::stream_t stream) { + long long size = tensor.size(); + void* dio = tensor.mutable_data(); + DataType type = tensor.get_dtype(); + switch (type){ + case AK_UINT8: fill_tensor_host_rand_impl((unsigned char*)dio, size); break; + case AK_INT8: fill_tensor_host_rand_impl((char*)dio, size); break; + case AK_INT16: fill_tensor_host_rand_impl((short*)dio, size); break; + case AK_UINT16: fill_tensor_host_rand_impl((unsigned short*)dio, size); break; + case AK_UINT32: fill_tensor_host_rand_impl((unsigned int*)dio, size); break; + case AK_INT32: fill_tensor_host_rand_impl((int*)dio, size); break; + case AK_HALF: fill_tensor_host_rand_impl((short*)dio, size); break; + case AK_FLOAT: fill_tensor_host_rand_impl((float*)dio, size); break; + case AK_DOUBLE: fill_tensor_host_rand_impl((double*)dio, size); break; + default: LOG(FATAL) << "data type: " << type << " is unsupported now"; + } +} - for (int i = 0; i < tensor.size(); ++i) { - data_ptr[i] = static_cast(i); +template +void fill_tensor_seq(Tensor& tensor, typename Tensor::API::stream_t stream) { + long long size = tensor.size(); + void* dio = tensor.mutable_data(); + DataType type = tensor.get_dtype(); + switch (type){ + case AK_UINT8: fill_tensor_host_seq_impl((unsigned char*)dio, size); break; + case AK_INT8: fill_tensor_host_seq_impl((char*)dio, size); break; + case AK_INT16: fill_tensor_host_seq_impl((short*)dio, size); break; + case AK_UINT16: fill_tensor_host_seq_impl((unsigned short*)dio, size); break; + case AK_UINT32: fill_tensor_host_seq_impl((unsigned int*)dio, size); break; + case AK_INT32: fill_tensor_host_seq_impl((int*)dio, size); break; + case AK_HALF: fill_tensor_host_seq_impl((short*)dio, size); break; + case AK_FLOAT: fill_tensor_host_seq_impl((float*)dio, size); break; + case AK_DOUBLE: fill_tensor_host_seq_impl((double*)dio, size); break; + default: LOG(FATAL) << "data type: " << type << " is unsupported now"; } } -template -void fill_tensor_host_rand(Tensor_t& tensor, typename Tensor_t::Dtype vstart, \ - typename Tensor_t::Dtype vend) { - typedef typename Tensor_t::Dtype Dtype; - Dtype* data_ptr = static_cast(tensor.get_buf()->get_data_mutable()); +template +void fill_tensor_host_rand_impl2(Dtype* dio, Dtype vstart, Dtype vend, long long size) { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution dis(0, 1.f); - int size = tensor.size(); - for (int i = 0; i < size; ++i) { - Dtype random_num = vstart + (vend - vstart) * dis(gen); - data_ptr[i] = random_num; + for (long long i = 0; i < size; ++i) { + Dtype random_num = static_cast(vstart + (vend - vstart) * dis(gen)); + dio[i] = random_num; } } -template -void print_tensor_host(Tensor_t& tensor) { - - typedef typename Tensor_t::Dtype Dtype; - LOG(INFO) << "host tensor data:" << tensor.size(); - const Dtype* data_ptr = static_cast(tensor.get_buf()->get_data()); - int size = tensor.size(); - - for (int i = 0; i < size; ++i) { - printf("%.2f ", static_cast(data_ptr[i])); - - if ((i + 1) % tensor.width() == 0) { - printf("\n"); - } +template +void fill_tensor_rand(Tensor& tensor, float vstart, float vend, \ + typename Tensor::API::stream_t stream) { + + long long size = tensor.size(); + void* dio = tensor.mutable_data(); + DataType type = tensor.get_dtype(); + switch (type){ + case AK_UINT8: fill_tensor_host_rand_impl2((unsigned char*)dio, static_cast(vstart), + static_cast(vend), size); break; + case AK_INT8: fill_tensor_host_rand_impl2((char*)dio, static_cast(vstart), static_cast(vend), size); break; + case AK_INT16: fill_tensor_host_rand_impl2((short*)dio, static_cast(vstart), static_cast(vend), size); break; + case AK_UINT16: fill_tensor_host_rand_impl2((unsigned short*)dio, static_cast(vstart), + static_cast(vend), size); break; + case AK_UINT32: fill_tensor_host_rand_impl2((unsigned int*)dio, static_cast(vstart), + static_cast(vend), size); break; + case AK_INT32: fill_tensor_host_rand_impl2((int*)dio, static_cast(vstart), static_cast(vend), size); break; + case AK_HALF: fill_tensor_host_rand_impl2((short*)dio, static_cast(vstart), static_cast(vend), size); break; + case AK_FLOAT: fill_tensor_host_rand_impl2((float*)dio, static_cast(vstart), static_cast(vend), size); break; + case AK_DOUBLE: fill_tensor_host_rand_impl2((double*)dio, static_cast(vstart), static_cast(vend), size); break; + default: LOG(FATAL) << "data type: " << type << " is unsupported now"; } - - printf("\n"); } template -void tensor_cmp_host(const Dtype* src1, const Dtype* src2, \ - int size, double& max_ratio, double& max_diff) { - - const double eps = 1e-6f; - max_diff = fabs(src1[0] - src2[0]); - max_ratio = 2.0 * max_diff / (src1[0] + src2[0] + eps); - - for (int i = 1; i < size; ++i) { - double diff = fabs(src1[i] - src2[i]); - - if (max_diff < diff) { - max_diff = diff; - max_ratio = 2.0 * max_diff / (src1[i] + src2[i] + eps); +void print_tensor_host_impl(const Dtype* din, long long size, int width) { + for (int i = 0; i < size; ++i) { + printf("%.6f ", static_cast(din[i])); + if ((i + 1) % width == 0) { + printf("\n"); } } + printf("\n"); } -#define FILL_TENSOR_HOST(target, type, layout) \ - template void fill_tensor_host_const>\ - (Tensor& tensor, DataTrait::dtype value); \ - template void fill_tensor_host_rand>\ - (Tensor& tensor); \ - template void fill_tensor_host_rand>\ - (Tensor& tensor, DataTrait::dtype vstart, \ - DataTrait::dtype vend); \ - template void print_tensor_host>\ - (Tensor& tensor);\ - template void fill_tensor_host_seq>\ - (Tensor& tensor); - - -FILL_TENSOR_HOST(X86, AK_FLOAT, NCHW); -FILL_TENSOR_HOST(X86, AK_FLOAT, NCHW_C16); -FILL_TENSOR_HOST(X86, AK_FLOAT, NCHW_C8); -FILL_TENSOR_HOST(X86, AK_FLOAT, NHWC); -FILL_TENSOR_HOST(X86, AK_FLOAT, NHW); -FILL_TENSOR_HOST(X86, AK_FLOAT, NW); -FILL_TENSOR_HOST(X86, AK_FLOAT, HW); -FILL_TENSOR_HOST(X86, AK_FLOAT, W); - -FILL_TENSOR_HOST(X86, AK_INT8, NCHW); -FILL_TENSOR_HOST(X86, AK_INT8, NHWC); -FILL_TENSOR_HOST(X86, AK_INT8, NHW); -FILL_TENSOR_HOST(X86, AK_INT8, NW); -FILL_TENSOR_HOST(X86, AK_INT8, HW); -FILL_TENSOR_HOST(X86, AK_INT8, W); - - -template void tensor_cmp_host(const float* src1, const float* src2, \ - int size, double& max_ratio, double& max_diff); -template void tensor_cmp_host(const char* src1, const char* src2, int size, \ - double& max_ratio, double& max_diff); - -template void fill_tensor_host_const>(Tensor& - tensor, char value); -template void fill_tensor_host_rand>(Tensor& - tensor); +template +void print_tensor(Tensor& tensor, typename Tensor::API::stream_t stream) { -template <> -void print_tensor_host>(Tensor& tensor) { - typedef typename Tensor::Dtype Dtype; LOG(INFO) << "host tensor data:" << tensor.size(); - const Dtype* data_ptr = tensor.get_buf()->get_data(); - int size = tensor.size(); - - for (int i = 0; i < size; ++i) { - printf("%.2f ", static_cast(data_ptr[i])); - - if ((i + 1) % (4 * tensor.width()) == 0) { - printf("\n"); - } + const void* data_ptr = tensor.data(); + long long size = tensor.size(); + int width = tensor.width(); + DataType type = tensor.get_dtype(); + switch(type) { + case AK_UINT8: print_tensor_host_impl((const unsigned char*)data_ptr, size, width); break; + case AK_INT8: print_tensor_host_impl((const char*)data_ptr, size, width); break; + case AK_UINT16: print_tensor_host_impl((const unsigned short*)data_ptr, size, width); break; + case AK_INT16: print_tensor_host_impl((const short*)data_ptr, size, width); break; + case AK_UINT32: print_tensor_host_impl((const unsigned int*)data_ptr, size, width); break; + case AK_INT32: print_tensor_host_impl((const int*)data_ptr, size, width); break; + case AK_FLOAT: print_tensor_host_impl((const float*)data_ptr, size, width); break; + case AK_DOUBLE: print_tensor_host_impl((const double*)data_ptr, size, width); break; + default: LOG(FATAL) << "data type: " << type << " is unsupported now"; } - printf("\n"); } -#ifdef USE_X86_PLACE -template <> -void reorder, Tensor>(Tensor& src, Tensor& dst) { - typedef typename Tensor::Dtype Dtype; - int blksize = 16; - const Dtype *src_data = src.data(); - Dtype *dst_data = dst.mutable_data(); - int width = src.width(); - int height = src.height(); - const int spatial_size = height * width; - auto ker = [&](const Dtype *i, Dtype *o) { - for (int w = 0; w < src.width(); ++w) { - for (int c = 0; c < blksize; ++c) { - const size_t nchw_off = c * spatial_size + w; - o[w * blksize + c] = i[nchw_off]; - } - } - }; - int num = dst.num(); - int channel = src.channel(); - int channel_blk = channel / blksize; -#pragma omp parallel for collapse(3) schedule(static) - for (int n = 0; n < num; ++n) { - for (int C = 0; C < channel_blk; ++C) { - for (int h = 0; h < height; ++h) { - int input_offset = ((n * channel + blksize * C) * height + h) * width; - int output_offset = ((n * channel_blk + C) * height + h) * blksize * width; - auto i = &src_data[input_offset]; - auto o = &dst_data[output_offset]; - ker(i, o); - } - } - } - return; -} -template <> -void reorder, Tensor>(Tensor& src, Tensor& dst) { - typedef typename Tensor::Dtype Dtype; - int blksize = 16; - const Dtype *src_data = src.data(); - Dtype *dst_data = dst.mutable_data(); - int width = dst.width(); - int height = dst.height(); - const int spatial_size = height * width; - auto ker = [&](const Dtype *i, Dtype *o) { - for (int w = 0; w < width; ++w) { - for (int c = 0; c < blksize; ++c) { - const size_t nchw_off = c * spatial_size + w; - o[nchw_off] = i[w * blksize + c]; - } - } - }; - int num = dst.num(); - int channel = dst.channel(); - int channel_blk = channel / blksize; -#pragma omp parallel for collapse(3) schedule(static) - for (int n = 0; n < num; ++n) { - for (int C = 0; C < channel_blk; ++C) { - for (int h = 0; h < height; ++h) { - int input_offset = ((n * channel_blk + C) * height + h) * blksize * width; - int output_offset = ((n * channel + blksize * C) * height + h) * width; - auto i = &src_data[input_offset]; - auto o = &dst_data[output_offset]; - ker(i, o); - } - } - } - return; -} -template <> -void reorder, Tensor>(Tensor& src, Tensor& dst) { - typedef typename Tensor::Dtype Dtype; - int blksize = 8; - const Dtype *src_data = src.data(); - Dtype *dst_data = dst.mutable_data(); - int width = dst.width(); - int height = dst.height(); - const int spatial_size = height * width; - auto ker = [&](const Dtype *i, Dtype *o) { - for (int w = 0; w < width; ++w) { - for (int c = 0; c < blksize; ++c) { - const size_t nchw_off = c * spatial_size + w; - o[nchw_off] = i[w * blksize + c]; - } - } - }; - int num = dst.num(); - int channel = dst.channel(); - int channel_blk = channel / blksize; -#pragma omp parallel for collapse(3) schedule(static) - for (int n = 0; n < num; ++n) { - for (int C = 0; C < channel_blk; ++C) { - for (int h = 0; h < height; ++h) { - int input_offset = ((n * channel_blk + C) * height + h) * blksize * width; - int output_offset = ((n * channel + blksize * C) * height + h) * width; - auto i = &src_data[input_offset]; - auto o = &dst_data[output_offset]; - ker(i, o); - } - } - } - return; + +template +void print_tensor_device(Tensor& tensor, typename Tensor::API::stream_t stream){ + CHECK(false)<<"not imply print_tensor_device"; } -#endif -#ifdef USE_ARM_PLACE -FILL_TENSOR_HOST(ARM, AK_FLOAT, NCHW); -FILL_TENSOR_HOST(ARM, AK_FLOAT, NHWC); -FILL_TENSOR_HOST(ARM, AK_FLOAT, NHW); -FILL_TENSOR_HOST(ARM, AK_FLOAT, NW); -FILL_TENSOR_HOST(ARM, AK_FLOAT, HW); -FILL_TENSOR_HOST(ARM, AK_FLOAT, W); - -FILL_TENSOR_HOST(ARM, AK_INT8, NCHW); -FILL_TENSOR_HOST(ARM, AK_INT8, NHWC); -FILL_TENSOR_HOST(ARM, AK_INT8, NHW); -FILL_TENSOR_HOST(ARM, AK_INT8, NW); -FILL_TENSOR_HOST(ARM, AK_INT8, HW); -FILL_TENSOR_HOST(ARM, AK_INT8, W); -#endif -#ifdef USE_CUDA -template<> -SaberStatus -DataTensorTransformHelper::convert_weights, - Tensor >(Tensor& out_tensor, - const Tensor& in_tensor, -Context ctx) { - int input_channel = in_tensor.channel(); - int output_channel = out_tensor.shape()[1]; - // LOG(INFO)<<"input_channel = "< tvalid(tensor.valid_shape()); + tvalid.copy_from(tensor); + print_tensor(tvalid, stream); } - return SaberSuccess; } -template<> -SaberStatus -DataTensorTransformHelper::convert_bias, - Tensor >(Tensor& out_tensor, - const Tensor& in_tensor, -Context ctx) { - unsigned long weight_size = _vector_weight_scale.size(); - unsigned long bias_size = in_tensor.size(); - CHECK_GT(_in_scale, 0); - CHECK_GT(weight_size, 0); - CHECK_EQ(bias_size, weight_size); - - const float* in_data = in_tensor.data(); - float* out_data = out_tensor.mutable_data(); - - for (int i = 0; i < bias_size; ++i) { - out_data[i] = in_data[i] / _in_scale / _vector_weight_scale[i]; - } - - return SaberSuccess; -} -#endif +template +void tensor_cmp_host(const Dtype* src1, const Dtype* src2, \ + int size, double& max_ratio, double& max_diff) { -#ifdef USE_BM + const double eps = 1e-6f; + max_diff = fabs(src1[0] - src2[0]); + max_ratio = fabs(2.0 * max_diff / (src1[0] + src2[0] + eps)); -template<> -void fill_tensor_device_rand>(Tensor& tensor, \ - typename Tensor::API::stream_t stream) { + for (int i = 1; i < size; ++i) { + double diff = fabs(src1[i] - src2[i]); - float *host_mem_input = new float[tensor.size()]; - for (int i = 0; i < tensor.size(); ++i) { - host_mem_input[i] = static_cast(rand()); + if (max_diff < diff) { + max_diff = diff; + max_ratio = fabs(2.0 * max_diff / (src1[i] + src2[i] + eps)); + //LOG(INFO) << "compare two src1: "<< src1[i] << " src2: "<< src2[i] << "i = "<< i << " max_ratio: " << max_ratio ; + } } - - bm_device_mem_t* device_data_ptr = tensor.mutable_data(); - BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input))); - - delete [] host_mem_input; } -void fill_tensor_device_rand(Tensor& tensor, float vstart, \ - float vend, typename Tensor::API::stream_t stream = NULL){ - - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dis(0, 1.f); - - float *host_mem_input = new float[tensor.size()]; - for (int i = 0; i < tensor.size(); ++i) { - float random_num = vstart + (vend - vstart) * dis(gen); - host_mem_input[i] = random_num; +template +double tensor_mean_value_host_impl(const Dtype* din, long long size) { + double sum = 0.0; + for (long long i = 0; i < size; ++i) { + sum += din[i]; } - - bm_device_mem_t* device_data_ptr = tensor.mutable_data(); - BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input))); - - delete [] host_mem_input; + return sum / size; } -void fill_tensor_device_const(Tensor& tensor, float value, \ - typename Tensor::API::stream_t stream = NULL){ - - float *host_mem_input = new float[tensor.size()]; - for (int i = 0; i < tensor.size(); ++i) { - host_mem_input[i] = value; +template +double tensor_mean_value(Tensor& tensor, typename Tensor::API::stream_t stream) { + + const void* data_ptr = tensor.data(); + long long size = tensor.size(); + DataType type = tensor.get_dtype(); + switch (type) { + case AK_UINT8: return tensor_mean_value_host_impl((const unsigned char*)data_ptr, size); + case AK_INT8: return tensor_mean_value_host_impl((const char*)data_ptr, size); + case AK_UINT16: return tensor_mean_value_host_impl((const unsigned short*)data_ptr, size); + case AK_INT16: return tensor_mean_value_host_impl((const short*)data_ptr, size); + case AK_UINT32: return tensor_mean_value_host_impl((const unsigned int*)data_ptr, size); + case AK_INT32: return tensor_mean_value_host_impl((const int*)data_ptr, size); + case AK_FLOAT: return tensor_mean_value_host_impl((const float*)data_ptr, size); + case AK_DOUBLE: return tensor_mean_value_host_impl((const double*)data_ptr, size); + default: LOG(FATAL) << "data type: " << type << " is unsupported now"; } - - bm_device_mem_t* device_data_ptr = tensor.mutable_data(); - BMDNN_CHECK(bm_memcpy_s2d(get_bm_handle(), *device_data_ptr, bm_mem_from_system(host_mem_input))); - - delete [] host_mem_input; + return 0.0; } -template <> -void print_tensor_device>(Tensor& tensor, \ - typename Tensor::API::stream_t stream) { - - LOG(INFO) << "BM device tensor data:" << tensor.size(); +template +double tensor_mean_value_valid(Tensor& tensor, typename Tensor::API::stream_t stream) { + + const void* data_ptr = (const void*)((const char*)tensor.data() + tensor.data_offset() * type_length(tensor.get_dtype())); + long long size = tensor.valid_size(); + DataType type = tensor.get_dtype(); + + if (tensor.is_continue_mem()) { + switch (type) { + case AK_UINT8: return tensor_mean_value_host_impl((const unsigned char*)data_ptr, size); + case AK_INT8: return tensor_mean_value_host_impl((const char*)data_ptr, size); + case AK_UINT16: return tensor_mean_value_host_impl((const unsigned short*)data_ptr, size); + case AK_INT16: return tensor_mean_value_host_impl((const short*)data_ptr, size); + case AK_UINT32: return tensor_mean_value_host_impl((const unsigned int*)data_ptr, size); + case AK_INT32: return tensor_mean_value_host_impl((const int*)data_ptr, size); + case AK_FLOAT: return tensor_mean_value_host_impl((const float*)data_ptr, size); + case AK_DOUBLE: return tensor_mean_value_host_impl((const double*)data_ptr, size); + default: LOG(FATAL) << "data type: " << type << " is unsupported now"; + } + } else { + Tensor tvalid(tensor.valid_shape()); + tvalid.copy_from(tensor); + return tensor_mean_value(tvalid, stream); + } - /* - const bm_device_mem_t* device_data_ptr = tensor.data(); - unsigned long long gaddr = bm_mem_get_device_addr(*device_data_ptr); - bm_flush(get_bm_handle()); - float* device_data = (float*)bm_get_global_addr(gaddr); + return 0.0; +} - for (int i = 0; i < tensor.size(); ++i) { - printf("%.2f ", device_data[i]); - if ((i + 1) % (4 * tensor.width()) == 0) { - printf("\n"); - } - }*/ +#define FILL_TENSOR_HOST(target) \ + template void fill_tensor_const(Tensor& tensor, float value, typename Tensor::API::stream_t stream); \ + template void fill_tensor_seq(Tensor& tensor, typename Tensor::API::stream_t stream); \ + template void fill_tensor_rand(Tensor& tensor, typename Tensor::API::stream_t stream); \ + template void fill_tensor_rand(Tensor& tensor, float vstart, float vend, typename Tensor::API::stream_t stream); \ + template void print_tensor(Tensor& tensor, typename Tensor::API::stream_t stream); \ + template void print_tensor_valid(Tensor& tensor, typename Tensor::API::stream_t stream); \ + template double tensor_mean_value(Tensor& tensor, typename Tensor::API::stream_t stream); \ + template double tensor_mean_value_valid(Tensor& tensor, typename Tensor::API::stream_t stream); - float *host_mem = new float[tensor.size()]; - auto* device_data_ptr = const_cast(tensor.data()); - bm_memcpy_d2s(get_bm_handle(), bm_mem_from_system(host_mem), *device_data_ptr); +#if defined(BUILD_LITE) || defined(USE_X86_PLACE) || defined(AMD_GPU) || defined(USE_CUDA) ||defined(USE_BM_PLACE) +FILL_TENSOR_HOST(X86) +#endif - for (int i = 0; i < tensor.size(); ++i) { - printf("%.2f\t", host_mem[i]); +#ifdef USE_CUDA +FILL_TENSOR_HOST(NVHX86) +#endif - if ((i + 1) % tensor.width() == 0){ - printf("\n"); - } - } - printf("\n"); +#ifdef USE_ARM_PLACE +FILL_TENSOR_HOST(ARM) +#endif - delete [] host_mem; -} +#ifdef USE_BM_PLACE #endif +template void tensor_cmp_host(const float* src1, const float* src2, \ + int size, double& max_ratio, double& max_diff); +template void tensor_cmp_host(const int* src1, const int* src2, \ + int size, double& max_ratio, double& max_diff); +template void tensor_cmp_host(const char* src1, const char* src2, int size, \ + double& max_ratio, double& max_diff); + } //namespace saber } //namespace anakin diff --git a/saber/core/tensor_op.h b/saber/core/tensor_op.h index c4d7a7661..ba76ed501 100644 --- a/saber/core/tensor_op.h +++ b/saber/core/tensor_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -35,72 +35,61 @@ template void reorder(Tensor_s& src, Tensor_d& dst); /** - * \brief Fill the host tensor buffer with rand value. + * \brief Fill the tensor buffer with rand value. * \param tensor The reference of input tensor. */ -template -void fill_tensor_host_const(Tensor_t& tensor, \ - typename Tensor_t::Dtype value); +template +void fill_tensor_const(Tensor& tensor, float value, typename Tensor::API::stream_t stream = NULL); +/** + * \brief Fill the tensor buffer with rand value. + * \param The reference of input tensor. + */ +template +void fill_tensor_seq(Tensor& tensor, typename Tensor::API::stream_t stream = NULL); /** - * \brief Fill the host tensor buffer with rand value. + * \brief Fill the tensor buffer with rand value. * \param The reference of input tensor. */ -template -void fill_tensor_host_rand(Tensor_t& tensor); +template +void fill_tensor_rand(Tensor& tensor, typename Tensor::API::stream_t stream = NULL); /** - * \brief Fill the host tensor buffer with rand value from vstart to vend. + * \brief Fill the tensor buffer with rand value from vstart to vend. * \param tensor The reference of input tensor. */ -template -void fill_tensor_host_rand(Tensor_t& tensor, typename Tensor_t::Dtype vstart, \ - typename Tensor_t::Dtype vend); +template +void fill_tensor_rand(Tensor& tensor, float vstart, float vend, typename Tensor::API::stream_t stream = NULL); /** -* \brief fill_tensor_host_seq fill the host tensor buffer with sequence value -* \param tensor input tensor reference -*/ -template -void fill_tensor_host_seq(Tensor_t& tensor); - -/** - * \brief Fill the device tensor buffer with value. - * \param tensor The reference of input tensor. - * \param value Input value. + * \brief Print the data in host tensor. + * \param tensor The reference of input tensor. */ -template -void fill_tensor_device_const(Tensor_t& tensor, \ - typename Tensor_t::Dtype value, \ - typename Tensor_t::API::stream_t stream = NULL); +template +void print_tensor(Tensor& tensor, typename Tensor::API::stream_t stream = NULL); /** - * \brief Fill the device tensor buffer with rand value. - * \param tensor The reference of input tensor. + * \brief Print the valid data in host tensor. + * \param tensor The reference of input tensor. */ -template -void fill_tensor_device_rand(Tensor_t& tensor, \ - typename Tensor_t::API::stream_t stream = NULL); +template +void print_tensor_valid(Tensor& tensor, typename Tensor::API::stream_t stream = NULL); -template -void fill_tensor_device_rand(Tensor_t& tensor, typename Tensor_t::Dtype vstart, \ - typename Tensor_t::Dtype vend, typename Tensor_t::API::stream_t stream = NULL); /** - * \brief Print the data in host tensor. + * \brief compute mean value of the valid data in device tensor. * \param tensor The reference of input tensor. */ -template -void print_tensor_host(Tensor_t& tensor); +template +double tensor_mean_value(Tensor& tensor, typename Tensor::API::stream_t stream = NULL); /** - * \brief Print the data in device tensor. + * \brief compute mean value of the valid data in device tensor. * \param tensor The reference of input tensor. */ -template -void print_tensor_device(Tensor_t& tensor, \ - typename Tensor_t::API::stream_t stream = NULL); +template +double tensor_mean_value_valid(Tensor& tensor, typename Tensor::API::stream_t stream = NULL); template void tensor_cmp_host(const Dtype* src1, const Dtype* src2, int size, double& max_ratio, double& max_diff); @@ -171,16 +160,6 @@ class DataTensorTransformHelper{ #endif -#ifdef USE_BM - -void fill_tensor_device_const(Tensor& tensor, float value, \ - typename Tensor::API::stream_t stream = NULL); - -void fill_tensor_device_rand(Tensor& tensor, float vstart, \ - float vend, typename Tensor::API::stream_t stream = NULL); - -#endif - } // namespace saber } // namespace anakin diff --git a/saber/core/tensor_traits.h b/saber/core/tensor_traits.h deleted file mode 100644 index 33aaf526e..000000000 --- a/saber/core/tensor_traits.h +++ /dev/null @@ -1,351 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_CORE_TENSOR_TRAITS_H -#define ANAKIN_SABER_CORE_TENSOR_TRAITS_H -#include "core/buffer.h" -#include "core/data_traits.h" - -namespace anakin{ - -namespace saber{ - -template -class Tensor; - -template -struct TensorTraits { - typedef typename TensorT::target_category target_category; - typedef typename TensorT::target_type target_type; - typedef typename TensorT::layout_category layout_category; - typedef typename TensorT::layout_type layout_type; - using layout_dims = std::integral_constant; -}; - -// NCHW_C16, the last dim is always 16 -template -struct TensorTraits> -{ - typedef typename Tensor::target_category target_category; - typedef typename Tensor::target_type target_type; - typedef typename DataTrait::dtype Dtype; - typedef _5D layout_category; - typedef NCHW_C16 layout_type; - using layout_dims = std::integral_constant; - using num_idx = std::integral_constant; - using channel_idx = std::integral_constant; - using height_idx = std::integral_constant; - using width_idx = std::integral_constant; - using k_idx = std::integral_constant; - static int num(const Shape& shape) { - return shape[0]; - } - static int channel(const Shape& shape) { - return shape[1] * 16; - } - static int height(const Shape& shape) { - return shape[2]; - } - static int width(const Shape& shape) { - return shape[3]; - } - static int depth(const Shape& shape) { - return shape[4]; - } -}; - -// NCHW_C8, the last dim is always 8 -template -struct TensorTraits> -{ - typedef typename Tensor::target_category target_category; - typedef typename Tensor::target_type target_type; - typedef typename DataTrait::dtype Dtype; - typedef _5D layout_category; - typedef NCHW_C8 layout_type; - using layout_dims = std::integral_constant; - using num_idx = std::integral_constant; - using channel_idx = std::integral_constant; - using height_idx = std::integral_constant; - using width_idx = std::integral_constant; - using k_idx = std::integral_constant; - static int num(const Shape& shape) { - return shape[0]; - } - static int channel(const Shape& shape) { - return shape[1] * 8; - } - static int height(const Shape& shape) { - return shape[2]; - } - static int width(const Shape& shape) { - return shape[3]; - } - static int depth(const Shape& shape) { - return shape[4]; - } -}; - -// NCHW_C4, the last dim is always 4 -template -struct TensorTraits> -{ - typedef typename Tensor::target_category target_category; - typedef typename Tensor::target_type target_type; - typedef typename DataTrait::dtype Dtype; - typedef _5D layout_category; - typedef NCHW_C4 layout_type; - using layout_dims = std::integral_constant; - using num_idx = std::integral_constant; - using channel_idx = std::integral_constant; - using height_idx = std::integral_constant; - using width_idx = std::integral_constant; - using k_idx = std::integral_constant; - static int num(const Shape& shape) { - return shape[0]; - } - static int channel(const Shape& shape) { - return shape[1] * 4; - } - static int height(const Shape& shape) { - return shape[2]; - } - static int width(const Shape& shape) { - return shape[3]; - } - static int depth(const Shape& shape) { - return shape[4]; - } -}; - -template -struct TensorTraits> -{ - typedef typename Tensor::target_category target_category; - typedef typename Tensor::target_type target_type; - typedef typename DataTrait::dtype Dtype; - typedef _4D layout_category; - typedef NCHW layout_type; - using layout_dims = std::integral_constant; - using num_idx = std::integral_constant; - using channel_idx = std::integral_constant; - using height_idx = std::integral_constant; - using width_idx = std::integral_constant; - static int num(const Shape& shape){ - return shape[0]; - } - static int channel(const Shape& shape){ - return shape[1]; - } - static int height(const Shape& shape){ - return shape[2]; - } - static int width(const Shape& shape){ - return shape[3]; - } -}; - -template -struct TensorTraits> -{ - typedef typename Tensor::target_category target_category; - typedef typename Tensor::target_type target_type; - typedef _4D layout_category; - typedef NHWC layout_type; - using layout_dims = std::integral_constant; - using num_idx = std::integral_constant; - using channel_idx = std::integral_constant; - using height_idx = std::integral_constant; - using width_idx = std::integral_constant; - static int num(const Shape& shape){ - return shape[0]; - } - static int channel(const Shape& shape){ - return shape[3]; - } - static int height(const Shape& shape){ - return shape[1]; - } - static int width(const Shape& shape){ - return shape[2]; - } -}; - -template -struct TensorTraits> -{ - typedef typename Tensor::target_category target_category; - typedef typename Tensor::target_type target_type; - typedef _3D layout_category; - typedef NHW layout_type; - using layout_dims = std::integral_constant; - using num_idx = std::integral_constant; - using channel_idx = std::integral_constant; - using height_idx = std::integral_constant; - using width_idx = std::integral_constant; - static int num(const Shape& shape){ - return shape[0]; - } - static int channel(const Shape& shape){ - return 1; - } - static int height(const Shape& shape){ - return shape[1]; - } - static int width(const Shape& shape){ - return shape[2]; - } -}; - -template -struct TensorTraits> -{ - typedef typename Tensor::target_category target_category; - typedef typename Tensor::target_type target_type; - typedef _2D layout_category; - typedef NW layout_type; - using layout_dims = std::integral_constant; - using num_idx = std::integral_constant; - using channel_idx = std::integral_constant; - using height_idx = std::integral_constant; - using width_idx = std::integral_constant; - static int num(const Shape& shape){ - return shape[0]; - } - static int channel(const Shape& shape){ - return 1; - } - static int height(const Shape& shape){ - return 1; - } - static int width(const Shape& shape){ - return shape[2]; - } -}; - -template -struct TensorTraits> -{ - typedef typename Tensor::target_category target_category; - typedef typename Tensor::target_type target_type; - typedef _2D layout_category; - typedef HW layout_type; - using layout_dims = std::integral_constant; - using num_idx = std::integral_constant; - using channel_idx = std::integral_constant; - using height_idx = std::integral_constant; - using width_idx = std::integral_constant; - static int num(const Shape& shape){ - return 1; - } - static int channel(const Shape& shape){ - return 1; - } - static int height(const Shape& shape){ - return shape[0]; - } - static int width(const Shape& shape){ - return shape[1]; - } -}; - -template -struct TensorTraits> -{ - typedef typename Tensor::target_category target_category; - typedef typename Tensor::target_type target_type; - typedef _1D layout_category; - typedef HW layout_type; - using layout_dims = std::integral_constant; - using num_idx = std::integral_constant; - using channel_idx = std::integral_constant; - using height_idx = std::integral_constant; - using width_idx = std::integral_constant; - static int num(const Shape& shape){ - return 1; - } - static int channel(const Shape& shape){ - return 1; - } - static int height(const Shape& shape){ - return 1; - } - static int width(const Shape& shape){ - return shape[0]; - } -}; - -template -static inline int MemShare(std::shared_ptr>& dst, \ - const std::shared_ptr>& src, __DtoD) { - //LOG(INFO) << "shared D2D"; - if(dst->get_id() == src->get_id()){ - dst = src; - return 1; - } - //LOG(INFO) << "copied D2D"; - SABER_CHECK(dst->re_alloc(src->get_count())); - SABER_CHECK(dst->sync_copy_from(*src)); - return 0; -} - -template -static inline int MemShare(std::shared_ptr>& dst, \ - const std::shared_ptr>& src, __HtoD) { - //LOG(INFO) << "copied H2D"; - SABER_CHECK(dst->re_alloc(src->get_count())); - SABER_CHECK(dst->sync_copy_from(*src)); - return 0; -} - -template -static inline int MemShare(std::shared_ptr>& dst, \ - const std::shared_ptr>& src, __HtoH) { - //LOG(INFO) << "shared H2H"; - dst = src; - return 1; -} - -template -static inline int MemShare(std::shared_ptr>& dst, \ - const std::shared_ptr>& src, __DtoH) { - //LOG(INFO) << "copied D2H"; - SABER_CHECK(dst->re_alloc(src->get_count())); - SABER_CHECK(dst->sync_copy_from(*src)); - return 0; -} - -template -static inline int BufferMemShare(std::shared_ptr>& dst, \ - const std::shared_ptr>& src){ - - typedef typename TargetTypeTraits::target_type target_type_dst; - typedef typename TargetTypeTraits::target_type target_type_src; - typedef typename TargetTypeTraits::target_category target_category_dst; - - typedef typename IF::value, __HtoH, __DtoH>::Type then_type; - typedef typename IF::value, __DtoD, __HtoD>::Type else_type; - typedef typename IF::value, then_type, else_type>::Type flag_type; - CHECK_EQ(src == nullptr, false) << "input buffer is null!"; - if (!dst){ - dst = std::make_shared>(src->get_count()); - } - return MemShare(dst, src, flag_type()); -} - -} //names - -} //namespace anakin - -#endif //ANAKIN_SABER_CORE_TENSOR_TRAITS_H diff --git a/saber/funcs/CMakeLists.txt b/saber/funcs/CMakeLists.txt index bdd319f13..18dbe49ea 100644 --- a/saber/funcs/CMakeLists.txt +++ b/saber/funcs/CMakeLists.txt @@ -1,3 +1,17 @@ +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + aux_source_directory(. DIR_BASE_SRCS) aux_source_directory(impl DIR_BASE_SRCS_IMPL) if(USE_ARM) @@ -7,18 +21,16 @@ if(USE_X86) aux_source_directory(impl/x86 X86_BASE_SRCS) endif() if(USE_CUDA) - #FILE(GLOB CUDA_BASE_SRCS "cuda/*.cpp" "cuda/*.cu") aux_source_directory(impl/cuda CUDA_BASE_SRCS) endif() -if(USE_BM) - #FILE(GLOB BM_BASE_SRCS "cuda/*.cpp" "cuda/*.cu") - aux_source_directory(impl/bm BM_BASE_SRCS) -endif() -if(USE_AMD) - #FILE(GLOB CUDA_BASE_SRCS "cuda/*.cpp" "cuda/*.cu") +if(AMD_GPU) aux_source_directory(impl/amd AMD_BASE_SRCS) endif() +if(USE_BM_PLACE) + aux_source_directory(impl/bm BM_BASE_SRCS) +endif() + set(DIR_SRCS_CUR "") foreach(SRC_NAME ${DIR_BASE_SRCS}) #unpack the dir "/" @@ -52,7 +64,7 @@ foreach(SRC_NAME ${CUDA_BASE_SRCS}) list(APPEND DIR_SRCS_CUR "${CMAKE_CURRENT_SOURCE_DIR}/${FILE_NAME}") endforeach() -foreach(SRC_NAME ${BM_BASE_SRCS}) +foreach(SRC_NAME ${X86_BASE_SRCS}) #unpack the dir "/" string(REPLACE "./" "" FILE_NAME ${SRC_NAME}) string(REPLACE " " "" FILE_NAME ${FILE_NAME}) @@ -60,7 +72,8 @@ foreach(SRC_NAME ${BM_BASE_SRCS}) list(APPEND DIR_SRCS_CUR "${CMAKE_CURRENT_SOURCE_DIR}/${FILE_NAME}") endforeach() -foreach(SRC_NAME ${X86_BASE_SRCS}) + +foreach(SRC_NAME ${AMD_BASE_SRCS}) #unpack the dir "/" string(REPLACE "./" "" FILE_NAME ${SRC_NAME}) string(REPLACE " " "" FILE_NAME ${FILE_NAME}) @@ -68,7 +81,7 @@ foreach(SRC_NAME ${X86_BASE_SRCS}) list(APPEND DIR_SRCS_CUR "${CMAKE_CURRENT_SOURCE_DIR}/${FILE_NAME}") endforeach() -foreach(SRC_NAME ${AMD_BASE_SRCS}) +foreach(SRC_NAME ${BM_BASE_SRCS}) #unpack the dir "/" string(REPLACE "./" "" FILE_NAME ${SRC_NAME}) string(REPLACE " " "" FILE_NAME ${FILE_NAME}) diff --git a/saber/funcs/activation.h b/saber/funcs/activation.h index 7af7a6f80..f37a04bac 100644 --- a/saber/funcs/activation.h +++ b/saber/funcs/activation.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -29,42 +29,41 @@ #include "saber/funcs/impl/x86/saber_activation.h" #endif -#ifdef USE_BM -#include "saber/funcs/impl/bm/vender_activation.h" +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/saber_activation.h" +#endif + +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_activation.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_activation.h" #endif namespace anakin { namespace saber { template + DataType OpDtype> class Activation : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - ActivationParam -> { + ActivationParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, ActivationParam>::BaseFunc; Activation() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef ActivationParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ActivationParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -73,6 +72,7 @@ class Activation : public BaseFunc< Output_v &output, Param_t ¶m) override { Shape output_shape = (input[0]->valid_shape()); + output[0]->set_seq_offset(input[0]->get_seq_offset()); return output[0]->set_shape(output_shape); } @@ -80,15 +80,11 @@ class Activation : public BaseFunc< switch (implenum) { case VENDER_IMPL: //this->_impl.push_back(new VenderActivation _impl.push_back(new VenderActivation ); + this->_impl.push_back(new VenderActivation ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberActivation ); + this->_impl.push_back(new SaberActivation ); return SaberSuccess; default: @@ -99,8 +95,11 @@ class Activation : public BaseFunc< private: virtual void pick_best_static() override { - if (true) // some condition? + if (this->_param.active == Active_prelu) { + this->_best_impl = this->_impl[1]; + } else { this->_best_impl = this->_impl[0]; + } } virtual void pick_best_specify(ImplEnum implenum) override { diff --git a/saber/funcs/argmax.h b/saber/funcs/argmax.h index b258011f9..221046989 100644 --- a/saber/funcs/argmax.h +++ b/saber/funcs/argmax.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,12 +17,18 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_argmax.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_argmax.h" #endif #ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_argmax.h" +#endif + +#ifdef USE_ARM_PLACE +//todo #include "saber/funcs/impl/impl_argmax.h" #endif @@ -30,34 +36,25 @@ namespace anakin { namespace saber { template + DataType OpDtype> class Argmax : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - ArgmaxParam -> { + ArgmaxParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, ArgmaxParam>::BaseFunc; Argmax() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef ArgmaxParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ArgmaxParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -67,11 +64,22 @@ class Argmax : public BaseFunc< //! support inplace computation, output shape = input shape - int num_top_axes = input[0]->dims(); - Shape output_shape = Shape::zero(num_top_axes); - for (int i = 0; i < num_top_axes; ++i) { - output_shape[i] = 1; + int top_k = param.top_k; + bool out_max_val = param.out_max_val; + bool has_axis = param.has_axis; + int axis = param.axis; + CHECK_GE(top_k, 1) << "top k must not less than 1."; + if(has_axis){ + CHECK_GE(axis, 0) << "axis must not less than 0."; + CHECK_LE(axis, input[0]->dims()) << "axis must be less than or equal to the number od dims."; + CHECK_LE(top_k, input[0]->valid_shape()[axis]) << "top_k must be less than or equal to the dimension of the axis."; + } else{ + CHECK_LE(top_k, input[0]->count(1, input[0]->dims())) << "top_k must be less than or equal to the dimension of input."; } + //int num_top_axes = input[0]->dims(); + // if(num_top_axes < 3) num_top_axes = 3; + Shape output_shape({1, 1, 1, 1}, Layout_NCHW); + //Shape output_shape = Shape::zero(num_top_axes); if (param.has_axis) { output_shape = input[0]->valid_shape(); output_shape[param.axis] = param.top_k; @@ -90,14 +98,12 @@ class Argmax : public BaseFunc< switch (implenum) { case VENDER_IMPL: this->_impl.push_back(new VenderArgmax ); + OpDtype>); return SaberSuccess; case SABER_IMPL: this->_impl.push_back(new SaberArgmax ); + OpDtype>); return SaberSuccess; default: @@ -122,4 +128,4 @@ class Argmax : public BaseFunc< } // namespace anakin -#endif \ No newline at end of file +#endif diff --git a/saber/funcs/axpy.h b/saber/funcs/axpy.h index 610b8b8ad..126f7b41c 100644 --- a/saber/funcs/axpy.h +++ b/saber/funcs/axpy.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,12 +17,17 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" - +#include "saber/funcs/impl/impl_axpy.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_axpy.h" #endif #ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_axpy.h" +#endif + +#ifdef USE_ARM_PLACE +//todo #include "saber/funcs/impl/impl_axpy.h" #endif @@ -30,34 +35,25 @@ namespace anakin { namespace saber { template + DataType OpDtype> class Axpy : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - AxpyParam -> { + AxpyParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, AxpyParam>::BaseFunc; Axpy() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef AxpyParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef AxpyParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -73,14 +69,12 @@ class Axpy : public BaseFunc< switch (implenum) { case VENDER_IMPL: this->_impl.push_back(new VenderAxpy ); + OpDtype>); return SaberSuccess; case SABER_IMPL: this->_impl.push_back(new SaberAxpy ); + OpDtype>); return SaberSuccess; default: @@ -89,7 +83,7 @@ class Axpy : public BaseFunc< } private: - + virtual void pick_best_static() override { if (true) // some condition? this->_best_impl = this->_impl[0]; diff --git a/saber/funcs/base.h b/saber/funcs/base.h index 144de4baf..13238304c 100644 --- a/saber/funcs/base.h +++ b/saber/funcs/base.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,17 +26,16 @@ namespace anakin { namespace saber { -template class Impl, - template class Param - > +template class Impl, + template class Param > class BaseFunc { public: - typedef typename inTensor::targetType_t targetType_t; - typedef Param Param_t; - typedef Impl Impl_t; - typedef std::vector Input_v; - typedef std::vector Output_v; + typedef Param Param_t; + typedef Impl Impl_t; + typedef std::vector*> Input_v; + typedef std::vector*> Output_v; typedef std::vector Shape_v; BaseFunc() {} @@ -58,15 +57,17 @@ class BaseFunc { Param_t& param) = 0; //TODO:create may lead to leak virtual SaberStatus reset_output_shape(const Input_v& input, Output_v& output, \ - Param_t& param, Context &ctx) { + Param_t& param, Context &ctx) { compute_output_shape(input, output, param); for (int i = 0; i < output.size(); ++i) { output[i]->reshape(output[i]->valid_shape()); } for (auto imp : this->_impl) { - SaberStatus status = imp->create(input, output, param, ctx); - if (status != SaberSuccess) { - return status; + if (imp) { + SaberStatus status = imp->create(input, output, param, ctx); + if (status != SaberSuccess) { + return status; + } } } return SaberSuccess; @@ -75,7 +76,7 @@ class BaseFunc { virtual SaberStatus init_impl(ImplEnum implenum) = 0; virtual SaberStatus init(const Input_v& input, Output_v& output, Param_t& param, - SaberImplStrategy strategy, ImplEnum implenum, Context &ctx) { + SaberImplStrategy strategy, ImplEnum implenum, Context &ctx) { this->_param = param; this->_last_input_shape = input[0]->valid_shape(); @@ -123,7 +124,7 @@ class BaseFunc { } virtual SaberStatus operator()(const Input_v& input, Output_v& output, Param_t& param, \ - Context &ctx) { + Context &ctx) { if ((_param == param) && (input[0]->valid_shape() == this->_last_input_shape)) { return _best_impl->dispatch(input, output, param); @@ -147,7 +148,7 @@ class BaseFunc { void pick_best(const Input_v input, Output_v output, \ Param_t& param, SaberImplStrategy strategy, ImplEnum implenum, \ - Context &ctx) { + Context &ctx) { switch(_strategy) { case STATIC: pick_best_static(); @@ -169,8 +170,8 @@ class BaseFunc { //typedef std::unordered_map static_map; virtual void pick_best_static() = 0; - virtual void pick_best_runtime(const Input_v input, Output_v output, Param_t& param, \ - Context &ctx) { + virtual void pick_best_runtime(const Input_v& input, Output_v& output, Param_t& param, \ + Context &ctx) { float time_cost = 99999.f; int idx = 0; @@ -183,14 +184,23 @@ class BaseFunc { } for(auto iter : _impl) { - SaberTimer timer; - timer.start(ctx); + SaberTimer timer; + SaberStatus status = SaberUnImplError; for(int i = 0; i < _runtime_ts; ++i) { - iter->dispatch(input, output, param); + timer.start(ctx); + status = SaberStatus(status | iter->dispatch(input, output, param)); + typename Tensor::API::stream_t stream = ctx.get_compute_stream(); + for (auto out : output) { + out->record_event(stream); + out->sync(); + } + timer.end(ctx); + } + if (status == SaberSuccess) { + times.push_back(timer.get_average_ms()); + } else { + times.push_back(time_cost); } - output[0]->sync(); - timer.end(ctx); - times.push_back(timer.get_average_ms()); } for (int i = 0; i < _impl.size(); ++i) { diff --git a/saber/funcs/batch_norm.h b/saber/funcs/batch_norm.h deleted file mode 100644 index 2e817c734..000000000 --- a/saber/funcs/batch_norm.h +++ /dev/null @@ -1,104 +0,0 @@ -#ifndef ANAKIN_SABER_FUNCS_BATCH_NORM_H -#define ANAKIN_SABER_FUNCS_BATCH_NORM_H - -#include "saber/core/tensor.h" -#include "saber/funcs/base.h" -#include "saber/saber_funcs_param.h" -#include "saber/funcs/impl/impl_base.h" -#include "saber/funcs/impl/impl_batch_norm.h" - -#ifdef NVIDIA_GPU -//todo -#include "saber/funcs/impl/impl_batch_norm.h" -#endif - -#ifdef USE_X86_PLACE -//todo -#include "saber/funcs/impl/impl_batch_norm.h" -#endif - -#ifdef USE_ARM_PLACE -//todo -#include "saber/funcs/impl/impl_batch_norm.h" -#endif - -#ifdef USE_BM -#include "saber/funcs/impl/bm/vender_batch_norm.h" -#endif - -namespace anakin { -namespace saber { - -template -class BatchNorm : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - BatchnormParam -> { -public: - using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - BatchnormParam>::BaseFunc; - - BatchNorm() = default; - - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef BatchnormParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; - typedef std::vector Shape_v; - - virtual SaberStatus compute_output_shape(const Input_v &input, - Output_v &output, Param_t ¶m) override { - - Shape output_shape = (input[0]->valid_shape()); - return output[0]->set_shape(output_shape); - } - - virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderBatchNorm ); - return SaberSuccess; - - case SABER_IMPL: - return SaberUnImplError; - - default: - return SaberUnImplError; - } - } - -private: - - virtual void pick_best_static() override { - if (true) // some condition? - this->_best_impl = this->_impl[0]; - } - - virtual void pick_best_specify(ImplEnum implenum) override { - this->_best_impl = this->_impl[0]; - } - -}; - -} // namespace saber -} // namespace anakin - -#endif diff --git a/saber/funcs/box_coder.h b/saber/funcs/box_coder.h deleted file mode 100644 index c96e0a4a1..000000000 --- a/saber/funcs/box_coder.h +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#ifndef ANAKIN_SABER_FUNCS_BOX_CODER_H -#define ANAKIN_SABER_FUNCS_BOX_CODER_H - -#include "saber/funcs/base.h" -#include "saber/funcs/impl/impl_base.h" - -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_box_coder.h" -#endif - -#ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_box_coder.h" -#endif - -namespace anakin { -namespace saber { - -template -class BoxCoder : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - BoxCoderParam -> { -public: - using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - BoxCoderParam>::BaseFunc; - - BoxCoder() = default; - - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef BoxCoderParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; - typedef std::vector Shape_v; - - virtual SaberStatus compute_output_shape(const Input_v &input, - Output_v &output, Param_t ¶m) override { - - Shape shape_out = output[0]->valid_shape(); - CHECK_EQ(shape_out.dims(), 2) << "only support 3d (NHW) output layout"; - shape_out[0] = 2; - - int win1 = input[0]->width(); - int hin1 = input[0]->height(); - - return output[0]->set_shape(shape_out); - } - - virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderBoxCoder ); - return SaberSuccess; - - case SABER_IMPL: - this->_impl.push_back(new SaberBoxCoder ); - return SaberSuccess; - - default: - return SaberUnImplError; - } - } - -private: - - virtual void pick_best_static() override { - if (true) // some condition? - this->_best_impl = this->_impl[0]; - } - - virtual void pick_best_specify(ImplEnum implenum) override { - this->_best_impl = this->_impl[0]; - } - -}; - -} // namespace saber -} // namespace anakin - - -#endif \ No newline at end of file diff --git a/saber/funcs/calibrate.h b/saber/funcs/calibrate.h new file mode 100644 index 000000000..10da32dfc --- /dev/null +++ b/saber/funcs/calibrate.h @@ -0,0 +1,134 @@ +#ifndef ANAKIN_SABER_FUNCS_CALIBRATE_H +#define ANAKIN_SABER_FUNCS_CALIBRATE_H + +#include "saber/core/tensor.h" +#include "saber/core/context.h" +#include +namespace anakin { +namespace saber { + +template +SaberStatus conv_calibrate_fp32_int8_c4( + Tensor &out_tensor, + const Tensor &in_tensor, + float in_scale, Context ctx); + +template +SaberStatus conv_calibrate_int32_fp32( + Tensor &out_tensor, + const Tensor &in_tensor, + float in_scale, float* weight_scale, + Context ctx); + +template +SaberStatus conv_calibrate_int8_c4_fp32( + Tensor &out_tensor, + const Tensor &in_tensor, + float* weight_scale, + Context ctx); + +template +void float2char(bool col_direct, signed char* dst, const float* src, + float *scale, int height, int width, + Context ctx); + +template +void fix2float(float * dst, + const float *sA, const float *sB, + const float alpha, const float beta, int height, int width, + Context ctx); + +template +SaberStatus convert_weights_to_nchw_c4_host(Tensor& out_tensor, + const Tensor& in_tensor, + Context ctx) { + + int input_channel = in_tensor.channel(); + int output_channel = out_tensor.num(); + std::vector vector_weight_scale; + vector_weight_scale.resize(output_channel); + + int weight_inner_dim = in_tensor.channel() + * in_tensor.height() + * in_tensor.width(); + const float* in_weight_data = (const float*)(in_tensor.data()); + + for (int c = 0; c < output_channel; ++c) { + float max_val = -1.f; + + for (int i = 0; i < weight_inner_dim; ++i) { + float read_data = fabs(in_weight_data[i]); + max_val = (read_data > max_val) ? read_data : max_val; + } + + vector_weight_scale[c] = max_val / 127.f; + in_weight_data += weight_inner_dim; + // LOG(INFO)<<"max_val = "< +SaberStatus convert_bias_host(Tensor& out_tensor, + const Tensor& in_tensor, + float in_scale, std::vector vector_weight_scale, + Context ctx) { + unsigned long weight_size = vector_weight_scale.size(); + unsigned long bias_size = in_tensor.size(); + CHECK_GT(in_scale, 0); + CHECK_GT(weight_size, 0); + CHECK_EQ(bias_size, weight_size); + + const float* in_data = (const float*)in_tensor.data(); + float* out_data = (float*)out_tensor.mutable_data(); + + for (int i = 0; i < bias_size; ++i) { + out_data[i] = in_data[i] / in_scale / vector_weight_scale[i]; + } + + return SaberSuccess; +} + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/cast.h b/saber/funcs/cast.h index 7d8f48c15..265783f70 100644 --- a/saber/funcs/cast.h +++ b/saber/funcs/cast.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,12 +17,17 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" - +#include "saber/funcs/impl/impl_cast.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_cast.h" #endif #ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_cast.h" +#endif + +#ifdef USE_ARM_PLACE +//todo #include "saber/funcs/impl/impl_cast.h" #endif @@ -30,34 +35,27 @@ namespace anakin { namespace saber { template + DataType OpDtype> + class Cast : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, CastParam > { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, CastParam>::BaseFunc; Cast() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef CastParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef CastParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -71,13 +69,11 @@ class Cast : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderCast ); + this->_impl.push_back(new VenderCast ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberCast ); + this->_impl.push_back(new SaberCast ); return SaberSuccess; default: diff --git a/saber/funcs/concat.h b/saber/funcs/concat.h index 3a630f7d2..ba45d5ee7 100644 --- a/saber/funcs/concat.h +++ b/saber/funcs/concat.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_concat.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_concat.h" @@ -26,38 +27,35 @@ #include "saber/funcs/impl/x86/saber_concat.h" #endif +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_concat.h" +#endif + namespace anakin { namespace saber { template + DataType OpDtype> + class Concat : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, ConcatParam > { public: using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - ConcatParam>::BaseFunc; + TargetType, + OpDtype, + ImplBase, + ConcatParam>::BaseFunc; Concat() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef ConcatParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ConcatParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -90,19 +88,18 @@ class Concat : public BaseFunc< } shape_out[param.axis] += sh[param.axis]; } + output[0]->set_seq_offset(input[0]->get_seq_offset()); return output[0]->set_shape(shape_out); } virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderConcat ); + this->_impl.push_back(new VenderConcat ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberConcat ); + this->_impl.push_back(new SaberConcat ); return SaberSuccess; default: diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h index e527f3d6f..939ce87d1 100644 --- a/saber/funcs/conv.h +++ b/saber/funcs/conv.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/funcs_utils.h" +#include "saber/funcs/impl/impl_conv.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_conv.h" @@ -24,101 +26,60 @@ #endif #ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_conv.h" +#include "saber/funcs/impl/x86/saber_conv.h" #endif -#ifdef USE_BM -#include "saber/funcs/impl/bm/vender_conv.h" +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_conv.h" #endif +#ifdef USE_BM_PLACE +#include "saber/funcs/impl/bm/vender_conv.h" +#endif namespace anakin { namespace saber { template + DataType OpDtype> class Conv : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - ConvParam -> { + ConvParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, ConvParam>::BaseFunc; Conv() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef ConvParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ConvParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; virtual SaberStatus compute_output_shape(const Input_v &input, Output_v &output, Param_t ¶m) override { - - Shape output_shape = (input[0]->valid_shape()); - CHECK_EQ(input[0]->shape().size(), 4) << "using reshape2d to reshape a 1d conv?"; - - // append the $n and $c/$k, output: N * K * P * Q - int num_idx = input[0]->num_index(); - int channel_idx = input[0]->channel_index(); - int height_idx = input[0]->height_index(); - int width_idx = input[0]->width_index(); - - output_shape[num_idx] = input[0]->num(); // N - output_shape[channel_idx] = param.weight()->num(); // K - - if (std::is_same::value) { - output_shape[channel_idx] /= 4; - if (std::is_same::value && (output_shape.size() == 5)) { - output_shape[channel_idx] *= 4; - output_shape.pop_back(); - } - } - - int input_dim = input[0]->height(); // P - int kernel_exten = param.dilation_h * (param.weight()->height() - 1) + 1; - int output_dim = (input_dim + 2 * param.pad_h - kernel_exten) - / param.stride_h + 1; - - output_shape[height_idx] = output_dim; - - input_dim = input[0]->width(); // Q - kernel_exten = param.dilation_w * (param.weight()->width() - 1) + 1; - output_dim = (input_dim + 2 * param.pad_w - kernel_exten) - / param.stride_w + 1; - - output_shape[width_idx] = output_dim; - - return output[0]->set_shape(output_shape); + Shape conv_shape = conv_compute_shape(input[0]->valid_shape(), param); + conv_shape.set_layout(Layout_NCHW); + return output[0]->set_shape(conv_shape); } virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: this->_impl.push_back(new VenderConv2D ); + OpDtype>); return SaberSuccess; case SABER_IMPL: this->_impl.push_back(new SaberConv2D ); + OpDtype>); return SaberSuccess; default: @@ -126,6 +87,18 @@ class Conv : public BaseFunc< } } + SaberStatus trans_weights(Tensor &target_weights, int stride_h, int stride_w, int group, + ImplEnum implenum) { + if (implenum == VENDER_IMPL) { + return static_cast *>(this->_best_impl)->trans_weights( + target_weights, stride_h, stride_w, group); + } else if (implenum == SABER_IMPL) { + return static_cast *>(this->_best_impl)->trans_weights( + target_weights, stride_h, stride_w, group); + } else { + return SaberUnImplError; + } + } private: virtual void pick_best_static() override { @@ -143,4 +116,4 @@ class Conv : public BaseFunc< } // namespace anakin -#endif \ No newline at end of file +#endif diff --git a/saber/funcs/conv_act.h b/saber/funcs/conv_act.h deleted file mode 100644 index 441dd9918..000000000 --- a/saber/funcs/conv_act.h +++ /dev/null @@ -1,161 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#ifndef ANAKIN_SABER_FUNCS_CONV_ACT_H -#define ANAKIN_SABER_FUNCS_CONV_ACT_H - -#include "saber/funcs/funcs_utils.h" -#include "saber/funcs/base.h" -#include "saber/funcs/impl/impl_base.h" - -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_conv_act.h" -#include "saber/funcs/impl/cuda/vender_conv_act.h" -#endif - -#ifdef USE_X86_PLACE -#include "saber/funcs/impl/x86/saber_conv_act.h" -#endif - -namespace anakin { -namespace saber { - -template -class ConvAct : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - ConvActiveParam -> { -public: - using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - ConvActiveParam>::BaseFunc; - - ConvAct() = default; - - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef ConvActiveParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; - typedef std::vector Shape_v; - - virtual SaberStatus compute_output_shape(const Input_v &input, - Output_v &output, Param_t ¶m) override { - Shape output_shape = (input[0]->valid_shape()); - - //CHECK_LT(input[0]->shape().size(), 4) << "using reshape2d to reshape a 1d conv?"; - - // append the $n and $c/$k, output: N * K * P * Q - int num_idx = input[0]->num_index(); - int channel_idx = input[0]->channel_index(); - int height_idx = input[0]->height_index(); - int width_idx = input[0]->width_index(); - - output_shape[num_idx] = input[0]->num(); // N - output_shape[channel_idx] = param.conv_param.weight()->num(); // K - - if (std::is_same::value) { - output_shape[channel_idx] /= 4; - if (std::is_same::value && (output_shape.size() == 5)) { - output_shape[channel_idx] *= 4; - output_shape.pop_back(); - } - } - - int input_dim = input[0]->height(); // P - int kernel_exten = param.conv_param.dilation_h * - (param.conv_param.weight()->height() - 1) + 1; - int output_dim = (input_dim + 2 * param.conv_param.pad_h - kernel_exten) - / param.conv_param.stride_h + 1; - - output_shape[height_idx] = output_dim; - - input_dim = input[0]->width(); // Q - kernel_exten = param.conv_param.dilation_w * - (param.conv_param.weight()->width() - 1) + 1; - output_dim = (input_dim + 2 * param.conv_param.pad_w - kernel_exten) - / param.conv_param.stride_w + 1; - - output_shape[width_idx] = output_dim; - return output[0]->set_shape(output_shape); - } - - virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderConv2DAct ); - return SaberSuccess; - - case SABER_IMPL: - this->_impl.push_back(new SaberConv2DAct ); - return SaberSuccess; - - default: - return SaberUnImplError; - } - } - - virtual SaberStatus init(const Input_v& input, Output_v& output, Param_t& param, - SaberImplStrategy strategy, ImplEnum implenum, - Context &ctx) override { - - update_weights(param); - - return BaseFunc, - Tensor, - Tensor, - ImplBase, - ConvActiveParam>::init(input, output, param, strategy, implenum, ctx); - } - - //should move this funcs to utils - void update_weights(ConvActiveParam ¶m) { - update_conv_weights(param); - } - -private: - - virtual void pick_best_static() override { - if (true) // some condition? - this->_best_impl = this->_impl[0]; - } - - virtual void pick_best_specify(ImplEnum implenum) override { - this->_best_impl = this->_impl[0]; - } - -}; - -} // namespace saber -} // namespace anakin - - -#endif \ No newline at end of file diff --git a/saber/funcs/conv_act_pooling.h b/saber/funcs/conv_act_pooling.h deleted file mode 100644 index fb95a7546..000000000 --- a/saber/funcs/conv_act_pooling.h +++ /dev/null @@ -1,223 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#ifndef ANAKIN_SABER_FUNCS_CONV_ACT_POOLING_H -#define ANAKIN_SABER_FUNCS_CONV_ACT_POOLING_H - -#include "saber/funcs/base.h" -#include "saber/funcs/impl/impl_base.h" -#include "saber/funcs/funcs_utils.h" -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_conv_act_pooling.h" -#include "saber/funcs/impl/cuda/vender_conv_act_pooling.h" -#endif - -#ifdef USE_X86_PLACE -#include "saber/funcs/impl/x86/saber_conv_act_pooling.h" -#endif - -namespace anakin { -namespace saber { - -template -class ConvActPooling : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - ConvActivePoolingParam -> { -public: - using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - ConvActivePoolingParam>::BaseFunc; - - ConvActPooling() = default; - - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef ConvActivePoolingParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; - typedef std::vector Shape_v; - - virtual SaberStatus compute_output_shape(const Input_v &input, - Output_v &output, Param_t ¶m) override { - - - Shape conv_shape = (input[0]->valid_shape()); - - if (input[0]->valid_shape().size() < 4) { - return SaberInvalidValue; - } - - // append the $n and $c/$k, output: N * K * P * Q - int num_idx = input[0]->num_index(); - int channel_idx = input[0]->channel_index(); - int height_idx = input[0]->height_index(); - int width_idx = input[0]->width_index(); - - conv_shape[num_idx] = input[0]->num(); // N - conv_shape[channel_idx] = param.conv_param.weight()->num(); // K - - int input_dim = input[0]->height(); // P - int kernel_exten = param.conv_param.dilation_h * - (param.conv_param.weight()->height() - 1) + 1; - int output_dim = (input_dim + 2 * param.conv_param.pad_h - kernel_exten) - / param.conv_param.stride_h + 1; - - conv_shape[height_idx] = output_dim; - - input_dim = input[0]->width(); // Q - kernel_exten = param.conv_param.dilation_w * (param.conv_param.weight()->width() - 1) + 1; - output_dim = (input_dim + 2 * param.conv_param.pad_w - kernel_exten) - / param.conv_param.stride_w + 1; - - conv_shape[width_idx] = output_dim; - - _conv_shape = conv_shape; - Shape output_shape = conv_shape; - - int in_height = conv_shape[height_idx]; - int in_width = conv_shape[width_idx]; - - int window_h = param.pooling_param.window_h; - int window_w = param.pooling_param.window_w; - int pad_h = param.pooling_param.pad_h; - int pad_w = param.pooling_param.pad_w; - int stride_h = param.pooling_param.stride_h; - int stride_w = param.pooling_param.stride_w; - int out_height; - int out_width; - if (param.pooling_param.global_pooling) { - out_height = 1; - out_width = 1; - param.pooling_param.stride_h = in_height; - param.pooling_param.stride_w = in_width; - window_h = in_height; - window_w = in_width; - param.pooling_param.window_h = in_height; - param.pooling_param.window_w = in_width; - } else { - if (param.pooling_param.cmp_out_shape_floor_as_conv) { - out_height = static_cast((static_cast( - in_height + 2 * pad_h - window_h) / stride_h)) + 1; - - out_width = static_cast((static_cast( - in_width + 2 * pad_w - window_w) / stride_w)) + 1; - } else { - out_height = static_cast(ceilf(static_cast( - in_height + 2 * pad_h - window_h) / stride_h)) + 1; - - out_width = static_cast(ceilf(static_cast( - in_width + 2 * pad_w - window_w) / stride_w)) + 1; - } - } - - if (param.pooling_param.pooling_padded()) { - if ((out_height - 1) * stride_h >= in_height + pad_h) { - -- out_height; - } - if ((out_width - 1) * stride_w >= in_width + pad_w) { - -- out_width; - } - } - - output_shape[height_idx] = out_height; - output_shape[width_idx] = out_width; - - return output[0]->set_shape(output_shape); - } - - virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderConv2DActPooling ); - return SaberSuccess; - - case SABER_IMPL: - this->_impl.push_back(new SaberConv2DActPooling ); - return SaberSuccess; - - default: - return SaberUnImplError; - } - } - - virtual SaberStatus init(const Input_v& input, Output_v& output, Param_t& param, - SaberImplStrategy strategy, ImplEnum implenum, - Context &ctx) override { - - update_weights(param); - - return BaseFunc, - Tensor, - Tensor, - ImplBase, - ConvActivePoolingParam>::init(input, output, param, strategy, implenum, ctx); - } - - //should move this funcs to utils - void update_weights(ConvActivePoolingParam ¶m) { - update_conv_weights(param); - } -private: - - virtual void pick_best_static() override { - - bool _use_saber_conv_pooling = true; - _use_saber_conv_pooling &= (this->_param).pooling_param.pad_h == 0; - _use_saber_conv_pooling &= (this->_param).pooling_param.pad_w == 0; - _use_saber_conv_pooling &= (this->_param).pooling_param.stride_h == 2; - _use_saber_conv_pooling &= (this->_param).pooling_param.stride_w == 2; - _use_saber_conv_pooling &= (this->_param).pooling_param.window_h == 2; - _use_saber_conv_pooling &= (this->_param).pooling_param.window_w == 2; - _use_saber_conv_pooling &= !(this->_param).pooling_param.global_pooling; - _use_saber_conv_pooling &= (this->_param).pooling_param.pooling_type == Pooling_max; - - if (!_use_saber_conv_pooling) { - this->_best_impl = this->_impl[0]; - } else { - this->_best_impl = this->_impl[1]; - } - } - - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} - - virtual void pick_best_specify(ImplEnum implenum) override { - this->_best_impl = this->_impl[0]; - } - Shape _conv_shape; -}; - -} // namespace saber -} // namespace anakin - - -#endif \ No newline at end of file diff --git a/saber/funcs/conv_eltwise.h b/saber/funcs/conv_eltwise.h new file mode 100644 index 000000000..bb5fcce90 --- /dev/null +++ b/saber/funcs/conv_eltwise.h @@ -0,0 +1,107 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_FUNCS_CONV_ELTWISE_H +#define ANAKIN_SABER_FUNCS_CONV_ELTWISE_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/funcs_utils.h" +#include "saber/funcs/impl/impl_conv_eltwise.h" +#include "saber/saber_funcs_param.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_conv_eltwise.h" +#include "saber/funcs/impl/cuda/vender_conv_eltwise.h" +#endif + + +namespace anakin { +namespace saber { + +template +class ConvEltwise : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + ConvEltwiseParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + ConvEltwiseParam>::BaseFunc; + + ConvEltwise() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ConvEltwiseParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + Shape conv_shape = conv_compute_shape(input[0]->valid_shape(), param.conv_param); + conv_shape.set_layout(Layout_NCHW); + return output[0]->set_shape(conv_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderConvEltwise ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberConvEltwise ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + SaberStatus trans_weights(Tensor &target_weights, int stride_h, int stride_w, int group, + ImplEnum implenum) { + if (implenum == VENDER_IMPL) { + return static_cast *>(this->_best_impl)->trans_weights( + target_weights, stride_h, stride_w, group); + } else if (implenum == SABER_IMPL) { + return static_cast *>(this->_best_impl)->trans_weights( + target_weights, stride_h, stride_w, group); + } else { + return SaberUnImplError; + } + } +private: + + virtual void pick_best_static() override { + if (true) // some condition? + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin + + +#endif diff --git a/saber/funcs/conv_pooling.h b/saber/funcs/conv_pooling.h new file mode 100644 index 000000000..735dee612 --- /dev/null +++ b/saber/funcs/conv_pooling.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_FUNCS_CONV_POOLING_H +#define ANAKIN_SABER_FUNCS_CONV_POOLING_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/funcs_utils.h" +#include "saber/funcs/impl/impl_conv_pooling.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/vender_conv_pooling.h" +#include "saber/funcs/impl/cuda/saber_conv_pooling.h" +#endif + +namespace anakin { +namespace saber { + +template +class ConvPooling : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + ConvPoolingParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + ConvPoolingParam>::BaseFunc; + + ConvPooling() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ConvPoolingParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + Shape conv_shape = conv_compute_shape(input[0]->valid_shape(), param.conv_param); + Shape pool_shape = pool_compute_shape(conv_shape, param.pooling_param); + pool_shape.set_layout(Layout_NCHW); + return output[0]->set_shape(pool_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderConv2DPooling ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberConv2DPooling ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + SaberStatus trans_weights(Tensor &target_weights, int stride_h, int stride_w, int group, + ImplEnum implenum) { + if (implenum == VENDER_IMPL) { + return static_cast *>(this->_best_impl)->trans_weights( + target_weights, stride_h, stride_w, group); + } else if (implenum == SABER_IMPL) { + return static_cast *>(this->_best_impl)->trans_weights( + target_weights, stride_h, stride_w, group); + } else { + return SaberUnImplError; + } + } +private: + + virtual void pick_best_static() override { + if (true) // some condition? + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin + + +#endif diff --git a/saber/funcs/conv_unpadding_padding.h b/saber/funcs/conv_unpadding_padding.h new file mode 100644 index 000000000..13bc0adf2 --- /dev/null +++ b/saber/funcs/conv_unpadding_padding.h @@ -0,0 +1,84 @@ +#ifndef ANAKIN_SABER_FUNCS_CONV_UNPADDING_PADDING_H +#define ANAKIN_SABER_FUNCS_CONV_UNPADDING_PADDING_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/funcs_utils.h" +#include "saber/funcs/impl/impl_conv_unpadding_padding.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_conv_upadding_padding.h" +#endif + +namespace anakin { +namespace saber { + +template +class ConvUnpaddingPadding : public BaseFunc < + TargetType, + OpDtype, + ImplBase, + ConvUnpaddingPaddingParam > { +public: + using BaseFunc < + TargetType, + OpDtype, + ImplBase, + ConvUnpaddingPaddingParam >::BaseFunc; + + ConvUnpaddingPadding() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ConvUnpaddingPaddingParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v& input, + Output_v& output, Param_t& param) override { + + Shape in_shape=input[0]->valid_shape(); +// in_shape[2]= + output[0]->set_seq_offset(input[0]->get_seq_offset()); + return output[0]->set_shape(in_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderConvUnpaddingPadding); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberConvUnpaddingPadding); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + if (true) { // some condition? + this->_best_impl = this->_impl[0]; + } + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + +} +} + +#endif //ANAKIN_CONV_UNPADDING_PADDING_H diff --git a/saber/funcs/crf_decoding.h b/saber/funcs/crf_decoding.h index 14e6b53c4..9bfb88a73 100644 --- a/saber/funcs/crf_decoding.h +++ b/saber/funcs/crf_decoding.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,46 +19,45 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" #include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/impl_crf_decoding.h" #ifdef NVIDIA_GPU -#include "saber/funcs/impl/impl_crf_decoding.h" +#include "saber/funcs/impl/cuda/saber_crf_decoding.h" #endif #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_crf_decoding.h" #endif +#ifdef USE_ARM_PLACE +//todo +#include "saber/funcs/impl/impl_crf_decoding.h" +#endif + namespace anakin { namespace saber { template + DataType OpDtype> class CrfDecoding : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, CrfDecodingParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - CrfDecodingParam>::BaseFunc; + TargetType, + OpDtype, + ImplBase, + CrfDecodingParam>::BaseFunc; CrfDecoding() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef CrfDecodingParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef CrfDecodingParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -78,12 +77,10 @@ class CrfDecoding : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderCrfDecoding ); + this->_impl.push_back(new VenderCrfDecoding ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberCrfDecoding ); + this->_impl.push_back(new SaberCrfDecoding ); return SaberSuccess; default: return SaberUnImplError; diff --git a/saber/funcs/crop.h b/saber/funcs/crop.h index a847945ce..2fbc930de 100644 --- a/saber/funcs/crop.h +++ b/saber/funcs/crop.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,11 +17,17 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_crop.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_crop.h" #endif #ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_crop.h" +#endif + +#ifdef USE_ARM_PLACE +//todo #include "saber/funcs/impl/impl_crop.h" #endif @@ -29,34 +35,25 @@ namespace anakin { namespace saber { template + DataType OpDtype> class Crop : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - CropParam -> { + CropParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, CropParam>::BaseFunc; Crop() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef CropParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef CropParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -88,15 +85,10 @@ class Crop : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderCrop ); + this->_impl.push_back(new VenderCrop ); return SaberSuccess; - case SABER_IMPL: - this->_impl.push_back(new SaberCrop ); + this->_impl.push_back(new SaberCrop ); return SaberSuccess; default: @@ -121,4 +113,4 @@ class Crop : public BaseFunc< } // namespace anakin -#endif \ No newline at end of file +#endif diff --git a/saber/funcs/ctc_align.h b/saber/funcs/ctc_align.h index 772aa3e4e..7435d03d9 100644 --- a/saber/funcs/ctc_align.h +++ b/saber/funcs/ctc_align.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,46 +17,35 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_ctc_align.h" -#endif - -#ifdef USE_X86_PLACE #include "saber/funcs/impl/impl_ctc_align.h" + +#ifdef NVIDIA_GPU +//#include "saber/funcs/impl/cuda/saber_ctc_align.h" #endif namespace anakin { namespace saber { template + DataType OpDtype> class CtcAlign : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - CtcAlignParam -> { + CtcAlignParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, CtcAlignParam>::BaseFunc; CtcAlign() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef CtcAlignParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef CtcAlignParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -71,14 +60,12 @@ class CtcAlign : public BaseFunc< switch (implenum) { case VENDER_IMPL: this->_impl.push_back(new VenderCtcAlign ); + OpDtype>); return SaberSuccess; case SABER_IMPL: this->_impl.push_back(new SaberCtcAlign ); + OpDtype>); return SaberSuccess; default: diff --git a/saber/funcs/debug.h b/saber/funcs/debug.h index 82eff7098..661276c3b 100644 --- a/saber/funcs/debug.h +++ b/saber/funcs/debug.h @@ -1,6 +1,17 @@ -// -// Created by Liu,Junjie(SYS) on 2018/5/28. -// +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #ifndef ANAKIN_SABER_FUNCS_DEBUG_H #define ANAKIN_SABER_FUNCS_DEBUG_H @@ -9,21 +20,45 @@ namespace anakin { namespace saber { -#if defined(USE_X86_PLACE) || defined(USE_CUDA) -static void write_tensorfile(Tensor tensor, const char* locate) { - typedef typename Tensor::Dtype Dtype; - LOG(INFO) << "host tensor data:" << tensor.size(); +template +struct DefaultHostType { + typedef X86 Host_type; +}; + +template <> +struct DefaultHostType { + typedef NVHX86 Host_type; +}; + +template <> +struct DefaultHostType { + typedef ARM Host_type; +}; + + +template +static void write_tensorfile(Tensor& tensor, const char* locate) { + + typedef typename DefaultHostType::Host_type HOST_TYPE; + Tensor host_tensor; + host_tensor.re_alloc(tensor.valid_shape(), tensor.get_dtype()); + host_tensor.copy_from(tensor); + LOG(INFO) << "target tensor data:" << tensor.size(); FILE* fp = fopen(locate, "w+"); if (fp == 0) { LOG(ERROR) << "file open field " << locate; } else { - const Dtype* data_ptr = static_cast(tensor.data()); - int size = tensor.valid_size(); - - for (int i = 0; i < size; ++i) { - fprintf(fp, "[%d] %g \n", i, (data_ptr[i])); + if (tensor.get_dtype() == AK_FLOAT) { + const float* data_ptr = (const float*)host_tensor.data(); + int size = host_tensor.valid_size(); + + for (int i = 0; i < size; ++i) { + fprintf(fp, "[%d] %g \n", i, (data_ptr[i])); + } + } else { + LOG(FATAL) << "not supported write type"; } fclose(fp); @@ -31,12 +66,15 @@ static void write_tensorfile(Tensor tensor, const char* lo LOG(INFO) << "!!! write success: " << locate; } -#endif + +template +static void record_dev_tensorfile(const float* dev_tensor, int size, const char* locate) {}; #ifdef USE_CUDA -static void record_dev_tensorfile(const float* dev_tensor, int size, const char* locate) { - Tensor host_temp; - host_temp.re_alloc(Shape(1, 1, 1, size)); +template <> +void record_dev_tensorfile(const float* dev_tensor, int size, const char* locate) { + Tensor host_temp; + host_temp.re_alloc(Shape({1, 1, 1, size}, Layout_NCHW), AK_FLOAT); CUDA_CHECK(cudaMemcpy(host_temp.mutable_data(), dev_tensor, sizeof(float) * size, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); @@ -46,8 +84,35 @@ static void record_dev_tensorfile(const float* dev_tensor, int size, const char* LOG(ERROR) << "file open failed " << locate; } else { + const float* data = (const float*)host_temp.data(); + + for (int i = 0; i < size; ++i) { + fprintf(fp, "[%d] %g \n", i, (data[i])); + } + + fclose(fp); + } + + LOG(INFO) << "!!! write success: " << locate; +} +static void record_dev_tensorfile(Tensor * dev_tensor, const char* locate) { + Tensor host_temp; + + int size = dev_tensor->valid_size(); + host_temp.re_alloc(Shape({1, 1, 1, size}, Layout_NCHW), dev_tensor->get_dtype()); + CUDA_CHECK(cudaMemcpy(host_temp.mutable_data(), dev_tensor->data(), sizeof(float) * size, + cudaMemcpyDeviceToHost)); + cudaDeviceSynchronize(); + FILE* fp = fopen(locate, "w+"); + + if (fp == 0) { + LOG(ERROR) << "file open failed " << locate; + + } else { + const float* data = (const float*)host_temp.data(); + for (int i = 0; i < size; ++i) { - fprintf(fp, "[%d] %g \n", i, (host_temp.data()[i])); + fprintf(fp, "[%d] %g \n", i, (data[i])); } fclose(fp); @@ -57,36 +122,60 @@ static void record_dev_tensorfile(const float* dev_tensor, int size, const char* } #endif -#if defined(USE_X86_PLACE) || defined(USE_CUDA) -static void readTensorData(Tensor tensor, const char* locate) { - typedef typename Tensor::Dtype Dtype; - FILE* fp = fopen(locate, "rb"); +#ifdef USE_X86_PLACE +template<> +void record_dev_tensorfile(const float* dev_tensor, int size, const char* locate) { + FILE* fp = fopen(locate, "w+"); if (fp == 0) { LOG(ERROR) << "file open failed " << locate; } else { - LOG(INFO) << "file open success [" << locate << " ],read " << tensor.valid_shape().count(); - size_t size=fread(tensor.mutable_data(), sizeof(Dtype), tensor.valid_size(), fp); - CHECK_EQ(size,tensor.valid_shape().count())<<"read data file ["<* dev_tensor, const char* locate) { + int size = dev_tensor->valid_size(); + FILE* fp = fopen(locate, "w+"); + + if (fp == 0) { + LOG(ERROR) << "file open failed " << locate; + + } else { + + for (int i = 0; i < size; ++i) { + fprintf(fp, "[%d] %g \n", i, (((float*)dev_tensor->data())[i])); + } + fclose(fp); } + + LOG(INFO) << "!!! write success: " << locate; } +#endif -static void readTensorData(Tensor tensor, const char* locate) { - typedef typename Tensor::Dtype Dtype; +#if defined(USE_X86_PLACE) || defined(USE_CUDA) +template +static void readTensorData(HTensor tensor, const char* locate) { FILE* fp = fopen(locate, "rb"); if (fp == 0) { - LOG(ERROR) << "file open failed " << locate; + CHECK(false) << "file open failed " << locate; } else { - LOG(INFO) << "file open success [" << locate << " ],read " << tensor.valid_shape().count(); - size_t size=fread(tensor.mutable_data(), sizeof(Dtype), tensor.valid_size(), fp); - CHECK_EQ(size,tensor.valid_shape().count())<<"read data file ["< + DataType OpDtype> class Deconv : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - ConvParam -> { + ConvParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, ConvParam>::BaseFunc; Deconv() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef ConvParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ConvParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; virtual SaberStatus compute_output_shape(const Input_v &input, \ Output_v &output, Param_t ¶m) override { - Shape output_shape = (input[0]->shape()); - if (input[0]->shape().size() < 4) { - LOG(FATAL) << "using reshape2d to reshape a 1d conv?"; - } - - // append the $n and $c/$k, output: N * K * P * Q - int num_idx = input[0]->num_index(); - int channel_idx = input[0]->channel_index(); - int height_idx = input[0]->height_index(); - int width_idx = input[0]->width_index(); - - output_shape[num_idx] = input[0]->num(); // N - output_shape[channel_idx] = param.weight()->num(); // K - - int kernel_extent_h = param.dilation_h * - (param.weight()->height() - 1) + 1; - int output_dim_h = (input[0]->height() - 1) * - param.stride_h + kernel_extent_h - 2 * param.pad_h; - int kernel_extent_w = param.dilation_w * - (param.weight()->width() - 1) + 1; - int output_dim_w = (input[0]->width() - 1) * - param.stride_w + kernel_extent_w - 2 * param.pad_w; - - output_shape[height_idx] = output_dim_h; - output_shape[width_idx] = output_dim_w; - return output[0]->set_shape(output_shape); + Shape deconv_shape = deconv_compute_shape(input[0]->valid_shape(), param); + deconv_shape.set_layout(Layout_NCHW); + return output[0]->set_shape(deconv_shape); } virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: this->_impl.push_back(new VenderDeconv2D ); + OpDtype>); return SaberSuccess; case SABER_IMPL: this->_impl.push_back(new SaberDeconv2D ); + OpDtype>); return SaberSuccess; default: return SaberUnImplError; } } + SaberStatus trans_weights(Tensor &target_weights, + Tensor &target_bias, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int group, + ImplEnum implenum) { + if (implenum == VENDER_IMPL) { + return static_cast *>(this->_best_impl)->trans_weights( + target_weights, target_bias, stride_h, stride_w, pad_h, pad_w, + dilation_h, dilation_w, group); + } else if (implenum == SABER_IMPL) { + return static_cast *>(this->_best_impl)->trans_weights( + target_weights, target_bias, stride_h, stride_w, pad_h, pad_w, + dilation_h, dilation_w, group); + } else { + return SaberUnImplError; + } + } private: diff --git a/saber/funcs/deconv_act.h b/saber/funcs/deconv_act.h deleted file mode 100644 index 1790ec283..000000000 --- a/saber/funcs/deconv_act.h +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -#ifndef ANAKIN_SABER_FUNCS_DECONV_ACT_H -#define ANAKIN_SABER_FUNCS_DECONV_ACT_H - -#include "saber/funcs/base.h" -#include "saber/funcs/impl/impl_base.h" -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_deconv_act.h" -#include "saber/funcs/impl/cuda/vender_deconv_act.h" -#endif - -#ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_deconv_act.h" -#endif - -namespace anakin { -namespace saber { - -template -class DeconvAct : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - ConvActiveParam -> { -public: - using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - ConvActiveParam>::BaseFunc; - - DeconvAct() = default; - - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef ConvActiveParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; - typedef std::vector Shape_v; - - virtual SaberStatus compute_output_shape(const Input_v &input, - Output_v &output, Param_t ¶m) override { - Shape output_shape = (input[0]->shape()); - - if (input[0]->shape().size() < 4) { - LOG(FATAL) << "using reshape2d to reshape a 1d conv?"; - } - - // append the $n and $c/$k, output: N * K * P * Q - int num_idx = input[0]->num_index(); - int channel_idx = input[0]->channel_index(); - int height_idx = input[0]->height_index(); - int width_idx = input[0]->width_index(); - - output_shape[num_idx] = input[0]->num(); // N - ConvParam conv_param = param.conv_param; - output_shape[channel_idx] = conv_param.weight()->num(); // K - - int kernel_extent_h = conv_param.dilation_h * - (conv_param.weight()->height() - 1) + 1; - int output_dim_h = (input[0]->height() - 1) * - conv_param.stride_h + kernel_extent_h - 2 * conv_param.pad_h; - int kernel_extent_w = conv_param.dilation_w * - (conv_param.weight()->width() - 1) + 1; - int output_dim_w = (input[0]->width() - 1) * - conv_param.stride_w + kernel_extent_w - 2 * conv_param.pad_w; - - output_shape[height_idx] = output_dim_h; - output_shape[width_idx] = output_dim_w; - return output[0]->set_shape(output_shape); - } - - virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderDeconv2DAct ); - return SaberSuccess; - - case SABER_IMPL: - this->_impl.push_back(new SaberDeconv2DAct ); - return SaberSuccess; - - default: - return SaberUnImplError; - } - } - -private: - - virtual void pick_best_static() override { - if (true) // some condition? - this->_best_impl = this->_impl[0]; - } - - virtual void pick_best_specify(ImplEnum implenum) override { - this->_best_impl = this->_impl[0]; - } - -}; - -} // namespace saber -} // namespace anakin - - -#endif \ No newline at end of file diff --git a/saber/funcs/deformable_conv.h b/saber/funcs/deformable_conv.h index 68169e9b0..c60b0137a 100644 --- a/saber/funcs/deformable_conv.h +++ b/saber/funcs/deformable_conv.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,97 +16,60 @@ #define ANAKIN_SABER_FUNCS_DEFORMABLE_CONV_H #include "saber/funcs/base.h" +#include "saber/funcs/funcs_utils.h" #include "saber/funcs/impl/impl_base.h" -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_deformable_conv.h" -#endif - -#ifdef USE_X86_PLACE #include "saber/funcs/impl/impl_deformable_conv.h" +#ifdef NVIDIA_GPU +//#include "saber/funcs/impl/cuda/saber_deformable_conv.h" #endif namespace anakin { namespace saber { template class DeformableConv : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - DeformableConvParam -> { + DeformableConvParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, DeformableConvParam>::BaseFunc; DeformableConv() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef DeformableConvParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef DeformableConvParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; virtual SaberStatus compute_output_shape(const Input_v &input, Output_v &output, Param_t ¶m) override { - Shape output_shape = (input[0]->valid_shape()); - Shape offset_shape = (input[1]->valid_shape()); - - if (input[0]->shape().size() < 4) { - LOG(FATAL) << "using reshape2d to reshape a 1d conv?"; - } - - // append the $n and $c/$k, output: N * K * P * Q - int num_idx = input[0]->num_index(); - int channel_idx = input[0]->channel_index(); - int height_idx = input[0]->height_index(); - int width_idx = input[0]->width_index(); - - output_shape[num_idx] = input[0]->num(); // N - output_shape[channel_idx] = param.weight()->num(); // K - - int input_dim = input[0]->height(); // P - int kernel_exten = param.dilation_h * (param.weight()->height() - 1) + 1; - int output_dim = (input_dim + 2 * param.pad_h - kernel_exten) - / param.stride_h + 1; - - output_shape[height_idx] = output_dim; - input_dim = input[0]->width(); // Q - kernel_exten = param.dilation_w * (param.weight()->width() - 1) + 1; - output_dim = (input_dim + 2 * param.pad_w - kernel_exten) - / param.stride_w + 1; + Shape conv_shape = conv_compute_shape(input[0]->valid_shape(), param); + conv_shape.set_layout(Layout_NCHW); + return output[0]->set_shape(conv_shape); - output_shape[width_idx] = output_dim; - return output[0]->set_shape(output_shape); } virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: this->_impl.push_back(new VenderDeformableConv2D ); + OpDtype>); return SaberSuccess; case SABER_IMPL: this->_impl.push_back(new SaberDeformableConv2D ); + OpDtype>); return SaberSuccess; default: diff --git a/saber/funcs/detection_output.h b/saber/funcs/detection_output.h index 866b71ae2..5b4987e4c 100644 --- a/saber/funcs/detection_output.h +++ b/saber/funcs/detection_output.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,46 +17,35 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_detection_output.h" -#endif - -#ifdef USE_X86_PLACE #include "saber/funcs/impl/impl_detection_output.h" + +#ifdef NVIDIA_GPU +//#include "saber/funcs/impl/cuda/saber_detection_output.h" #endif namespace anakin { namespace saber { template + DataType OpDtype> class DetectionOutput : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - DetectionOutputParam -> { + DetectionOutputParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, DetectionOutputParam>::BaseFunc; DetectionOutput() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef DetectionOutputParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef DetectionOutputParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -76,13 +65,13 @@ class DetectionOutput : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderDetectionOutput ); + this->_impl.push_back(new VenderDetectionOutput ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberDetectionOutput ); + this->_impl.push_back(new SaberDetectionOutput ); return SaberSuccess; default: diff --git a/saber/funcs/eltwise.h b/saber/funcs/eltwise.h index 7d3a4860c..c698f5d2f 100644 --- a/saber/funcs/eltwise.h +++ b/saber/funcs/eltwise.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_eltwise.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_eltwise.h" #endif @@ -25,39 +26,33 @@ #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_eltwise.h" #endif - +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_eltwise.h" +#endif namespace anakin { namespace saber { template class Eltwise : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - EltwiseParam -> { + EltwiseParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, EltwiseParam>::BaseFunc; Eltwise() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef EltwiseParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef EltwiseParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -80,13 +75,13 @@ class Eltwise : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderEltwise ); + this->_impl.push_back(new VenderEltwise ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberEltwise ); + this->_impl.push_back(new SaberEltwise ); return SaberSuccess; default: @@ -101,8 +96,6 @@ class Eltwise : public BaseFunc< this->_best_impl = this->_impl[0]; } - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} - virtual void pick_best_specify(ImplEnum implenum) override { this->_best_impl = this->_impl[0]; } diff --git a/saber/funcs/eltwise_act.h b/saber/funcs/eltwise_act.h deleted file mode 100644 index d53c26766..000000000 --- a/saber/funcs/eltwise_act.h +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_ELTWISE_ACT_H -#define ANAKIN_SABER_FUNCS_ELTWISE_ACT_H - -#include "saber/funcs/base.h" -#include "saber/funcs/impl/impl_base.h" -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_eltwise_act.h" -#endif - -#ifdef USE_X86_PLACE -#include "saber/funcs/impl/x86/saber_eltwise_act.h" -#endif - -namespace anakin { -namespace saber { - -template -class EltwiseActive : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - EltwiseActiveParam -> { -public: - using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - EltwiseActiveParam>::BaseFunc; - - EltwiseActive() = default; - - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef EltwiseActiveParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; - typedef std::vector Shape_v; - - virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ - Param_t& param) override { - for (int i = 1; i < input.size(); ++i) { - CHECK_EQ(input[0]->num(), input[i]->num()); - CHECK_EQ(input[0]->channel(), input[i]->channel()); - CHECK_EQ(input[0]->height(), input[i]->height()); - CHECK_EQ(input[0]->width(), input[i]->width()); - } - - Shape output_shape = input[0]->valid_shape(); - output[0]->set_shape(output_shape); - - return SaberSuccess; - } - - virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderEltwiseActive ); - return SaberSuccess; - - case SABER_IMPL: - this->_impl.push_back(new SaberEltwiseActive ); - return SaberSuccess; - - default: - return SaberUnImplError; - } - } - -private: - - virtual void pick_best_static() override { - if (true) // some condition? - this->_best_impl = this->_impl[0]; - } - - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} - - virtual void pick_best_specify(ImplEnum implenum) override { - this->_best_impl = this->_impl[0]; - } - -}; - - -} -} - -#endif //ANAKIN_SABER_FUNCS_ELTWISE_ACTIVE_H \ No newline at end of file diff --git a/saber/funcs/embedding.h b/saber/funcs/embedding.h index 3ad5fdd0d..21b64ebc0 100644 --- a/saber/funcs/embedding.h +++ b/saber/funcs/embedding.h @@ -1,11 +1,8 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,11 +13,10 @@ #ifndef ANAKIN_SABER_FUNCS_EMBEDDING_H #define ANAKIN_SABER_FUNCS_EMBEDDING_H -#include "saber/core/tensor.h" #include "saber/funcs/base.h" -#include "saber/saber_funcs_param.h" #include "saber/funcs/impl/impl_base.h" #include "saber/funcs/impl/impl_embedding.h" + #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_embedding.h" #endif @@ -29,38 +25,37 @@ #include "saber/funcs/impl/x86/saber_embedding.h" #endif +#ifdef USE_AMD +#include "saber/funcs/impl/amd/saber_embedding.h" +#endif + +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_embedding.h" +#endif + namespace anakin { namespace saber { template + DataType OpDtype> class Embedding : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - EmbeddingParam -> { + EmbeddingParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, EmbeddingParam>::BaseFunc; Embedding() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef EmbeddingParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef EmbeddingParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -68,19 +63,22 @@ class Embedding : public BaseFunc< virtual SaberStatus compute_output_shape(const Input_v &input, Output_v &output, Param_t ¶m) override { - Shape output_shape = {input[0]->valid_size(), param.emb_dim, 1, 1}; + Shape output_shape({input[0]->valid_size(), param.emb_dim, 1, 1}); + output[0]->set_seq_offset(input[0]->get_seq_offset()); return output[0]->set_shape(output_shape); } virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - return SaberUnImplError; + //this->_impl.push_back(new VenderActivation _impl.push_back(new VenderEmbedding ); + return SaberSuccess; case SABER_IMPL: this->_impl.push_back(new SaberEmbedding ); + OpDtype>); return SaberSuccess; default: @@ -101,7 +99,9 @@ class Embedding : public BaseFunc< }; + + } // namespace saber } // namespace anakin -#endif +#endif \ No newline at end of file diff --git a/saber/funcs/fc.h b/saber/funcs/fc.h index 06dc8695a..639eb2047 100644 --- a/saber/funcs/fc.h +++ b/saber/funcs/fc.h @@ -1,9 +1,7 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software @@ -18,48 +16,42 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_fc.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_fc.h" -#include "saber/funcs/impl/cuda/vender_fc.h" #endif #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/vender_fc.h" #endif - + +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_fc.h" +#endif + namespace anakin{ namespace saber{ -template +template class Fc : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - FcParam -> { + FcParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - FcParam>::BaseFunc; + TargetType, + OpDtype, + ImplBase, + FcParam>::BaseFunc; Fc() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef FcParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef FcParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -77,32 +69,21 @@ class Fc : public BaseFunc< } CHECK_EQ(weights_size / n, k) << "weights size does not meet the input size"; - int num_idx = output[0]->num_index(); - int channel_idx = output[0]->channel_index(); - int height_idx = output[0]->height_index(); - int widht_idx = output[0]->width_index(); - if (num_idx >= 0) { - shape_out[num_idx] = m; - } - if (height_idx >= 0) { - shape_out[height_idx] = 1; - } - if (widht_idx >= 0) { - shape_out[widht_idx] = 1; - } - shape_out[channel_idx] = n; + shape_out.set_num(m); + shape_out.set_height(1); + shape_out.set_width(1); + shape_out.set_channel(n); + output[0]->set_seq_offset(input[0]->get_seq_offset()); return output[0]->set_shape(shape_out); } virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderFc ); + this->_impl.push_back(new VenderFc); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberFc ); + this->_impl.push_back(new SaberFc); return SaberSuccess; default: @@ -137,4 +118,4 @@ class Fc : public BaseFunc< } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_FC_H +#endif //ANAKIN_SABER_FUNCS_FC_H \ No newline at end of file diff --git a/saber/funcs/flatten.h b/saber/funcs/flatten.h index 32ecd7719..d89d5fc33 100644 --- a/saber/funcs/flatten.h +++ b/saber/funcs/flatten.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -23,35 +23,25 @@ namespace anakin{ namespace saber{ -template +template class Flatten : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - FlattenParam -> { + FlattenParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - FlattenParam>::BaseFunc; + TargetType, + OpDtype, + ImplBase, + FlattenParam>::BaseFunc; Flatten() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef FlattenParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef FlattenParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -78,7 +68,7 @@ class Flatten : public BaseFunc< //flatten ops do nothing virtual SaberStatus operator()(const Input_v& input, Output_v& output, Param_t& param, \ Context &ctx) { - + return SaberSuccess; } private: @@ -88,12 +78,6 @@ class Flatten : public BaseFunc< this->_best_impl = this->_impl[0]; } - virtual void pick_best_runtime(Input_v input, Output_v output, \ - Param_t& param, Context &ctx) override { - //! flatten only has saber implementation - this->_best_impl = this->_impl[0]; - } - virtual void pick_best_specify(ImplEnum implenum) override { //! flatten only has saber implementation this->_best_impl = this->_impl[0]; @@ -105,4 +89,4 @@ class Flatten : public BaseFunc< } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_FLATTEN_H \ No newline at end of file +#endif //ANAKIN_SABER_FUNCS_FLATTEN_H diff --git a/saber/funcs/funcs_utils.h b/saber/funcs/funcs_utils.h index 0538ec790..7f3458aee 100644 --- a/saber/funcs/funcs_utils.h +++ b/saber/funcs/funcs_utils.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,11 +19,113 @@ #include #include #include "saber/core/tensor.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_funcs_param.h" + namespace anakin{ namespace saber{ -template +template +Shape conv_compute_shape(const Shape input_shape, Param ¶m) { + Shape output_shape = (input_shape); + CHECK_GE(input_shape.size(), 4) << "using reshape2d to reshape a 1d conv?"; + + output_shape.set_num(input_shape.num()); // N + output_shape.set_channel(param.weight()->num()); // K + + int input_dim = input_shape.height(); // P + int kernel_exten = param.dilation_h * (param.weight()->height() - 1) + 1; + int output_height = (input_dim + 2 * param.pad_h - kernel_exten) + / param.stride_h + 1; + output_shape.set_height(output_height); + + input_dim = input_shape.width(); // Q + kernel_exten = param.dilation_w * (param.weight()->width() - 1) + 1; + int output_width = (input_dim + 2 * param.pad_w - kernel_exten) + / param.stride_w + 1; + output_shape.set_width(output_width); + return output_shape; +} + +template +Shape deconv_compute_shape(const Shape input_shape, ConvParam ¶m) { + Shape output_shape = input_shape; + CHECK_GE(input_shape.size(), 4) << "using reshape2d to reshape a 1d deconv?"; + + // append the $n and $c/$k, output: N * K * P * Q + + output_shape.set_num(input_shape.num()); // N + output_shape.set_channel(param.weight()->num() * param.group); // K + + int kernel_extent_h = param.dilation_h * + (param.weight()->height() - 1) + 1; + int output_dim_h = (input_shape.height() - 1) * + param.stride_h + kernel_extent_h - 2 * param.pad_h; + int kernel_extent_w = param.dilation_w * + (param.weight()->width() - 1) + 1; + int output_dim_w = (input_shape.width() - 1) * + param.stride_w + kernel_extent_w - 2 * param.pad_w; + + output_shape.set_height(output_dim_h); + output_shape.set_width(output_dim_w); + return output_shape; +} + +template +Shape pool_compute_shape(const Shape input_shape, Param ¶m) { + + Shape output_shape = input_shape; + + int in_height = input_shape.height(); + int in_width = input_shape.width(); + + int window_h = param.window_h; + int window_w = param.window_w; + int pad_h = param.pad_h; + int pad_w = param.pad_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + int out_height; + int out_width; + if (param.global_pooling) { + out_height = 1; + out_width = 1; + param.stride_h = in_height; + param.stride_w = in_width; + window_h = in_height; + window_w = in_width; + param.window_h = in_height; + param.window_w = in_width; + } else { + if (param.cmp_out_shape_floor_as_conv) { + out_height = static_cast((static_cast( + in_height + 2 * pad_h - window_h) / stride_h)) + 1; + + out_width = static_cast((static_cast( + in_width + 2 * pad_w - window_w) / stride_w)) + 1; + } else { + out_height = static_cast(ceilf(static_cast( + in_height + 2 * pad_h - window_h) / stride_h)) + 1; + + out_width = static_cast(ceilf(static_cast( + in_width + 2 * pad_w - window_w) / stride_w)) + 1; + } + } + + if (param.pooling_padded()) { + if ((out_height - 1) * stride_h >= in_height + pad_h) { + -- out_height; + } + if ((out_width - 1) * stride_w >= in_width + pad_w) { + -- out_width; + } + } + output_shape.set_height(out_height); + output_shape.set_width(out_width); + return output_shape; +} +template void transpose_inplace(float* output, const float* input, const int num, const int channel, const int height, const int width) { @@ -51,8 +153,6 @@ void extract_matrix_from_matrix_in_leddim(const Dtype* input, } } - - template void merge_matrix_to_matrix_in_leddim(const Dtype* input, Dtype* output,int start_index,int end_index,int stride,int dimsize){ @@ -215,33 +315,42 @@ void transpose_filter_KCRS_2_CRSK(const Dtype *input, Dtype *output, \ } template < typename Tensor_t, template class Param > -void update_conv_weights(Param& param) -{ - Tensor new_weight; - Tensor new_bias; - typedef typename Tensor_t::Dtype dtype; +void update_conv_weights(Param& param) { +#ifdef USE_ARM_PLACE + Tensor new_weight; + Tensor new_bias; +#elif defined(USE_CUDA) + Tensor new_weight; + Tensor new_bias; +#else + Tensor new_weight; + Tensor new_bias; +#endif //USE_ARM_PLACE + typedef typename Tensor_t::FDtype Dtype; + DataType dtype = param.conv_param.weight()->get_dtype(); + CHECK_EQ(dtype, AK_FLOAT) << "only support float type weights"; Shape weight_shape = param.conv_param.weight()->shape(); - new_weight.re_alloc(weight_shape); + new_weight.re_alloc(weight_shape, AK_FLOAT); new_weight.copy_from(*(param.conv_param.weight())); Shape bias_shape; if (param.conv_param.bias()->size() > 0) { bias_shape = param.conv_param.bias()->shape(); - new_bias.re_alloc(bias_shape); + new_bias.re_alloc(bias_shape, AK_FLOAT); new_bias.copy_from(*(param.conv_param.bias())); } else if (param.has_batchnorm) { bias_shape = {1, param.batchnorm_param.mean.size(), 1, 1}; - new_bias.re_alloc(bias_shape); + new_bias.re_alloc(bias_shape, AK_FLOAT); void* new_bias_data = new_bias.mutable_data(); - memset(new_bias_data, 0, sizeof(dtype) * new_bias.size()); + memset(new_bias_data, 0, sizeof(Dtype) * new_bias.size()); } else if (param.has_scale) { bias_shape = {1, param.scale_param.scale_w.size(), 1, 1}; - new_bias.re_alloc(bias_shape); + new_bias.re_alloc(bias_shape, AK_FLOAT); void* new_bias_data = new_bias.mutable_data(); - memset(new_bias_data, 0, sizeof(dtype) * new_bias.size()); + memset(new_bias_data, 0, sizeof(Dtype) * new_bias.size()); } else { return; } @@ -249,15 +358,15 @@ void update_conv_weights(Param& param) int filter_num = new_weight.num(); int chw = new_weight.channel(); - dtype* weight_data = new_weight.mutable_data(); - dtype* bias_data = new_bias.mutable_data(); + Dtype* weight_data = new_weight.mutable_data(); + Dtype* bias_data = new_bias.mutable_data(); chw *= new_weight.height(); chw *= new_weight.width(); for (int i = 0; i < filter_num; ++i) { - dtype alpha = 1.f; - dtype beta = 0.f; + Dtype alpha = 1.f; + Dtype beta = 0.f; if (param.has_batchnorm) { float scale_factor = 1.f; @@ -297,9 +406,187 @@ void update_conv_weights(Param& param) param.conv_param.mutable_bias()->copy_from(new_bias); } +template < typename Tensor_t, template class Param > +void update_deconv_weights(Param& param) +{ +#ifdef USE_ARM_PLACE + Tensor new_weight; + Tensor new_bias; +#elif defined(USE_CUDA) + Tensor new_weight; + Tensor new_bias; +#else + Tensor new_weight; + Tensor new_bias; +#endif //USE_ARM_PLACE + //typedef typename Tensor_t::FDtype dtype; + CHECK_EQ(AK_FLOAT, param.conv_param.weight()->get_dtype()) << "only support float weights"; + + Shape weight_shape = param.conv_param.weight()->shape(); + new_weight.re_alloc(weight_shape, AK_FLOAT); + new_weight.copy_from(*(param.conv_param.weight())); + Shape bias_shape; + + if (param.conv_param.bias()->size() > 0) { + bias_shape = param.conv_param.bias()->shape(); + new_bias.re_alloc(bias_shape, AK_FLOAT); + new_bias.copy_from(*(param.conv_param.bias())); + + } else if (param.has_batchnorm) { + bias_shape = {1, param.batchnorm_param.mean.size(), 1, 1}; + new_bias.re_alloc(bias_shape, AK_FLOAT); + void* new_bias_data = new_bias.mutable_data(); + memset(new_bias_data, 0, sizeof(float) * new_bias.size()); + + } else if (param.has_scale) { + bias_shape = {1, param.scale_param.scale_w.size(), 1, 1}; + new_bias.re_alloc(bias_shape, AK_FLOAT); + void* new_bias_data = new_bias.mutable_data(); + memset(new_bias_data, 0, sizeof(float) * new_bias.size()); + } else { + return; + } + int filter_num = new_weight.num(); + int channel_num_per_group = new_weight.channel(); + std::vector scale(new_weight.num(), 0); + std::vector shift(new_weight.num(), 0); + for (int i = 0; i < filter_num; ++i) { + float alpha = 1.f; + float beta = 0.f; + + if (param.has_batchnorm) { + float scale_factor = 1.f; + scale_factor = (param.batchnorm_param.scale == 0) ? + 1 : 1.f / param.batchnorm_param.scale; + float eps = param.batchnorm_param.eps; + float variance; + float mean; + alpha = param.batchnorm_param.variance[i] * scale_factor + eps; + alpha = 1.f / sqrtf(alpha); + beta = -1.f * (param.batchnorm_param.mean[i] * scale_factor); + beta *= alpha; + } + + if (param.has_scale) { + alpha *= param.scale_param.scale_w[i]; + + if (param.scale_param.bias_term) { + beta = beta * param.scale_param.scale_w[i] + + param.scale_param.scale_b[i]; + } else { + beta *= param.scale_param.scale_w[i]; + } + } + scale[i] = alpha; + shift[i] = beta; + } + + + float* weight_data = (float*)new_weight.mutable_data(); + float* bias_data = (float*)new_bias.mutable_data(); + // {Ic, Oc/group, K_h, K_w} real shape + // {Oc, Ic/group, K_h, K_w} parser return back shape + // filter_num = Oc; + // channel_num_per_group = Ic/group; + // [group, Ic/group, Oc/group, K_h, k_w] + + int hw = new_weight.height() * new_weight.width(); + int group = param.conv_param.group; + int filter_num_per_group = filter_num / group; + int id = 0; + for (int i = 0; i < group; i++) { + for (int j = 0; j < channel_num_per_group; j++) { + for (int k = 0; k < filter_num_per_group; k++) { + int out_channel_id = i * filter_num_per_group + k; + for (int m = 0; m < hw; m++) { + weight_data[id] = weight_data[id]* scale[out_channel_id]; + id++; + } + } + } + } + + for (int i = 0; i < filter_num; i++) { + bias_data[i] *= scale[i]; + bias_data[i] += shift[i]; + } + + param.conv_param.mutable_weight()->copy_from(new_weight); + Shape new_bias_shape = new_bias.shape(); + param.conv_param.mutable_bias()->re_alloc(new_bias_shape); + param.conv_param.mutable_bias()->copy_from(new_bias); +} + +inline int align_up(int a, int b) { + return (a % b != 0) ? (a - a % b + b) : a; +} + +template +void conv_trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group, + bool in_place = false, Tensor* weight_dev = nullptr) { + + Tensor trans_weights_host; + if (stride_h == 1 && + stride_w == 1 && + target_weights.height() == 3 && + target_weights.width() == 3 && group == 1) { + //Update weights if need + Shape weight_shape = target_weights.valid_shape(); + Tensor new_weight; + new_weight.re_alloc(weight_shape, target_weights.get_dtype()); + new_weight.copy_from(target_weights); + float *weight_data = (float *)new_weight.mutable_data(); + int round_in_channel = align_up(target_weights.channel(), 8); + int round_out_channel = align_up(target_weights.num(), 32); + int weight4x4_size = round_in_channel * round_out_channel * 4 * 4; + Shape old_shape = target_weights.valid_shape(); + Shape new_trans_weights_shape({{weight4x4_size, 1, 1 ,1}}, target_weights.get_layout()); + trans_weights_host.re_alloc(new_trans_weights_shape, target_weights.get_dtype()); + float* _host_work_space = (float*)trans_weights_host.mutable_data(); + transform_3x3_weight_2_4x4(weight_data, _host_work_space, target_weights.num(), + round_out_channel, target_weights.channel(), round_in_channel); + Shape new_weights_shape({weight4x4_size, 1, 1, 1}, target_weights.get_layout()); + if (in_place) { + target_weights.re_alloc(new_weights_shape, target_weights.get_dtype()); + target_weights.copy_from(trans_weights_host); + target_weights.set_shape(old_shape); + } else { + weight_dev->re_alloc(new_weights_shape, target_weights.get_dtype()); + weight_dev->copy_from(trans_weights_host); + weight_dev->set_shape(old_shape); + } + } else if (group == 1) { + int weight_size = (target_weights.valid_shape()).count(); + Tensor weight_host; + weight_host.re_alloc(target_weights.valid_shape(), target_weights.get_dtype()); + weight_host.copy_from(target_weights); + const float *weight_data = (const float *)weight_host.data(); + trans_weights_host.re_alloc(target_weights.valid_shape(), target_weights.get_dtype()); + float* _host_work_space = (float*)trans_weights_host.mutable_data(); + + transpose_filter_KCRS_2_CRSK(weight_data, _host_work_space, \ + target_weights.num(), \ + target_weights.channel(), \ + target_weights.height(), \ + target_weights.width()); + if (in_place) { + target_weights.re_alloc(target_weights.valid_shape(), target_weights.get_dtype()); + target_weights.copy_from(trans_weights_host); + } else { + weight_dev->re_alloc(target_weights.valid_shape(), target_weights.get_dtype()); + weight_dev->copy_from(trans_weights_host); + } + + } +// cudaDeviceSynchronize(); +} } // namespace saber } // namespace anakin #endif //SABER_FUNCS_UTILS_H + + + diff --git a/saber/funcs/gemm.h b/saber/funcs/gemm.h new file mode 100644 index 000000000..788d0dcfe --- /dev/null +++ b/saber/funcs/gemm.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_GEMM_H +#define ANAKIN_SABER_FUNCS_GEMM_H + +#include "anakin_config.h" +#include "saber/core/context.h" +#include "saber/saber_types.h" + +namespace anakin { +namespace saber { + +template +class Gemm { + // Row major gemm +public: + Gemm() = default; + ~Gemm() {} + + SaberStatus init(const bool trans_A, const bool trans_B, + const int m, const int n, const int k, + Context ctx) { + return SaberUnImplError; + } + + SaberStatus dispatch(const outDtype alpha, const outDtype beta, + const inDtype* a, const inDtype* b, + outDtype* c) { + return SaberUnImplError; + } + +private: + Context _ctx; +}; + +template +class Gemv { + // Row major gemm +public: + Gemv() = default; + ~Gemv() {} + + SaberStatus init(const bool trans_A, const int m, const int n, + const int incx, const int incy, + Context ctx) { + return SaberUnImplError; + } + + SaberStatus dispatch(const outDtype alpha, const outDtype beta, + const inDtype* a, const inDtype* b, + outDtype* c) { + return SaberUnImplError; + } + +private: + Context _ctx; +}; + +} +} + +#ifdef USE_CUDA +#include "saber/funcs/impl/cuda/vender_gemm.h" +#include "saber/funcs/impl/cuda/saber_gemm.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/vender_gemm.h" +#endif + +#endif \ No newline at end of file diff --git a/saber/funcs/gru.h b/saber/funcs/gru.h index 280dc7e13..764ce0a6f 100644 --- a/saber/funcs/gru.h +++ b/saber/funcs/gru.h @@ -1,63 +1,63 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_FUNCS_GRU_H #define ANAKIN_SABER_FUNCS_GRU_H #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_gru.h" + #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_gru.h" -#include "saber/funcs/impl/cuda/vender_gru.h" +//#include "saber/funcs/impl/cuda/vender_gru.h" #endif #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_gru.h" +#include "saber/funcs/impl/x86/vender_gru.h" #endif +#ifdef USE_ARM_PLACE +//todo +#include "saber/funcs/impl/impl_gru.h" +#endif namespace anakin { namespace saber { template class Gru : public BaseFunc < - Tensor, - Tensor, - Tensor, - ImplBase, - GruParam - > { + TargetType, + OpDtype, + ImplBase, + GruParam >{ public: using BaseFunc < - Tensor, - Tensor, - Tensor, - ImplBase, - GruParam >::BaseFunc; + TargetType, + OpDtype, + ImplBase, + GruParam >::BaseFunc; Gru() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef GruParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef GruParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -69,76 +69,42 @@ class Gru : public BaseFunc < int input_height = input[0]->height(); int input_width = input[0]->width(); + CHECK_GE(input.size(), 1) << "input must >= 1"; - CHECK_GE(input.size(),1)<<"input must >= 1"; - -// if(param.weight()->valid_shape().size()==5){ -// -// int hiddenSize = param.bias()->valid_size() / 3; -// int seq_sum = input[0]->num(); -// Shape output_shape = Shape(seq_sum, hiddenSize * param._num_direction/16, 1, 1,16); -// return output[0]->set_shape(output_shape); -// } - - -// if (input[0]->get_seq_offset().size()>0) { - int hiddenSize = param.bias()->valid_size() / 3; - -// if (input.size() == 0) { -// Shape output_shape = Shape(max_seq_sum, hiddenSize * param.numDirection, 1, 1); -// return output[0]->set_shape(output_shape); -// } else { - int seq_sum = input[0]->num(); -// CHECK_LE(seq_sum, max_seq_sum) << "seq_sum should le than the init shape"; - Shape output_shape = Shape(seq_sum, hiddenSize * param.num_direction, 1, 1); - return output[0]->set_shape(output_shape); -// } -// } -// else { -// int seqLength = input_channel; -// int batchSize = input_height; -// int wordSize = input_width; -// int hiddenSize = param.bias()->valid_size() / 3; -// Shape output_shape = Shape(1, seqLength, batchSize, hiddenSize * param._num_direction); -// return output[0]->set_shape(output_shape); -// } - + int hiddenSize = param.bias()->valid_size() / 3; + int seq_sum = input[0]->num(); + Shape output_shape = Shape({seq_sum, hiddenSize * param.num_direction, 1, 1},input[0]->get_layout()); + output[0]->set_shape(output_shape); + output[0]->set_seq_offset(input[0]->get_seq_offset()); + return SaberSuccess; } virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderGru ); - return SaberSuccess; - - case SABER_IMPL: - this->_impl.push_back(new SaberGru ); - return SaberSuccess; - - default: - return SaberUnImplError; + case VENDER_IMPL: + this->_impl.push_back(new VenderGru ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberGru ); + return SaberSuccess; + + default: + return SaberUnImplError; } } private: virtual void pick_best_static() override { - //! gru only has vendor implementation - this->_best_impl = this->_impl[0]; - } - - virtual void pick_best_runtime(Input_v input, Output_v output, \ - Param_t& param, Context& ctx) override { - //! gru only has vendor implementation this->_best_impl = this->_impl[0]; } virtual void pick_best_specify(ImplEnum implenum) override { - //! gru only has vendor implementation this->_best_impl = this->_impl[0]; } diff --git a/saber/funcs/im2sequence.h b/saber/funcs/im2sequence.h index b5c94a63b..b32829039 100644 --- a/saber/funcs/im2sequence.h +++ b/saber/funcs/im2sequence.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,46 +18,40 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_im2sequence.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_im2sequence.h" #endif #ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_im2sequence.h" +#endif +#ifdef USE_ARM_PLACE +//todo #include "saber/funcs/impl/impl_im2sequence.h" #endif - namespace anakin { namespace saber { -template +template class Im2Sequence : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - Im2SequenceParam -> { + Im2SequenceParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - Im2SequenceParam>::BaseFunc; + TargetType, + OpDtype, + ImplBase, + Im2SequenceParam>::BaseFunc; Im2Sequence() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef Im2SequenceParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef Im2SequenceParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -89,20 +83,25 @@ class Im2Sequence : public BaseFunc< output_shape[width_idx] = 1; output[0]->set_shape(output_shape); - + int n=input[0]->num(); + std::vector offset0(n+1); + std::vector> offset; + offset.push_back(offset0); + for(int i=0;i<=n;i++){ + offset[0].push_back(i*output_height * output_width); + } + output[0]->set_seq_offset(offset); return SaberSuccess; } virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderIm2Sequence ); + this->_impl.push_back(new VenderIm2Sequence ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberIm2Sequence ); + this->_impl.push_back(new SaberIm2Sequence ); return SaberSuccess; default: @@ -117,8 +116,6 @@ class Im2Sequence : public BaseFunc< this->_best_impl = this->_impl[0]; } - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} - virtual void pick_best_specify(ImplEnum implenum) override { this->_best_impl = this->_impl[0]; } diff --git a/saber/funcs/impl/amd/amd_impl.h b/saber/funcs/impl/amd/amd_impl.h new file mode 100644 index 000000000..c50f94aeb --- /dev/null +++ b/saber/funcs/impl/amd/amd_impl.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_AMD_IMPL_H +#define ANAKIN_SABER_FUNCS_AMD_IMPL_H + +#include "saber/funcs/impl/amd/saber_activation.h" +#include "saber/funcs/impl/amd/saber_conv.h" +#include "saber/funcs/impl/amd/saber_conv_act.h" +#include "saber/funcs/impl/amd/saber_conv_act_pooling.h" +#include "saber/funcs/impl/amd/saber_pooling.h" +#include "saber/funcs/impl/amd/saber_softmax.h" +#include "saber/funcs/impl/amd/vender_fc.h" +#endif //ANAKIN_SABER_FUNCS_NV_IMPL_H diff --git a/saber/funcs/impl/amd/amd_utils.cpp b/saber/funcs/impl/amd/amd_utils.cpp new file mode 100644 index 000000000..e10190fc4 --- /dev/null +++ b/saber/funcs/impl/amd/amd_utils.cpp @@ -0,0 +1,112 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "saber/funcs/impl/amd/amd_utils.h" +#include "utils/logger/logger.h" + +namespace anakin { +namespace saber { +#define MAX_LOG_LENGTH 65535 +cl_program CreateCLProgram(cl_context context, cl_device_id device, const char* fileName, KernelInfo* ki) +{ + cl_int errNum; + cl_program program; + + std::ifstream kFile(fileName, std::ios::in); + if (!kFile.is_open()) + { + LOG(ERROR) << "Failed to open file for reading: " << fileName; + return NULL; + } + std::string src( + (std::istreambuf_iterator(kFile)), + std::istreambuf_iterator() + ); + char *srcStr = src.c_str(); + program = clCreateProgramWithSource(context, 1, (const char**)&srcStr, NULL, NULL); + + kFile.close(); + + if (program == NULL) + { + LOG(ERROR) << "Failed to create CL program with source file."; + return NULL; + } + char *comp_options = NULL; + if(ki != NULL) + comp_options= ki->comp_options.c_str(); + errNum = clBuildProgram(program, 1, &device, comp_options, NULL, NULL); + if (errNum != CL_SUCCESS) + { + char buildErrLog[MAX_LOG_LENGTH]; + clGetProgramBuildInfo(program, + device, + CL_PROGRAM_BUILD_LOG, + sizeof(buildErrLog), + buildErrLog, + NULL); + + LOG(ERROR) << "CL program build error log in kernel: " << buildErrLog; + clReleaseProgram(program); + return NULL; + } + return program; +}; + +cl_program CreatProgramFromBinaryFile(cl_context context, cl_device_id device, const char * binFile) +{ + cl_program program; + cl_int errNum; + + FILE * fp = fopen(binFile, "rb"); + if (fp == NULL) + { + LOG(ERROR) << "Can't open bin file: " << std::string(binFile); + return NULL; + } + + size_t binSize; + fseek(fp, 0, SEEK_END); + binSize = ftell(fp); + rewind(fp); + + unsigned char * binProgram = new unsigned char[binSize]; + fread(binProgram, 1, binSize, fp); + fclose(fp); + + program = clCreateProgramWithBinary(context, 1, &device, &binSize, (const unsigned char**)&binProgram, NULL, &errNum); + errNum = clBuildProgram(program, 1, &device, "", NULL, NULL); + + delete[] binProgram; + if (errNum != CL_SUCCESS) + { + char buildErrLog[MAX_LOG_LENGTH]; + clGetProgramBuildInfo(program, + device, + CL_PROGRAM_BUILD_LOG, + sizeof(buildErrLog), + buildErrLog, + NULL); + + LOG(ERROR) << "CL program build error log in kernel: " << buildErrLog; + clReleaseProgram(program); + + return NULL; + } + return program; +}; + + +} +} diff --git a/saber/funcs/impl/amd/amd_utils.h b/saber/funcs/impl/amd/amd_utils.h new file mode 100644 index 000000000..e55229014 --- /dev/null +++ b/saber/funcs/impl/amd/amd_utils.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_FUNC_IMPL_AMD_UTILS_H +#define ANAKIN_SABER_FUNC_IMPL_AMD_UTILS_H + +#include + +#include +#include +#include +#include + +#define MLO_POOLING_OP_MAX 0 +#define MLO_POOLING_OP_AVE 1 + +namespace anakin { +namespace saber { + +typedef struct ExtSolutionConfigTpye +{ + int in_tile0, in_tile1; + int grp_tile0, grp_tile1; + int out_pix_tile0, out_pix_tile1; + int n_stacks; + int n_out_pix_tiles; + int n_out_tiles_perstack; + int n_in_data_tiles; + int n_read_procs; + int alu_tile0, alu_tile1; + int horiz_out_pix; + int vert_out_pix; +}T_ExtSolutionConfig; + +struct KernelInfo +{ + std::string comp_options; + std::vector l_wk; + std::vector g_wk; + std::string kernel_file; + std::string kernel_name; + friend std::ostream& operator<<(std::ostream& os, const KernelInfo& k); +}; + +extern cl_program CreateCLProgram(cl_context context, cl_device_id device, const char* fileName, KernelInfo* ki=NULL); +extern cl_program CreatProgramFromBinaryFile(cl_context context, cl_device_id device, const char* binFile); + + +inline cl_int _setKernelArgs(cl_kernel &k,int i){ return CL_SUCCESS;} + +template +inline cl_int _setKernelArgs(cl_kernel &kernel,int i, const T &firstParameter, const Args& ...restOfParameters){ + return clSetKernelArg(kernel, i, sizeof(firstParameter), &firstParameter) | \ + _setKernelArgs(kernel,i+1,restOfParameters...); +} + +template +inline cl_int setKernelArgs(cl_kernel &kernel, const Args& ...args){ + return _setKernelArgs(kernel, 0, args...); +} +} +} +#endif diff --git a/saber/funcs/impl/amd/cl/ConvFwd3x3.cl b/saber/funcs/impl/amd/cl/ConvFwd3x3.cl new file mode 100644 index 000000000..ddc6272c1 --- /dev/null +++ b/saber/funcs/impl/amd/cl/ConvFwd3x3.cl @@ -0,0 +1,925 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +/* Compiler options: +-c -emit-llvm -target amdgcn-amd-amdhsa-amdgizcl -x cl -cl-kernel-arg-info + +-DMLO_HW_WAVE_SZ=64 +-DMLO_DIR_FORWARD=1 +-DMLO_FILTER_SIZE0=3 +-DMLO_FILTER_SIZE1=3 +-DMLO_FILTER_PAD0=1 +-DMLO_FILTER_PAD1=1 +-DMLO_FILTER_STRIDE0=1 +-DMLO_FILTER_STRIDE1=1 +-DMLO_N_OUTPUTS=64 +-DMLO_N_INPUTS=3 +-DMLO_BATCH_SZ=2 +-DMLO_OUT_WIDTH=224 +-DMLO_OUT_HEIGHT=224 +-DMLO_OUT_BATCH_STRIDE=3211264 +-DMLO_OUT_CHANNEL_STRIDE=50176 +-DMLO_OUT_STRIDE=224 +-DMLO_IN_WIDTH=224 +-DMLO_IN_HEIGHT=224 +-DMLO_IN_BATCH_STRIDE=150528 +-DMLO_IN_CHANNEL_STRIDE=50176 +-DMLO_IN_STRIDE=224 +-DMLO_IN_TILE0=32 +-DMLO_IN_TILE1=32 +-DMLO_GRP_TILE0=16 +-DMLO_GRP_TILE1=16 +-DMLO_OUT_TILE0=2 +-DMLO_OUT_TILE1=2 +-DMLO_N_STACKS=1 +-DMLO_N_OUT_TILES=8 +-DMLO_N_OUT_TILES_PERSTACK=8 +-DMLO_N_IN_TILES_PERSTACK=2 +-DMLO_N_READ_PROCS=256 +-DMLO_CONV_BIAS=0 +-DMLO_ALU_VTILE0=16 +-DMLO_ALU_VTILE1=16 + +-cl-std=CL1.2 -O3 +-mcpu=gfx900 +-mllvm +-amdgpu-early-inline-all +-mllvm -amdgpu-prelink + +-D__AMD__=1 +-D__gfx900__=1 +-D__gfx900=1 +-D__OPENCL_VERSION__=120 +-D__IMAGE_SUPPORT__=1 + +-Xclang +-cl-ext=+cl_khr_fp64,+cl_khr_global_int32_base_atomics,+cl_khr_global_int32_extended_atomics,+cl_khr_local_int32_base_atomics,+cl_khr_local_int32_extended_atomics,+cl_khr_int64_base_atomics,+cl_khr_int64_extended_atomics,+cl_khr_3d_image_writes,+cl_khr_byte_addressable_store,+cl_khr_fp16,+cl_khr_gl_sharing,+cl_amd_device_attribute_query,+cl_amd_media_ops,+cl_amd_media_ops2,+cl_khr_subgroups,+cl_amd_copy_buffer_p2p,+cl_amd_assembly_program -include opencl-c.h +*/ + +#define _FLOAT float +#define _FLOAT2 float2 +#define _FLOAT4 float4 +#define _FLOAT8 float8 + +#ifndef FLT_MAX +#define FLT_MAX 3.402823466e+38F /* max value */ +#endif + + + +//HCJ definitions generated by solver +//#define MLO_HW_WAVE_SZ 64 +//#define MLO_DIR_FORWARD 1 +//#define MLO_FILTER_SIZE0 3 +//#define MLO_FILTER_SIZE1 3 +//#define MLO_FILTER_PAD0 1 +//#define MLO_FILTER_PAD1 1 +//#define MLO_FILTER_STRIDE0 1 +//#define MLO_FILTER_STRIDE1 1 +//#define MLO_N_OUTPUTS 64 +//#define MLO_N_INPUTS 3 +//#define MLO_BATCH_SZ 2 +//#define MLO_OUT_WIDTH 224 +//#define MLO_OUT_HEIGHT 224 +//#define MLO_OUT_BATCH_STRIDE 3211264 +//#define MLO_OUT_CHANNEL_STRIDE 50176 +//#define MLO_OUT_STRIDE 224 +//#define MLO_IN_WIDTH 224 +//#define MLO_IN_HEIGHT 224 +//#define MLO_IN_BATCH_STRIDE 150528 +//#define MLO_IN_CHANNEL_STRIDE 50176 +//#define MLO_IN_STRIDE 224 +//#define MLO_IN_TILE0 32 +//#define MLO_IN_TILE1 32 +//#define MLO_GRP_TILE0 16 +//#define MLO_GRP_TILE1 16 +//#define MLO_OUT_TILE0 2 +//#define MLO_OUT_TILE1 2 +//#define MLO_N_STACKS 1 +//#define MLO_N_OUT_TILES 8 +//#define MLO_N_OUT_TILES_PERSTACK 8 +//#define MLO_N_IN_TILES_PERSTACK 2 +//#define MLO_N_READ_PROCS 256 +//#define MLO_CONV_BIAS 0 +//#define MLO_ALU_VTILE0 16 +//#define MLO_ALU_VTILE1 16 +#define MLO_CONV_BIAS 1 + +#define UNUSED __attribute__((__unused__)) + +#ifndef MLO_FILTER_STRIDE0 +#define MLO_FILTER_STRIDE0 1 +#endif +#ifndef MLO_FILTER_STRIDE1 +#define MLO_FILTER_STRIDE1 1 +#endif + +#define MLO_FILTER_SZ (MLO_FILTER_SIZE1 * MLO_FILTER_SIZE0) + +#define MLO_GRP_SZ0 (MLO_GRP_TILE0 * MLO_GRP_TILE1) +#define MLO_GRP_SZ1 1 +#define MLO_GRP_SZ2 1 +#define MLO_GRP_SZ (MLO_GRP_SZ0 * MLO_GRP_SZ1 * MLO_GRP_SZ2) +#define MLO_N_PROC_WAVES ((MLO_GRP_SZ + MLO_N_READ_PROCS - 1) / MLO_N_READ_PROCS) +#define MLO_OUT_TILE_SZ (MLO_OUT_TILE1 * MLO_OUT_TILE0) +#define MLO_ALU_TILE_SZ (MLO_ALU_VTILE1 * MLO_ALU_VTILE0) + +#if MLO_IN_TILE0 < MLO_OUT_WIDTH || MLO_IN_TILE1 < MLO_OUT_HEIGHT +#define MLO_LARGE_MAP 1 +#else +#define MLO_LARGE_MAP 0 +#endif + +#if(MLO_IN_WIDTH == MLO_OUT_WIDTH && \ + (MLO_IN_WIDTH / MLO_IN_TILE0) * MLO_IN_TILE0 == MLO_IN_WIDTH && \ + MLO_IN_HEIGHT == MLO_OUT_HEIGHT && \ + (MLO_IN_HEIGHT / MLO_IN_TILE1) * MLO_IN_TILE1 == MLO_IN_HEIGHT) +#define MLO_OUT_ALIGNED 1 +#else +#define MLO_OUT_ALIGNED 0 +#endif + +#define MLO_N_ALUTILES_TOTAL ((MLO_GRP_TILE0 * MLO_GRP_TILE1) / (MLO_ALU_TILE_SZ)) +#define MLO_N_ALUTILES_PERSTACK (MLO_N_ALUTILES_TOTAL / MLO_N_STACKS) +#define MLO_ALUTILES_STACK_SZ (MLO_N_ALUTILES_PERSTACK * MLO_ALU_TILE_SZ) +#define MLO_N_IN_TILES_TOTAL (MLO_N_IN_TILES_PERSTACK * MLO_N_STACKS) +/* +#define MLO_N_OUT_TILES_PERSTACK (MLO_N_OUT_TILES*MLO_N_ALUTILES_PERSTACK) +#if MLO_N_OUT_TILES_PERSTACK > MLO_N_OUTPUTS +#undef MLO_N_OUT_TILES_PERSTACK +#define MLO_N_OUT_TILES_PERSTACK MLO_N_OUTPUTS +#endif +*/ +#define MLO_N_OUT_TILE_BLOCKS0 ((MLO_OUT_WIDTH + MLO_IN_TILE0 - 1) / MLO_IN_TILE0) +#define MLO_N_OUT_TILE_BLOCKS1 ((MLO_OUT_HEIGHT + MLO_IN_TILE1 - 1) / MLO_IN_TILE1) +#define MLO_N_IN_PACKS ((MLO_N_INPUTS + MLO_N_IN_TILES_PERSTACK - 1) / MLO_N_IN_TILES_PERSTACK) + +#define MLO_N_IN_READ (MLO_N_IN_PACKS * MLO_N_IN_TILES_PERSTACK) +#if MLO_N_IN_READ == MLO_N_INPUTS +#define MLO_INPUTS_ALIGNED 1 +#else +#define MLO_INPUTS_ALIGNED 0 +#endif + +#define MLO_N_OUT_PACKS (MLO_N_OUTPUTS / MLO_N_OUT_TILES_PERSTACK) +#if MLO_N_OUT_PACKS * MLO_N_OUT_TILES_PERSTACK == MLO_N_OUTPUTS && \ + MLO_N_OUT_TILES_PERSTACK != MLO_N_OUTPUTS +#define MLO_OUTPUTS_ALIGNED 1 +#else +#define MLO_OUTPUTS_ALIGNED 0 +#endif + +#define MLO_N_BATCH_PACKS (MLO_BATCH_SZ / MLO_N_STACKS) +#if MLO_N_BATCH_PACKS * MLO_N_STACKS == MLO_BATCH_SZ && MLO_N_STACKS != MLO_BATCH_SZ +#define MLO_BATCH_ALIGNED 1 +#else +#define MLO_BATCH_ALIGNED 0 +#endif + +#if MLO_DIR_FORWARD == 1 +#define MLO_IN_LCL_WIDTH \ + ((MLO_IN_TILE0 - 1) * MLO_FILTER_STRIDE0 + MLO_FILTER_SIZE0) // here we use kernel size. it's + // important when padding == 0 2* + // MLO_FILTER_PAD0 +#define MLO_IN_LCL_HEIGHT ((MLO_IN_TILE1 - 1) * MLO_FILTER_STRIDE1 + MLO_FILTER_SIZE1) +#else +#define MLO_IN_LCL_WIDTH \ + ((MLO_IN_TILE0 + MLO_FILTER_SIZE0 - 1 + MLO_FILTER_STRIDE0 - 1) / \ + MLO_FILTER_STRIDE0) // here we use kernel size. it's important when padding == 0 2* +// MLO_FILTER_PAD0 +#define MLO_IN_LCL_HEIGHT \ + ((MLO_IN_TILE1 + MLO_FILTER_SIZE1 - 1 + MLO_FILTER_STRIDE1 - 1) / MLO_FILTER_STRIDE1) +#endif +#define MLO_IN_LCL_TILE_SZ (MLO_IN_LCL_WIDTH * MLO_IN_LCL_HEIGHT) +#define MLO_IN_LCL_PERSTACK_SZ (MLO_IN_LCL_TILE_SZ * MLO_N_IN_TILES_PERSTACK) +#define MLO_IN_LCL_SZ (MLO_IN_LCL_PERSTACK_SZ * MLO_N_STACKS) + +#define MLO_WEIGHTS_SZ (MLO_N_OUT_TILES_PERSTACK * MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ) + +#define MLO_PVT_ACCUM_DATA_SZ (MLO_N_OUT_TILES * MLO_OUT_TILE_SZ) +#if MLO_DIR_FORWARD == 1 +#define MLO_PVT_IN_WIDTH ((MLO_OUT_TILE0 - 1) * MLO_FILTER_STRIDE0 + MLO_FILTER_SIZE0) +#define MLO_PVT_IN_HEIGHT ((MLO_OUT_TILE1 - 1) * MLO_FILTER_STRIDE1 + 1) +#else +#define MLO_PVT_IN_WIDTH \ + ((MLO_OUT_TILE0 + MLO_FILTER_SIZE0 - 1 + MLO_FILTER_STRIDE0 - 1) / MLO_FILTER_STRIDE0) +#define MLO_PVT_IN_HEIGHT ((MLO_OUT_TILE1 + MLO_FILTER_STRIDE1 - 1) / MLO_FILTER_STRIDE1) +#endif + +#define MLO_LCL_WEIGHTS 1 + +#define MLO_PADDING_SHIFT1 (MLO_FILTER_SIZE1 - MLO_FILTER_PAD1 - 1) +#define MLO_PADDING_SHIFT0 (MLO_FILTER_SIZE0 - MLO_FILTER_PAD0 - 1) + +#define MLO_PADDING_FIX1 (MLO_FILTER_SIZE1 % MLO_OUT_TILE1) +#define MLO_PADDING_FIX0 (MLO_FILTER_SIZE0 % MLO_OUT_TILE0) + +#if defined(__AMDGCN__) +extern uint __llvm_amdgcn_readfirstlane(uint) __asm("llvm.amdgcn.readfirstlane"); +#define uniform(x) __llvm_amdgcn_readfirstlane(x) +#else +#define uniform(x) (x) +#endif + +static inline uint iDiv(uint v, uint d) +{ + uint r = (uint)((float)v * (1.0f / (float)d) + 0.00001f); + return (r); +} + +static inline uint iMod(uint v, uint u, uint d) +{ + uint r = v - mul24((uint)u, (uint)d); + return (r); +} + +static inline void calculateXYPos(uint linPos, uint width, uint* __restrict x, uint* __restrict y) +{ + (*y) = (uint)((float)linPos * (1.0f / (float)width) + 0.00001f); + (*x) = linPos - mul24((*y), width); +} + +static inline uint calculateOffset(uint stride, uint x, uint y) +{ + uint ret = y * stride + x; + return (ret); +} + +static inline void readDataElem(uint linPos, + __local _FLOAT* lcl_data, + int lcl_base, + UNUSED uint lcl_height, + uint lcl_width, + int lcl_stride, + int lcl_y, + int lcl_x, + const __global _FLOAT* gbl_data, + int gbl_base, + uint gbl_height, + uint gbl_width, + int gbl_stride, + int gbl_y, + int gbl_x, + bool vis, + UNUSED bool debug) +{ + uint x, y; + calculateXYPos(linPos, lcl_width, &x, &y); + int g_x = x + gbl_x; + int g_y = y + gbl_y; + uint gbl_off0 = calculateOffset(gbl_stride, g_x, g_y); + int gbl_off = gbl_off0 + gbl_base; + +#if MLO_LARGE_MAP == 1 + int lcl_off = lcl_base + linPos; + (void)lcl_stride; + (void)lcl_x; + (void)lcl_y; +#else + int l_x = x + lcl_x; + int l_y = y + lcl_y; + int lcl_off = lcl_base + mad24(l_y, lcl_stride, l_x); +#endif + +#if MLO_LARGE_MAP == 1 + vis &= (g_x >= 0 && g_x < gbl_width && g_y >= 0 && g_y < gbl_height); +#else + (void)gbl_width; + (void)gbl_height; +#endif + gbl_off = (vis) ? gbl_off : 0; + _FLOAT gbl_val = gbl_data[gbl_off]; + gbl_val = (vis) ? gbl_val : 0; + + lcl_data[lcl_off] = gbl_val; +} + +static inline void readData(uint lcl_id, + int size, + int lcl_p_stride, + __local _FLOAT* lcl_data, + int lcl_base, + uint lcl_height, + uint lcl_width, + int lcl_stride, + int lcl_y, + int lcl_x, + const __global _FLOAT* gbl_data, + int gbl_base, + uint gbl_height, + uint gbl_width, + int gbl_stride, + int gbl_y, + int gbl_x, + bool vis, + bool debug) +{ + + for(uint i = lcl_id; i < size; i += lcl_p_stride) + { + readDataElem(i, + lcl_data, + lcl_base, + lcl_height, + lcl_width, + lcl_stride, + lcl_y, + lcl_x, + gbl_data, + gbl_base, + gbl_height, + gbl_width, + gbl_stride, + gbl_y, + gbl_x, + vis, + debug); + } +} + +static inline void loadData(uint lcl_id, + int lcl_p_stride, + __local _FLOAT* lcl_data, + int lcl_off, + int lcl_size, + uint lcl_height, + uint lcl_width, + int lcl_stride, + int lcl_bot_y, + int lcl_bot_x, + const __global _FLOAT* gbl_data, + int gbl_off, + int gbl_size, + uint gbl_height, + uint glb_width, + int gbl_stride, + int gbl_bot_y, + int gbl_bot_x, + int buf_block_ind, + int max_n_bufs, + int lcl_n_bufs, + bool debug) +{ + + for(uint c = 0; c < lcl_n_bufs; ++c, lcl_off += lcl_size, gbl_off += gbl_size) + { + bool vis = (buf_block_ind + c < max_n_bufs); + readData(lcl_id, + lcl_size, + lcl_p_stride, + lcl_data, + lcl_off, + lcl_height, + lcl_width, + lcl_stride, + lcl_bot_y, + lcl_bot_x, + gbl_data, + gbl_off, + gbl_height, + glb_width, + gbl_stride, + gbl_bot_y, + gbl_bot_x, + vis, + (debug)); + } +} + +static inline void Conv(uint o_map_base, + int in_stg_off, + __private _FLOAT* __restrict pvt_in_stage, + __local _FLOAT* __restrict lcl_indata, + __private _FLOAT* __restrict pvt_wei_stage, + __local _FLOAT* __restrict lcl_wei, + __private _FLOAT* __restrict pvt_accum) +{ + // convolution + + // over all inputs in stack + int in_stg_off1 = in_stg_off; + for(uint i_c = 0; i_c < MLO_N_IN_TILES_PERSTACK; ++i_c, in_stg_off1 += MLO_IN_LCL_TILE_SZ) + { + // preload input + int wei_stg_base_off = mad24(o_map_base, + (uint)(MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ), + mul24(i_c, (uint)MLO_FILTER_SZ)); + int in_stg_off2 = in_stg_off1; + for(uint j = 0; j < MLO_PVT_IN_HEIGHT - 1; ++j, + in_stg_off2 += (((j - MLO_PADDING_SHIFT1 + MLO_PADDING_FIX1) % MLO_FILTER_STRIDE1) + ? 0 + : MLO_IN_LCL_WIDTH)) + { + for(uint i = 0; i < MLO_PVT_IN_WIDTH; ++i) + { + pvt_in_stage[j * MLO_PVT_IN_WIDTH + i] = lcl_indata[in_stg_off2 + i]; + } + } + +// over filter rows +#ifdef __AMDGCN__ +#if MLO_FILTER_SIZE1 < 6 +#pragma unroll +#elif MLO_FILTER_SIZE1 < 9 +#pragma unroll 2 +#endif +#endif +#if MLO_DIR_FORWARD == 1 + for(uint k = 0; k < MLO_FILTER_SIZE1; ++k, in_stg_off2 += MLO_IN_LCL_WIDTH) +#else + for(uint k = 0; k < MLO_FILTER_SIZE1; ++k, + in_stg_off2 += (((k - MLO_PADDING_SHIFT1 + MLO_PADDING_FIX1) % MLO_FILTER_STRIDE1) + ? 0 + : MLO_IN_LCL_WIDTH)) +#endif + { + int k_act = 0; +#if MLO_DIR_FORWARD == 1 + k_act = k; +#else + // load filter in reverse order + k_act = MLO_FILTER_SIZE1 - 1 - k; +#endif + // load next input row + for(uint i_pvt = 0; i_pvt < MLO_PVT_IN_WIDTH; ++i_pvt) + { + pvt_in_stage[(MLO_PVT_IN_HEIGHT - 1) * MLO_PVT_IN_WIDTH + i_pvt] = + lcl_indata[in_stg_off2 + i_pvt]; + } + + // over all outputs + for(uint o_c = 0; o_c < MLO_N_OUT_TILES; ++o_c) + { + int wei_stg_off = wei_stg_base_off + o_c * MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ + + k_act * MLO_FILTER_SIZE0; + for(uint i = 0; i < MLO_FILTER_SIZE0; ++i) + { + pvt_wei_stage[i] = + lcl_wei[wei_stg_off + + i]; //(float)o_c/(float)MLO_N_OUT_TILES + (float)(i+k)/9; + } + + // actual conv + + for(uint j = 0; j < MLO_OUT_TILE1; ++j) + { +#if MLO_DIR_FORWARD == 0 + if(((j + k + 1 - MLO_PADDING_SHIFT1 + (MLO_FILTER_SIZE1 % MLO_FILTER_STRIDE1)) % + MLO_FILTER_STRIDE1) == 0) +#endif + for(uint i = 0; i < MLO_OUT_TILE0; ++i) + { + for(uint l = 0; l < MLO_FILTER_SIZE0; ++l) + { + + int l_act = 0; +#if MLO_DIR_FORWARD == 1 + l_act = l; + +#else + // in reverse horizontal and vertical orders + l_act = MLO_FILTER_SIZE0 - 1 - l; + +#endif + +#if MLO_DIR_FORWARD == 1 + pvt_accum[(o_c * MLO_OUT_TILE1 + j) * MLO_OUT_TILE0 + i] += + pvt_in_stage[j * MLO_PVT_IN_WIDTH * MLO_FILTER_STRIDE1 + + i * MLO_FILTER_STRIDE0 + l] * + pvt_wei_stage[l_act]; +#else + if(((i + l + 1 - MLO_PADDING_SHIFT0 + + (MLO_FILTER_SIZE0 % MLO_FILTER_STRIDE0)) % + MLO_FILTER_STRIDE0) == 0) + { + pvt_accum[(o_c * MLO_OUT_TILE1 + j) * MLO_OUT_TILE0 + i] += + pvt_in_stage[(j / MLO_FILTER_STRIDE1) * MLO_PVT_IN_WIDTH + + (i + l) / MLO_FILTER_STRIDE0] * + pvt_wei_stage[l_act]; + } +#endif + } + } + } + + } // for(uint o_c = 0; o_c < MLO_N_OUT_TILES; ++o_c) + + // move data up + for(uint j = 0; j < MLO_PVT_IN_HEIGHT - 1; ++j) + { + for(uint i = 0; i < MLO_PVT_IN_WIDTH; ++i) + { + pvt_in_stage[j * MLO_PVT_IN_WIDTH + i] = + pvt_in_stage[(j + 1) * MLO_PVT_IN_WIDTH + i]; + } + } + + } // for(uint k = 0; k < MLO_FILER_SIZE1; ++k,in_stg_off2+=MLO_IN_LCL_WIDTH) + + } // for(uint i_c = 0; i_c < MLO_N_IN_TILES_PERSTACK; ++i_c, in_stg_off1 += + // MLO_IN_LCL_PERSTACK_SZ) +} + +__attribute__((reqd_work_group_size(MLO_GRP_SZ0, MLO_GRP_SZ1, MLO_GRP_SZ2))) __kernel void +ConvFwd3x3(const __global _FLOAT* __restrict in, + const __global _FLOAT* __restrict weights, +#if MLO_CONV_BIAS + const __global _FLOAT* __restrict bias, +#endif + __global _FLOAT* __restrict out, + _FLOAT slope) +{ + +#if 1 + __local _FLOAT lcl_indata[MLO_IN_LCL_SZ]; + __local _FLOAT lcl_wei[MLO_WEIGHTS_SZ]; + __private _FLOAT pvt_accum[MLO_PVT_ACCUM_DATA_SZ]; + __private _FLOAT pvt_in_stage[MLO_PVT_IN_HEIGHT * MLO_PVT_IN_WIDTH]; + __private _FLOAT pvt_wei_stage[MLO_FILTER_SIZE0]; + + uint grp_id0 = get_group_id(0); +#if MLO_N_OUT_TILE_BLOCKS0 & (MLO_N_OUT_TILE_BLOCKS0 - 1) + uint y_tile_blk = iDiv(grp_id0, MLO_N_OUT_TILE_BLOCKS0); + uint x_tile_blk = iMod(grp_id0, y_tile_blk, MLO_N_OUT_TILE_BLOCKS0); +#else + uint y_tile_blk = grp_id0 / MLO_N_OUT_TILE_BLOCKS0; + uint x_tile_blk = grp_id0 & (MLO_N_OUT_TILE_BLOCKS0 - 1); +#endif + uint o_pack = get_group_id(1); // block of outputs + uint b_pack = get_group_id(2); // batch block + + uint lcl_id = get_local_id(0); +#if MLO_ALUTILES_STACK_SZ >= MLO_GRP_SZ + uint stack = 0; + uint alu_stack_id = lcl_id; +#elif MLO_ALUTILES_STACK_SZ & (MLO_ALUTILES_STACK_SZ - 1) + uint stack = iDiv(lcl_id, MLO_ALUTILES_STACK_SZ); // stack + uint alu_stack_id = iMod(lcl_id, stack, MLO_ALUTILES_STACK_SZ); // alu index in stack +#else + uint stack = lcl_id / MLO_ALUTILES_STACK_SZ; // stack + uint alu_stack_id = lcl_id & (MLO_ALUTILES_STACK_SZ - 1); // alu index in stack +#if MLO_ALUTILES_STACK_SZ >= 64 + stack = uniform(stack); +#endif +#endif +// ALU plane inside stack +#if MLO_ALU_TILE_SZ & (MLO_ALU_TILE_SZ - 1) + uint alu_out_plane_id = iDiv(alu_stack_id, MLO_ALU_TILE_SZ); // alu output plane index + uint alu_out_id = iMod( + alu_stack_id, alu_out_plane_id, MLO_ALU_TILE_SZ); // alu index inside an ALU output plane +#else + uint alu_out_plane_id = alu_stack_id / MLO_ALU_TILE_SZ; // alu output plane index + uint alu_out_id = alu_stack_id & (MLO_ALU_TILE_SZ - 1); // alu index inside an ALU output plane +#endif +// pos inside ALU tile +#if MLO_ALU_VTILE0 & (MLO_ALU_VTILE0 - 1) + uint alu_tl1 = iDiv(alu_out_id, MLO_ALU_VTILE0); + uint alu_tl0 = iMod(alu_out_id, alu_tl1, MLO_ALU_VTILE0); +#else + uint alu_tl1 = alu_out_id / MLO_ALU_VTILE0; + uint alu_tl0 = alu_out_id & (MLO_ALU_VTILE0 - 1); +#endif + + uint o_map_plane = + o_pack * MLO_N_OUT_TILES_PERSTACK; // first output maps index per full ALU plane stack + uint o_map_base = alu_out_plane_id * MLO_N_OUT_TILES; // local output map offset + uint o_map = o_map_plane + o_map_base; // output map index per ALU plane + uint b_index = b_pack * MLO_N_STACKS; + +#if MLO_LARGE_MAP != 1 +#if MLO_N_READ_PROCS >= MLO_GRP_SZ + uint wave_id = 0; + uint wave_lcl_id = lcl_id; +#elif MLO_N_READ_PROCS & (MLO_N_READ_PROCS - 1) + uint wave_id = iDiv(lcl_id, MLO_N_READ_PROCS); + uint wave_lcl_id = iMod(lcl_id, wave_id, MLO_N_READ_PROCS); +#else + uint wave_id = lcl_id / MLO_N_READ_PROCS; + uint wave_lcl_id = lcl_id & (MLO_N_READ_PROCS - 1); +#if MLO_N_READ_PROCS >= 64 + wave_id = uniform(wave_id); +#endif +#endif +#endif + +#if MLO_DIR_FORWARD == 1 + uint x_grp = x_tile_blk * MLO_IN_TILE0 * MLO_FILTER_STRIDE0; + uint y_grp = y_tile_blk * MLO_IN_TILE1 * MLO_FILTER_STRIDE1; +#if MLO_LARGE_MAP == 1 + uint x_in_grp = x_grp - MLO_FILTER_PAD0; + uint y_in_grp = y_grp - MLO_FILTER_PAD1; +#endif + uint x_in_lcl = alu_tl0 * MLO_OUT_TILE0 * MLO_FILTER_STRIDE0; + uint y_in_lcl = alu_tl1 * MLO_OUT_TILE1 * MLO_FILTER_STRIDE1; +#else + uint x_grp = x_tile_blk * (MLO_IN_TILE0 / MLO_FILTER_STRIDE0); + uint y_grp = y_tile_blk * (MLO_IN_TILE1 / MLO_FILTER_STRIDE1); +#if MLO_LARGE_MAP == 1 + uint x_in_grp = x_grp - (MLO_FILTER_PAD0 / MLO_FILTER_STRIDE0); + uint y_in_grp = y_grp - (MLO_FILTER_PAD1 / MLO_FILTER_STRIDE1); +#endif + uint x_in_lcl = alu_tl0 * (MLO_OUT_TILE0 / MLO_FILTER_STRIDE0); + uint y_in_lcl = alu_tl1 * (MLO_OUT_TILE1 / MLO_FILTER_STRIDE1); +#endif + + // base offset to read data from local input data + uint in_stg_off = stack * MLO_IN_LCL_PERSTACK_SZ + (y_in_lcl)*MLO_IN_LCL_WIDTH + x_in_lcl; + + uint in_off = b_index * MLO_IN_BATCH_STRIDE; + +#if MLO_DIR_FORWARD == 1 + uint wei_off = mul24(o_map_plane, (uint)(MLO_N_INPUTS * MLO_FILTER_SZ)); +#else + uint wei_off = mul24(o_map_plane, (uint)MLO_FILTER_SZ); +#endif + +#if MLO_LARGE_MAP == 0 + for(uint i = lcl_id; i < MLO_IN_LCL_SZ; i += MLO_GRP_SZ) + { + lcl_indata[i] = 0; + } +#endif + + for(uint i = 0; i < MLO_PVT_ACCUM_DATA_SZ; ++i) + { + pvt_accum[i] = 0; + } + + for(uint ic = 0; ic < MLO_N_INPUTS; ic += MLO_N_IN_TILES_PERSTACK, + in_off += MLO_IN_CHANNEL_STRIDE * MLO_N_IN_TILES_PERSTACK, + wei_off += MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ +#if MLO_DIR_FORWARD == 0 + * + MLO_N_OUTPUTS +#endif + ) + { + barrier(CLK_LOCAL_MEM_FENCE); + +// small map has been read in full continiously into the lDS buffer within padded rect, +// padding has been done on initilization. +// large map calculates padding on the fly and fills it with 0. + +#if 1 // all inputs + +#if MLO_LARGE_MAP == 1 + int in_lcl_off1 = 0; + int in_off1 = in_off; + for(uint i_b = 0; i_b < MLO_N_STACKS; + ++i_b, in_off1 += MLO_IN_BATCH_STRIDE, in_lcl_off1 += MLO_IN_LCL_PERSTACK_SZ) + { + bool vis = true; +#if MLO_BATCH_ALIGNED == 0 + vis &= (b_index + i_b < MLO_BATCH_SZ); +#endif + + // over all inputs in stack + int in_off2 = in_off1; + int in_lcl_off2 = in_lcl_off1; + for(uint i_c = 0; i_c < MLO_N_IN_TILES_PERSTACK; + ++i_c, in_off2 += MLO_IN_CHANNEL_STRIDE, in_lcl_off2 += MLO_IN_LCL_TILE_SZ) + { +#if MLO_INPUTS_ALIGNED == 0 + vis &= (ic + i_c < MLO_N_INPUTS); +#endif + + uint elem_id = lcl_id; + int lcl_p_stride = MLO_GRP_SZ0; + int lcl_base = 0; + int lcl_y = 0; + int lcl_x = 0; + int gbl_base = in_off2; + + readData(elem_id, + (MLO_IN_LCL_HEIGHT * MLO_IN_LCL_WIDTH), + lcl_p_stride, + &lcl_indata[in_lcl_off2], + lcl_base, + MLO_IN_LCL_HEIGHT, + MLO_IN_LCL_WIDTH, + MLO_IN_LCL_WIDTH, + lcl_y, + lcl_x, + &in[0], + gbl_base, + MLO_IN_HEIGHT, + MLO_IN_WIDTH, + MLO_IN_STRIDE, + y_in_grp, + x_in_grp, + vis, + true); + } + } +#else + for(uint i = wave_id; i < MLO_N_IN_TILES_TOTAL; i += MLO_N_PROC_WAVES) + { +#if MLO_N_IN_TILES_PERSTACK & (MLO_N_IN_TILES_PERSTACK - 1) + uint i_b = iDiv(i, MLO_N_IN_TILES_PERSTACK); + uint i_c = iMod(i, i_b, MLO_N_IN_TILES_PERSTACK); +#else + uint i_b = i / MLO_N_IN_TILES_PERSTACK; + uint i_c = i & (MLO_N_IN_TILES_PERSTACK - 1); +#endif + + bool vis = true; + +#if MLO_BATCH_ALIGNED == 0 + vis &= (b_index + i_b < MLO_BATCH_SZ); +#endif + +#if MLO_INPUTS_ALIGNED == 0 + vis &= (ic + i_c < MLO_N_INPUTS); +#endif + int in_off2 = in_off + i_b * MLO_IN_BATCH_STRIDE + i_c * MLO_IN_CHANNEL_STRIDE; + int in_lcl_off2 = i_b * MLO_IN_LCL_PERSTACK_SZ + i_c * MLO_IN_LCL_TILE_SZ; + + uint elem_id = wave_lcl_id; + int lcl_p_stride = MLO_N_READ_PROCS; + int lcl_base = 0; +#if MLO_DIR_FORWARD == 1 + int lcl_y = MLO_FILTER_PAD1; + int lcl_x = MLO_FILTER_PAD0; +#else + int lcl_y = (MLO_FILTER_PAD1 / MLO_FILTER_STRIDE0); + int lcl_x = (MLO_FILTER_PAD0 / MLO_FILTER_STRIDE1); +#endif + int gbl_base = in_off2; + + readData(elem_id, + (MLO_IN_HEIGHT * MLO_IN_WIDTH), + lcl_p_stride, + &lcl_indata[in_lcl_off2], + lcl_base, + MLO_IN_HEIGHT, + MLO_IN_WIDTH, + MLO_IN_LCL_WIDTH, + lcl_y, + lcl_x, + &in[0], + gbl_base, + MLO_IN_HEIGHT, + MLO_IN_WIDTH, + MLO_IN_STRIDE, + y_grp, + x_grp, + vis, + true); + } +#endif + +// read inputs and weights +// put weights into LDS + +#if 1 // only weights + + for(uint i = lcl_id; i < MLO_WEIGHTS_SZ; i += MLO_GRP_SZ) + { +#if MLO_DIR_FORWARD == 1 +// here is [tops][bottoms] +#if(MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ) & ((MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ) - 1) + uint lcl_o = iDiv(i, (MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ)); + uint gbl_i = iMod(i, lcl_o, (MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ)); +#else + uint lcl_o = i / (MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ); + uint gbl_i = i & ((MLO_N_IN_TILES_PERSTACK * MLO_FILTER_SZ) - 1); +#endif + uint gbl_we_off = wei_off + lcl_o * MLO_N_INPUTS * MLO_FILTER_SZ + gbl_i; + bool within_range = gbl_we_off < (MLO_N_OUTPUTS * MLO_N_INPUTS * MLO_FILTER_SZ); + + gbl_we_off = (within_range) ? gbl_we_off : 0; + _FLOAT wei = weights[gbl_we_off]; + wei = (within_range) ? wei : 0; + lcl_wei[i] = wei; +#else +// outputs are botoms(inputs)) +// inputs are tops(outputs) +#if(MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ) & ((MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ) - 1) + uint lcl_o = iDiv(i, (MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ)); + uint gbl_i = iMod(i, lcl_o, (MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ)); +#else + uint lcl_o = i / (MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ); + uint gbl_i = i & ((MLO_N_OUT_TILES_PERSTACK * MLO_FILTER_SZ) - 1); +#endif +#if MLO_FILTER_SZ & (MLO_FILTER_SZ - 1) + uint lcl_c = iDiv(gbl_i, MLO_FILTER_SZ); + uint lcl_i = iMod(gbl_i, lcl_c, MLO_FILTER_SZ); +#else + uint lcl_c = gbl_i / MLO_FILTER_SZ; + uint lcl_i = gbl_i & (MLO_FILTER_SZ - 1); +#endif + + uint lcl_we_off = mad24( + mad24(lcl_c, (uint)MLO_N_IN_TILES_PERSTACK, lcl_o), (uint)MLO_FILTER_SZ, lcl_i); + uint gbl_we_off = mad24( + mad24(lcl_o, (uint)MLO_N_OUTPUTS, lcl_c), (uint)MLO_FILTER_SZ, wei_off + lcl_i); + bool within_range = gbl_we_off < (MLO_N_OUTPUTS * MLO_N_INPUTS * MLO_FILTER_SZ); + gbl_we_off = (within_range) ? gbl_we_off : 0; + _FLOAT wei = weights[gbl_we_off]; + wei = (within_range) ? wei : 0; + lcl_wei[lcl_we_off] = wei; + +#endif + } + +#endif + +// over all batch stacks + +#endif // all input + + barrier(CLK_LOCAL_MEM_FENCE); + + // convolution + Conv(o_map_base, in_stg_off, pvt_in_stage, lcl_indata, pvt_wei_stage, lcl_wei, pvt_accum); + + // barrier(CLK_LOCAL_MEM_FENCE); + } +// write results out +#if MLO_DIR_FORWARD == 1 +#if MLO_FILTER_STRIDE0 == 1 + int x_out_grp = x_grp; +#else + int x_out_grp = x_tile_blk * MLO_IN_TILE0; +#endif +#if MLO_FILTER_STRIDE1 == 1 + int y_out_grp = y_grp; +#else + int y_out_grp = y_tile_blk * MLO_IN_TILE1; +#endif +#else + int x_out_grp = x_grp * MLO_FILTER_STRIDE0; + int y_out_grp = y_grp * MLO_FILTER_STRIDE1; +#endif + int x_out_lcl = alu_tl0 * MLO_OUT_TILE0; + int y_out_lcl = alu_tl1 * MLO_OUT_TILE1; + + uint out_off = (b_index + stack) * MLO_OUT_BATCH_STRIDE + o_map * MLO_OUT_CHANNEL_STRIDE + + (y_out_grp + y_out_lcl) * MLO_OUT_STRIDE + x_out_grp + x_out_lcl; +// over all local stacks +#if MLO_BATCH_ALIGNED == 0 + if(b_index + stack < MLO_BATCH_SZ) +#endif + { + + // over all local outputs + int out_off1 = out_off; + for(uint o = 0; o < MLO_N_OUT_TILES; ++o, out_off1 += MLO_OUT_CHANNEL_STRIDE) + { +#if MLO_OUTPUTS_ALIGNED == 0 + if(o_map + o < MLO_N_OUTPUTS) +#endif + { + // over output tile + int out_off2 = out_off1; +#if MLO_OUT_TILE0 == 1 + for(int j = 0; j < MLO_OUT_TILE1 && y_out_grp + y_out_lcl + j < MLO_OUT_HEIGHT; + ++j, out_off2 += MLO_OUT_STRIDE) + { + for(int i = 0; i < MLO_OUT_TILE0 && x_out_grp + x_out_lcl + i < MLO_OUT_WIDTH && + out_off2 + i < MLO_OUT_BATCH_STRIDE * MLO_BATCH_SZ; + ++i) + { +#else + for(uint j = 0; j < MLO_OUT_TILE1; ++j, out_off2 += MLO_OUT_STRIDE) + { + if(y_out_grp + y_out_lcl + j < MLO_OUT_HEIGHT) + for(uint i = 0; i < MLO_OUT_TILE0; ++i) + { + if (x_out_grp + x_out_lcl + i < MLO_OUT_WIDTH && + out_off2 + i < MLO_OUT_BATCH_STRIDE * MLO_BATCH_SZ) + { +#endif + out[out_off2 + i] = pvt_accum[o * MLO_OUT_TILE_SZ + j * MLO_OUT_TILE0 + i] +#if MLO_CONV_BIAS + + bias[o_map + o] +#endif + ; + //ReLU fusion + out[out_off2 + i] *= (out[out_off2 + i] > 0.0f ? 1.0f : slope); + } + } + } + } + } + } +#endif + + /*uint tid = get_local_id(0); + uint gid = get_group_id(0); + + __global _FLOAT* q = out + 256 * gid + tid; // 线性地址.(测试用) + *q = 256 * gid + tid;*/ +} + diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M1.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M1.cl new file mode 100644 index 000000000..960a4e0a2 --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M1.cl @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#define STRIDE (25088) +#define ITER (196) + +void reduce(__local float* buffer, int tid) +{ + if (tid < 64) + { + buffer[tid] += buffer[tid + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 32) + { + buffer[tid << 1] += buffer[(tid << 1) + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) + { + buffer[tid << 2] += buffer[(tid << 2) + 2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { + buffer[tid << 3] += buffer[(tid << 3) + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { + buffer[tid << 4] += buffer[(tid << 4) + 8]; + } + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(128, 1, 1))) +__kernel void InnerProduct( + __global const float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + __local float shared_a[129]; + __local float shared_b[8][65]; + + __local float result[2][129]; + + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __global const float* pA = (__global const float*)(a + (grid_x >> 9) * STRIDE); + __global const float* pB = (__global const float*)(b + ((grid_x & 511) << 3) * STRIDE); + + int offset = ((grid_x << 6) + ((lid_x >> 6) * 12544) + (lid_x & 63)) % STRIDE; + + int temp_offset = offset; + for (int l = 0; l < 2; l++, offset = temp_offset) + { + float sum = 0.0f; + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + shared_a[lid_x] = pA[offset]; + + for (int j = l * 4; j < (l + 1) * 4; j++) + { + shared_b[(lid_x >> 6 << 2) + (j & 3)][(lid_x & 63)] = pB[offset + j * STRIDE]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int k = 0; k < 4; k++) + { + sum += shared_a[(lid_x >> 6 << 6) + ((lid_x & 15) << 2) + k] * shared_b[(lid_x >> 6 << 2) + ((lid_x & 63) >> 4)][((lid_x & 15) << 2) + k]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + result[l][lid_x] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + + reduce(result[l], lid_x); + } + + if (lid_x < 8) + { + c[(grid_x << 3) + lid_x] = result[lid_x >> 2][(lid_x & 3) << 4] + bias[(grid_x << 3) + lid_x]; + } +} + diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M2.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M2.cl new file mode 100644 index 000000000..568c241e6 --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M2.cl @@ -0,0 +1,86 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#define STRIDE (25088) +#define CSTRIDE (4096) +#define ITER (392) + +void reduce(__local float* buffer, int tid) +{ + if (tid < 32) + { + buffer[tid << 1] += buffer[(tid << 1) + 1]; + } + if (tid < 16) + { + buffer[tid << 2] += buffer[(tid << 2) + 2]; + } +} + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void InnerProduct( + __global const float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + __local float shared_a[2][66]; + __local float shared_b[8][66]; + __local float result[65]; + + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __global const float* pA = (__global const float*)(a); + __global const float* pB = (__global const float*)(b + (grid_x << 3) * STRIDE); + + int offset = ((grid_x << 6)) % STRIDE; + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + for (int j = 0; j < 2; j++) + { + shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE]; + } + + for (int j = 0; j < 8; j++) + { + shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE]; + } + + for (int k = 0; k < 16; k++) + { + sum += shared_a[lid_x >> 5][((lid_x & 3) << 4) + k + ((((lid_x & 3) << 4) + k) >> 5)] * shared_b[(lid_x & 31) >> 2][((lid_x & 3) << 4) + k + ((((lid_x & 3) << 4) + k) >> 5)]; + } + } + + result[lid_x] = sum; + reduce(result, lid_x); + + if (lid_x < 2) + { + float8 out; + float* pOut = (float*)&out; + + for (int i = 0; i < 8; i++) + { + pOut[i] = result[((lid_x * 8 + i) << 2)] + bias[(grid_x << 3) + i]; + } + + __global float8* pC = (__global float8*)(c + (grid_x << 3) + lid_x * CSTRIDE); + *pC = out; + } +} diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M32.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M32.cl new file mode 100644 index 000000000..a135816b7 --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M32.cl @@ -0,0 +1,84 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +///////////////////////////////////////////////////////// +// FC6 batch 32 Version 3 2018.6.25 + +#define STRIDE (25088) +#define CSTRIDE (4096) +#define ITER (1568) + +__attribute__((reqd_work_group_size(256, 1, 1))) +__kernel void InnerProduct( + __global const float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + __local float shared_a[1024]; + __local float shared_b[1024]; + __local float4* pShared_a = (__local float4*)shared_a; + __local float4* pShared_b = (__local float4*)shared_b; + + float4 sha; + float4 shb; + + float4 sum = 0.0f; + float* pSum = (float*)∑ + + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __global const float* pA = (__global const float*)(a); + __global const float* pB = (__global const float*)(b + (grid_x << 5) * STRIDE); + __global float4* pC = (__global float4*)(c + (lid_x >> 3) * CSTRIDE + (grid_x << 5) + ((lid_x & 7) << 2)); + + int offset = (grid_x << 5) % STRIDE; + + for (int i = 0; i < ITER; i++, offset = (offset + 16) % STRIDE) + { + for (int j = 0; j < 2; j++) + { + shared_a[((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32 + ((((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32) >> 5 << 2)] = pA[((j << 4) + (lid_x >> 4)) * STRIDE + (lid_x & 15) + offset]; + shared_b[((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32 + ((((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32) >> 5 << 2)] = pB[((j << 4) + (lid_x >> 4)) * STRIDE + (lid_x & 15) + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int k = 0; k < 16; k++) + { + sha = shared_a[(lid_x >> 3) + k * 32 + (((lid_x >> 3) + k * 32) >> 5 << 2)]; + shb = pShared_b[(((lid_x & 7))) + k * 8 + (((((lid_x & 7))) + k * 8) >> 3)]; + sum += sha * shb; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + for (int i = 0; i < 4; i++) + { + shared_a[(lid_x >> 3) * 32 + ((lid_x & 7) << 2) + i] = pSum[i]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + float4 result; + float* pResult = (float*)&result; + + for (int i = 0; i < 4; i++) + { + pResult[i] = shared_a[(lid_x << 2) + i] + bias[(grid_x << 5) + ((lid_x & 7) << 2) + i]; + } + + *pC = result; +} + diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M4.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M4.cl new file mode 100644 index 000000000..3b3fec19b --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M4.cl @@ -0,0 +1,82 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#define STRIDE (25088) +#define CSTRIDE (4096) +#define ITER (392) + +void reduce(__local float* buffer, int tid) +{ + if (tid < 32) + { + buffer[tid << 1] += buffer[(tid << 1) + 1]; + } + if (tid < 16) + { + buffer[tid << 2] += buffer[(tid << 2) + 2]; + } +} + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void InnerProduct( + __global const float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + __local float shared_a[4][66]; + __local float shared_b[4][66]; + __local float result[65]; + + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __global const float* pA = (__global const float*)(a); + __global const float* pB = (__global const float*)(b + (grid_x << 2) * STRIDE); + + int offset = ((grid_x << 6)) % STRIDE; + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + for (int j = 0; j < 4; j++) + { + shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE]; + shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE]; + } + + for (int k = 0; k < 16; k++) + { + sum += shared_a[lid_x >> 4][((lid_x & 3) << 4) + k + ((((lid_x & 3) << 4) + k) >> 5)] * shared_b[(lid_x & 15) >> 2][((lid_x & 3) << 4) + k + ((((lid_x & 3) << 4) + k) >> 5)]; + } + } + + result[lid_x] = sum; + reduce(result, lid_x); + + if (lid_x < 4) + { + float4 out; + float* pOut = (float*)&out; + + for (int i = 0; i < 4; i++) + { + pOut[i] = result[((lid_x * 4 + i) << 2)] + bias[(grid_x << 2) + i]; + } + + __global float4* pC = (__global float4*)(c + (grid_x << 2) + lid_x * CSTRIDE); + *pC = out; + } +} diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC6M8.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M8.cl new file mode 100644 index 000000000..29cdb4a6a --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC6M8.cl @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#define STRIDE (25088) +#define CSTRIDE (4096) +#define ITER (392) + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void InnerProduct( + __global const float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + __local float shared_a[8][66]; + __local float shared_b[8][66]; + __local float result[65]; + + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __global const float* pA = (__global const float*)(a); + __global const float* pB = (__global const float*)(b + (grid_x << 3) * STRIDE); + + int offset = ((grid_x << 6)) % STRIDE; + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + for (int j = 0; j < 8; j++) + { + shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE]; + } + + for (int j = 0; j < 8; j++) + { + shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE]; + } + + for (int k = 0; k < 64; k++) + { + sum += shared_a[lid_x >> 3][k + (k >> 5)] * shared_b[(lid_x & 7)][k + (k >> 5)]; + } + } + + result[lid_x] = sum; + + if (lid_x < 8) + { + float8 out; + float* pOut = (float*)&out; + + for (int i = 0; i < 8; i++) + { + pOut[i] = result[((lid_x * 8 + i))] + bias[(grid_x << 3) + i]; + } + + __global float8* pC = (__global float8*)(c + (grid_x << 3) + lid_x * CSTRIDE); + *pC = out; + } +} diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M1.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M1.cl new file mode 100644 index 000000000..f166c449a --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M1.cl @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#define STRIDE (4096) +#define ITER (64) + +__attribute__((reqd_work_group_size(256, 1, 1))) +__kernel void InnerProduct( + __constant float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + __local float shared_b[64][65]; + + __local float result[256]; + + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __constant float* pA = (__constant float*)a; + __global const float* pB = (__global const float*)(b + ((grid_x << 6) + (lid_x >> 6 << 4)) * STRIDE + (lid_x & 63)); + + int offset = (grid_x << 7) % STRIDE; + + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + for (int j = 0; j < 16; j++) + { + shared_b[(lid_x >> 6 << 4) + j][(lid_x & 63)] = pB[offset + j * STRIDE]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int k = 0; k < 16; k++) + { + sum += pA[offset + (lid_x >> 6 << 4) + k] * shared_b[lid_x & 63][(lid_x >> 6 << 4) + k];//shared_a[(lid_x >> 6 << 4) + k] + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + result[lid_x] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lid_x < 64) + { + result[lid_x] += result[lid_x + 64] + result[lid_x + 128] + result[lid_x + 192]; + c[(grid_x << 6) + lid_x] = result[lid_x] + bias[(grid_x << 6) + lid_x]; + } +} + diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M2.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M2.cl new file mode 100644 index 000000000..fd4442971 --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M2.cl @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#define STRIDE (4096) +#define ITER (64) + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void InnerProduct( + __constant float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + __local float shared_b[16][65]; + + __local float result[64]; + + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __constant float* pA = (__constant float*)(a + (grid_x >> 8 << 12)); //correct + __global const float* pB = (__global const float*)(b + ((grid_x & 255) << 4) * STRIDE + lid_x); + + int offset = ((grid_x & 255) << 6) % STRIDE; + + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + for (int j = 0; j < 16; j++) + { + shared_b[j][lid_x] = pB[offset + j * STRIDE]; + } + //barrier(CLK_LOCAL_MEM_FENCE); + + for (int k = 0; k < 16; k++) + { + sum += pA[offset + (lid_x >> 4 << 4) + k] * shared_b[(lid_x & 15)][(lid_x >> 4 << 4) + k]; + } + //barrier(CLK_LOCAL_MEM_FENCE); + } + + result[lid_x] = sum; + //barrier(CLK_LOCAL_MEM_FENCE); + + if (lid_x < 16) + { + result[lid_x] += result[lid_x + 16] + result[lid_x + 32] + result[lid_x + 48]; + c[(grid_x << 4) + lid_x] = result[lid_x] + bias[((grid_x & 255) << 4) + lid_x]; + } +} + diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M32.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M32.cl new file mode 100644 index 000000000..88a8659d1 --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M32.cl @@ -0,0 +1,83 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +///////////////////////////////////////////////////////// +// FC7 batch 32 Version 3 2018.6.25 + +#define STRIDE (4096) +#define CSTRIDE (4096) +#define ITER (256) + +__attribute__((reqd_work_group_size(256, 1, 1))) +__kernel void InnerProduct( + __global const float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + __local float shared_a[1024]; + __local float shared_b[1024]; + __local float4* pShared_a = (__local float4*)shared_a; + __local float4* pShared_b = (__local float4*)shared_b; + + float4 sha; + float4 shb; + + float4 sum = 0.0f; + float* pSum = (float*)∑ + + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __global const float* pA = (__global const float*)(a); + __global const float* pB = (__global const float*)(b + (grid_x << 5) * STRIDE); + __global float4* pC = (__global float4*)(c + (lid_x >> 3) * CSTRIDE + (grid_x << 5) + ((lid_x & 7) << 2)); + + int offset = (grid_x << 6) % STRIDE; + + for (int i = 0; i < ITER; i++, offset = (offset + 16) % STRIDE) + { + for (int j = 0; j < 2; j++) + { + shared_a[((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32 + ((((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32) >> 5 << 2)] = pA[((j << 4) + (lid_x >> 4)) * STRIDE + (lid_x & 15) + offset]; + shared_b[((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32 + ((((j << 4) + (lid_x >> 4)) + (lid_x & 15) * 32) >> 5 << 2)] = pB[((j << 4) + (lid_x >> 4)) * STRIDE + (lid_x & 15) + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int k = 0; k < 16; k++) + { + sha = shared_a[(lid_x >> 3) + k * 32 + (((lid_x >> 3) + k * 32) >> 5 << 2)]; + shb = pShared_b[(((lid_x & 7))) + k * 8 + (((((lid_x & 7))) + k * 8) >> 3)]; + sum += sha * shb; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + for (int i = 0; i < 4; i++) + { + shared_a[(lid_x >> 3) * 32 + ((lid_x & 7) << 2) + i] = pSum[i]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + float4 result; + float* pResult = (float*)&result; + + for (int i = 0; i < 4; i++) + { + pResult[i] = shared_a[(lid_x << 2) + i] + bias[(grid_x << 5) + ((lid_x & 7) << 2) + i]; + } + + *pC = result; +} diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M4.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M4.cl new file mode 100644 index 000000000..f7eb1091b --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M4.cl @@ -0,0 +1,82 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#define STRIDE (4096) +#define CSTRIDE (4096) +#define ITER (64) + +void reduce(__local float* buffer, int tid) +{ + if (tid < 32) + { + buffer[tid << 1] += buffer[(tid << 1) + 1]; + } +} + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void InnerProduct( + __global const float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + __local float shared_a[4][66]; + __local float shared_b[8][66]; + __local float result[65]; + + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __global const float* pA = (__global const float*)(a); + __global const float* pB = (__global const float*)(b + (grid_x << 3) * STRIDE); + + int offset = ((grid_x << 6)) % STRIDE; + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + for (int j = 0; j < 4; j++) + { + shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE]; + } + + for (int j = 0; j < 8; j++) + { + shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE]; + } + + for (int k = 0; k < 32; k++) + { + sum += shared_a[lid_x >> 4][((lid_x & 1) << 5) + k + ((((lid_x & 1) << 5) + k) >> 5)] * shared_b[(lid_x & 15) >> 1][((lid_x & 1) << 5) + k + ((((lid_x & 1) << 5) + k) >> 5)]; + } + } + + result[lid_x] = sum; + reduce(result, lid_x); + + if (lid_x < 4) + { + float8 out; + float* pOut = (float*)&out; + + for (int i = 0; i < 8; i++) + { + pOut[i] = result[((lid_x * 8 + i) << 1)] + bias[(grid_x << 3) + i]; + } + + __global float8* pC = (__global float8*)(c + (grid_x << 3) + lid_x * CSTRIDE); + *pC = out; + } +} diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC7M8.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M8.cl new file mode 100644 index 000000000..ba62863d5 --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC7M8.cl @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#define STRIDE (4096) +#define CSTRIDE (4096) +#define ITER (64) +#define NOVECTOR + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void InnerProduct( + __global const float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + __local float shared_a[8][66]; + __local float shared_b[8][66]; + __local float result[65]; + + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __global const float* pA = (__global const float*)(a); + __global const float* pB = (__global const float*)(b + (grid_x << 3) * STRIDE); + + int offset = ((grid_x << 6)) % STRIDE; + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + for (int j = 0; j < 8; j++) + { + shared_a[j][lid_x + (lid_x >> 5)] = pA[offset + lid_x + j * STRIDE]; + } + + for (int j = 0; j < 8; j++) + { + shared_b[j][lid_x + (lid_x >> 5)] = pB[offset + lid_x + j * STRIDE]; + } + + for (int k = 0; k < 64; k++) + { + sum += shared_a[lid_x >> 3][k + (k >> 5)] * shared_b[(lid_x & 7)][k + (k >> 5)]; + } + } + + result[lid_x] = sum; + if (lid_x < 8) + { + float8 out; + float* pOut = (float*)&out; + + for (int i = 0; i < 8; i++) + { + pOut[i] = result[((lid_x * 8 + i))] + bias[(grid_x << 3) + i]; + } + + __global float8* pC = (__global float8*)(c + (grid_x << 3) + lid_x * CSTRIDE); + *pC = out; + } +} diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8.cl new file mode 100644 index 000000000..8d9257251 --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC8.cl @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#define STRIDE (4096) +#define ITER (16) + +#define OUTPUT 1000 + +__attribute__((reqd_work_group_size(256, 1, 1))) +__kernel void InnerProduct( + __constant float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __local float shared_b[64][65]; + __local float result[256]; + + __constant float* pA = (__constant float*)a; + __global const float* pB = (__global const float*)(b + ((grid_x << 4)) * STRIDE + (lid_x & 63)); //correct + + int offset = ((grid_x << 6) + (lid_x >> 6 << 10)) % STRIDE; + + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + for (int j = 0; j < 16; j++) + { + shared_b[(lid_x >> 6 << 4) + j][(lid_x & 63)] = (offset + j * STRIDE + ((grid_x << 4)) * STRIDE + (lid_x & 63) < OUTPUT * STRIDE ? pB[(offset + j * STRIDE)] : 0.0f); //correct + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int k = 0; k < 16; k++) + { + sum += pA[(offset + ((lid_x & 3) << 4) + k) % STRIDE] * shared_b[(lid_x >> 2)][((lid_x & 3) << 4) + k]; //correct + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + result[lid_x] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lid_x < 64) + { + result[(lid_x << 2)] += result[(lid_x << 2) + 1] + result[(lid_x << 2) + 2] + result[(lid_x << 2) + 3]; + barrier(CLK_LOCAL_MEM_FENCE); + result[(lid_x << 2)] += result[(lid_x << 2) + 64] + result[(lid_x << 2) + 128] + result[(lid_x << 2) + 192]; + + if (lid_x < 16 && (grid_x << 4) + lid_x < OUTPUT) + { + c[(grid_x << 4) + lid_x] = bias[(grid_x << 4) + lid_x] + result[(lid_x << 2)]; + } + } +} + diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M1.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M1.cl new file mode 100644 index 000000000..7535432d2 --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M1.cl @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#define STRIDE (4096) +#define ITER (16) + +#define OUTPUT 1000 + +__attribute__((reqd_work_group_size(256, 1, 1))) +__kernel void InnerProduct( + __constant float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __local float shared_b[64][65]; + __local float result[256]; + + __constant float* pA = (__constant float*)a; + __global const float* pB = (__global const float*)(b + ((grid_x << 4)) * STRIDE + (lid_x & 63)); //correct + + int offset = ((grid_x << 6) + (lid_x >> 6 << 10)) % STRIDE; + + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + for (int j = 0; j < 16; j++) + { + shared_b[(lid_x >> 6 << 4) + j][(lid_x & 63)] = (offset + j * STRIDE + ((grid_x << 4)) * STRIDE + (lid_x & 63) < OUTPUT * STRIDE ? pB[(offset + j * STRIDE)] : 0.0f); //correct + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int k = 0; k < 16; k++) + { + sum += pA[(offset + ((lid_x & 3) << 4) + k) % STRIDE] * shared_b[(lid_x >> 2)][((lid_x & 3) << 4) + k]; //correct + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + result[lid_x] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lid_x < 64) + { + result[(lid_x << 2)] += result[(lid_x << 2) + 1] + result[(lid_x << 2) + 2] + result[(lid_x << 2) + 3]; + barrier(CLK_LOCAL_MEM_FENCE); + result[(lid_x << 2)] += result[(lid_x << 2) + 64] + result[(lid_x << 2) + 128] + result[(lid_x << 2) + 192]; + + if (lid_x < 16 && (grid_x << 4) + lid_x < OUTPUT) + { + c[(grid_x << 4) + lid_x] = bias[(grid_x << 4) + lid_x] + result[(lid_x << 2)]; + } + } +} + diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M2.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M2.cl new file mode 100644 index 000000000..9e8c270b7 --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M2.cl @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#define STRIDE (4096) +#define HSTRIDE (2048) +#define ITER (32) + +#define OUTPUT 1000 + +void reduce(__local float* buffer, int tid) +{ + if (tid < 64) + { + buffer[tid] += buffer[tid + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 32) + { + buffer[tid << 1] += buffer[(tid << 1) + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) + { + buffer[tid << 2] += buffer[(tid << 2) + 2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { + buffer[tid << 3] += buffer[(tid << 3) + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(128, 1, 1))) +__kernel void InnerProduct( + __constant float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __local float shared_b[16][65]; + __local float result[129]; + + __constant float* pA = (__constant float*)(a + (grid_x >> 7 << 12)); + __global const float* pB = (__global const float*)(b); + + int offset = (((grid_x & 127) << 6)) % HSTRIDE; + + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % HSTRIDE) + { + for (int j = 0; j < 8; j++) + { + shared_b[(lid_x >> 6 << 3) + j][(lid_x & 63)] = (offset + (lid_x >> 6 << 11) + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x & 63) < OUTPUT * STRIDE ? pB[offset + (lid_x >> 6 << 11) + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x & 63)] : 0.0f); //correct + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int k = 0; k < 8; k++) + { + sum += pA[(offset + ((lid_x & 7) << 3) + k) % HSTRIDE + (lid_x >> 6 << 11)] * shared_b[(lid_x >> 6 << 3) + ((lid_x & 63) >> 3)][((lid_x & 7) << 3) + k]; //correct + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + result[lid_x] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + + reduce(result, lid_x); + + if (lid_x < 8 && ((grid_x & 127) << 3) + lid_x < OUTPUT) + { + int out_offset = ((grid_x >> 7) * OUTPUT + ((grid_x & 127) << 3) + lid_x); + c[out_offset] = bias[((grid_x & 127) << 3) + lid_x] + result[(lid_x << 3)]; + } +} + diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M32.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M32.cl new file mode 100644 index 000000000..99687a01b --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M32.cl @@ -0,0 +1,78 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#define STRIDE (4096) +#define ITER (64) + +#define OUTPUT 1000 + +void reduce(__local float* buffer, int tid) +{ + if (tid < 32) + { + buffer[tid << 1] += buffer[(tid << 1) + 1]; + } +} + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void InnerProduct( + __global const float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __local float shared_a[4][65]; + __local float shared_b[8][65]; + __local float result[65]; + + __global const float* pA = (__global const float*)(a + (grid_x >> 7 << 14)); //correct + __global const float* pB = (__global const float*)(b); //correct + + int offset = (((grid_x & 127) << 6)) % STRIDE; + + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + for (int j = 0; j < 4; j++) + { + shared_a[j][lid_x] = pA[offset + j * STRIDE + lid_x]; + } + + for (int j = 0; j < 8; j++) + { + shared_b[j][(lid_x)] = ((j + ((grid_x & 127) << 3)) * STRIDE + (offset + lid_x) < OUTPUT * STRIDE ? pB[(j + ((grid_x & 127) << 3)) * STRIDE + (offset + lid_x)] : 0.0f); //correct + } + + for (int k = 0; k < 32; k++) + { + sum += shared_a[lid_x >> 4][((lid_x & 1) << 5) + k] * shared_b[((lid_x & 15) >> 1)][((lid_x & 1) << 5) + k]; //correct + } + } + + result[lid_x] = sum; + reduce(result, lid_x); + + if (lid_x < 32 && ((grid_x & 127) << 3) + (lid_x & 7) < OUTPUT) + { + int out_offset = ((grid_x >> 7 << 2) + (lid_x >> 3)) * OUTPUT + ((grid_x & 127) << 3) + (lid_x & 7); + c[out_offset] = bias[((grid_x & 127) << 3) + (lid_x & 7)] + result[(lid_x << 1)]; + } +} + diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M4.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M4.cl new file mode 100644 index 000000000..952e71fd5 --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M4.cl @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#define STRIDE (4096) +#define ITER (64) + +#define OUTPUT 1000 + +void reduce(__local float* buffer, int tid) +{ + if (tid < 32) + { + buffer[tid << 1] += buffer[(tid << 1) + 1]; + } + if (tid < 16) + { + buffer[tid << 2] += buffer[(tid << 2) + 2]; + } + if (tid < 8) + { + buffer[tid << 3] += buffer[(tid << 3) + 4]; + } +} + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void InnerProduct( + __constant float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __local float shared_b[8][65]; + __local float result[64]; + + __constant float* pA = (__constant float*)(a + (grid_x >> 7 << 12)); //correct + __global const float* pB = (__global const float*)(b); //correct + + int offset = (((grid_x & 127) << 6)) % STRIDE; + + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + for (int j = 0; j < 8; j++) + { + shared_b[j][(lid_x)] = (offset + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x) < OUTPUT * STRIDE ? pB[offset + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x)] : 0.0f); //correct + } + + for (int k = 0; k < 8; k++) + { + sum += pA[(offset + ((lid_x & 7) << 3) + k)] * shared_b[((lid_x) >> 3)][((lid_x & 7) << 3) + k]; //correct + } + } + + result[lid_x] = sum; + + reduce(result, lid_x); + + if (lid_x < 8 && ((grid_x & 127) << 3) + lid_x < OUTPUT) + { + int out_offset = ((grid_x >> 7) * OUTPUT + ((grid_x & 127) << 3) + lid_x); + c[out_offset] = bias[((grid_x & 127) << 3) + lid_x] + result[(lid_x << 3)]; + } +} + diff --git a/saber/funcs/impl/amd/cl/InnerProductBNTFC8M8.cl b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M8.cl new file mode 100644 index 000000000..3e72c5054 --- /dev/null +++ b/saber/funcs/impl/amd/cl/InnerProductBNTFC8M8.cl @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#define STRIDE (4096) +#define ITER (64) + +#define OUTPUT 1000 + +void reduce(__local float* buffer, int tid) +{ + if (tid < 32) + { + buffer[tid << 1] += buffer[(tid << 1) + 1]; + } + if (tid < 16) + { + buffer[tid << 2] += buffer[(tid << 2) + 2]; + } + if (tid < 8) + { + buffer[tid << 3] += buffer[(tid << 3) + 4]; + } +} + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void InnerProduct( + __global const float *a, + __global const float *b, + __global const float *bias, + __global float *c) +{ + int gid_x = get_global_id(0); + int lid_x = get_local_id(0); + int grid_x = get_group_id(0); + + __local float shared_a[65]; + __local float shared_b[8][65]; + __local float result[64]; + + __global const float* pA = (__global const float*)(a + (grid_x >> 7 << 12)); //correct + __global const float* pB = (__global const float*)(b); //correct + + int offset = (((grid_x & 127) << 6)) % STRIDE; + + float sum = 0.0f; + + for (int i = 0; i < ITER; i++, offset = (offset + 64) % STRIDE) + { + shared_a[lid_x] = pA[offset + lid_x]; + + for (int j = 0; j < 8; j++) + { + shared_b[j][(lid_x)] = (offset + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x) < OUTPUT * STRIDE ? pB[offset + (j + ((grid_x & 127) << 3)) * STRIDE + (lid_x)] : 0.0f); //correct + } + + for (int k = 0; k < 8; k++) + { +#if 1 + sum = mad(shared_a[((lid_x & 7) << 3) + k], shared_b[((lid_x) >> 3)][((lid_x & 7) << 3) + k], sum); //correct +#else + sum += shared_a[((lid_x & 7) << 3) + k] * shared_b[((lid_x) >> 3)][((lid_x & 7) << 3) + k]; //correct +#endif + } + } + + result[lid_x] = sum; + + reduce(result, lid_x); + + if (lid_x < 8 && ((grid_x & 127) << 3) + lid_x < OUTPUT) + { + int out_offset = ((grid_x >> 7) * OUTPUT + ((grid_x & 127) << 3) + lid_x); + c[out_offset] = bias[((grid_x & 127) << 3) + lid_x] + result[(lid_x << 3)]; + } +} + diff --git a/saber/funcs/impl/amd/cl/MIOpenBiasReLuPooling.cl b/saber/funcs/impl/amd/cl/MIOpenBiasReLuPooling.cl new file mode 100644 index 000000000..a17634bc9 --- /dev/null +++ b/saber/funcs/impl/amd/cl/MIOpenBiasReLuPooling.cl @@ -0,0 +1,133 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#define _FLOAT float +#define _FLOAT2 float2 +#define _FLOAT4 float4 +#define _FLOAT8 float8 +#define _INT_MASK_GLOBAL uchar +#define _INT_MASK_LOCAL uchar + +#ifndef FLT_MAX +#define FLT_MAX 3.402823466e+38F /* max value */ +#endif + +#define UNUSED __attribute__((__unused__)) + +#define MLO_POOLING_OP_MAX 0 +#define MLO_POOLING_OP_AVE 1 +#define MLO_POOLING_OP_STC 2 + +#define MLO_POOLING_GROUP_SZ2 1 + +#ifndef MLO_POOLING_OP_ID +#define MLO_POOLING_OP_ID 0 +#endif +// max +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX +#define MLO_POOLING_OP(A, B) fmax(A, B); +#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE +#define MLO_POOLING_OP(A, B) (A + B); +#endif + +/********************************************************************************* + +**********************************************************************************/ + +#define THREAD_PER_WAVE 64 +#define WAVE_PER_4SIMD 40 + +#define MLO_BOT_DATA_SZ0 2 +#define MLO_BOT_DATA_SZ1 2 + +//#define LOCAL_MEMORY + +__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void +mloPooling(const __global _FLOAT* bot, + __global _FLOAT* top, + __global _FLOAT* bias, + float slope) +{ + uint gid = get_global_id(0); + uint ob = BATCH_NUM * MLO_POOLING_N_OUTPUTS; // output * batch_sz + uint bot_off = 0; + uint top_off = gid; + + _FLOAT2 bot_data[MLO_BOT_DATA_SZ1]; + _FLOAT res; + +#ifdef LOCAL_MEMORY + __local _FLOAT write_combine[256]; + __local _FLOAT4* p_write_combine = (__local _FLOAT4*)write_combine; + __global _FLOAT4* p_top; +#endif + + uint loop_num = ((ob * MLO_POOLING_TOP_STRIDE * MLO_POOLING_TOP_HEIGHT + THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD - 1) / (THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD)); + uint top_loop_stride = THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD; + + for (int index = 0; index < loop_num && top_off < ob * MLO_POOLING_TOP_STRIDE * MLO_POOLING_TOP_HEIGHT; index++, top_off += top_loop_stride) + { + uint bot_b = (top_off / MLO_POOLING_TOP_BATCH_STRIDE); + uint bot_c = (top_off % MLO_POOLING_TOP_BATCH_STRIDE / MLO_POOLING_TOP_CHANNEL_STRIDE); + uint bot_y = (top_off % MLO_POOLING_TOP_CHANNEL_STRIDE / MLO_POOLING_TOP_STRIDE) << 1; + uint bot_x = (top_off % MLO_POOLING_TOP_STRIDE) << 1; + + bot_off = bot_b * MLO_POOLING_BOT_BATCH_STRIDE + bot_c * MLO_POOLING_BOT_CHANNEL_STRIDE + bot_y * MLO_POOLING_BOT_STRIDE + bot_x; +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX + res = -FLT_MAX; +#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + res = 0; +#endif + +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + uint pool_size = 0; +#endif + for(uint j = 0; j < MLO_BOT_DATA_SZ1; ++j) + { + uint bot_gbl_off = bot_off + j * MLO_POOLING_BOT_STRIDE; + __global _FLOAT2* read = (__global _FLOAT2*)(bot + bot_gbl_off); + bot_data[j] = *read; + bot_data[j] += bias[bot_c]; + bot_data[j].s0 *= (bot_data[j].s0 > 0.0f ? 1.0f : slope); + bot_data[j].s1 *= (bot_data[j].s1 > 0.0f ? 1.0f : slope); + +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + pool_size += (uint)vis; +#endif + res = MLO_POOLING_OP(res, bot_data[j].s0); + + +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + pool_size += (uint)vis; +#endif + res = MLO_POOLING_OP(res, bot_data[j].s1); + } + +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + res *= 1.f / (_FLOAT)pool_size; +#endif + +#ifdef LOCAL_MEMORY + write_combine[get_local_id(0)] = res; + if (get_local_id(0) % 4 == 0) + { + p_top = (__global _FLOAT4*)(top + top_off); + *p_top = p_write_combine[get_local_id(0) / 4]; + } +#else + top[top_off] = res; +#endif + } +} diff --git a/test/saber/cuda/test_saber_device_NV.h b/saber/funcs/impl/amd/cl/MIOpenBiasReLuUni.cl similarity index 50% rename from test/saber/cuda/test_saber_device_NV.h rename to saber/funcs/impl/amd/cl/MIOpenBiasReLuUni.cl index 69f12abb8..676f0df07 100644 --- a/test/saber/cuda/test_saber_device_NV.h +++ b/saber/funcs/impl/amd/cl/MIOpenBiasReLuUni.cl @@ -1,36 +1,25 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ -#ifndef SABER_TEST_SABER_DEVICE_NV_H -#define SABER_TEST_SABER_DEVICE_NV_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "core/device.h" - -using namespace anakin::test; - -class TestSaberDeviceNV : public Test { -public: - TestSaberDeviceNV() {} - ~TestSaberDeviceNV() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -}; - -#endif //SABER_TEST_SABER_DEVICE_NV_H +__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void +MIOpenReLu(const __global float* __restrict in, + __global float* __restrict out, + __global float* bias, + float slope, int N, int C, int H, int W) +{ + int gid_x = get_global_id(0); + float intermediate = in[gid_x] + bias[(gid_x % (C * H * W)) / (H * W)]; + out[gid_x] = intermediate * (intermediate > 0.0f ? 1.0f : slope); +} diff --git a/saber/funcs/impl/amd/cl/Pooling.cl b/saber/funcs/impl/amd/cl/Pooling.cl new file mode 100644 index 000000000..31927ad16 --- /dev/null +++ b/saber/funcs/impl/amd/cl/Pooling.cl @@ -0,0 +1,164 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#define _FLOAT float +#define _FLOAT2 float2 +#define _FLOAT4 float4 +#define _FLOAT8 float8 +#define _INT_MASK_GLOBAL uchar +#define _INT_MASK_LOCAL uchar + +#ifndef FLT_MAX +#define FLT_MAX 3.402823466e+38F /* max value */ +#endif + +#define UNUSED __attribute__((__unused__)) + +#define MLO_POOLING_OP_MAX 0 +#define MLO_POOLING_OP_AVE 1 +#define MLO_POOLING_OP_STC 2 + +#define MLO_POOLING_GROUP_SZ2 1 + +#ifndef MLO_POOLING_OP_ID +#define MLO_POOLING_OP_ID 0 +#endif +// max +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX +#define MLO_POOLING_OP(A, B) fmax(A, B); +#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE +#define MLO_POOLING_OP(A, B) (A + B); +#endif + +/********************************************************************************* + +**********************************************************************************/ + +#define THREAD_PER_WAVE 64 +#define WAVE_PER_4SIMD 40 + +#define MLO_BOT_DATA_SZ0 2 +#define MLO_BOT_DATA_SZ1 2 + +//#define LOCAL_MEMORY + +__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void +mloPooling(const __global _FLOAT* bot, + __global _FLOAT* top ) +// __global _INT_MASK_GLOBAL* mask) +{ + uint gid = get_global_id(0); + uint ob = BATCH_NUM * MLO_POOLING_N_OUTPUTS; // output * batch_sz + uint bot_off = 0; + uint top_off = gid; + + _FLOAT2 bot_data[MLO_BOT_DATA_SZ1]; + _FLOAT res; + +#ifdef LOCAL_MEMORY + __local _FLOAT write_combine[256]; + __local _FLOAT4* p_write_combine = (__local _FLOAT4*)write_combine; + __global _FLOAT4* p_top; +#endif + + uint loop_num = ((ob * MLO_POOLING_TOP_STRIDE * MLO_POOLING_TOP_HEIGHT + THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD - 1) / (THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD)); + uint top_loop_stride = THREAD_PER_WAVE * CU_NUM * WAVE_PER_4SIMD; + + for (int index = 0; index < loop_num && top_off < ob * MLO_POOLING_TOP_STRIDE * MLO_POOLING_TOP_HEIGHT; index++, top_off += top_loop_stride) + { + uint bot_b = (top_off / MLO_POOLING_TOP_BATCH_STRIDE); + uint bot_c = (top_off % MLO_POOLING_TOP_BATCH_STRIDE / MLO_POOLING_TOP_CHANNEL_STRIDE); + uint bot_y = (top_off % MLO_POOLING_TOP_CHANNEL_STRIDE / MLO_POOLING_TOP_STRIDE) << 1; + uint bot_x = (top_off % MLO_POOLING_TOP_STRIDE) << 1; + + bot_off = bot_b * MLO_POOLING_BOT_BATCH_STRIDE + bot_c * MLO_POOLING_BOT_CHANNEL_STRIDE + bot_y * MLO_POOLING_BOT_STRIDE + bot_x; +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX + res = -FLT_MAX; +#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + res = 0; +#endif + +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + uint pool_size = 0; +#endif + for(uint j = 0; j < MLO_BOT_DATA_SZ1; ++j) + { + //int run_y = (int)j; + + uint bot_gbl_off = bot_off + j * MLO_POOLING_BOT_STRIDE; + __global _FLOAT2* read = (__global _FLOAT2*)(bot + bot_gbl_off); + bot_data[j] = *read; + + //int run_x = (int)bot_x; +//#if 1 +// bool vis = true; +//#else +// bool vis = ((run_y >= 0 && run_y < MLO_POOLING_BOT_HEIGHT) && +// (run_x >= 0 && run_x < MLO_POOLING_BOT_WIDTH)) +// ? true +// : false; +//#endif +// bot_data[j].s0 = (vis) ? bot_data[j].s0 : +//#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX +// -FLT_MAX +//#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE +// 0 +//#endif +// ; + +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + pool_size += (uint)vis; +#endif + res = MLO_POOLING_OP(res, bot_data[j].s0); + + + //run_x++; +//#if 1 +//#else +// vis = ((run_y >= 0 && run_y < MLO_POOLING_BOT_HEIGHT) && +// (run_x >= 0 && run_x < MLO_POOLING_BOT_WIDTH)) +// ? true +// : false; +//#endif +// bot_data[j].s1 = (vis) ? bot_data[j].s1 : +//#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX +// -FLT_MAX +//#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE +// 0 +//#endif +// ; + +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + pool_size += (uint)vis; +#endif + res = MLO_POOLING_OP(res, bot_data[j].s1); + } + +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE + res *= 1.f / (_FLOAT)pool_size; +#endif + +#ifdef LOCAL_MEMORY + write_combine[get_local_id(0)] = res; + if (get_local_id(0) % 4 == 0) + { + p_top = (__global _FLOAT4*)(top + top_off); + *p_top = p_write_combine[get_local_id(0) / 4]; + } +#else + top[top_off] = res; +#endif + } +} diff --git a/saber/funcs/impl/amd/cl/Relu.cl b/saber/funcs/impl/amd/cl/Relu.cl new file mode 100644 index 000000000..87f0e3e42 --- /dev/null +++ b/saber/funcs/impl/amd/cl/Relu.cl @@ -0,0 +1,186 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#define _FLOAT float +#define _FLOAT2 float2 +#define _FLOAT4 float4 +#define _FLOAT8 float8 + +#ifndef FLT_MAX +#define FLT_MAX 3.402823466e+38F /* max value */ +#endif + +#define UNUSED __attribute__((__unused__)) + +#ifndef MLO_FILTER_STRIDE0 +#define MLO_FILTER_STRIDE0 1 +#endif +#ifndef MLO_FILTER_STRIDE1 +#define MLO_FILTER_STRIDE1 1 +#endif + +#define MLO_GRP_SZ0 (MLO_GRP_TILE0 * MLO_GRP_TILE1) +#define MLO_GRP_SZ1 1 +#define MLO_GRP_SZ2 1 +#define MLO_GRP_SZ (MLO_GRP_SZ0 * MLO_GRP_SZ1 * MLO_GRP_SZ2) +#define MLO_ALU_TILE_SZ (MLO_ALU_VTILE1 * MLO_ALU_VTILE0) + +#define MLO_N_ALUTILES_TOTAL ((MLO_GRP_TILE0 * MLO_GRP_TILE1) / (MLO_ALU_TILE_SZ)) +#define MLO_N_ALUTILES_PERSTACK (MLO_N_ALUTILES_TOTAL / MLO_N_STACKS) +#define MLO_ALUTILES_STACK_SZ (MLO_N_ALUTILES_PERSTACK * MLO_ALU_TILE_SZ) + +#define MLO_N_OUT_TILE_BLOCKS0 ((MLO_OUT_WIDTH + MLO_IN_TILE0 - 1) / MLO_IN_TILE0) +#define MLO_N_OUT_TILE_BLOCKS1 ((MLO_OUT_HEIGHT + MLO_IN_TILE1 - 1) / MLO_IN_TILE1) + +#define MLO_N_OUT_PACKS (MLO_N_OUTPUTS / MLO_N_OUT_TILES_PERSTACK) +#if MLO_N_OUT_PACKS * MLO_N_OUT_TILES_PERSTACK == MLO_N_OUTPUTS && \ + MLO_N_OUT_TILES_PERSTACK != MLO_N_OUTPUTS +#define MLO_OUTPUTS_ALIGNED 1 +#else +#define MLO_OUTPUTS_ALIGNED 0 +#endif + +#define MLO_N_BATCH_PACKS (MLO_BATCH_SZ / MLO_N_STACKS) +#if MLO_N_BATCH_PACKS * MLO_N_STACKS == MLO_BATCH_SZ && MLO_N_STACKS != MLO_BATCH_SZ +#define MLO_BATCH_ALIGNED 1 +#else +#define MLO_BATCH_ALIGNED 0 +#endif + +#if defined(__AMDGCN__) +extern uint __llvm_amdgcn_readfirstlane(uint) __asm("llvm.amdgcn.readfirstlane"); +#define uniform(x) __llvm_amdgcn_readfirstlane(x) +#else +#define uniform(x) (x) +#endif + +static inline uint iDiv(uint v, uint d) +{ + uint r = (uint)((float)v * (1.0f / (float)d) + 0.00001f); + return (r); +} + +static inline uint iMod(uint v, uint u, uint d) +{ + uint r = v - mul24((uint)u, (uint)d); + return (r); +} + +__attribute__((reqd_work_group_size(MLO_GRP_SZ0, MLO_GRP_SZ1, MLO_GRP_SZ2))) __kernel void +Relu(const __global _FLOAT* __restrict in, + __global _FLOAT* __restrict out, + _FLOAT slope) +{ + uint grp_id0 = get_group_id(0); +#if MLO_N_OUT_TILE_BLOCKS0 & (MLO_N_OUT_TILE_BLOCKS0 - 1) + uint y_tile_blk = iDiv(grp_id0, MLO_N_OUT_TILE_BLOCKS0); + uint x_tile_blk = iMod(grp_id0, y_tile_blk, MLO_N_OUT_TILE_BLOCKS0); +#else + uint y_tile_blk = grp_id0 / MLO_N_OUT_TILE_BLOCKS0; + uint x_tile_blk = grp_id0 & (MLO_N_OUT_TILE_BLOCKS0 - 1); +#endif + uint o_pack = get_group_id(1); // block of outputs + uint b_pack = get_group_id(2); // batch block + + uint lcl_id = get_local_id(0); +#if MLO_ALUTILES_STACK_SZ >= MLO_GRP_SZ + uint stack = 0; + uint alu_stack_id = lcl_id; +#elif MLO_ALUTILES_STACK_SZ & (MLO_ALUTILES_STACK_SZ - 1) + uint stack = iDiv(lcl_id, MLO_ALUTILES_STACK_SZ); // stack + uint alu_stack_id = iMod(lcl_id, stack, MLO_ALUTILES_STACK_SZ); // alu index in stack +#else + uint stack = lcl_id / MLO_ALUTILES_STACK_SZ; // stack + uint alu_stack_id = lcl_id & (MLO_ALUTILES_STACK_SZ - 1); // alu index in stack +#if MLO_ALUTILES_STACK_SZ >= 64 + stack = uniform(stack); +#endif +#endif +// ALU plane inside stack +#if MLO_ALU_TILE_SZ & (MLO_ALU_TILE_SZ - 1) + uint alu_out_plane_id = iDiv(alu_stack_id, MLO_ALU_TILE_SZ); // alu output plane index + uint alu_out_id = iMod( + alu_stack_id, alu_out_plane_id, MLO_ALU_TILE_SZ); // alu index inside an ALU output plane +#else + uint alu_out_plane_id = alu_stack_id / MLO_ALU_TILE_SZ; // alu output plane index + uint alu_out_id = alu_stack_id & (MLO_ALU_TILE_SZ - 1); // alu index inside an ALU output plane +#endif +// pos inside ALU tile +#if MLO_ALU_VTILE0 & (MLO_ALU_VTILE0 - 1) + uint alu_tl1 = iDiv(alu_out_id, MLO_ALU_VTILE0); + uint alu_tl0 = iMod(alu_out_id, alu_tl1, MLO_ALU_VTILE0); +#else + uint alu_tl1 = alu_out_id / MLO_ALU_VTILE0; + uint alu_tl0 = alu_out_id & (MLO_ALU_VTILE0 - 1); +#endif + + uint o_map_plane = + o_pack * MLO_N_OUT_TILES_PERSTACK; // first output maps index per full ALU plane stack + uint o_map_base = alu_out_plane_id * MLO_N_OUT_TILES; // local output map offset + uint o_map = o_map_plane + o_map_base; // output map index per ALU plane + uint b_index = b_pack * MLO_N_STACKS; + + uint x_grp = x_tile_blk * MLO_IN_TILE0 * MLO_FILTER_STRIDE0; + uint y_grp = y_tile_blk * MLO_IN_TILE1 * MLO_FILTER_STRIDE1; + +// write results out +#if MLO_FILTER_STRIDE0 == 1 + int x_out_grp = x_grp; +#else + int x_out_grp = x_tile_blk * MLO_IN_TILE0; +#endif +#if MLO_FILTER_STRIDE1 == 1 + int y_out_grp = y_grp; +#else + int y_out_grp = y_tile_blk * MLO_IN_TILE1; +#endif + int x_out_lcl = alu_tl0 * MLO_OUT_TILE0; + int y_out_lcl = alu_tl1 * MLO_OUT_TILE1; + + uint out_off = (b_index + stack) * MLO_OUT_BATCH_STRIDE + o_map * MLO_OUT_CHANNEL_STRIDE + + (y_out_grp + y_out_lcl) * MLO_OUT_STRIDE + x_out_grp + x_out_lcl; +// over all local stacks +#if MLO_BATCH_ALIGNED == 0 + if(b_index + stack < MLO_BATCH_SZ) +#endif + { + // over all local outputs + int out_off1 = out_off; + for(uint o = 0; o < MLO_N_OUT_TILES; ++o, out_off1 += MLO_OUT_CHANNEL_STRIDE) + { +#if MLO_OUTPUTS_ALIGNED == 0 + if(o_map + o < MLO_N_OUTPUTS) +#endif + { + // over output tile + int out_off2 = out_off1; + for(uint j = 0; j < MLO_OUT_TILE1; ++j, out_off2 += MLO_OUT_STRIDE) + { + if(y_out_grp + y_out_lcl + j < MLO_OUT_HEIGHT) + { + for (uint i = 0; i < MLO_OUT_TILE0; ++i) + { + if (x_out_grp + x_out_lcl + i < MLO_OUT_WIDTH && out_off2 + i < MLO_OUT_BATCH_STRIDE * MLO_BATCH_SZ) + { + //ReLU + //out[out_off2 + i] = max(in[out_off2 + i], 0.0f); + out[out_off2 + i] = (in[out_off2 + i] > 0) ? in[out_off2 + i] : in[out_off2 + i] * slope; + } + } + } + } + } + } + } +} diff --git a/test/saber/cuda/test_saber_context_NV.h b/saber/funcs/impl/amd/cl/ReluUni.cl similarity index 53% rename from test/saber/cuda/test_saber_context_NV.h rename to saber/funcs/impl/amd/cl/ReluUni.cl index 8f99f9c32..5656b0155 100644 --- a/test/saber/cuda/test_saber_context_NV.h +++ b/saber/funcs/impl/amd/cl/ReluUni.cl @@ -1,36 +1,22 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ -#ifndef SABER_TEST_SABER_CONTEXT_NV_H -#define SABER_TEST_SABER_CONTEXT_NV_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "core/context.h" - -using namespace anakin::test; - -class TestSaberContextNV : public Test { -public: - TestSaberContextNV() {} - ~TestSaberContextNV() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -}; - -#endif //SABER_TEST_SABER_CONTEXT_NV_H +__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void +ReluUni(const __global float* __restrict in, + __global float* __restrict out, + float slope) +{ + out[get_global_id(0)] = in[get_global_id(0)] * (in[get_global_id(0)] > 0.0f ? 1.0f : slope); +} diff --git a/saber/funcs/impl/amd/cl/Softmax.cl b/saber/funcs/impl/amd/cl/Softmax.cl new file mode 100644 index 000000000..32e61fa13 --- /dev/null +++ b/saber/funcs/impl/amd/cl/Softmax.cl @@ -0,0 +1,220 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +/* Steps to compute softmax: + * 1. Compute the max per channel. + * 2. Subtract the max from each value in the channel. + * 3. Compute the exponent of all the values. + * 4. Compute the sum of the vales per channel. + * 5. Normalize based on the sum. + * + * We use CSR-{Vector / Stream} apprach to pick an algorithm depending on the + * number of channels each workgroup has to work with. + * J. L. Greathouse, M. Daga, Efficient sparse matrix-vector multiplication + * on GPUs using the CSR storage format, in: Proc. Int'l Conf. High Performance + * Computing, Networking, Storage and Analysis (SC'14) +*/ + +kernel void Softmax(global float* y, const int c, const int grid_size, const int spatial_dim) +{ +#if NUM_BATCH == 1 // CSR-Vector like appraoch + + /* Entire workgroup works on one spatial_dim. + * We use logarthmic reductions to compute max and sum per channel. + * This approach reads in the same data thrice from DRAM but is still better + * than launching three different kernels. + * The workgroup begins by computing the nth image and s (spatial_dim) it + * is working on and iterates over the entire grid until finished. + */ + + local float l_helper[256]; + + int gid = get_group_id(0); + int lid = get_local_id(0); + + // Total number of workgroups launched can be less than the gridsize, hence iterate over. + for(gid = get_group_id(0); gid < grid_size; gid += get_num_groups(0)) + { + + int n = gid / spatial_dim; // nth image + int s = gid % spatial_dim; // spatial dimension (h*w) + + l_helper[lid] = -FLT_MAX; + + float t_helper = -FLT_MAX; // thread_local helper var + + // Compute max per channel + // Iterate over all the channels one thread is supposed to loop over + // and compute max + for(int i = lid; i < c; i += get_local_size(0)) + { + t_helper = max(y[mad24(n, c, i) * spatial_dim + s], t_helper); + } + + // Now we have to compute the max from 256 values (one per each thread) + l_helper[lid] = t_helper; + barrier(CLK_LOCAL_MEM_FENCE); + + // Logarithmic reduction to compute the max. + for(int i = (get_local_size(0) >> 1); i > 0; i >>= 1) + { + if(lid < i) + { + l_helper[lid] = max(l_helper[lid], l_helper[lid + i]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + float channel_max = l_helper[0]; + t_helper = 0.; + + // Subtract channel_max from each value + for(int i = lid; i < c; i += get_local_size(0)) + { + float value = y[mad24(n, c, i) * spatial_dim + s]; + + // Compute exponent of each value + // Then sum all the values touched by this thread + t_helper += exp(value - channel_max); + } + + l_helper[lid] = t_helper; + barrier(CLK_LOCAL_MEM_FENCE); + + // Compute sum of 256 values (one for each thread) + // Logarithmic reduction to compute the sum + for(int i = (get_local_size(0) >> 1); i > 0; i >>= 1) + { + if(lid < i) + { + l_helper[lid] += l_helper[lid + i]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + float channel_sum = l_helper[0]; + + // Normalize each value in the channel by the channel_sum + for(int i = lid; i < c; i += get_local_size(0)) + { + float value = y[mad24(n, c, i) * spatial_dim + s]; + + // Subtracting max again because we do not write the output of + // value-max to DRAM above. Doing a subtraction again is much + // faster than writing uncoalesced to DRAM + value = exp(value - channel_max); + + y[mad24(n, c, i) * spatial_dim + s] = value / channel_sum; + } + } + +#else // CSR-Stream like approach + + /* Each workgroup is computing the softmax for NUM_BATCH spatial_dims ala CSR-Stream. + * The number of threads iterting over channels to compute softmax for one batch is BATCH_SIZE. + * The number of values each thread works on is U_BATCH_SIZE (read micro batch size). + * Each batch in the workgroup works on its nth image and s (spatial_dim). + * E.g. a 256 thread workgroup with c=31 has 8 batches and a batchsize of 32. + * The number of workgroups launched are exactly the number as required + * hence, there is no for-loop. + */ + + local float l_helper[256]; + + int gid = get_group_id(0); + int lid = get_local_id(0); + + // ID of the thread within the batch + int batch_lid = lid & (BATCH_SIZE - 1); // thread specific channel_st + int batch = lid / BATCH_SIZE; // which spatial_dim or pixel + + // Batch specific n and s + int batch_n = (NUM_BATCH * gid + batch) / spatial_dim; // nth image + int batch_s = (NUM_BATCH * gid + batch) % spatial_dim; // which spatial_dim/pixel + + l_helper[lid] = -FLT_MAX; + + float t_helper = -FLT_MAX; // thread_local helper var + + // stores all the values touched by one thread so that we do not have load + // again as the CSR-Vector approach + float value[U_BATCH_SIZE]; + for(int i = 0; i < U_BATCH_SIZE; i++) + { + value[i] = -FLT_MAX; + } + + // Compute max per channel + // BATCH_SIZE threads iterate over the channels + for(int i = batch_lid; i < c; i += BATCH_SIZE) + { + if(mad24(batch_n, c, i) * spatial_dim + batch_s < c * grid_size) + value[i / BATCH_SIZE] = y[mad24(batch_n, c, i) * spatial_dim + batch_s]; + t_helper = max(value[i / BATCH_SIZE], t_helper); + } + + // Now we have to compute the max from 256 values (one per each thread) + l_helper[lid] = t_helper; + barrier(CLK_LOCAL_MEM_FENCE); + + // Logarithmic reduction to compute the max. + for(int i = (BATCH_SIZE >> 1); i > 0; i >>= 1) + { + if(batch_lid < i) + { + l_helper[lid] = max(l_helper[lid], l_helper[lid + i]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + float channel_max = l_helper[batch * BATCH_SIZE]; + t_helper = 0.; + + // Subtract channel_max from each value + for(int i = batch_lid; i < c; i += BATCH_SIZE) + { + + // Compute exponent of each value + // Then sum all the values touched by this thread + t_helper += exp(value[i / BATCH_SIZE] - channel_max); + } + + l_helper[lid] = t_helper; + barrier(CLK_LOCAL_MEM_FENCE); + + // Compute sum of 256 values (one for each thread) + // Logarithmic reduction to compute the sum + for(int i = (BATCH_SIZE >> 1); i > 0; i >>= 1) + { + if(batch_lid < i) + { + l_helper[lid] += l_helper[lid + i]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + float channel_sum = l_helper[batch * BATCH_SIZE]; + + // Normalize each value in the channel by the channel_sum + for(int i = batch_lid; i < c; i += BATCH_SIZE) + { + value[i / BATCH_SIZE] = exp(value[i / BATCH_SIZE] - channel_max); + + if(mad24(batch_n, c, i) * spatial_dim + batch_s < c * grid_size) + y[mad24(batch_n, c, i) * spatial_dim + batch_s] = value[i / BATCH_SIZE] / channel_sum; + } + +#endif // CSR-Vector vs CSR-Stream +} diff --git a/saber/funcs/impl/amd/lib/wino_conv_3x3.so b/saber/funcs/impl/amd/lib/wino_conv_3x3.so new file mode 100644 index 000000000..591fad67a Binary files /dev/null and b/saber/funcs/impl/amd/lib/wino_conv_3x3.so differ diff --git a/saber/funcs/impl/amd/saber_activation.cpp b/saber/funcs/impl/amd/saber_activation.cpp new file mode 100644 index 000000000..26538c175 --- /dev/null +++ b/saber/funcs/impl/amd/saber_activation.cpp @@ -0,0 +1,338 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "saber/funcs/base.h" +#include "saber/funcs/impl/amd/saber_activation.h" +#include "saber/funcs/impl/amd/amd_utils.h" + +namespace anakin{ +namespace saber { + +typedef TargetWrapper AMD_API; + +template +SaberStatus SaberActivation::init( + const std::vector& inputs, + std::vector& outputs, + ActivationParam ¶m, + Context &ctx) +{ + + typedef typename DataTensor_in::Dtype DataType_in; + typedef typename DataTensor_out::Dtype DataType_out; + typedef typename OpTensor::Dtype DataType_op; + this->_ctx = &ctx; + + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberActivation::create( + const std::vector& inputs, + std::vector& outputs, + ActivationParam ¶m, + Context &ctx) +{ + this->_ctx = &ctx; + + cl_context context = 0; + cl_device_id device = 0; + + Device dev = Env::cur_env()[inputs[0]->device_id()]; //anakin device id to AMD device + device = dev.get_device(); + context = dev.get_context(); + + //LOG(INFO) << "device id= " << device << " conext = " << context; + + KernelInfo kernelInfo; + + switch (param.active){ + case Active_relu: + //TODO + //Rewrite here once solver is ready.////////////// + T_ExtSolutionConfig extSolution; + //LOG(INFO) << inputs[0]->width() << " x " << inputs[0]->height() << " x " << inputs[0]->channel(); + switch(inputs[0]->width()) + { + case 224: + kernelInfo.l_wk = {256, 1, 1}; + kernelInfo.g_wk = {12544, 8, 1}; + + extSolution.in_tile0 = 32; + extSolution.in_tile1 = 32; + extSolution.grp_tile0 = 16; + extSolution.grp_tile1 = 16; + extSolution.out_pix_tile0 = 2; + extSolution.out_pix_tile1 = 2; + extSolution.n_stacks = 1; + extSolution.n_out_pix_tiles = 8; + extSolution.n_out_tiles_perstack = 8; + extSolution.n_in_data_tiles = 2; + extSolution.n_read_procs = 256; + extSolution.alu_tile0 = 16; + extSolution.alu_tile1 = 16; + break; + + case 112: + kernelInfo.l_wk = {256, 1, 1}; + kernelInfo.g_wk = {4096, 16, 1}; + + extSolution.in_tile0 = 32; + extSolution.in_tile1 = 32; + extSolution.grp_tile0 = 16; + extSolution.grp_tile1 = 16; + extSolution.out_pix_tile0 = 2; + extSolution.out_pix_tile1 = 2; + extSolution.n_stacks = 1; + extSolution.n_out_pix_tiles = 8; + extSolution.n_out_tiles_perstack = 8; + extSolution.n_in_data_tiles = 2; + extSolution.n_read_procs = 256; + extSolution.alu_tile0 = 16; + extSolution.alu_tile1 = 16; + break; + + case 56: + kernelInfo.l_wk = {256, 1, 1}; + kernelInfo.g_wk = {1024, 32, 1}; + + extSolution.in_tile0 = 32; + extSolution.in_tile1 = 32; + extSolution.grp_tile0 = 16; + extSolution.grp_tile1 = 16; + extSolution.out_pix_tile0 = 2; + extSolution.out_pix_tile1 = 2; + extSolution.n_stacks = 1; + extSolution.n_out_pix_tiles = 8; + extSolution.n_out_tiles_perstack = 8; + extSolution.n_in_data_tiles = 2; + extSolution.n_read_procs = 256; + extSolution.alu_tile0 = 16; + extSolution.alu_tile1 = 16; + break; + + case 28: + kernelInfo.l_wk = {256, 1, 1}; + kernelInfo.g_wk = {256, 64, 1}; + + extSolution.in_tile0 = 32; + extSolution.in_tile1 = 32; + extSolution.grp_tile0 = 16; + extSolution.grp_tile1 = 16; + extSolution.out_pix_tile0 = 2; + extSolution.out_pix_tile1 = 2; + extSolution.n_stacks = 1; + extSolution.n_out_pix_tiles = 8; + extSolution.n_out_tiles_perstack = 8; + extSolution.n_in_data_tiles = 2; + extSolution.n_read_procs = 256; + extSolution.alu_tile0 = 16; + extSolution.alu_tile1 = 16; + break; + case 14: + kernelInfo.l_wk = {64, 1, 1}; + kernelInfo.g_wk = {64, 64, 1}; + + extSolution.in_tile0 = 16; + extSolution.in_tile1 = 16; + extSolution.grp_tile0 = 8; + extSolution.grp_tile1 = 8; + extSolution.out_pix_tile0 = 2; + extSolution.out_pix_tile1 = 2; + extSolution.n_stacks = 1; + extSolution.n_out_pix_tiles = 8; + extSolution.n_out_tiles_perstack = 8; + extSolution.n_in_data_tiles = 2; + extSolution.n_read_procs = 64; + extSolution.alu_tile0 = 8; + extSolution.alu_tile1 = 8; + break; + case 1: + if (inputs[0]->channel() == 4096) { + kernelInfo.l_wk = {256, 1, 1}; + kernelInfo.g_wk = {4096, 1, 1}; + }else if(inputs[0]->channel() == 1000) { + kernelInfo.l_wk = {256, 1, 1}; + kernelInfo.g_wk = {1024, 1, 1}; + } + break; + } + + kernelInfo.comp_options = + std::string(" -DMLO_HW_WAVE_SZ=64") + // (fixed) wave=64 + std::string(" -DMLO_DIR_FORWARD=1") + // (fixed) forward + std::string(" -DMLO_FILTER_STRIDE0=1") + // (fixed temp) + std::string(" -DMLO_FILTER_STRIDE1=1") + // (fixed temp) + std::string(" -DMLO_N_OUTPUTS=") + std::to_string(outputs[0]->channel()) + + std::string(" -DMLO_N_INPUTS=") + std::to_string(inputs[0]->channel()) + + std::string(" -DMLO_BATCH_SZ=") + std::to_string(inputs[0]->num()) + + std::string(" -DMLO_OUT_WIDTH=") + std::to_string(outputs[0]->width()) + + std::string(" -DMLO_OUT_HEIGHT=") + std::to_string(outputs[0]->height()) + + std::string(" -DMLO_OUT_BATCH_STRIDE=") + std::to_string(outputs[0]->width() * outputs[0]->height() * outputs[0]->channel()) + + std::string(" -DMLO_OUT_CHANNEL_STRIDE=") + std::to_string(outputs[0]->width() * outputs[0]->height()) + + std::string(" -DMLO_OUT_STRIDE=") + std::to_string(outputs[0]->width()) + + std::string(" -DMLO_IN_WIDTH=") + std::to_string(inputs[0]->width()) + + std::string(" -DMLO_IN_HEIGHT=") + std::to_string(inputs[0]->height()) + + std::string(" -DMLO_IN_BATCH_STRIDE=") + std::to_string(inputs[0]->width() * inputs[0]->height() * inputs[0]->channel()) + + std::string(" -DMLO_IN_CHANNEL_STRIDE=") + std::to_string(inputs[0]->width() * inputs[0]->height()) + + std::string(" -DMLO_IN_STRIDE=") + std::to_string(inputs[0]->width()) + + std::string(" -DMLO_CONV_BIAS=0") ; // (for now not support) + + if(inputs[0]->width() == 1) { + + kernelInfo.kernel_file = "ReluUni.cl"; + kernelInfo.kernel_name = "ReluUni"; + + } else { + //set comp_options... + + kernelInfo.comp_options += + std::string(" -DMLO_IN_TILE0=") + std::to_string(extSolution.in_tile0) + + std::string(" -DMLO_IN_TILE1=") + std::to_string(extSolution.in_tile1) + + std::string(" -DMLO_GRP_TILE0=") + std::to_string(extSolution.grp_tile0) + + std::string(" -DMLO_GRP_TILE1=") + std::to_string(extSolution.grp_tile1) + + std::string(" -DMLO_OUT_TILE0=") + std::to_string(extSolution.out_pix_tile0) + + std::string(" -DMLO_OUT_TILE1=") + std::to_string(extSolution.out_pix_tile1) + + std::string(" -DMLO_N_STACKS=") + std::to_string(extSolution.n_stacks) + + std::string(" -DMLO_N_OUT_TILES=") + std::to_string(extSolution.n_out_pix_tiles) + + std::string(" -DMLO_N_OUT_TILES_PERSTACK=") + std::to_string(extSolution.n_out_tiles_perstack) + + std::string(" -DMLO_N_IN_TILES_PERSTACK=") + std::to_string(extSolution.n_in_data_tiles) + + std::string(" -DMLO_N_READ_PROCS=") + std::to_string(extSolution.n_read_procs) + + std::string(" -DMLO_ALU_VTILE0=") + std::to_string(extSolution.alu_tile0) + + std::string(" -DMLO_ALU_VTILE1=") + std::to_string(extSolution.alu_tile1); + + kernelInfo.kernel_file = "Relu.cl"; + kernelInfo.kernel_name = "Relu"; + } + break; + + case Active_sigmoid: + + break; + + case Active_tanh: + + break; + + case Active_clipped_relu: + + break; + + case Active_elu: + + break; + } + std::copy(kernelInfo.g_wk.begin(), kernelInfo.g_wk.end(), _globalWorkSize); + std::copy(kernelInfo.l_wk.begin(), kernelInfo.l_wk.end(), _localWorkSize); + + //LOG(INFO) << "kernel file name: " << kernelInfo.kernel_file; + //LOG(INFO) << "kernel name: " << kernelInfo.kernel_name; + //LOG(INFO) << "local work size: " << kernelInfo.l_wk[0] << " " << kernelInfo.l_wk[1] << " " << kernelInfo.l_wk[2]; + //LOG(INFO) << "global work size: " << kernelInfo.g_wk[0] << " " << kernelInfo.g_wk[1] << " " << kernelInfo.g_wk[2]; + //LOG(INFO) << "compile option: " << kernelInfo.comp_options; + + //To create the program + cl_program program = CreateCLProgram(context, device, kernelInfo.kernel_file.c_str(), &kernelInfo); + if (program == NULL) + { + LOG(ERROR) << "Failed to load program"; + return SaberInvalidValue; + } + + //LOG(INFO) << "COMPILE OCL KERNEL CODE"; + + //To create kernel + _kernel = clCreateKernel(program, kernelInfo.kernel_name.c_str(), NULL); + if (_kernel == NULL) + { + LOG(ERROR) << "Failed to create kernel"; + return SaberInvalidValue; + } + + //LOG(INFO) << "COMPLETE CREATE KERNEL"; + + return SaberSuccess; +} + +template +SaberStatus SaberActivation::dispatch( + const std::vector& inputs, + std::vector& outputs, + ActivationParam ¶m) +{ + cl_int errNum = 0; + //To get the commpute command queue + AMD_API::stream_t cm = this->_ctx->get_compute_stream(); + + //To set the argument + cl_mem memObjects[2] = { 0, 0 }; + + size_t offset_in, offset_out; + + const ClMem clin = inputs[0]->data(); + ClMem clout = outputs[0]->mutable_data(); + offset_in = clin.offset; + offset_out = clout.offset; + + memObjects[0] = clin.dmem;//(cl_mem)inputs[0]->data(); + memObjects[1] = clout.dmem;//(cl_mem)outputs[0]->mutable_data(); + + errNum = clSetKernelArg(_kernel, 0, sizeof(cl_mem), &memObjects[0]); + errNum |= clSetKernelArg(_kernel, 1, sizeof(cl_mem), &memObjects[1]); + errNum |= clSetKernelArg(_kernel, 2, sizeof(float), ¶m.negative_slope); + if (errNum != CL_SUCCESS) + { + LOG(ERROR) << "Fail to set kernel arguments"; + return SaberInvalidValue; + } + cl_event event; + //LOG(INFO) << "COMPLETE SET ARGUMENT"; + errNum = clEnqueueNDRangeKernel(cm, _kernel, 3, NULL, + _globalWorkSize, _localWorkSize, + 0, NULL, &event); + if (errNum != CL_SUCCESS) + { + LOG(ERROR) << "Fail to set execution: " << errNum; + return SaberInvalidValue; + } + //LOG(INFO) << "COMPLETE EXECUTION"; + cl_event_list list; + list.push_back(event); + Env::add_event(list); + return SaberSuccess; +} + +template class SaberActivation; +DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, AMD, AK_HALF); +DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, AMD, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_scale.h b/saber/funcs/impl/amd/saber_activation.h similarity index 53% rename from saber/funcs/impl/x86/saber_scale.h rename to saber/funcs/impl/amd/saber_activation.h index 3bc19cf19..8412bfd27 100644 --- a/saber/funcs/impl/x86/saber_scale.h +++ b/saber/funcs/impl/amd/saber_activation.h @@ -1,21 +1,21 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ +#ifndef ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_ACTIVATION_H +#define ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_ACTIVATION_H -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SCALE_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SCALE_H - -#include "saber/funcs/impl/impl_scale.h" +#include "saber/funcs/impl/impl_activation.h" #include "saber/saber_types.h" #include "saber/funcs/impl/impl_base.h" #include "saber/saber_funcs_param.h" @@ -29,40 +29,42 @@ template -class SaberScale : public ImplBase< - Tensor, - Tensor, - Tensor, - ScaleParam > > + Tensor, + Tensor, + Tensor, + ActivationParam > > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; - SaberScale() + SaberActivation() {} - ~SaberScale() { + ~SaberActivation() { } virtual SaberStatus init(const std::vector& inputs, std::vector& outputs, - ScaleParam ¶m, - Context &ctx) override; + ActivationParam ¶m, + Context &ctx) override; virtual SaberStatus create(const std::vector& inputs, std::vector& outputs, - ScaleParam ¶m, - Context &ctx) override; + ActivationParam ¶m, + Context &ctx) override; virtual SaberStatus dispatch(const std::vector& inputs, std::vector& outputs, - ScaleParam ¶m) override; + ActivationParam ¶m) override; private: - + cl_kernel _kernel; + size_t _globalWorkSize[3]; + size_t _localWorkSize[3]; }; } diff --git a/saber/funcs/impl/arm/impl/neon_mathfun.h b/saber/funcs/impl/arm/impl/neon_mathfun.h new file mode 100644 index 000000000..8c074b56d --- /dev/null +++ b/saber/funcs/impl/arm/impl/neon_mathfun.h @@ -0,0 +1,320 @@ +/* NEON implementation of sin, cos, exp and log + * + * Inspired by Intel Approximate Math library, and based on the + * corresponding algorithms of the cephes math library + */ + +/* Copyright (C) 2011 Julien Pommier + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * (this is the zlib license) + */ +#ifndef ANAKIN_SABER_FUNCS_ARM_IMPL_NEON_MATHFUN_H +#define ANAKIN_SABER_FUNCS_ARM_IMPL_NEON_MATHFUN_H + +#include "saber/core/common.h" + +#define c_inv_mant_mask ~0x7f800000u +#define c_cephes_SQRTHF 0.707106781186547524 +#define c_cephes_log_p0 7.0376836292E-2 +#define c_cephes_log_p1 - 1.1514610310E-1 +#define c_cephes_log_p2 1.1676998740E-1 +#define c_cephes_log_p3 - 1.2420140846E-1 +#define c_cephes_log_p4 + 1.4249322787E-1 +#define c_cephes_log_p5 - 1.6668057665E-1 +#define c_cephes_log_p6 + 2.0000714765E-1 +#define c_cephes_log_p7 - 2.4999993993E-1 +#define c_cephes_log_p8 + 3.3333331174E-1 +#define c_cephes_log_q1 -2.12194440e-4 +#define c_cephes_log_q2 0.693359375 + +/* natural logarithm computed for 4 simultaneous float + * return NaN for x <= 0 + */ +static inline float32x4_t log_ps(float32x4_t x) +{ + float32x4_t one = vdupq_n_f32(1); + + x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ + uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); + + int32x4_t ux = vreinterpretq_s32_f32(x); + + int32x4_t emm0 = vshrq_n_s32(ux, 23); + + /* keep only the fractional part */ + ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); + ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); + x = vreinterpretq_f32_s32(ux); + + emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); + float32x4_t e = vcvtq_f32_s32(emm0); + + e = vaddq_f32(e, one); + + /* part2: + * if( x < SQRTHF ) { + * e -= 1; + * x = x + x - 1.0; + * } else { x = x - 1.0; } + */ + uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); + float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); + x = vsubq_f32(x, one); + e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); + x = vaddq_f32(x, tmp); + + float32x4_t z = vmulq_f32(x,x); + + float32x4_t y = vdupq_n_f32(c_cephes_log_p0); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); + y = vmulq_f32(y, x); + + y = vmulq_f32(y, z); + + + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); + y = vaddq_f32(y, tmp); + + + tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); + y = vsubq_f32(y, tmp); + + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); + x = vaddq_f32(x, y); + x = vaddq_f32(x, tmp); + x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN + return x; +} + +#define c_exp_hi 88.3762626647949f +#define c_exp_lo -88.3762626647949f + +#define c_cephes_LOG2EF 1.44269504088896341 +#define c_cephes_exp_C1 0.693359375 +#define c_cephes_exp_C2 -2.12194440e-4 + +#define c_cephes_exp_p0 1.9875691500E-4 +#define c_cephes_exp_p1 1.3981999507E-3 +#define c_cephes_exp_p2 8.3334519073E-3 +#define c_cephes_exp_p3 4.1665795894E-2 +#define c_cephes_exp_p4 1.6666665459E-1 +#define c_cephes_exp_p5 5.0000001201E-1 + +/* exp() computed for 4 float at once */ +static inline float32x4_t exp_ps(float32x4_t x) +{ + float32x4_t tmp, fx; + + float32x4_t one = vdupq_n_f32(1); + x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); + x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); + + /* perform a floorf */ + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + + /* if greater, substract 1 */ + uint32x4_t mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); + + + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + + tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); + float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + + static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 }; + float32x4_t y = vld1q_dup_f32(cephes_exp_p+0); + float32x4_t c1 = vld1q_dup_f32(cephes_exp_p+1); + float32x4_t c2 = vld1q_dup_f32(cephes_exp_p+2); + float32x4_t c3 = vld1q_dup_f32(cephes_exp_p+3); + float32x4_t c4 = vld1q_dup_f32(cephes_exp_p+4); + float32x4_t c5 = vld1q_dup_f32(cephes_exp_p+5); + + y = vmulq_f32(y, x); + z = vmulq_f32(x, x); + + y = vaddq_f32(y, c1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c5); + + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, one); + + /* build 2^n */ + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); + mm = vshlq_n_s32(mm, 23); + float32x4_t pow2n = vreinterpretq_f32_s32(mm); + + y = vmulq_f32(y, pow2n); + return y; +} + +#define c_minus_cephes_DP1 -0.78515625 +#define c_minus_cephes_DP2 -2.4187564849853515625e-4 +#define c_minus_cephes_DP3 -3.77489497744594108e-8 +#define c_sincof_p0 -1.9515295891E-4 +#define c_sincof_p1 8.3321608736E-3 +#define c_sincof_p2 -1.6666654611E-1 +#define c_coscof_p0 2.443315711809948E-005 +#define c_coscof_p1 -1.388731625493765E-003 +#define c_coscof_p2 4.166664568298827E-002 +#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI + +/* evaluation of 4 sines & cosines at once. + * + * The code is the exact rewriting of the cephes sinf function. + * Precision is excellent as long as x < 8192 (I did not bother to + * take into account the special handling they have for greater values + * -- it does not return garbage for arguments over 8192, though, but + * the extra precision is missing). + * + * Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + * surprising but correct result. + * + * Note also that when you compute sin(x), cos(x) is available at + * almost no extra price so both sin_ps and cos_ps make use of + * sincos_ps.. + */ +static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos) +{ + // any x + float32x4_t xmm1, xmm2, xmm3, y; + + uint32x4_t emm2; + + uint32x4_t sign_mask_sin, sign_mask_cos; + sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); + x = vabsq_f32(x); + + /* scale by 4/Pi */ + y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); + + /* store the integer part of y in mm0 */ + emm2 = vcvtq_u32_f32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); + emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); + y = vcvtq_f32_u32(emm2); + + /* get the polynom selection mask + * there is one polynom for 0 <= x <= Pi/4 + * and another one for Pi/4 +SaberStatus SaberActivation::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam ¶m) { + + int num = inputs[0]->num(); + int channel = inputs[0]->channel(); + float* ptr_out = (float*)outputs[0]->mutable_data(); + const float* ptr_in = (const float*)inputs[0]->data(); + int size = inputs[0]->valid_size(); + int csize= size / (channel * num); + int threads = 1; + this->_ctx->get_mode(threads); + //multi threads + int nums_per_thread = size / threads; + int remain = size - threads * nums_per_thread; + //openmp 16 + int neon_loop_cnt = nums_per_thread >> 4; + int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4); + //deal with 4 data + int neon_loop_cnt_dim4 = nums_per_thread >> 2; + int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); + float32x4_t vzero = vdupq_n_f32(0.f); + float coef = param.coef; + float slope = param.negative_slope; + bool channel_shared = param.prelu_param.channel_shared; + float* slopes_ptr = nullptr; + switch (param.active){ + //x > 0 ? x :0 + case Active_relu: + #pragma omp parallel for + for (int i = 0; i < threads; ++i) { + const float* ptr_in_thread = ptr_in + i * nums_per_thread; + float* ptr_out_thread = ptr_out + i * nums_per_thread; + int cnt = neon_loop_cnt; +#ifdef __aarch64__ + for (int num = 0; num < neon_loop_cnt; num++){ + float32x4_t vr0 = vld1q_f32(ptr_in_thread); + // ptr_in_thread+=4; + float32x4_t vr1 = vld1q_f32(ptr_in_thread + 4); + // ptr_in_thread+=4; + float32x4_t vr2 = vld1q_f32(ptr_in_thread + 8); + // ptr_in_thread+=4; + float32x4_t vr3 = vld1q_f32(ptr_in_thread + 12); + //ptr_in_thread+=4; + ptr_in_thread += 16; + vr0 = vmaxq_f32(vr0, vzero); + vr1 = vmaxq_f32(vr1, vzero); + vr2 = vmaxq_f32(vr2, vzero); + vr3 = vmaxq_f32(vr3, vzero); + vst1q_f32(ptr_out_thread, vr0); + //ptr_out_thread+=4; + vst1q_f32(ptr_out_thread + 4, vr1); + // ptr_out_thread+=4; + vst1q_f32(ptr_out_thread + 8, vr2); + // ptr_out_thread+=4; + vst1q_f32(ptr_out_thread + 12, vr3); + //ptr_out_thread+=4; + ptr_out_thread += 16; + } +#else + if (cnt > 0) { + asm volatile ( + "1: @ loop header\n" + "vld1.32 {d0-d1}, [%[din]]! @ load din 0\n" + "vld1.32 {d2-d3}, [%[din]]! @ load din 0\n" + "vld1.32 {d4-d5}, [%[din]]! @ load din 0\n" + "vld1.32 {d6-d7}, [%[din]]! @ load din 0\n" + + "vmax.f32 q8, q0, %q[vzero] @ relu\n" + "vmax.f32 q9, q1, %q[vzero] @ relu\n" + "vmax.f32 q10, q2, %q[vzero] @ relu\n" + "vmax.f32 q11, q3, %q[vzero] @ relu\n" + + "vst1.32 {d16-d17}, [%[dout]]! @ store result, add pointer\n" + "pld [%[din]] @ preload data\n" + "vst1.32 {d18-d19}, [%[dout]]! @ store result, add pointer\n" + "pld [%[din], #128] @ preload data\n" + "vst1.32 {d20-d21}, [%[dout]]! @ store result, add pointer\n" + "pld [%[din], #256] @ preload data\n" + "vst1.32 {d22-d23}, [%[dout]]! @ store result, add pointer\n" + "pld [%[din], #384] @ preload data\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne 1b @ jump to main loop start point\n" + :[dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread), [cnt] "+r"(cnt) + :[vzero] "w" (vzero) + :"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); + } +#endif + for (int j = 0; j < neon_loop_remain; j++) { + ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : 0.f; + ptr_in_thread++; + ptr_out_thread++; + } + } + ptr_out = ptr_out + threads * nums_per_thread; + ptr_in = ptr_in + threads * nums_per_thread; + for (int i = 0; i < remain; i++) { + ptr_out[0] = ptr_in[0] > 0.f ? ptr_in[0] : 0.f; + ptr_in++; + ptr_out++; + } + break; + + // x > 0 ? x : 0; + // x < threshold ? x : threshold + case Active_clipped_relu: + //coef = param.coef; + #pragma omp parallel for + for (int i = 0; i < threads; ++i) { + const float* ptr_in_thread = ptr_in + i * nums_per_thread; + float* ptr_out_thread = ptr_out + i * nums_per_thread; + int cnt = neon_loop_cnt; + float32x4_t vthreshold = vdupq_n_f32(coef); +#ifdef __aarch64__ + for (int num = 0; num < neon_loop_cnt; num++){ + float32x4_t vr0 = vld1q_f32(ptr_in_thread); + float32x4_t vr1 = vld1q_f32(ptr_in_thread + 4); + float32x4_t vr2 = vld1q_f32(ptr_in_thread + 8); + float32x4_t vr3 = vld1q_f32(ptr_in_thread + 12); + ptr_in_thread += 16; + + vr0 = vmaxq_f32(vr0,vzero); + vr1 = vmaxq_f32(vr1,vzero); + vr2 = vmaxq_f32(vr2,vzero); + vr3 = vmaxq_f32(vr3,vzero); + + uint32x4_t vmask0 = vcgeq_f32(vr0, vthreshold); + uint32x4_t vmask1 = vcgeq_f32(vr1, vthreshold); + uint32x4_t vmask2 = vcgeq_f32(vr2, vthreshold); + uint32x4_t vmask3 = vcgeq_f32(vr3, vthreshold); + + float32x4_t vout0 =vbslq_f32(vmask0, vthreshold, vr0); + float32x4_t vout1 =vbslq_f32(vmask1, vthreshold, vr1); + float32x4_t vout2 =vbslq_f32(vmask2, vthreshold, vr2); + float32x4_t vout3 =vbslq_f32(vmask3, vthreshold, vr3); + + + vst1q_f32(ptr_out_thread, vout0); + vst1q_f32(ptr_out_thread + 4, vout1); + vst1q_f32(ptr_out_thread + 8, vout2); + vst1q_f32(ptr_out_thread + 12, vout3); + //ptr_out_thread+=4; + ptr_out_thread += 16; + } +#else + if (cnt > 0) { + asm volatile ( + "3: @ loop header\n" + "vld1.32 {d0-d1}, [%[din]]! @ load din 0\n" + "vld1.32 {d2-d3}, [%[din]]! @ load din 0\n" + "vld1.32 {d4-d5}, [%[din]]! @ load din 0\n" + "vld1.32 {d6-d7}, [%[din]]! @ load din 0\n" + + "vmax.f32 q8, q0, %q[vzero] @ relu\n" + "vmax.f32 q9, q1, %q[vzero] @ relu\n" + "vmax.f32 q10, q2, %q[vzero] @ relu\n" + "vmax.f32 q11, q3, %q[vzero] @ relu\n" + + "vcgt.f32 q0, q8, %q[vthreshold] @ v0 > threshold\n" + "vcgt.f32 q1, q9, %q[vthreshold] @ v0 > threshold\n" + "vcgt.f32 q2, q10, %q[vthreshold] @ v0 > threshold\n" + "vcgt.f32 q3, q11, %q[vthreshold] @ v0 > threshold\n" + + "vbit.f32 q8, %q[vthreshold], q0 @ \n" + "vbit.f32 q9, %q[vthreshold], q1 @ \n" + "vbit.f32 q10, %q[vthreshold], q2 @ \n" + "vbit.f32 q11, %q[vthreshold], q3 @ \n" + + "vst1.32 {d16-d17}, [%[dout]]! @ store result, add pointer\n" + "pld [%[din]] @ preload data\n" + "vst1.32 {d18-d19}, [%[dout]]! @ store result, add pointer\n" + "pld [%[din], #128] @ preload data\n" + "vst1.32 {d20-d21}, [%[dout]]! @ store result, add pointer\n" + "pld [%[din], #256] @ preload data\n" + "vst1.32 {d22-d23}, [%[dout]]! @ store result, add pointer\n" + "pld [%[din], #384] @ preload data\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne 3b @ jump to main loop start point\n" + :[dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread), [cnt] "+r"(cnt) + :[vzero] "w" (vzero), [vthreshold] "w" (vthreshold) + :"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); + } +#endif + for (int j = 0; j < neon_loop_remain; j++) { + ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? (ptr_in_thread[0] > coef ? coef : ptr_in_thread[0]) : 0.f; + ptr_in_thread++; + ptr_out_thread++; + } + } + ptr_out = ptr_out + threads * nums_per_thread; + ptr_in = ptr_in + threads * nums_per_thread; + for (int i = 0; i < remain; i++) { + ptr_out[0] = ptr_in[0] > 0.f ? (ptr_in[0] > coef ? coef : ptr_in[0]) : 0.f; + ptr_in++; + ptr_out++; + } + break; + //sigmoid: 1/(exp(-x) + 1) + case Active_sigmoid: + #pragma omp parallel for + for (int i = 0; i < threads; i++) { + float32x4_t exp_vec = vdupq_n_f32(0.0f); + float32x4_t recip = vdupq_n_f32(0.0f); + const float* ptr_in_thread = ptr_in + i * nums_per_thread; + float* ptr_out_thread = ptr_out + i * nums_per_thread; + for (int j = 0; j < neon_loop_cnt_dim4; j++ ) { + exp_vec = exp_ps(vnegq_f32(vld1q_f32(ptr_in_thread))); + exp_vec = vaddq_f32(exp_vec, vdupq_n_f32(1.0f)); + recip = vrecpeq_f32(exp_vec); + recip = vmulq_f32 (vrecpsq_f32 (exp_vec, recip), recip); + recip = vmulq_f32 (vrecpsq_f32 (exp_vec, recip), recip); + vst1q_f32(ptr_out_thread, recip); + ptr_out_thread += 4; + ptr_in_thread += 4; + } + for (int j = 0; j < neon_loop_remain_dim4; j++){ + ptr_out_thread[0] = 1 / (1 + exp(-ptr_in_thread[0])); + ptr_in_thread++; + ptr_out_thread++; + } + } + ptr_out = ptr_out + threads * nums_per_thread; + ptr_in = ptr_in + threads * nums_per_thread; + for (int i = 0; i < remain; i++) { + ptr_out[0] = 1/(1+exp(-ptr_in[0])); + ptr_in++; + ptr_out++; + } + break; + + // tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x)) + case Active_tanh: + //LOG(INFO) << "Active_tanh"; + #pragma omp parallel for + for (int i = 0; i < threads; ++i) { + float32x4_t vtwo = vdupq_n_f32(2.0f); + float32x4_t vone = vdupq_n_f32(1.0f); + const float* ptr_in_thread = ptr_in + i * nums_per_thread; + float* ptr_out_thread = ptr_out + i * nums_per_thread; + int cnt4 = neon_loop_cnt_dim4; + int remain4 = size; + cnt4 = cnt4 < 5 ? cnt4 : 0; + remain4 = cnt4 == 0 ? remain4 : neon_loop_remain_dim4; + for (int j = 0; j < cnt4; j++) { + float32x4_t vdin = vld1q_f32(ptr_in_thread); + float32x4_t vsum = vmulq_f32(vdin, vtwo); + float32x4_t vexp_sum = exp_ps(vsum); + float32x4_t vadd_sum = vaddq_f32(vexp_sum, vone); + float32x4_t vrecip = div_ps(vtwo, vadd_sum); + float32x4_t vout = vsubq_f32(vone, vrecip); + vst1q_f32(ptr_out_thread, vout); + ptr_out_thread += 4; + ptr_in_thread += 4; + } + for(int j = 0; j < remain4; j++){ + ptr_out_thread[0] = 1.0 - 2.0 / (1.0 + exp(2.0 * ptr_in_thread[0])); + //(exp(ptr_in_thread[0]) - exp(-ptr_in_thread[0])) / (exp(ptr_in_thread[0]) + exp(-ptr_in_thread[0])); + ptr_in_thread++; + ptr_out_thread++; + } + } + ptr_out = ptr_out + threads * nums_per_thread; + ptr_in = ptr_in + threads * nums_per_thread; + for (int j = 0; j < remain; ++j) { + ptr_out[0] = 1.0 - 2.0 / (1.0 + exp(2.0 * ptr_in[0]));//(exp(ptr_in[0]) - exp(-ptr_in[0])) / (exp(ptr_in[0]) + exp(-ptr_in[0])); + ptr_in++; + ptr_out++; + } + break; + + // stanh : b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}} + case Active_stanh: + #pragma omp parallel for + for (int i = 0; i < threads; ++i) { + float32x4_t vcoef = vdupq_n_f32(coef); + float32x4_t vslope = vdupq_n_f32(slope); + float32x4_t vtwo = vdupq_n_f32(2.0f); + float32x4_t vone = vdupq_n_f32(1.0f); + const float* ptr_in_thread = ptr_in + i * nums_per_thread; + float* ptr_out_thread = ptr_out + i * nums_per_thread; + int cnt4 = neon_loop_cnt_dim4; + int remain4 = size; + cnt4 = cnt4 < 10 ? cnt4 : 0; + remain4 = cnt4 == 0 ? remain4 : neon_loop_remain_dim4; + for (int j = 0; j < cnt4; j++) { + float32x4_t vdin = vld1q_f32(ptr_in_thread); + float32x4_t vmul_sum = vmulq_f32(vdin, vslope); + float32x4_t vsum = vmulq_f32(vmul_sum, vtwo); + float32x4_t vexp_sum = exp_ps(vsum); + float32x4_t vadd_sum = vaddq_f32(vexp_sum, vone); + float32x4_t vrecip = div_ps(vtwo, vadd_sum); + float32x4_t vout = vsubq_f32(vone, vrecip); + vout = vmulq_f32(vout, vcoef); + vst1q_f32(ptr_out_thread, vout); + ptr_out_thread += 4; + ptr_in_thread += 4; + } + for(int j = 0; j < remain4; j++){ + float din = ptr_in_thread[0] * slope; + ptr_out_thread[0] = coef * (1.0 - 2.0 / (1.0 + exp(2.0 * din))); + ptr_in_thread++; + ptr_out_thread++; + } + } + ptr_out = ptr_out + threads * nums_per_thread; + ptr_in = ptr_in + threads * nums_per_thread; + for (int j = 0; j < remain; ++j) { + float din = ptr_in[0] * slope; + ptr_out[0] = coef * (1.0 - 2.0 / (1.0 + exp(2.0 * din))); + ptr_in++; + ptr_out++; + } + break; + + //prelu: x > 0 ? x : slope[c] * x + case Active_prelu: + slopes_ptr = (float*)param.prelu_param.slope->data(); + for (int n = 0; n < num; n++){ + const float* data_in_batch = ptr_in + n * channel * csize; + float* data_out_batch = ptr_out + n * channel * csize; +#pragma omp parallel for + for (int c = 0; c < channel; c++){ + const float* data_in_channel = data_in_batch + c * csize; + float* data_out_channel = data_out_batch + c * csize; + float slope_val = channel_shared ? slopes_ptr[0] : slopes_ptr[c]; + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vslope = vdupq_n_f32(slope_val); + int dim4 = csize >> 2; + int dim4_remain = csize - (dim4 * 4); +#ifdef __aarch64__ + for (int i = 0; i < dim4; i++){ + float32x4_t vr0 = vld1q_f32(data_in_channel); + uint32x4_t vmask = vcltq_f32(vr0, vzero);//vr0 <= vzero + float32x4_t vout = vmulq_f32(vr0, vslope);//vr0 * vslope + float32x4_t vout_sel = vbslq_f32(vmask, vout, vr0); + vst1q_f32(data_out_channel, vout_sel); + data_in_channel += 4; + data_out_channel += 4; + } +#else + int cnt = dim4; + if (dim4 > 0){ + asm volatile( + "2: @main loop\n" + "vld1.f32 {d0-d1}, [%[ptr_in]]! @load q1\n" + "vclt.f32 q1, q0, %q[vzero] @vcle q0 <= vzero\n" + "vmul.f32 q2, q0, %q[vslope] @vmul q0 * vslope\n" + "vbit.32 q0, q2, q1 @vbit q0, q2, q1\n" + "subs %[cnt], #1 @subs nn, 1\n" + "vst1.f32 {d0-d1}, [%[ptr_out]]! @store data\n" + "bne 2b @bne nn\n" + :[ptr_in] "+r" (data_in_channel), [cnt] "+r" (cnt), \ + [ptr_out] "+r" (data_out_channel) + :[vzero] "w" (vzero), [vslope] "w" (vslope) + :"q0", "q1", "q2" + ); + } +#endif //__aarch64__ + for (int i = 0 ; i < dim4_remain ; i++) { + data_out_channel[0] = data_in_channel[0] > 0 ? data_in_channel[0] : data_in_channel[0] * slope_val; + data_in_channel++; + data_out_channel++; + } + } + } + break; + + //elu: x > 0 ? x : coef * (exp(x) - 1) + case Active_elu: + #pragma omp parallel for + for (int i = 0; i < threads; ++i) { + const float* ptr_in_thread = ptr_in + i * nums_per_thread; + float* ptr_out_thread = ptr_out + i * nums_per_thread; + int cnt = neon_loop_cnt; + float32x4_t vone = vdupq_n_f32(1.0f); + float32x4_t vcoef = vdupq_n_f32(coef); + for (int num = 0; num < neon_loop_cnt; num++){ + float32x4_t vr0 = vld1q_f32(ptr_in_thread); + // ptr_in_thread+=4; + float32x4_t vr1 = vld1q_f32(ptr_in_thread + 4); + // ptr_in_thread+=4; + float32x4_t vr2 = vld1q_f32(ptr_in_thread + 8); + // ptr_in_thread+=4; + float32x4_t vr3 = vld1q_f32(ptr_in_thread + 12); + //ptr_in_thread+=4; + ptr_in_thread += 16; + + float32x4_t vsum0 = exp_ps(vr0); + float32x4_t vsum1 = exp_ps(vr1); + float32x4_t vsum2 = exp_ps(vr2); + float32x4_t vsum3 = exp_ps(vr3); + uint32x4_t vmask0 = vcgeq_f32(vr0, vzero); + uint32x4_t vmask1 = vcgeq_f32(vr1, vzero); + uint32x4_t vmask2 = vcgeq_f32(vr2, vzero); + uint32x4_t vmask3 = vcgeq_f32(vr3, vzero); + vsum0 = vsubq_f32(vsum0, vone); + vsum1 = vsubq_f32(vsum1, vone); + vsum2 = vsubq_f32(vsum2, vone); + vsum3 = vsubq_f32(vsum3, vone); + + vsum0 = vmulq_f32(vsum0, vcoef); + vsum1 = vmulq_f32(vsum1, vcoef); + vsum2 = vmulq_f32(vsum2, vcoef); + vsum3 = vmulq_f32(vsum3, vcoef); + + + + float32x4_t vout0 =vbslq_f32(vmask0, vr0, vsum0); + float32x4_t vout1 =vbslq_f32(vmask1, vr1, vsum1); + float32x4_t vout2 =vbslq_f32(vmask2, vr2, vsum2); + float32x4_t vout3 =vbslq_f32(vmask3, vr3, vsum3); + + vst1q_f32(ptr_out_thread, vout0); + //ptr_out_thread+=4; + vst1q_f32(ptr_out_thread + 4, vout1); + // ptr_out_thread+=4; + vst1q_f32(ptr_out_thread + 8, vout2); + // ptr_out_thread+=4; + vst1q_f32(ptr_out_thread + 12, vout3); + //ptr_out_thread+=4; + ptr_out_thread += 16; + } + + for (int j = 0; j < neon_loop_remain; j++) { + ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : coef * (exp(ptr_in_thread[0]) - 1); + ptr_in_thread++; + ptr_out_thread++; + } + } + ptr_out = ptr_out + threads * nums_per_thread; + ptr_in = ptr_in + threads * nums_per_thread; + for (int i = 0; i < remain; i++) { + ptr_out[0] = ptr_in[0] > 0.f ? ptr_in[0] : coef * (exp(ptr_in[0]) - 1); + ptr_in++; + ptr_out++; + } + break; + default: + return SaberUnKownError; + } + return SaberSuccess; +} +DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, ARM, AK_HALF); +DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, ARM, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/arm/saber_activation.h b/saber/funcs/impl/arm/saber_activation.h new file mode 100644 index 000000000..10ef82f8a --- /dev/null +++ b/saber/funcs/impl/arm/saber_activation.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_ACTIVATION_H +#define ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_ACTIVATION_H + +#include "saber/funcs/impl/impl_activation.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberActivation : \ + public ImplBase< + ARM, + OpDtype, + ActivationParam > +{ +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberActivation() + {} + + ~SaberActivation() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ActivationParam& param, Context& ctx) { + this->_ctx = &ctx; + return SaberSuccess; + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ActivationParam& param, Context &ctx) { + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ActivationParam& param); + + +}; + +//template class SaberActivation; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_ACTIVATION_H diff --git a/saber/funcs/impl/arm/saber_concat.cpp b/saber/funcs/impl/arm/saber_concat.cpp new file mode 100644 index 000000000..6fb3e3af5 --- /dev/null +++ b/saber/funcs/impl/arm/saber_concat.cpp @@ -0,0 +1,54 @@ +#include "saber/funcs/impl/arm/saber_concat.h" + +namespace anakin{ + +namespace saber{ + +template +void concat_kernel_arm(const int len, const dtype* src, dtype* dst) { + if (dst != src) { + memcpy(dst, src, sizeof(dtype) * len); + } +} + +template <> +SaberStatus SaberConcat::dispatch(\ + const std::vector *>& inputs, + std::vector *>& outputs, + ConcatParam ¶m) { + + int input_size = inputs.size(); + + //! get output data, valid shape and stride shape + int offset_concat_axis = 0; + Shape out_shape = outputs[0]->valid_shape(); + const int out_concat_axis = out_shape[param.axis]; + + if (inputs.size() == 1) { + outputs[0]->copy_from(*inputs[0]); + return SaberSuccess; + } + + OpDataType* dout = (OpDataType*)outputs[0]->mutable_data(); + + for (int i = 0; i < input_size; ++i) { + Shape sh_in = inputs[i]->valid_shape(); + const OpDataType* din = (const OpDataType*)inputs[i]->data(); + const int in_concat_axis = sh_in[param.axis]; + for (int n = 0; n < _num_concats; ++n) { + concat_kernel_arm(in_concat_axis * _concat_input_size, + din + n * in_concat_axis * _concat_input_size, + dout + (n * out_concat_axis + offset_concat_axis) + * _concat_input_size); + } + offset_concat_axis += in_concat_axis; + } + return SaberSuccess; +} +DEFINE_OP_TEMPLATE(SaberConcat, ConcatParam, ARM, AK_HALF); +DEFINE_OP_TEMPLATE(SaberConcat, ConcatParam, ARM, AK_INT8); +//template class SaberConcat; + +} //namespace anakin + +} //namespace anakin diff --git a/saber/funcs/impl/arm/saber_concat.h b/saber/funcs/impl/arm/saber_concat.h new file mode 100644 index 000000000..1370b7ed8 --- /dev/null +++ b/saber/funcs/impl/arm/saber_concat.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_CONCAT_H +#define ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_CONCAT_H + +#include "saber/funcs/impl/impl_concat.h" +#include "saber/core/tensor.h" + +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +template +class SaberConcat : \ + public ImplBase< + ARM, OpDtype, + ConcatParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberConcat() = default; + ~SaberConcat() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConcatParam ¶m, Context &ctx){ + // get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConcatParam ¶m, Context &ctx){ + + _num_concats = inputs[0]->count_valid(0, param.axis); + _concat_input_size = inputs[0]->count_valid(param.axis + 1, inputs[0]->dims()); + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConcatParam ¶m); + +private: + int _num_concats; + int _concat_input_size; +}; + +} //namespace saber + +} //namespace anakin + +#endif //USE_ARM_PLACE + +#endif //ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_CONCAT_H diff --git a/saber/funcs/impl/bm/.vender_conv.cpp.swp b/saber/funcs/impl/bm/.vender_conv.cpp.swp new file mode 100644 index 000000000..0f2720a4b Binary files /dev/null and b/saber/funcs/impl/bm/.vender_conv.cpp.swp differ diff --git a/saber/funcs/impl/bm/CMakeLists.txt b/saber/funcs/impl/bm/CMakeLists.txt new file mode 100644 index 000000000..9bc297213 --- /dev/null +++ b/saber/funcs/impl/bm/CMakeLists.txt @@ -0,0 +1,59 @@ +function(anakin_fetch_files_with_suffix search_dir suffix outputs) + exec_program(ls ${search_dir} + ARGS "*.${suffix}" + OUTPUT_VARIABLE OUTPUT + RETURN_VALUE VALUE) + if(NOT VALUE) + string(REPLACE "\n" ";" OUTPUT_LIST "${OUTPUT}") + set(abs_dir "") + foreach(var ${OUTPUT_LIST}) + set(abs_dir ${abs_dir} ${search_dir}/${var}) + #message(STATUS "fetch_result: ${abs_dir}") + endforeach() + set(${outputs} ${${outputs}} ${abs_dir} PARENT_SCOPE) + else() + #message(WARNING "anakin_fetch_files_recursively ${BoldRed}failed${ColourReset}:\n" + # "real_dir:${BoldYellow}${search_dir}${ColourReset}\n" + # "suffix:*.${BoldYellow}${suffix}${ColourReset} \n") + endif() +endfunction() + + + +set(BIN_NAME bmkernel_bin) +set(LINK_CONFIG link/bm1682_ddr.lds) +set(BM_ROOT /usr/local/include/bm) +anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/bm/device "c" DEVICE_KERNEL_SRC) +message(${DEVICE_KERNEL_SRC}) +#string(REPLACE ".c" ".o" DEVICE_KERNEL_OBJ ${DEVICE_KERNEL_SRC}) +#message(${DEVICE_KERNEL_OBJ}) + +set(OBJ_PATH "") +FOREACH(FILE_PATH ${DEVICE_KERNEL_SRC}) + STRING(REGEX REPLACE ".+/(.+)\\..*" "\\1" FILE_NAME ${FILE_PATH}) + set(OBJ_PATH ${OBJ_PATH} ${FILE_NAME}.o) + message(${FILE_NAME}.o) +ENDFOREACH(FILE_PATH) +message(${OBJ_PATH}) + +add_custom_command(OUTPUT bm_kernel_tmp + COMMAND arm-none-eabi-gcc ${DEVICE_KERNEL_SRC} -c -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -std=gnu99 -O2 -Wall -Werror -ffunction-sections -fdata-sections -nostdlib -DENABLE_PRINT -I${BM_ROOT}/include/config -I${BM_ROOT}/include/common -I${BM_ROOT}/include/c_model -I${BM_ROOT}/include/firmware_core -I${BM_ROOT}/include/bmlib + COMMAND arm-none-eabi-gcc -T ${BM_ROOT}/${LINK_CONFIG} -mcpu=arm926ej-s -mfpu=vfp -fno-short-enums -Wl,--check-sections -Wl,--gc-sections -Wl,--unresolved-symbols=report-all -Wl,--no-enum-size-warning -o ${BIN_NAME}.elf -Wl,--start-group -lc -lm ${OBJ_PATH} ${BM_ROOT}/lib/device/fw-top.a ${BM_ROOT}/lib/device/fw-arm.a -Wl,--end-group + COMMAND arm-none-eabi-objcopy -O binary -R *.slow* ${BIN_NAME}.elf ${BIN_NAME}_itcm.bin + COMMAND hexdump -v -e '1/4 \"%08x\\n\"' ${BIN_NAME}_itcm.bin > ${BIN_NAME}_itcm.hex.sim + COMMAND arm-none-eabi-objcopy -O binary -j *.slow* ${BIN_NAME}.elf ${BIN_NAME}_ddr.bin + COMMAND hexdump -v -e '1/4 \"%08x\\n\"' ${BIN_NAME}_ddr.bin > ${BIN_NAME}_ddr.hex.sim + COMMAND printf "%x" 0xAABBCCDD > ${BIN_NAME}.bin + COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin + COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin + COMMAND printf "%x" 0x0 >> ${BIN_NAME}.bin + + COMMAND printf \"%x\" `wc -c < ${BIN_NAME}_itcm.hex.sim` >> ${BIN_NAME}.bin + + COMMAND cat ${BIN_NAME}_itcm.hex.sim >> ${BIN_NAME}.bin + COMMAND cat ${BIN_NAME}_ddr.hex.sim >> ${BIN_NAME}.bin + COMMAND mkdir -p /usr/local/include/bm + COMMAND cp ${BIN_NAME}.bin /usr/local/include/bm/ + COMMENT "BM Kernel compilation..." +) +add_custom_target(ANAKIN ALL DEPENDS bm_kernel_tmp) diff --git a/saber/funcs/impl/bm/base/CMakeLists.txt b/saber/funcs/impl/bm/base/CMakeLists.txt deleted file mode 100644 index 59b82abb5..000000000 --- a/saber/funcs/impl/bm/base/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved -# @file CMakeLists files in the saber subdirectory for nvidia gpu code -# @auther cuichaowen -# @date 2017-11-29 -# ---------------------------------------------------------------------------- - -if(USE_BM) - anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/include "h" ANAKIN_SABER_BM_C_SRC) - anakin_fetch_files_with_suffix_recursively(${BM_BASE_CODE_ROOT}/lib "so" ANAKIN_SABER_BM_STATIC_LIB) -endif() - -macro(anakin_set_upscope src) - set(${src} ${${src}} PARENT_SCOPE) -endmacro() - -if(USE_BM) - anakin_set_upscope(ANAKIN_SABER_BM_C_SRC) - anakin_set_upscope(ANAKIN_SABER_BM_STATIC_LIB) -endif() diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h deleted file mode 100644 index 97feb1972..000000000 --- a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_api.h +++ /dev/null @@ -1,814 +0,0 @@ -#ifndef BMDNN_API_H -#define BMDNN_API_H - -#include "bmdnn_runtime.h" -#include "op_code.h" - -#if defined (__cplusplus) -extern "C" { -#endif - -/* - * All the name-style of input/output are in the viewpoint of forward operation - */ - -typedef struct kernel_param{ - int g; - int oc; - int ic; - int h; - int w; -}bm_kernel_param_t; - -typedef struct bm_conv_param{ - int stride_h; - int stride_w; - int pad_h; - int pad_w; - int dilation_h; - int dilation_w; - bool result_add; -}bm_conv_param_t; - -typedef struct bm_pool_param{ - int stride_h; - int stride_w; - int pad_h; - int pad_w; - int kh; - int kw; - bool is_avg_pooling; -}bm_pool_param_t; - -bm_status_t bmdnn_conv_relu_pool_forward( - bm_handle_t handle, - bm_device_mem_t input, - bm_device_mem_t weight, - bm_device_mem_t bias, - bm_tensor_4d_t input_shape, - bm_kernel_param_t kernel_param, - bm_pool_param_t pool_param, - bm_conv_param_t conv_param, - bool with_bias, - bm_device_mem_t output); - -bm_status_t bmdnn_conv_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t weight, - bm_device_mem_t bias, - bm_tensor_4d_t input_shape, - bm_kernel_param_t kernel_param, - bm_tensor_4d_t output_shape, - bm_conv_param_t conv_param, - bool with_bias, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_deconv_forward( - bm_handle_t handle, - bm_device_mem_t input, - bm_device_mem_t weight, - bm_device_mem_t bias, - bm_tensor_4d_t input_shape, - bm_kernel_param_t kernel_param, - bm_tensor_4d_t output_shape, - bm_conv_param_t conv_param, - bool with_bias, - bm_device_mem_t output); - -bm_status_t bmdnn_conv_backward_bias( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - int input_n, - int input_c, - int input_h, - int input_w, - int groups, - int output_c, - int kh, - int kw, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int result_add, - //output - bm_device_mem_t bias_diff); - -bm_status_t bmdnn_pooling_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - int input_n, - int input_c, - int input_h, - int input_w, - int kh, - int kw, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int is_avg_pooling, - //output - bm_device_mem_t output - ); -bm_status_t bmdnn_upsample_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - int input_n, - int input_c, - int input_h, - int input_w, - int size, - //output - bm_device_mem_t output - ); -bm_status_t bmdnn_roi_pooling_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t rois, - int input_n, - int input_c, - int input_h, - int input_w, - int pooled_h, - int pooled_w, - int roi_num, - int spatial_scale, - //output - bm_device_mem_t output - ); - -bm_status_t bmdnn_fc_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t weight, - bm_device_mem_t bias, - int batch_size, - int num_output_neuron, - int num_input_neuron, - int transpose, - int using_bias, - int using_relu, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_fc_backward( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t input, - bm_device_mem_t weight, - int num_output_neuron, - int batch_size, - int num_input_neuron, - int using_bias, - int propagate_down_bias_diff, - int propagate_down_weight_diff, - int propagate_down_bottom, - //output - bm_device_mem_t weight_diff, - bm_device_mem_t bias_diff, - bm_device_mem_t input_diff); - -bm_status_t bmdnn_dropout_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - float dropout_ratio, - int input_n, - int input_dim, - //output - bm_device_mem_t output, - bm_device_mem_t mask); - -bm_status_t bmdnn_dropout_backward( - bm_handle_t handle, - //input - bm_device_mem_t input, - float dropout_ratio, - int input_n, - int input_dim, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_batchnorm_forward_inference( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t mean_ma, - bm_device_mem_t variance_ma, - float scale_ma, - bm_device_mem_t variance, - float eps, - int input_n, - int input_c, - int input_h, - int input_w, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_batchnorm_forward_train( - bm_handle_t handle, - //input - bm_device_mem_t input, - float ma_fraction, - float eps, - int input_n, - int input_c, - int input_h, - int input_w, - //output - bm_device_mem_t output, - bm_device_mem_t mean, - bm_device_mem_t variance, - bm_device_mem_t mean_ma, - bm_device_mem_t variance_ma); - -bm_status_t bmdnn_batchnorm_backward( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t output, - bm_device_mem_t variance, - int input_n, - int input_c, - int input_h, - int input_w, - int using_global_stats, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_lrn_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - int input_n, - int input_c, - int input_h, - int input_w, - int lrn_n, - float alpha, - float beta, - float k, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_lrn_backward( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t output, - bm_device_mem_t input, - int lrn_n, - float alpha, - float beta, - float k, - int input_n, - int input_c, - int input_h, - int input_w, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_relu_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - float negative_slope, - int input_n, - int input_dim, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_relu_backward( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t input, - float negative_slope, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_sigmoid_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_sigmoid_backward( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t output, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_tanh_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_tanh_backward( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t output, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_softmax_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - int input_n, - int input_c, - int input_inner_dim, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_softmax_backward( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t output, - int input_n, - int input_c, - int input_inner_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_softmax_loss_forward( - bm_handle_t handle, - bm_device_mem_t input, - bm_device_mem_t label, - float normalizer, - int input_n, - int input_c, - int input_inner_dim, - bm_device_mem_t output, - bm_device_mem_t loss); -bm_status_t bmdnn_interp_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - int input_n, - int input_c, - int input_h, - int input_w, - int pad_bag, - int pad_end, - int output_h, - int output_w, - //output - bm_device_mem_t output - ); -bm_status_t bmdnn_softmax_loss_backward( - bm_handle_t handle, - bm_device_mem_t output, - bm_device_mem_t label, - bm_device_mem_t loss, - float normalizer, - int input_n, - int input_c, - int input_inner_dim, - bm_device_mem_t input_diff); - -bm_status_t bmdnn_softmax_loss_bidirection( - bm_handle_t handle, - bm_device_mem_t input, - bm_device_mem_t label, - float normalizer, - int input_n, - int input_c, - int input_inner_dim, - bm_device_mem_t output_diff, - bm_device_mem_t loss); - -bm_status_t bmdnn_multiregion_forward_parallel( - bm_handle_t handle, - //input - bm_device_mem_t* input, - int* input_n, - int* input_c, - int* input_h, - int* input_w, - int input_num, - int classes, - int coords, - int nums, - int* Activate_parm, - //output - bm_device_mem_t* output -); - -bm_status_t bmdnn_accuracy( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t label_idx, - bm_device_mem_t input_mem_buffer, - int input_num, - int input_dim, - int top_k, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_coeff_update_sgd( - bm_handle_t handle, - bm_device_mem_t weight_diff, - bm_device_mem_t weight, - bm_device_mem_t history_weight, - int weight_count, - float base_lr, - float momentum, - float weight_decay); - -bm_status_t bmdnn_fc_backward_sgd( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t input, - //input and output - bm_device_mem_t weight, - bm_device_mem_t weight_history, - int num_output_neuron, - int batch_size, - int num_input_neuron, - int using_bias, - int propagate_down_bias_diff, - int propagate_down_weight_diff, - int propagate_down_bottom, - float base_lr, - float momentum, - float weight_decay, - //output - bm_device_mem_t bias_diff, - bm_device_mem_t input_diff); - -bm_status_t bmdnn_permute( - bm_handle_t handle, - //input - bm_device_mem_t input, - int input_n, - int input_c, - int input_h, - int input_w, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_normalize_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t scale, - float eps, - float scale_val, - bool across_spatial, - bool channel_share, - int input_n, - int input_c, - int input_h, - int input_w, - //output - bm_device_mem_t output); - -/* - * MD Operations for user - */ - - -bm_status_t bmdnn_md_scalar( - bm_handle_t handle, - //input - bm_device_mem_t tensor_A, - bm_device_mem_t tensor_B, - int input_n, - int input_c, - int input_h, - int input_w, - ALIGN_TENSOR_OP align_tensor_op, - int result_add, - int A_is_constant, - int B_is_constant, - float A_const_val, - float B_const_val, - int B_N_is_1, - int B_index_is_1, - //output - bm_device_mem_t tensor_R); - -bm_status_t bmdnn_md_cmp( - bm_handle_t handle, - //input - bm_device_mem_t tensor_A, - bm_device_mem_t tensor_B, - bm_device_mem_t tensor_C, - bm_device_mem_t tensor_D, - int input_n, - int input_c, - int input_h, - int input_w, - int A_is_constant, - int B_is_constant, - int C_is_constant, - int D_is_constant, - float A_constant, - float B_constant, - unsigned int C_constant, - unsigned int D_constant, - int result_skip, - //output - bm_device_mem_t tensor_Y, - bm_device_mem_t tensor_R); - -bm_status_t bmdnn_md_sfu( - bm_handle_t handle, - //input - bm_device_mem_t tensor_A, - int input_n, - int input_c, - int input_h, - int input_w, - SFU_OP sfu_op, - float a, - int n, - //output - bm_device_mem_t tensor_Y); - -bm_status_t bmdnn_md_sum( - bm_handle_t handle, - //input - bm_device_mem_t tensor_A, - int input_n, - int input_c, - int input_h, - int input_w, - int result_add, - //output - bm_device_mem_t tensor_Y); - - -bm_status_t bmdnn_md_linear( - bm_handle_t handle, - //input - bm_device_mem_t tensor_A, - bm_device_mem_t tensor_B, - bm_device_mem_t tensor_S, - int input_n, - int input_c, - int input_h, - int input_w, - LINEAR_OP linear_op, - int result_add, - int B_is_const, - int S_is_const, - float B_const_val, - float S_const_val, - //output - bm_device_mem_t tensor_Y); - -bm_status_t bmdnn_img_sum( - bm_handle_t handle, - //input - bm_device_mem_t tensor_A, - int input_n, - int input_c, - int input_h, - int input_w, - int result_add, - //output - bm_device_mem_t tensor_Y); - -/* - * fullnet mode - */ -bm_status_t bmdnn_fullnet( - bm_handle_t handle, - unsigned long long bdc_cmd_offset, - unsigned long long gdma_cmd_offset, - unsigned long long cdma_cmd_offset, - unsigned long long cmd_num_offset - ); - -/* - * multiple fullnet mode - */ -bm_status_t bmdnn_multi_fullnet( - bm_handle_t handle, - int input_num, - unsigned long long* user_input_global_offset, - unsigned long long* cmd_input_global_offset, - int* input_tensor_size, - int output_num, - unsigned long long* user_output_global_offset, - unsigned long long* cmd_output_global_offset, - int* output_tensor_size, - unsigned long long bdc_cmd_offset, - unsigned long long gdma_cmd_offset, - unsigned long long cdma_cmd_offset, - int* bdc_cmd_num, - int* gdma_cmd_num, - int* cdma_cmd_num, - int cmdgroup_num - ); - -/* - * dynamic fullnet mode - */ -bm_status_t bmdnn_dynamic_fullnet( - bm_handle_t handle, - unsigned long long compiled_ir_global_addr, - unsigned int compiled_ir_length, - unsigned int batch_num, - unsigned int input_num, - unsigned long long* input_global_offset, - unsigned int* input_height, - unsigned int* input_width, - unsigned int output_num, - unsigned long long* output_global_offset, - unsigned long long apd_ctx_mem_offset -#if defined(USING_CMODEL) && !defined(USING_FULLNET) - ,float** p_refer_result -#endif - ); - -/** - * Depthwise convolution. - */ -bm_status_t bmdnn_depthwise_forward( - bm_handle_t handle, - bm_device_mem_t input, - bm_device_mem_t weight, - bm_device_mem_t bias, - int input_n, - int input_c, - int input_h, - int input_w, - int kernel_h, - int kernel_w, - int dilation_h, - int dilation_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int using_bias, - bm_device_mem_t output); - -bm_status_t bmdnn_lstm_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t cont, - bm_device_mem_t input_static, - /*bm_device_mem_t w_hc, - bm_device_mem_t w_xc,*/ - bm_device_mem_t w_hxc, - bm_device_mem_t w_xstatic, - bm_device_mem_t b_c, - bm_device_mem_t h_0, - bm_device_mem_t c_0, - int input_n, - int seq_len, - int input_dim, - int input_static_dim, - int output_dim, - int with_input_static, - //output - bm_device_mem_t c, - bm_device_mem_t gate, - bm_device_mem_t h_T, - bm_device_mem_t c_T, - bm_device_mem_t h); - -bm_status_t bmdnn_netease_ocr_forward( - bm_handle_t handle, - //input - bm_device_mem_t conv1_ifmap, - bm_device_mem_t params, - bm_device_mem_t result); - -typedef struct dim4_s { - int n, c, h, w; -} dim4_t; -enum -{ - CONV_DEPTHWISE, - CONV_3D -}; -typedef struct mobilenet_conv_param_s -{ - /** convolution. */ - int type; - bm_device_mem_t kernel; - bm_device_mem_t bias; - dim4_t kernel_shape; - int dilation_h, dilation_w; - int pad_h, pad_w; - int stride_h, stride_w; - bool using_bias; - /** batchnorm. */ - bm_device_mem_t mean; - bm_device_mem_t variance; - /** relu. */ - float slope; -} mobilenet_conv_param_t; -bm_status_t bmdnn_mobilenet_forward( - bm_handle_t handle, - const mobilenet_conv_param_t *conv, - int num, - const dim4_t &input_shape, - const bm_device_mem_t &input_global_mem, - dim4_t &output_shape, - bm_device_mem_t &output_global_mem, - float parallel_performance_factor = 1.f); - -bm_status_t bmdnn_conv_forward_bank_conflict( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t weight, - bm_device_mem_t bias, - bm_tensor_4d_t input_shape, - bm_kernel_param_t kernel_param, - bm_tensor_4d_t output_shape, - bm_conv_param_t conv_param, - bool with_bias, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_pooling_forward_bank_conflict( - bm_handle_t handle, - //input - bm_device_mem_t input, - int input_n, - int input_c, - int input_h, - int input_w, - int kh, - int kw, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int is_avg_pooling, - bm_device_mem_t output); - -bm_status_t bmdnn_fc_forward_bank_conflict( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t weight, - bm_device_mem_t bias, - int batch_size, - int num_output_neuron, - int num_input_neuron, - int transpose, - int using_bias, - int using_relu, - bm_device_mem_t output); - -bm_status_t bmdnn_conv_forward_power_evaluation( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t weight, - bm_device_mem_t bias, - bm_tensor_4d_t input_shape, - bm_kernel_param_t kernel_param, - bm_tensor_4d_t output_shape, - bm_conv_param_t conv_param, - bool with_bias, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_img_scale( - bm_handle_t handle, bm_device_mem_t dst, bm_device_mem_t src, int n, - int c, int dh, int sh, int dw, int sw); - -#if defined (__cplusplus) -} -#endif - -#endif /* BMDNN_API_H */ diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h deleted file mode 100644 index 384cd4108..000000000 --- a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_ext_api.h +++ /dev/null @@ -1,438 +0,0 @@ -#ifndef BMDNN_EXT_API_H -#define BMDNN_EXT_API_H - -#include "bmdnn_runtime.h" - -#if defined (__cplusplus) -extern "C" { -#endif - -bm_status_t bmdnn_threshold_forward( - bm_handle_t handle, - float threshold, - //input - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t output - ); - -bm_status_t bmdnn_exp_forward( - bm_handle_t handle, - float base, - float input_scale, - float input_shift, - //input - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t output - ); - -bm_status_t bmdnn_exp_backward( - bm_handle_t handle, - float base, - float input_scale, - float input_shift, - //input - bm_device_mem_t output_diff, - bm_device_mem_t output, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff - ); - -bm_status_t bmdnn_power_forward( - bm_handle_t handle, - float power_, - float scale_, - float shift_, - //input - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t output - ); - -bm_status_t bmdnn_power_backward( - bm_handle_t handle, - float power_, - float scale_, - float shift_, - //input - bm_device_mem_t output_diff, - bm_device_mem_t output, - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff - ); - -bm_status_t bmdnn_euclidean_loss_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t label, - bm_device_mem_t temp_, - int input_n, - int input_dim, - //output - bm_device_mem_t diff, - bm_device_mem_t loss); - -bm_status_t bmdnn_euclidean_loss_backward( - bm_handle_t handle, - float alpha, - //input - bm_device_mem_t output, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_silence_backward( - bm_handle_t handle, - //input - //bm_device_mem_t output_data, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_lstm_unit_forward( - bm_handle_t handle, - //input - bm_device_mem_t X_i, - bm_device_mem_t X_f, - bm_device_mem_t X_o, - bm_device_mem_t X_g, - bm_device_mem_t C_prev, - bm_device_mem_t cont_expand, - int num, - int hidden_dim, - //output - bm_device_mem_t C, - bm_device_mem_t H); - -bm_status_t bmdnn_lstm_unit_backward( - bm_handle_t handle, - //input - bm_device_mem_t C_diff, - bm_device_mem_t H_diff, - bm_device_mem_t X_i, - bm_device_mem_t X_f, - bm_device_mem_t X_o, - bm_device_mem_t X_g, - bm_device_mem_t C_prev, - bm_device_mem_t C, - bm_device_mem_t cont_expand, - int num, - int hidden_dim, - //output - bm_device_mem_t C_prev_diff, - bm_device_mem_t X_i_diff, - bm_device_mem_t X_f_diff, - bm_device_mem_t X_o_diff, - bm_device_mem_t X_g_diff); - -bm_status_t bmdnn_eltwise_forward( - bm_handle_t handle, - int op_, - int flag_first, - float coeffs_, - int index, - //input - bm_device_mem_t input, - bm_device_mem_t target, - int input_n, - int input_dim, - //output - bm_device_mem_t mask_data, - bm_device_mem_t output); - -bm_status_t bmdnn_eltwise_backward( - bm_handle_t handle, - int op_, - int flag_first, - float coeffs_, - int index, - //input - bm_device_mem_t output_data, - bm_device_mem_t output_diff, - bm_device_mem_t input_data, - bm_device_mem_t mask_data, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_bias_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t bias, - int outer_dim, - int dim, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_bias_backward( - bm_handle_t handle, - int flag, - //input - bm_device_mem_t output_diff, - int outer_dim, - int bias_dim, - int inner_dim, - //output - bm_device_mem_t input_diff, - bm_device_mem_t bias_diff); - -bm_status_t bmdnn_log_forward( - bm_handle_t handle, - float scale, - float shift, - float base, - //input - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_log_backward( - bm_handle_t handle, - float scale, - float shift, - float base, - //input - bm_device_mem_t output_diff, - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_absval_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_absval_backward( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_sigmoid_cross_entropy_loss_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t target, - bm_device_mem_t buffer, - int input_n, - int input_dim, - //output - bm_device_mem_t output, - bm_device_mem_t loss); - -bm_status_t bmdnn_sigmoid_cross_entropy_loss_backward( - bm_handle_t handle, - //input - bm_device_mem_t output, - bm_device_mem_t target, - bm_device_mem_t output_diff, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_contrastive_loss_forward( - bm_handle_t handle, - //input - bm_device_mem_t input_0, - bm_device_mem_t input_1, - bm_device_mem_t label, - bm_device_mem_t buffer, - int input_n, - int input_c, - float margin, - bool legacy_version, - //output - bm_device_mem_t diff, - bm_device_mem_t dist_sq, - bm_device_mem_t loss); - -bm_status_t bmdnn_contrastive_loss_backward( - bm_handle_t handle, - //input - bm_device_mem_t label, - bm_device_mem_t diff, - bm_device_mem_t dist_sq, - bm_device_mem_t output_diff, - bm_device_mem_t buffer, - int input_n, - int input_dim, - float margin, - bool legacy_version, - int propagate_down_flag, - //output - bm_device_mem_t input_diff_0, - bm_device_mem_t input_diff_1); - -bm_status_t bmdnn_filter_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t filter, - int input_n, - int output_n, - int input_dim, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_filter_backward( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t filter, - int input_n, - int output_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_split_backward( - bm_handle_t handle, - //input - int is_first, - bm_device_mem_t output_diff, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_bnll_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_bnll_backward( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t input, - float threshold, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -bm_status_t bmdnn_prelu_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t slope, - float slope0, - int channel_shared, - int input_n, - int input_c, - int input_h, - int input_w, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_prelu_backward( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t input, - bm_device_mem_t slope, - int propagate_down_flag, - int channel_shared, - int input_n, - int input_c, - int input_h, - int input_w, - //output - bm_device_mem_t slope_diff, - bm_device_mem_t input_diff); - -bm_status_t bmdnn_scale_forward( - bm_handle_t handle, - //input - bm_device_mem_t input, - bm_device_mem_t scale, - int input_n, - int input_c, - int input_h, - int input_w, - int scale_dim, - int inner_dim, - int scale_is_neuron, - //output - bm_device_mem_t scale_extension, - bm_device_mem_t output); - -bm_status_t bmdnn_scale_backward( - bm_handle_t handle, - //input - bm_device_mem_t output_diff, - bm_device_mem_t input_data, - bm_device_mem_t scale_extension, - int propagate_down_flag, - int input_n, - int input_c, - int input_h, - int input_w, - int scale_dim, - int inner_dim, - int scale_is_neuron, - //output - bm_device_mem_t scale_diff, - bm_device_mem_t input_diff); - -bm_status_t bmdnn_elu_forward( - bm_handle_t handle, - float alpha, - //input - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t output); - -bm_status_t bmdnn_elu_backward( - bm_handle_t handle, - float alpha, - //input - bm_device_mem_t output_diff, - bm_device_mem_t output, - bm_device_mem_t input, - int input_n, - int input_dim, - //output - bm_device_mem_t input_diff); - -#if defined (__cplusplus) -} -#endif - -#endif /* BMDNN_EXT_API_H */ diff --git a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h b/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h deleted file mode 100644 index 6fede1338..000000000 --- a/saber/funcs/impl/bm/base/include/bmdnn/bmdnn_runtime.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef BMDNN_RUNTIME_H_ -#define BMDNN_RUNTIME_H_ - -#include "bmlib_runtime.h" - -#if defined (__cplusplus) -extern "C" { -#endif - -bm_status_t bmdnn_init( - bm_handle_t *handle); - -void bmdnn_deinit( - bm_handle_t handle); - -#if defined (__cplusplus) -} -#endif - -#endif diff --git a/saber/funcs/impl/bm/base/include/bmdnn/op_code.h b/saber/funcs/impl/bm/base/include/bmdnn/op_code.h deleted file mode 100644 index f85846a8a..000000000 --- a/saber/funcs/impl/bm/base/include/bmdnn/op_code.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef OP_CODE_H_ -#define OP_CODE_H_ - - -typedef enum align_tensor_op { - ALIGN_TENSOR_ADD, - ALIGN_TENSOR_SUB, - ALIGN_TENSOR_MUL, - ALIGN_TENSOR_DIV, - TENSOR_INVALID -} ALIGN_TENSOR_OP; - -typedef enum linear_op { - LINEAR_MAC, - LINEAR_ADD_SQR, - LINEAR_SUB_SQR -} LINEAR_OP; - -typedef enum sfu_op { - SFU_XN, - SFU_EX, - SFU_LNX, - SFU_RSQ, - SFU_INVALID -} SFU_OP; -typedef struct tensor_4d_t { - int n; - int c; - int h; - int w; -}bm_tensor_4d_t; - - -#define TENSOR_ADD 0 -#define TENSOR_SUB 1 -#define TENSOR_MUL 2 -//Note the div should be implmented by KAMAKE algorithm -#define TENSOR_DIV 3 -#define TENSOR_MAX 4 -#define TENSOR_CPY 5 -#define TENSOR_MAC 6 - -#define TENSOR_N_DIM 0 -#define TENSOR_C_DIM 1 -#define TENSOR_H_DIM 2 -#define TENSOR_W_DIM 3 - -#define SHARE_REG_MESSAGE_WP 0 -#define SHARE_REG_MESSAGE_RP 1 -#define SHARE_REG_MESSAGE_IRQSTATUS 2 -#define SHARE_REG_CDMA_IRQSTATUS 3 - -#define SHAREMEM_MSG_FIXED_OFFSET (8192) -#define SHAREMEM_SIZE_BIT 8 -#define SHAREMEM_MASK ((1< -#include - -#if !defined(__x86_64__) && !defined(__aarch64__) -#error "BM needs 64-bit to compile" -#endif - -#if defined (__cplusplus) -extern "C" { -#endif - -typedef enum { - BM_SUCCESS = 0, - BM_ERR_DEVNOTREADY = 1, /* Device not ready yet */ - BM_ERR_FAILURE = 2, /* General failure */ - BM_ERR_TIMEOUT = 3, /* Timeout */ - BM_ERR_PARAM = 4, /* Parameters invalid */ - BM_ERR_NOMEM = 5, /* Not enough memory */ - BM_ERR_DATA = 6, /* Data error */ - BM_ERR_BUSY = 7, /* Busy */ - BM_ERR_NOFEATURE = 8, /* Not supported yet */ - BM_NOT_SUPPORTED = 9 -} bm_status_t; - -typedef enum { - BM_MEM_TYPE_DEVICE = 0, - BM_MEM_TYPE_HOST = 1, - BM_MEM_TYPE_SYSTEM = 2, - BM_MEM_TYPE_INT8_DEVICE = 3, - BM_MEM_TYPE_INVALID = 4 -} bm_mem_type_t; - -#define BM_MEM_ADDR_NULL (0xfffffffff) - -typedef struct bm_mem_desc { - unsigned char desc[16]; -} bm_mem_desc_t; - -struct bm_context; -typedef struct bm_context * bm_handle_t; -typedef struct bm_mem_desc bm_device_mem_t; -typedef struct bm_mem_desc bm_host_mem_t; -typedef struct bm_mem_desc bm_system_mem_t; - -#define BM_CHECK_RET(call) \ - do { \ - bm_status_t ret = call; \ - if ( ret != BM_SUCCESS ) { \ - printf("BM_CHECK_RET failed %d\n", ret); \ - ASSERT(0); \ - exit(-ret); \ - } \ - } while(0) - -/* - * control - */ -void bm_flush( - bm_handle_t handle); -/* - * brief malloc host memory according to a tensor shape(each neuron is 32 bits) -*/ - -bm_status_t bm_malloc_neuron_device( - bm_handle_t handle, - bm_device_mem_t *pmem, - int n, - int c, - int h, - int w); - -/* - * brief malloc host memory in size of dword(32 bits) -*/ - -bm_status_t bm_malloc_device_dword( - bm_handle_t handle, - bm_device_mem_t *pmem, - int count); - -/* - * brief malloc host memory in size of byte -*/ - -bm_status_t bm_malloc_device_byte( - bm_handle_t handle, - bm_device_mem_t *pmem, - unsigned int size); - -void bm_free_device( - bm_handle_t handle, - bm_device_mem_t mem); - -/* - * brief malloc host memory in size of byte - */ -bm_status_t bm_malloc_host( - bm_handle_t handle, - bm_host_mem_t *pmem, - unsigned int size); - -void bm_free_host( - bm_handle_t handle, - bm_host_mem_t mem); - -void *bm_host_mem_get_pointer( - bm_host_mem_t mem); - -/* - * Memory copy and set - */ -bm_status_t bm_memcpy_h2d( - bm_handle_t handle, - bm_device_mem_t dst, - bm_host_mem_t src); - -bm_status_t bm_memcpy_d2h( - bm_handle_t handle, - bm_host_mem_t dst, - bm_device_mem_t src); - - -bm_status_t bm_memcpy_s2d( - bm_handle_t handle, - bm_device_mem_t dst, - bm_system_mem_t src); - -bm_status_t bm_memcpy_d2s( - bm_handle_t handle, - bm_system_mem_t dst, - bm_device_mem_t src); - -bm_status_t bm_memcpy_d2d( - bm_handle_t handle, - bm_device_mem_t dst, - int dst_offset, - bm_device_mem_t src, - int src_offset, - int len); - -bm_status_t bm_memset_device( - bm_handle_t handle, - const int value, - bm_device_mem_t mem); - -bm_device_mem_t bm_mem_from_system( - void * system_addr); - -bm_device_mem_t bm_mem_from_device( - void * device_addr); - -/* -*brief malloc one device memory with the shape of (N,C,H,W), copy the sys_mem to -device mem if need_copy is true -*/ - -bm_status_t bm_mem_convert_system_to_device_neuron( - bm_handle_t handle, - struct bm_mem_desc *dev_mem, - struct bm_mem_desc sys_mem, - bool need_copy, - int n, - int c, - int h, - int w); - -/* -*brief malloc one device memory with the size of coeff_count, copy the sys_mem to -device mem if need_copy is true -*/ -bm_status_t bm_mem_convert_system_to_device_coeff( - bm_handle_t handle, - struct bm_mem_desc *dev_mem, - struct bm_mem_desc sys_mem, - bool need_copy, - int coeff_count); - -/* - * memory info get and set - */ -unsigned long long bm_mem_get_device_addr(struct bm_mem_desc mem); -void bm_mem_set_device_addr(struct bm_mem_desc & mem, unsigned long long addr); -unsigned int bm_mem_get_device_size(struct bm_mem_desc mem); -void bm_mem_set_device_size(struct bm_mem_desc & mem, unsigned int size); -bm_mem_type_t bm_mem_get_type(struct bm_mem_desc mem); - -/* -* brief Get the handle of bmlib_runtime -* return : If the handle has been inited, return the handle it self , else init one and return it -*/ -bm_handle_t get_bm_handle(); - -/* - * Helper functions - */ - -/** -* \brief Get the number of nodechip (Constant 1 in bm1682) -* \return -* \ref NO -*/ -int bm_get_nodechip_num( - bm_handle_t handle); - -/** -* \brief Get the number of nodechip (Constant 64 in bm1682) -* \return -* \ref NO -*/ -int bm_get_npu_num( - bm_handle_t handle); -int bm_get_eu_num( bm_handle_t handle); -/** -* \brief Get the number of nodechip (Constant 64 in bm1682) -* \return -* \ref NO -*/ -bm_device_mem_t bm_mem_null(void); -#define BM_MEM_NULL (bm_mem_null()) - -bm_status_t bm_dev_getcount(int* count); -bm_status_t bm_dev_query(int devid); -bm_status_t bm_dev_request(bm_handle_t *handle, bool bmkernel_used, int devid); -void bm_dev_free(bm_handle_t handle); - -#if defined (__cplusplus) -} -#endif - -#endif /* BM_RUNTIME_H_ */ diff --git a/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h b/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h deleted file mode 100644 index e878343ef..000000000 --- a/saber/funcs/impl/bm/base/include/bmlib/bmlib_utils.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef BMLIB_UTILS_H -#define BMLIB_UTILS_H -#include - -/* - * Debug definitions for user app only - * Copy from common.h - * Don't include for internal usage - */ -#ifdef __cplusplus -extern "C" { -#endif - -#define UNUSED(x) (void)(x) - -#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask)) -#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1) - -int array_cmp( - float *p_exp, - float *p_got, - int len, - const char *info_label, - float delta); - -int tri_array_cmp( - float *p_exp, - float *p_got, - float *third_party, - int len, - const char *info_label, - float delta, - int* err_idx); - -int array_cmp_int( - int *p_exp, - int *p_got, - int len, - const char *info_label -); - -void dump_hex(char *desc, void *addr, int len); -void dump_data_float(char *desc, void *addr, int n, int c, int h, int w); -void dump_data_int(char *desc, void *addr, int n, int c, int h, int w); -void dump_matrix_float(char *desc, void *addr, int row, int col); -void dump_array_file(char * file, int row_num, int col_num, int transpose, float * parr); - -/* dump to file */ -void dump_float_tensor(const char * filename, - int length, float * dump_data); - -#ifdef __cplusplus -/* not available in C */ -void random_param( - int &n, int &c, int &h, int &w, - int &kh, int &kw, int &ph, int &pw, int &sh, int &sw, - int &oc); - -void random_conv_param( - int &n, int &ic, int &ih, int &iw, int &oc, - int &kh, int &kw, int &dh, int &dw, - int &ph, int &pw, int &sh, int &sw); -#endif - -int conv_coeff_storage_convert(float * coeff_orig, float ** coeff_reformat, unsigned int oc, unsigned int ic, unsigned int kh, unsigned int kw, unsigned int npu_num); - - -#ifdef __cplusplus -} -#endif - -#endif /* BMLIB_UTILS_H */ diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h b/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h deleted file mode 100644 index f3e086f91..000000000 --- a/saber/funcs/impl/bm/base/include/bmruntime/bmblob.h +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef __BM_BLOB_H__ -#define __BM_BLOB_H__ - -struct bm_mem_desc; -typedef struct bm_mem_desc bm_device_mem_t; -namespace bmcnn { - -typedef struct { int n, c, h, w; } Shape; - -class BMBlob -{ -public: - /** - * \brief Constructor of blob. - * - * \param shape - Shape of blob - */ - explicit BMBlob(const Shape &shape, void *handle); - /** - * \brief Deconstructor of blob. - */ - virtual ~BMBlob(); - /** - * \brief Reshape blob. - * - * \param n - Batch number of blob - * \param c - Channel number of blob - * \param h - Height of blob section - * \param w - Width of blob section - * - * \note - * (1) For now, number of channels is not allowed to be reshaped.\n - * (2) After reshaping, data in this blob will be set vanished.\n - */ - void Reshape(int n, int c, int h, int w); - /** - * \brief Get shape. - */ - inline Shape shape() const - { return shape_; } - /** - * \brief Get batch size. - */ - inline int batch_num() const - { return shape_.n; } - /** - * \brief Get feature - * - * \return Channel number of the blob\n - */ - inline int channels() const - { return shape_.c; } - /** - * \brief Get height of section - */ - int height() const - { return shape_.h; } - /** - * \brief Get width of section. - */ - int width() const - { return shape_.w; } - /** - * \brief Get read-only pointer to data in cpu. - */ - const float *cpu_data(); - /** - * \brief Get mutable pointer of data in cpu. - */ - float *mutable_cpu_data(); - /** - * \brief Get mutable pointer of memory in device. - */ - bm_device_mem_t *mutable_dev_mem(); - /** - * \brief Get read-only pointer of memory in device. - */ - const bm_device_mem_t *dev_mem(); -private: - BMBlob(const BMBlob &other); - BMBlob &operator=(const BMBlob &other); - - bm_device_mem_t *dev_mem_; - float *sys_data_; - Shape shape_; - int data_pos_; - int capacity_; - void *handle_; - - enum { AIR = 0x00, SYS = 0x01, DEV = 0x10 }; - void sync_s2d(); - void sync_d2s(); -}; - -} /* namespace bmcnn */ - -#endif /* __BM_BLOB_H__ */ diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h deleted file mode 100644 index daa101fce..000000000 --- a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime.h +++ /dev/null @@ -1,154 +0,0 @@ -#ifndef BMRUNTIME_H_ -#define BMRUNTIME_H_ -#include -#include -#include "bmlib_runtime.h" -#include "bmruntime_common.h" -#include "stdio.h" -#include -#include -#include -#include - -using std::vector; -using std::map; -using std::set; -using std::string; -using std::pair; -using std::make_pair; -using std::cout; -using std::endl; -typedef unsigned int u32; -typedef unsigned long long u64; - -typedef struct stage_param_with_idx{ - int height_high; - int height_low; - int width_high; - int width_low; - int stage_index; -}stage_param_with_idx_t; - -class bmruntime { - public: - bmruntime(bm_handle_t bm_handle); - ~bmruntime(); - - bool load_context(const string& ctx_dir); - - const set& get_input_tensor(int net_idx) const; - const set& get_input_tensor(const string& net_name); - - const set& get_output_tensor(int net_idx) const; - const set& get_output_tensor(const string& net_name); - - const bm_device_mem_t* get_input_blob(const string& tensor_name, int net_idx); - const bm_device_mem_t* get_input_blob(const string& tensor_name, const string& net_name); - - const bm_device_mem_t* get_output_blob(const string& tensor_name, int net_idx); - const bm_device_mem_t* get_output_blob(const string& tensor_name, const string& net_name); - - bool launch(int net_idx); - bool launch(const string& net_name); - - bool launch(int net_idx, const bm_device_mem_t* input_tensors, int input_num, - const bm_device_mem_t* output_tensors, int output_num); - bool launch(const string& net_name, const bm_device_mem_t* input_tensors, int input_num, - const bm_device_mem_t* output_tensors, int output_num); - - bool launch(int net_idx, int n, int h , int w); - bool launch(const string& net_name, int n, int h, int w); - bool launch(int net_idx, const bm_device_mem_t* input_tensors, int input_num, - const bm_device_mem_t* output_tensors, int output_num, int n, int h, int w); - bool launch(const string& net_name, const bm_device_mem_t* input_tensors, int input_num, - const bm_device_mem_t* output_tensors, int output_num, int n , int h, int w); - - void get_input_blob_max_nhw(const string& tensor_name, int net_idx, int * max_n, int * max_c, int * max_h, int * max_w); - void get_input_blob_max_nhw(const string& tensor_name, const string& net_name, int * max_n, int * max_c, int * max_h, int * max_w); - void get_output_blob_max_nhw(const string& tensor_name, int net_idx, int * max_n, int * max_c, int * max_h, int * max_w); - void get_output_blob_max_nhw(const string& tensor_name, const string& net_name, int * max_n, int *max_c, int * max_h, int * max_w); - - int get_oh_from_ih(const string& input_tensor_name, const string& output_tensor_name, const string& net_name, int ih); - int get_oh_from_ih(const string& input_tensor_name, const string& output_tensor_name, int net_idx, int ih); - int get_ow_from_iw(const string& input_tensor_name, const string& output_tensor_name, const string& net_name, int iw); - int get_ow_from_iw(const string& input_tensor_name, const string& output_tensor_name, int net_idx, int iw); - - - - - bool can_batch_size_change(int net_idx); - bool can_batch_size_change(const string& net_name); - bool can_height_and_width_change(int net_idx); - bool can_height_and_width_change(const string& net_name); - - void show_neuron_network(); - - int get_network_number() {return net_num;} - - inline bm_handle_t get_bm_handle() {return m_handle;} - - protected: - bool setup_mem_context(const string& ctx_dir); - bool setup_cmd_context(const string& ctx_dir); - bool set_using_cmd_file(const string& ctx_dir); - void load_cmd(u32* cmd, int engine_id, bool last_cmd, u64 start_address, u64 append_mem_offset); - bool setup_ir_context(const string& ctx_dir); - - void wrong_net_idx_handle(int net_idx) const; - - int get_net_idx(const string& net_name); - int get_stage_idx(int net_idx, int h, int w); - u64 get_stage_offset(int net_idx, int stage_idx); - - int compute_output_height(int input_height, int global_kh, int global_stride_h, int global_pad_h, int global_pool_kh); - int compute_output_width(int input_width, int global_kw, int global_stride_w, int global_pad_w, int global_pool_kw); - - bm_handle_t m_handle; - std::vector m_device_mem_info_vec; - std::vector m_device_mem_vec; - - vector m_gdma_total_id_v; - vector m_cdma_total_id_v; - vector m_bdc_total_id_v; - vector > m_gdma_group_id_v; - vector > m_cdma_group_id_v; - vector > m_bdc_group_id_v; - vector m_cmdgroup_num; - vector m_gdma_cmd_start_address_v; - vector m_cdma_cmd_start_address_v; - vector m_bdc_cmd_start_address_v; - vector > input_tensor_mem_map_v; - vector > output_tensor_mem_map_v; - vector > m_input_tensor_set_v; - vector > m_output_tensor_set_v; - int net_num; - map net_name_to_idx; - vector stage_num; - - bool have_ir_info; - vector > m_ir_info_len; - vector m_ir_info_start_address_v; - vector > stage_param_with_idx_vv; - - //io tensor param - vector n_can_change_v; - vector h_w_can_change_v; - - vector > > input_tensor_max_shape_vv; - vector > > output_tensor_max_shape_vv; - vector > > global_output_tensor_param_vv; - - bool m_using_cmd_file; - FILE * m_gdma_cmd_file; - FILE * m_cdma_cmd_file; - FILE * m_bdc_cmd_file; - - //previous value or state - int pre_net_num; - int pre_m_device_mem_info_vec_size; - - //append mem offset when appending another framework's context. - vector apd_ctx_mem_offset; -}; - -#endif diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h deleted file mode 100644 index 200656739..000000000 --- a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_common.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef BMRUNTIME_COMMON_H -#define BMRUNTIME_COMMON_H - -#define BMRT_ASSERT(_cond) \ - do { \ - if (!(_cond)) { \ - printf("ASSERT %s: %s: %d: %s\n", \ - __FILE__, __func__, __LINE__, #_cond); \ - exit(-1); \ - } \ - } while(0) - -typedef enum neuron_device_mem_type { - INPUT_NEURON_TENSOR = 0, - INTERMEDIATE_NEURON_TENSOR = 1, - OUTPUT_NEURON_TENSOR = 2, - CMD_BUF_TENSOR = 3, - CMD_NUM_TENSOR = 4 -} NEURON_DEVICE_MEM_TYPE; - -typedef enum device_mem_type { - NEURON = 0, - COEFF = 1, -#ifdef INT8_COEFF_FUNC - COEFF_INT8 = 2, - COEFF_INT8SCALE = 3, - LOCAL = 4 -#else - LOCAL = 2 -#endif -} DEVICE_MEM_TYPE; - -typedef struct device_mem_info { - DEVICE_MEM_TYPE device_mem_type; - NEURON_DEVICE_MEM_TYPE neuron_device_mem_type; - int n; - int c; - int h; - int w; - int coeff_count; - int groups; - unsigned long long address; -} DEVICE_MEM_INFO; - -//info for compute output tensor -typedef struct tensor_max_shape { - int max_n; - int channel; - int max_h; - int max_w; -} tensor_max_shape_t; - -typedef struct global_output_tensor_param { - int input_idx; - int global_kh; - int global_kw; - int global_stride_h; - int global_stride_w; - int global_pad_h; - int global_pad_w; - int global_pool_kh; - int global_pool_kw; -} global_output_tensor_param_t; - -#endif diff --git a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h b/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h deleted file mode 100644 index 4214674f3..000000000 --- a/saber/funcs/impl/bm/base/include/bmruntime/bmruntime_interface.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef BMRUNTIME_INTERFACE_H_ -#define BMRUNTIME_INTERFACE_H_ - -#include "bmruntime.h" -#include "bmdnn_runtime.h" - -bmruntime* create_bmruntime(bm_handle_t* bm_handle); - -void destroy_bmruntime(bm_handle_t bm_handle, bmruntime* p_bmrt); - -#endif diff --git a/saber/funcs/impl/bm/device/.bmkernel_base.h.swp b/saber/funcs/impl/bm/device/.bmkernel_base.h.swp new file mode 100644 index 000000000..dc242fa50 Binary files /dev/null and b/saber/funcs/impl/bm/device/.bmkernel_base.h.swp differ diff --git a/saber/funcs/impl/bm/device/bm_common.h b/saber/funcs/impl/bm/device/bm_common.h new file mode 100644 index 000000000..b22f1b6e2 --- /dev/null +++ b/saber/funcs/impl/bm/device/bm_common.h @@ -0,0 +1,158 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_COMMON_H +#define ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_COMMON_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include "bm_config.h" +#include "op_code.h" +#include "bm_memmap.h" +#include "firmware_core_kernel.h" +#include "bmkernel_base.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//#define DEBUG_MESSAGE +#ifdef DEBUG_MESSAGE +#define MSG_DBG(fmt, ...) printf("MSG: "fmt, ##__VA_ARGS__) +#else +#define MSG_DBG(fmt, ...) +#endif + + +#define INLINE inline + +#define UNUSED(x) (void)(x) + +#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask)) +#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1) + +#define ROUND_UP(A, B) ((A)/(B) + ((A) % (B) == 0 ? 0 : 1)) + +#define bm_min(x, y) ((x) < (y) ? (x) : (y)) +#define bm_max(x, y) ((x) > (y) ? (x) : (y)) + + +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned int u32; +typedef unsigned long long u64; + +typedef union { + int ival; + float fval; +} IF_VAL; + +typedef u32 tuple4_u32[4]; + +typedef struct tensor_info{ + u32 n,c,h,w; + u32 w_stride, n_stride, c_stride, h_stride; + u32 address; + u32 data_format; + u32 neuron_matrix; //0: neuron, 1: matrix + u32 matrix_col_magin; //the magin is not 0, when column_num%w_param!=0 +}TENSOR_INFO; + +#define FLOAT_SIZE 4 +#define INT8_SIZE 1 +#define FLOAT_BITWIDTH 32 +#define GET_U64(U32_H, U32_L) (((u64)(U32_H) << 32) | (u64)(U32_L)) + +typedef enum { + CAFFE_SUPPORT = 0, + TENSORFLOW_SUPPORT = 1 +} PLATFORM_SUPPORT; + +typedef enum { + NODECHIP_REG = 0, + HOST_REG = 1 +} REG_TYPE; + +typedef struct kernel_param{ + int g; + int oc; + int ic; + int h; + int w; +} bm_kernel_param_t; + +typedef struct bm_conv_param{ + int stride_h; + int stride_w; + int pad_h; + int pad_w; + int dilation_h; + int dilation_w; + bool result_add; +} bm_conv_param_t; + +typedef struct conv_secs_info{ + int ocsecs; + int icsecs; + int nsecs; + int hsecs; +} conv_secs_info_t; + +static INLINE int ceiling_func(int numerator, int denominator) +{ + return (numerator + denominator - 1) / denominator; +} + +static INLINE int ceiling_func_shift(int numerator, int shift) +{ + return (numerator + (1 << shift) - 1) >> shift; +} + +static int INLINE calc_offset(int *shape, int *offset) +{ + return ((offset[0] * shape[1] + offset[1]) * shape[2] + offset[2]) + * shape[3] + offset[3]; +} + +//All the size are in the units of bytes +static int INLINE get_index_csize_global(int h, int w, int index_bitwidth) +{ + int size = h * w * index_bitwidth; + //32 bit align + return (((size >> 5)) + ((size & 0x1f) != 0)) * FLOAT_SIZE; +} + +static int INLINE get_index_cstride_global(int h, int w, int index_bitwidth) +{ + int size = h * w * index_bitwidth; + //32 bit align + return (((size >> 5)) + + ((size & 0x1f) != 0)) * FLOAT_BITWIDTH / index_bitwidth; +} + +static int INLINE get_neuron_csize_local(int h, int w) +{ + int size = h * w; + //EU_NUM neurons align + return ALIGN(size,EU_NUM) * FLOAT_SIZE; +} + +static int INLINE addr_EU_align(int addr){ + addr = addr / FLOAT_SIZE; + return ALIGN( addr, EU_NUM ) * FLOAT_SIZE; +} + +static int INLINE get_cstride_local(int h, int w) +{ + int size = h * w; + //EU_NUM neurons align + return ALIGN(size,EU_NUM); +} + +#ifdef __cplusplus +} +#endif +#endif /* ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_COMMON_H */ diff --git a/saber/funcs/impl/bm/device/bm_memmap.h b/saber/funcs/impl/bm/device/bm_memmap.h new file mode 100644 index 000000000..18d8185e6 --- /dev/null +++ b/saber/funcs/impl/bm/device/bm_memmap.h @@ -0,0 +1,61 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_MEMMAP_H +#define ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_MEMMAP_H + +#define ITCM_MEM_START_ADDR 0x00000000 +#define ITCM_MEM_SIZE 0x00080000 // 512KB +#define DTCM_MEM_START_ADDR 0x02000000 +#define DTCM_MEM_SIZE 0x00010000 // 64KB +#define SHARE_MEM_START_ADDR (DTCM_MEM_START_ADDR + DTCM_MEM_SIZE) +#define SHARE_MEM_SIZE 0x00010000 // 64KB + +#define LOCAL_MEM_ADDRWIDTH 18 +#define LOCAL_MEM_START_ADDR 0x04000000 +//#define LOCAL_MEM_SIZE (1<ifmap_offset_global; + u64 ofmap_offset_global = conv_param->ofmap_offset_global; + u64 weight_offset_global = conv_param->weight_offset_global; + u64 bias_offset_global = conv_param->bias_offset_global; + int input_n = conv_param->input_n; + int input_c = conv_param->input_c; + int input_h = conv_param->input_h; + int input_w = conv_param->input_w; + int groups = conv_param->groups; + int output_c = conv_param->output_c; + int kh = conv_param->kh; + int kw = conv_param->kw; + int dh = conv_param->dh; + int dw = conv_param->dw; + int pad_h = conv_param->pad_h; + int pad_w = conv_param->pad_w; + int stride_h = conv_param->stride_h; + int stride_w = conv_param->stride_w; + int using_bias = conv_param->using_bias; + int result_add = conv_param->result_add; + int icsecs = conv_param->icsecs; + int ocsecs = conv_param->ocsecs; + int nsecs = conv_param->nsecs; + int hsecs = conv_param->hsecs; + + P_COMMAND dma_command; + CMD_ID_NODE id_node; + resync_cmd_id( &id_node ); + + int kh_ext = dh * (kh - 1) + 1; + int kw_ext = dw * (kw - 1) + 1; + int output_h = (input_h + 2 * pad_h - kh_ext) / stride_h + 1; + int output_w = (input_w + 2 * pad_w - kw_ext) / stride_w + 1; + + int ic = input_c / groups; + int oc = output_c / groups; + int ic_per_NPU = ceiling_func_shift(ic, NPU_SHIFT); + int oc_per_NPU = ceiling_func_shift(oc, NPU_SHIFT); + int bias_offset_local = 0; + int bias_tensor_size = oc_per_NPU * FLOAT_SIZE; + int weight_offset_local = bias_offset_local + bias_tensor_size; + int weight_group_offset = oc * ic * kh * kw; + int weight_tensor_size = ic * oc_per_NPU * kh * kw * FLOAT_SIZE; + int weight_capacity = addr_EU_align(weight_tensor_size + bias_tensor_size); + int ifmap_group_offset = ic * input_h * input_w; + int ofmap_group_offset = oc * output_h * output_w; + int global_ifmap_Nstride = ifmap_group_offset * groups; + int global_ofmap_Nstride = ofmap_group_offset * groups; + int nslice = input_n, ocslice = oc, icslice = ic, hslice = output_h; + nslice = input_n / nsecs; + int n_residual = input_n - nslice * nsecs; + hslice = output_h / hsecs; + int h_residual = output_h - hslice * hsecs; + icslice = ic / icsecs; + int ic_residual = ic - icslice * icsecs; + ocslice = oc / ocsecs; + int oc_residual = oc - ocslice * ocsecs; + int bias_group_offset = oc; + int max_icslice = icslice + (ic_residual > 0); + int max_ic_per_NPU = ceiling_func_shift(max_icslice, NPU_SHIFT); + int max_ocslice = ocslice + (oc_residual > 0); + int max_oc_per_NPU = ceiling_func_shift(max_ocslice, NPU_SHIFT); +// int nodechip_idx = 0; + for (int ig = 0; ig < groups; ig++){ + int ocend = 0; + for (int ocidx = 0; ocidx < ocsecs; ocidx++){ + int ocstart = ocend; + int cur_ocslice = ocslice + (oc_residual > ocidx); + ocend = ocstart + cur_ocslice; + oc_per_NPU = ceiling_func_shift(cur_ocslice, NPU_SHIFT); + if (using_bias){ + dma_command = get_command(ENGINE_GDMA); + tensor_compact_move_gen_cmd( + bias_offset_local, // local mem start address + bias_offset_global + (ig * bias_group_offset + ocstart) * FLOAT_SIZE, // global mem start address + 1, cur_ocslice, 1, 1, // n, c, h, w + 0, // direction G2L + 0, // transpose + (void *)dma_command, + 0, // local mem index + &id_node + ); + call_atomic(0, atomic_global_dma, dma_command, ENGINE_GDMA); + } + weight_capacity = max_icslice * oc_per_NPU * kh * kw * FLOAT_SIZE; + int ofmap_offset_local = addr_EU_align(weight_capacity + weight_offset_local); + int nend = 0; + for (int nidx = 0; nidx < nsecs; nidx++){ + int nstart = nend; + int sec_len_n = nslice + (nidx < n_residual); + nend = nstart + sec_len_n; + int o_hb = 0; + for (int hidx = 0; hidx < hsecs; hidx++){ + int o_ht = o_hb; + int o_h = hslice + (h_residual > hidx); + o_hb = o_ht + o_h; + int i_ht = bm_max(o_ht * stride_h - pad_h, 0); + int pad_h_t = 0; + if (i_ht == 0){ + pad_h_t = pad_h - o_ht * stride_h; + } + int i_hb = bm_min(o_hb * stride_h + kh_ext - 1 - pad_h, input_h); + int pad_h_b = 0; + if (i_hb == input_h){ + pad_h_b = o_hb * stride_h + kh_ext - 1 - pad_h - input_h; + } + int i_h = i_hb - i_ht; + int ifmap_align_size = get_neuron_csize_local(i_h, input_w); + int ifmap_tensor_size = sec_len_n * max_ic_per_NPU * ifmap_align_size; + int ofmap_align_size = get_neuron_csize_local(o_h, output_w); + int ofmap_tensor_size = sec_len_n * max_oc_per_NPU * ofmap_align_size; + int ifmap_offset_local = ofmap_offset_local + ofmap_tensor_size; + int offset_local_end = ifmap_offset_local + ifmap_tensor_size; + ASSERT(offset_local_end <= LOCAL_MEM_SIZE); + if (result_add){ + dma_command = get_command(ENGINE_GDMA); + u64 shift = nstart * global_ofmap_Nstride + ig * ofmap_group_offset + + (ocstart * output_h + o_ht) * output_w; + int local_cstride = get_cstride_local(o_h, output_w); + tensor_stride_move_gen_cmd( + ofmap_offset_local, // local mem start address + ofmap_offset_global + shift * FLOAT_SIZE, // global mem start address + sec_len_n, cur_ocslice, o_h, output_w, // n, c, h, w + 0, // local mem index + 0, // direction G2L + global_ofmap_Nstride, output_h * output_w, output_w, // src stride n,c,h + oc_per_NPU * local_cstride, local_cstride, output_w, // dst stride n,c,h + GDMA_TYPE_f32, + 0, // transpose + dma_command, &id_node + ); + call_atomic(0, atomic_global_dma, dma_command, ENGINE_GDMA); + } + int icend = 0; + for (int icidx = 0; icidx < icsecs; icidx++){ + int icstart = icend; + int cur_icslice = icslice + (ic_residual > icidx); + icend = icstart + cur_icslice; + ic_per_NPU = ceiling_func_shift(cur_icslice, NPU_SHIFT); + u64 shift = (ocstart * ic + icstart) * kh * kw + ig * weight_group_offset; + if ((icsecs != 1) || (nidx == 0 && hidx == 0)){ + dma_command = get_command(ENGINE_GDMA); + tensor_stride_move_gen_cmd( + weight_offset_local, // local mem start address + weight_offset_global + shift * FLOAT_SIZE, // global mem start address + 1, cur_ocslice, cur_icslice, kh * kw, // n, c, h, w + 0, // local mem index + 0, // direction G2L + 0, ic * kh * kw, kh * kw, // src stride n,c,h + 0, cur_icslice * kh * kw, kh * kw, // dst stride n,c,h + GDMA_TYPE_f32, + 0, // transpose + dma_command, &id_node + ); + call_atomic(0, atomic_global_dma, dma_command, ENGINE_GDMA); + } + shift = nstart * global_ifmap_Nstride + ig * ifmap_group_offset + + (icstart * input_h + i_ht) * input_w; + int local_cstride = get_cstride_local(i_h, input_w); + dma_command = (float*)get_command(ENGINE_GDMA); + tensor_stride_move_gen_cmd( + ifmap_offset_local, // local mem start address + ifmap_offset_global + shift * FLOAT_SIZE, // global mem start address + sec_len_n, cur_icslice, i_h, input_w, // n, c, h, w + 0, // local mem index + 0, // direction G2L + global_ifmap_Nstride, input_h * input_w, input_w, // src stride n,c,h + ic_per_NPU * local_cstride, local_cstride, input_w, // dst stride n,c,h + GDMA_TYPE_f32, + 0, // transpose + dma_command, &id_node + ); + call_atomic(0, atomic_global_dma, dma_command, ENGINE_GDMA); + + local_shape_t ifshape, ofshape; + ifshape.n = sec_len_n; + ifshape.c = cur_icslice; + ifshape.h = i_h; + ifshape.w = input_w; + ofshape.c = cur_ocslice; + ofshape.h = o_h; + ofshape.w = output_w; + P_COMMAND conv_command = get_command(ENGINE_BD); + atomic_conv_kernel_stride_gen_cmd( + conv_command, + LOCAL_MEM_START_ADDR | ifmap_offset_local, // input address + LOCAL_MEM_START_ADDR | ofmap_offset_local, // output address + LOCAL_MEM_START_ADDR | weight_offset_local, // weight address + LOCAL_MEM_START_ADDR | bias_offset_local, // bias address + ifshape, // input shape + ofshape, // output shape + kh, kw, // kernel h, w + dh, dw, // dilation h, w + kh * kw, cur_icslice * kh * kw, kw, // kernel stride n,c,h + pad_h_t, pad_h_b, pad_w, pad_w, // pad top, bottom, left, right + stride_h, stride_w, // stride h, w + icidx == icsecs - 1 ? using_bias: 0, // use bias + result_add || icidx > 0, // add result + &id_node + ); + call_atomic(0, atomic_conv_neuron, conv_command, ENGINE_BD); + } + u64 shift = nstart * global_ofmap_Nstride + ig * ofmap_group_offset + + (ocstart * output_h + o_ht) * output_w; + int local_cstride = get_cstride_local(o_h, output_w); + + dma_command = get_command(ENGINE_GDMA); + tensor_stride_move_gen_cmd( + ofmap_offset_local, // local mem start address + ofmap_offset_global + shift * FLOAT_SIZE, // global mem start address + sec_len_n, cur_ocslice, o_h, output_w, // n, c, h, w + 0, // local mem index + 1, // direction L2G + oc_per_NPU * local_cstride, local_cstride, output_w, // src stride n,c,h + global_ofmap_Nstride, output_h * output_w, output_w, // dst stride n,c,h + GDMA_TYPE_f32, + 0, // transpose + dma_command, &id_node + ); + call_atomic(0, atomic_global_dma, dma_command, ENGINE_GDMA); + } + } + } + } + poll_all_engine_done(&id_node); + return 0; +} diff --git a/saber/funcs/impl/bm/device/bmk_conv.h b/saber/funcs/impl/bm/device/bmk_conv.h new file mode 100644 index 000000000..57ce3a1d1 --- /dev/null +++ b/saber/funcs/impl/bm/device/bmk_conv.h @@ -0,0 +1,18 @@ +#ifndef BM_CONV_H +#define BM_CONV_H + +#include +#include "bm_common.h" +#include "atomic_dma_gen_cmd.h" +#include "atomic_conv_gen_cmd.h" +#include "atomic_md_sum_gen_cmd.h" + +#ifdef USING_CMODEL +#include "cmodel_runtime.h" +#include "atomic_dma.h" +#include "atomic_conv.h" +#include "atomic_md_sum.h" +#endif +#include "bmkernel_base.h" +int bm_conv_fwd(bm_api_conv_forward *conv_param); +#endif diff --git a/saber/funcs/impl/bm/device/bmkernel_base.c b/saber/funcs/impl/bm/device/bmkernel_base.c new file mode 100644 index 000000000..f65dc47ae --- /dev/null +++ b/saber/funcs/impl/bm/device/bmkernel_base.c @@ -0,0 +1,30 @@ +#include "bmkernel_base.h" +#include "bm_config.h" +#include "bmk_conv.h" +#include +/** + * bmkernel_func is the user entry to BMKERNEL just like "main" to some applications. + * + * \param args - Pointer to arguments that user sends from host. + * op - Flag to determine the operation type. + */ + +int bmkernel_func(void *args) +{ + bmkernel_api_base* param = (bmkernel_api_base *)args; + switch (param->op) { + case ACTIVATION: { + // bm_activation_fwd(param) + return 0; + } + case CONV: { + bm_api_conv_forward* api = &(param->opParam.convParam); +// printf("BM conv op.\n"); + return bm_conv_fwd(api); + } + default: { + printf("op %d is not supported by BM yet.\n", param->op); + return -1; + } + } +} diff --git a/saber/funcs/impl/bm/device/bmkernel_base.h b/saber/funcs/impl/bm/device/bmkernel_base.h new file mode 100644 index 000000000..e4925e209 --- /dev/null +++ b/saber/funcs/impl/bm/device/bmkernel_base.h @@ -0,0 +1,49 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BMKERNEL_BASE_H +#define ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BMKERNEL_BASE_H +#ifdef __cplusplus +extern "C" { +#endif + +enum BmOpType { + ACTIVATION, + CONV +}; + +typedef struct { + unsigned long long ifmap_offset_global; + unsigned long long ofmap_offset_global; + unsigned long long weight_offset_global; + unsigned long long bias_offset_global; + int input_n; // note this is total input_n + int input_c; + int input_h; + int input_w; + int groups; + int output_c; + int kh; + int kw; + int dh; + int dw; + int pad_h; + int pad_w; + int stride_h; + int stride_w; + int using_bias; + int result_add; + int icsecs; + int ocsecs; + int nsecs; + int hsecs; +} __attribute__((packed)) bm_api_conv_forward; + +typedef struct { + enum BmOpType op; // Flag to determine the operation type. + union U1{ + bm_api_conv_forward convParam; + } opParam; +} __attribute__((packed)) bmkernel_api_base; + +#ifdef __cplusplus +} +#endif +#endif /* ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BMKERNEL_BASE_H */ diff --git a/saber/funcs/impl/bm/vender_activation.h b/saber/funcs/impl/bm/vender_activation.h deleted file mode 100644 index ec27ac054..000000000 --- a/saber/funcs/impl/bm/vender_activation.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef ANAKIN_SABER_FUNCS_BMDNN_ACT_H -#define ANAKIN_SABER_FUNCS_BMDNN_ACT_H -#include "saber/funcs/impl/impl_activation.h" -namespace anakin { - -namespace saber { - -template -class VenderActivation : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ActivationParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderActivation(): _handle(NULL), _active_type(Active_relu) {} - - ~VenderActivation() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ActivationParam& param, Context& ctx) { - // not sure - _handle = get_bm_handle(); - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ActivationParam& param, Context& ctx) { - // not sure - return SaberSuccess; - } - - //call bmdnn activation funcs here - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ActivationParam& param) { - const InDataType in_data = *(inputs[0]->data()); - OutDataType out_data = *(outputs[0]->mutable_data()); - int input_dim = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width(); - int input_n = inputs[0]->num(); - - _active_type = param.active; - switch (_active_type) { - case Active_relu: - BMDNN_CHECK(bmdnn_relu_forward(_handle, in_data, 0.0, input_n, input_dim, out_data)); - break; - case Active_sigmoid: - BMDNN_CHECK(bmdnn_sigmoid_forward(_handle, in_data, input_n, input_dim, out_data)); - break; - case Active_tanh: - BMDNN_CHECK(bmdnn_tanh_forward(_handle, in_data, input_n, input_dim, out_data)); - break; - } - return SaberSuccess; - } - -private: - bm_handle_t _handle; - ActiveType _active_type; -}; - -template class VenderActivation; -} // namespace saber - -} // namespace anakin -#endif //ANAKIN_SABER_FUNCS_BMDNN_ACT_H diff --git a/saber/funcs/impl/bm/vender_batch_norm.h b/saber/funcs/impl/bm/vender_batch_norm.h deleted file mode 100644 index 4f433a4a9..000000000 --- a/saber/funcs/impl/bm/vender_batch_norm.h +++ /dev/null @@ -1,96 +0,0 @@ -#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H -#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H - -#include "saber/funcs/impl/impl_batch_norm.h" - -namespace anakin{ - -namespace saber { - -template -class VenderBatchNorm:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - BatchnormParam>> { -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderBatchNorm() : _handle(NULL) {} - - ~VenderBatchNorm() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - BatchnormParam &batch_norm_param, Context &ctx) { - - _handle = get_bm_handle(); - return create(inputs, outputs, batch_norm_param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - BatchnormParam &batch_norm_param, Context &ctx) { - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - BatchnormParam ¶m) { - - const InDataType *in_data = (const InDataType *) inputs[0]->data(); - OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); - - int input_n = inputs[0]->num(); - int input_c = inputs[0]->channel(); - int input_h = inputs[0]->height(); - int input_w = inputs[0]->width(); - - float eps = param.eps; - float scale = param.scale; - - bm_device_mem_t mean_ma = bm_mem_from_system(¶m.mean[0]); - bm_device_mem_t variance_ma = bm_mem_from_system(¶m.variance[0]); - - bm_device_mem_t* variance_holder = new bm_device_mem_t(); - - bmdnn_batchnorm_forward_inference( - _handle, - //input - *in_data, - mean_ma, - variance_ma, - scale, - *variance_holder, - eps, - input_n, - input_c, - input_h, - input_w, - //output - *out_data - ); - - return SaberSuccess; - } - -private: - bm_handle_t _handle; -}; - -} //namespace saber - -} // namespace anakin - -#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_BATCH_NORM_H diff --git a/saber/funcs/impl/bm/vender_conv.cpp b/saber/funcs/impl/bm/vender_conv.cpp new file mode 100644 index 000000000..ca45c8b7d --- /dev/null +++ b/saber/funcs/impl/bm/vender_conv.cpp @@ -0,0 +1,304 @@ + +#include "saber/funcs/impl/bm/vender_conv.h" +#include "bmkernel_base.h" +#include "bm_common.h" +#include +#include +#include +#include "tensor_op.h" + +namespace anakin +{ +namespace saber +{ + +int get_align_tensor_size(bm_tensor_4d_t shape){ + int c_per_npu = ceiling_func_shift(shape.c, NPU_SHIFT); + return shape.n * c_per_npu * get_neuron_csize_local(shape.h, shape.w); +} + +void conv_splitc(bm_kernel_param_t kernel_param, conv_secs_info_t *secs_info){ + int oc_per_NPU = ceiling_func_shift(kernel_param.oc, NPU_SHIFT); + int kernel_size = kernel_param.h * kernel_param.w * FLOAT_SIZE; + int weight_capacity = kernel_param.ic * oc_per_NPU * kernel_size; + secs_info->icsecs = 1; + secs_info->ocsecs = 1; + const int quart_local_size = (LOCAL_MEM_SIZE >> 2); + if( weight_capacity > (LOCAL_MEM_SIZE >> 1) ){ + const int max_weight_size = quart_local_size; + secs_info->icsecs = weight_capacity / max_weight_size + 1; + if(secs_info->icsecs > kernel_param.ic){ + secs_info->icsecs = kernel_param.ic; + } + int icslice = (kernel_param.ic + secs_info->icsecs - 1) / secs_info->icsecs; + weight_capacity = icslice * oc_per_NPU * kernel_size * FLOAT_SIZE; + weight_capacity = addr_EU_align( weight_capacity); + int max_ocsecs = oc_per_NPU; + while( weight_capacity > max_weight_size ){ + if(secs_info->ocsecs == 1){ + secs_info->ocsecs = weight_capacity / quart_local_size + 1; + } + if(secs_info->ocsecs > max_ocsecs){ + secs_info->ocsecs = max_ocsecs; + break; + }else{ + secs_info->ocsecs++; + } + int ocslice = (kernel_param.oc + secs_info->ocsecs - 1) / secs_info->ocsecs; + oc_per_NPU = ceiling_func_shift(ocslice, NPU_SHIFT); + weight_capacity = icslice * oc_per_NPU * kernel_size * FLOAT_SIZE; + weight_capacity = addr_EU_align(weight_capacity); + } + } +} + +static bm_status_t conv_splith(bm_tensor_4d_t input_shape, bm_tensor_4d_t output_shape, + bm_conv_param_t conv_param, int local_mem_capacity, int kh, conv_secs_info_t *secs_info){ + int io_need = get_align_tensor_size(input_shape) + + get_align_tensor_size(output_shape); + secs_info->hsecs = io_need / local_mem_capacity; + int output_h = output_shape.h; + output_shape.h = (output_h + secs_info->hsecs - 1) / secs_info->hsecs; + input_shape.h = output_shape.h * conv_param.stride_h + kh; + while(io_need > local_mem_capacity){ + if(secs_info->hsecs == output_h){ + return BM_NOT_SUPPORTED; + } + secs_info->hsecs++; + output_shape.h = (output_h + secs_info->hsecs - 1) / secs_info->hsecs; + input_shape.h = output_shape.h * conv_param.stride_h + kh; + io_need = get_align_tensor_size(input_shape) + + get_align_tensor_size(output_shape); + } + return BM_SUCCESS; +} + +static bm_status_t get_conv_secs_info( + bm_tensor_4d_t input_shape, + bm_kernel_param_t kernel_param, + bm_tensor_4d_t output_shape, + bool with_bias, + bm_conv_param_t conv_param, + conv_secs_info_t *secs_info){ + int ic = kernel_param.ic; + int oc = kernel_param.oc; + int oc_per_NPU = ceiling_func_shift(oc, NPU_SHIFT); + int bias_tensor_size = oc_per_NPU * FLOAT_SIZE; + if(!with_bias){ + bias_tensor_size = 0; + } + int kernel_size = kernel_param.h * kernel_param.w; + int weight_tensor_size = ic * oc_per_NPU * kernel_size * FLOAT_SIZE; + int weight_capacity = addr_EU_align( weight_tensor_size + bias_tensor_size); + int ifmap_total_tensor_size = get_align_tensor_size(input_shape); + int ofmap_total_tensor_size = get_align_tensor_size(output_shape); + int totalneed_local_size = ifmap_total_tensor_size + + ofmap_total_tensor_size + weight_capacity; + secs_info->nsecs = 1; secs_info->hsecs = 1; + if(totalneed_local_size > LOCAL_MEM_SIZE){ + //if weight_capacity > 2 * bank_size then split oc and ic + conv_splitc(kernel_param, secs_info); + int ocslice = (oc + secs_info->ocsecs - 1) / secs_info->ocsecs; + int icslice = (ic + secs_info->icsecs - 1) / secs_info->icsecs; + oc_per_NPU = ceiling_func_shift(ocslice, NPU_SHIFT); + + weight_capacity = icslice * oc_per_NPU * kernel_size * FLOAT_SIZE; + weight_capacity = addr_EU_align( weight_capacity + bias_tensor_size ); + int local_mem_capacity = LOCAL_MEM_SIZE - weight_capacity; + CHECK_GT(local_mem_capacity, 0) << "local memory capacity not enough"; + input_shape.c = icslice; + output_shape.c = ocslice; + ifmap_total_tensor_size = get_align_tensor_size(input_shape); + ofmap_total_tensor_size = get_align_tensor_size(output_shape); + int totalneed_local_size = ifmap_total_tensor_size + ofmap_total_tensor_size; + if(totalneed_local_size > local_mem_capacity){ + int kh_ext = conv_param.dilation_h * (kernel_param.h - 1) + 1; + if(input_shape.n > 1){ + if( totalneed_local_size > local_mem_capacity * input_shape.n){ + secs_info->nsecs = input_shape.n; + output_shape.n = input_shape.n = 1; + bm_status_t result = conv_splith(input_shape, output_shape, + conv_param, local_mem_capacity, kh_ext, secs_info); + if(result == BM_NOT_SUPPORTED){ + return result; + } + }else{ + int input_n = input_shape.n; + secs_info->nsecs = (totalneed_local_size + local_mem_capacity - 1) / local_mem_capacity; + input_shape.n = (input_n + secs_info->nsecs - 1) / secs_info->nsecs; + output_shape.n = input_shape.n; + totalneed_local_size = get_align_tensor_size(input_shape) + + get_align_tensor_size(output_shape); + while(totalneed_local_size > local_mem_capacity){ + secs_info->nsecs++; + input_shape.n = (input_n + secs_info->nsecs - 1) / secs_info->nsecs; + output_shape.n = input_shape.n; + totalneed_local_size = get_align_tensor_size(input_shape) + + get_align_tensor_size(output_shape); + } + } + }else{ + bm_status_t result = conv_splith(input_shape, output_shape, + conv_param, local_mem_capacity, kh_ext, secs_info); + if(result == BM_NOT_SUPPORTED){ + return result; + } + } + } + }else{ + secs_info->icsecs = 1; + secs_info->ocsecs = 1; + } + return BM_SUCCESS; +} + +// FP32 part +template <> +SaberStatus VenderConv2D::\ + create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) +{ +} + +template <> +SaberStatus VenderConv2D::\ + init(const std::vector *> &inputs, + std::vector *> &outputs, + ConvParam ¶m, Context &ctx) +{ + + _handle = ctx.get_handle(); + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus VenderConv2D::\ + dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param) +{ + const BM_mem_addr in_data = (const BM_mem_addr) inputs[0]->data(); + BM_mem_addr out_data = (BM_mem_addr) outputs[0]->mutable_data(); + const BM_mem_addr weight = (const BM_mem_addr) param.weight()->data(); + + int input_n = inputs[0]->num(); + int input_c = inputs[0]->channel(); + int input_h = inputs[0]->height(); + int input_w = inputs[0]->width(); + + int output_n = outputs[0]->num(); + int output_c = outputs[0]->channel(); + int output_h = outputs[0]->height(); + int output_w = outputs[0]->width(); + + int group = param.group; + int kh = param.weight()->height(); + int kw = param.weight()->width(); + int pad_h = param.pad_h; + int pad_w = param.pad_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + int dilation_h = param.dilation_h; + int dilation_w = param.dilation_w; + + bool with_bias = param.bias()->size() > 0; + const bm_mem_desc bias = with_bias ? (const bm_mem_desc) param.bias()->data() : BM_MEM_NULL; + + bm_tensor_4d_t input_shape = { + input_n, + input_c, + input_h, + input_w}; + + bm_tensor_4d_t output_shape = { + output_n, + output_c, + output_h, + output_w}; + + bm_kernel_param_t kernel_param = { + group, + output_c, + input_c, + kh, + kw}; + + bm_conv_param_t conv_param = { + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w, + 0}; + + bm_device_mem_t input_buf_mem = in_data; + // TODO: handle special case with pooling op + + conv_secs_info_t secs_info; + bm_status_t result = get_conv_secs_info(input_shape, kernel_param, + output_shape, with_bias, conv_param, &secs_info); + CHECK_EQ(BM_SUCCESS, result) << "local memory is not enough in conv."; + + + bmkernel_api_base api; + api.op = CONV; + api.opParam.convParam = { + bm_mem_get_device_addr(input_buf_mem), + bm_mem_get_device_addr(out_data), + bm_mem_get_device_addr(weight), + with_bias ? bm_mem_get_device_addr(bias) : BM_MEM_ADDR_NULL, + input_shape.n, + input_shape.c, + input_shape.h, + input_shape.w, + kernel_param.g, + output_shape.c, + kernel_param.h, + kernel_param.w, + conv_param.dilation_h, + conv_param.dilation_w, + conv_param.pad_h, + conv_param.pad_w, + conv_param.stride_h, + conv_param.stride_w, + with_bias, + conv_param.result_add, + secs_info.icsecs, + secs_info.ocsecs, + secs_info.nsecs, + secs_info.hsecs + }; + + LOG(INFO)<<"BM Conv starts..."; + print_tensor(*inputs[0]); + print_tensor(*(param.mutable_weight())); + + bm_status_t bm_stat = bmlib_kernel_launch(_handle, "/usr/local/include/bm/bmkernel_bin.bin"); + CHECK_EQ(BM_SUCCESS, bm_stat) << "bmlib_kernel_launch failed."; + + /* Send arguments. */ + enum BmOpType op = CONV; + BM_CHECK(bmlib_kernel_send_args(_handle, reinterpret_cast(&api), sizeof(api))); + + + LOG(INFO)<<"BM Conv ends..."; + print_tensor(*outputs[0]); + + char *buffer; + if ((buffer = getcwd(NULL, 0)) == NULL){ + perror("getcwd error"); + } else { + printf("%s\n", buffer); + } + + return SaberSuccess; +} + +// INT8 part +// Not supported yet + +template class VenderConv2D; +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/bm/vender_conv.h b/saber/funcs/impl/bm/vender_conv.h index 7243fd6a4..b37ed1f9b 100644 --- a/saber/funcs/impl/bm/vender_conv.h +++ b/saber/funcs/impl/bm/vender_conv.h @@ -1,5 +1,5 @@ -#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H -#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_CONV2D_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_BM_CONV2D_H +#define ANAKIN_SABER_FUNCS_IMPL_BM_CONV2D_H #include "saber/funcs/impl/impl_conv.h" @@ -7,112 +7,25 @@ namespace anakin{ namespace saber{ -template -class VenderConv2D : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvParam > > -{ +template +class VenderConv2D : public ImplBase< + BM, OpDtype, ConvParam > { + public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - VenderConv2D(): _handle(NULL) {} ~VenderConv2D() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvParam& param, Context& ctx) { - - _handle = get_bm_handle(); - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvParam& param, Context& ctx) { - - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvParam& param) { - const InDataType *in_data = (const InDataType *) inputs[0]->data(); - const InDataType *weight = (const InDataType *) param.weight()->data(); - OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); - - int input_n = inputs[0]->num(); - int input_c = inputs[0]->channel(); - int input_h = inputs[0]->height(); - int input_w = inputs[0]->width(); - - int output_n = outputs[0]->num(); - int output_c = outputs[0]->channel(); - int output_h = outputs[0]->height(); - int output_w = outputs[0]->width(); - - int group = param.group; - int kh = param.weight()->height(); - int kw = param.weight()->width(); - int pad_h = param.pad_h; - int pad_w = param.pad_w; - int stride_h = param.stride_h; - int stride_w = param.stride_w; - int dilation_h = param.dilation_h; - int dilation_w = param.dilation_w; - - bool with_bias = param.bias()->size() > 0; - const InDataType *bias = with_bias? (const InDataType *) param.bias()->data() : &bm_mem_null(); - - bm_tensor_4d_t input_shape = { - input_n, - input_c, - input_h, - input_w - }; - - bm_tensor_4d_t output_shape = { - output_n, - output_c, - output_h, - output_w - }; - - bm_kernel_param_t kernel_param = { - group, - output_c, - input_c, - kh, - kw - }; + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx); - bm_conv_param_t conv_param = { - stride_h, - stride_w, - pad_h, - pad_w, - dilation_h, - dilation_w, - 0 - }; + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx); - BMDNN_CHECK(bmdnn_conv_forward(_handle, *in_data, *weight, *bias, input_shape, - kernel_param, output_shape, conv_param, with_bias, *out_data)); - - return SaberSuccess; - } + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param); private: bm_handle_t _handle; @@ -120,4 +33,4 @@ class VenderConv2D - -namespace anakin{ - -namespace saber{ - -template -class VenderConv2DAct : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvActiveParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderConv2DAct() - : _handle(NULL) - , _workspaceData(NULL) - , _workspace(NULL) - , _conv_descs(NULL) - , _input_descs(NULL) - , _output_descs(NULL) - , _filter_desc(NULL) - , _workspace_fwd_sizes(0) - , _workspaceSizeInBytes(0) - , _fwd_algo((cudnnConvolutionFwdAlgo_t)0) - , _input_nchw_descs(NULL) - , _output_nchw_descs(NULL) - , x8_data(NULL) - , y8_data(NULL) - , x8_data_size(0) - , y8_data_size(0) - {} - - ~VenderConv2DAct() { - - if (_conv_descs) { - CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs)); - } - if (_input_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs)); - } - if (_output_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs)); - } - if (_filter_desc) { - CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc)); - } - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - if (_workspaceData != NULL) { - cudaFree(_workspaceData); - } - if (_input_nchw_descs != NULL) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_nchw_descs)); - } - if (_output_nchw_descs != NULL) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_nchw_descs)); - } - if (x8_data != NULL) { - CUDA_CHECK(cudaFree(x8_data)); - } - if (y8_data != NULL) { - CUDA_CHECK(cudaFree(y8_data)); - } - } - - /** - * [Create description] Init all cudnn resource here - * @AuthorHTL - * @DateTime 2018-02-01T16:13:06+0800 - * @param inputs [description] - * @param outputs [description] - * @param conv_param [conv parameters] - */ - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx) { - // ---- init cudnn resources ---- - - _workspaceSizeInBytes = 0; - _workspaceData = NULL; - - _workspace_fwd_sizes = 0; - - this->_ctx = ctx; - // ---- get cuda resources ---- - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - _workspace = NULL; - - int in_channels = inputs[0]->channel(); - - // ---- create cudnn Descs ---- - cudnn::createFilterDesc(&_filter_desc); - - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_output_descs); - cudnn::createConvolutionDesc(&_conv_descs); - cudnn::create_activation_des(&_active_descs); - - if (param.conv_param.bias()->size() > 0) { - cudnn::createTensorDesc(&_bias_desc); - } - - cudnnCreateTensorDescriptor(&_input_nchw_descs); - cudnnCreateTensorDescriptor(&_output_nchw_descs); - - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx); - //call cudnnConvolutionForward here - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param); -private: - cudnnHandle_t _handle; - cudnnConvolutionFwdAlgo_t _fwd_algo; - - cudnnTensorDescriptor_t _input_descs; - cudnnTensorDescriptor_t _output_descs; - cudnnTensorDescriptor_t _bias_desc; - - cudnnFilterDescriptor_t _filter_desc; - - cudnnConvolutionDescriptor_t _conv_descs; - - size_t _workspace_fwd_sizes; - size_t _workspaceSizeInBytes; // size of underlying storage - - void *_workspaceData; // underlying storage - void *_workspace; // aliases into workspaceData - - const bool _use_tensor_core = true; - const size_t _workspace_limit_bytes = 64 * 1024 * 1024; - const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; - - // activation descriptor - cudnnActivationDescriptor_t _active_descs; - - // create transform descriptor - cudnnTensorDescriptor_t _input_nchw_descs; - cudnnTensorDescriptor_t _output_nchw_descs; - - void *x8_data; - void *y8_data; - - int x8_data_size; - int y8_data_size; -}; - - -} - -} -#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H diff --git a/saber/funcs/impl/bm/vender_conv_act_pooling.h b/saber/funcs/impl/bm/vender_conv_act_pooling.h deleted file mode 100644 index e602a693d..000000000 --- a/saber/funcs/impl/bm/vender_conv_act_pooling.h +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_POOLING_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_POOLING_H - -#include "saber/funcs/impl/impl_conv_act_pooling.h" -#include "saber/funcs/impl/cuda/cudnn_helper.h" -#include - -namespace anakin{ - -namespace saber{ - -template -class VenderConv2DActPooling : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvActivePoolingParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderConv2DActPooling() - : _handle(NULL) - , _workspaceData(NULL) - , _workspace(NULL) - , _conv_descs(NULL) - , _input_descs(NULL) - , _output_descs(NULL) - , _filter_desc(NULL) - , _workspace_fwd_sizes(0) - , _workspaceSizeInBytes(0) - , _fwd_algo((cudnnConvolutionFwdAlgo_t)0) - {} - ~VenderConv2DActPooling() { - - if (_conv_descs) { - CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs)); - } - if (_input_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs)); - } - if (_output_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs)); - } - if (_filter_desc) { - CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc)); - } - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - if (_workspaceData != NULL) { - cudaFree(_workspaceData); - } - } - - /** - * [Create description] Init all cudnn resource here - * @AuthorHTL - * @DateTime 2018-02-01T16:13:06+0800 - * @param inputs [description] - * @param outputs [description] - * @param conv_param [conv parameters] - */ - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam& param, Context& ctx) { - // ---- init cudnn resources ---- - - _workspaceSizeInBytes = 0; - _workspaceData = NULL; - - _workspace_fwd_sizes = 0; - - this->_ctx = ctx; - // ---- get cuda resources ---- - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - _workspace = NULL; - - int in_channels = inputs[0]->channel(); - - // ---- create cudnn Descs ---- - cudnn::createFilterDesc(&_filter_desc); - - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_inner_descs); - cudnn::createTensorDesc(&_output_descs); - cudnn::createConvolutionDesc(&_conv_descs); - if (param.has_activation) { - cudnn::create_activation_des(&_active_descs); - } - if (param.has_pooling) { - cudnn::create_pooling_des(&_pooling_descs); - } - if (param.conv_param.bias()->size() > 0) { - cudnn::createTensorDesc(&_bias_desc); - } - - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam& param, Context& ctx); - //call cudnnConvolutionForward here - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam& param); -private: - cudnnHandle_t _handle; - cudnnConvolutionFwdAlgo_t _fwd_algo; - - cudnnTensorDescriptor_t _input_descs; - cudnnTensorDescriptor_t _output_descs; - cudnnTensorDescriptor_t _inner_descs; - cudnnTensorDescriptor_t _bias_desc; - - cudnnFilterDescriptor_t _filter_desc; - - cudnnConvolutionDescriptor_t _conv_descs; - cudnnPoolingDescriptor_t _pooling_descs; - - size_t _workspace_fwd_sizes; - size_t _workspaceSizeInBytes; // size of underlying storage - - void *_workspaceData; // underlying storage - void *_workspace; // aliases into workspaceData - - const bool _use_tensor_core = true; - const size_t _workspace_limit_bytes = 64 * 1024 * 1024; - const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; - - // activation descriptor - cudnnActivationDescriptor_t _active_descs; - - Shape _inner_shape; - DataTensor_out _inner_tensor; -}; - - -} - -} -#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H diff --git a/saber/funcs/impl/bm/vender_fc.h b/saber/funcs/impl/bm/vender_fc.h deleted file mode 100644 index 82dd6000c..000000000 --- a/saber/funcs/impl/bm/vender_fc.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef ANAKIN_SABER_FUNCS_BMDNN_FC_H -#define ANAKIN_SABER_FUNCS_BMDNN_FC_H - -#include "saber/funcs/impl/impl_fc.h" - -namespace anakin{ - -namespace saber{ - -template -class VenderFc: \ - public ImplBase< - Tensor, \ - Tensor, \ - Tensor, \ - FcParam>> { - -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderFc(): _handle(NULL) {}; - ~VenderFc() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - FcParam& param, Context& ctx){ - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - FcParam& param, Context& ctx){ - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - FcParam& param){ - const InDataType *in_data = (const InDataType *) inputs[0]->data(); - const InDataType *weights = (const InDataType *) param.weights->get_buf()->get_data(); - const InDataType *bias = (const InDataType *) param.bias->get_buf()->get_data(); - OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); - int batch_size = inputs[0]->num(); - int input_len = inputs[0]->channel(); - int output_len = param.num_output; - int is_transpose = param.is_transpose_weights ? 1 : 0; - BMDNN_CHECK(bmdnn_fc_forward(_handle, in_data, weights, bias, - batch_size, output_len, input_len, is_transpose, 1, 0, - out_data)); - return SaberSuccess; - }; - -private: - bm_handle_t _handle; -}; - -template class VenderFc; -} //namespace saber - -} //namespace anakin - -#endif // ANAKIN_SABER_FUNCS_BMDNN_FC_H diff --git a/saber/funcs/impl/bm/vender_pooling.h b/saber/funcs/impl/bm/vender_pooling.h deleted file mode 100644 index 1bdcfdecb..000000000 --- a/saber/funcs/impl/bm/vender_pooling.h +++ /dev/null @@ -1,87 +0,0 @@ -#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H -#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H - -#include "saber/funcs/impl/impl_pooling.h" - -namespace anakin{ - -namespace saber { - -template -class VenderPooling:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - PoolingParam>> { -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderPooling() : _handle(NULL) {} - - ~VenderPooling() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - PoolingParam &pooling_param, Context &ctx) { - - _handle = get_bm_handle(); - return create(inputs, outputs, pooling_param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - PoolingParam &pooling_param, Context &ctx) { - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - PoolingParam ¶m) { - const InDataType in_data = *(inputs[0]->data()); - OutDataType out_data = *(outputs[0]->mutable_data()); - int input_n = inputs[0]->num(); - int input_c = inputs[0]->channel(); - int input_h = inputs[0]->height(); - int input_w = inputs[0]->width(); - int kh = param.window_h; - int kw = param.window_w; - int pad_h = param.pad_h; - int pad_w = param.pad_w; - int stride_h = param.stride_h; - int stride_w = param.stride_w; - int is_avg_pooling; - if(param.pooling_type == Pooling_max){ - is_avg_pooling = 0; - } else { - is_avg_pooling = 1; - } - - BMDNN_CHECK(bmdnn_pooling_forward(_handle, in_data, - input_n, input_c, input_h, input_w, kh, kw, pad_h, pad_w, - stride_h, stride_w, is_avg_pooling, out_data)); - return SaberSuccess; - } - -private: - bm_handle_t _handle; - PoolingType _pooling_type; -}; - -template class VenderPooling; - -} //namespace saber - -} // namespace anakin - -#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_POOLING_H diff --git a/saber/funcs/impl/bm/vender_scale.h b/saber/funcs/impl/bm/vender_scale.h deleted file mode 100644 index 4e9402a43..000000000 --- a/saber/funcs/impl/bm/vender_scale.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H -#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H - -#include "saber/funcs/impl/impl_scale.h" - -namespace anakin{ - -namespace saber{ - -template -class VenderScale : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ScaleParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderScale() {} - - ~VenderScale() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ScaleParam& param, Context& ctx) { - - _handle = get_bm_handle(); - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ScaleParam& param, Context &ctx) { - - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ScaleParam& param) { - - const InDataType in_data = *(inputs[0]->data()); - OutDataType out_data = *(outputs[0]->mutable_data()); - - int input_n = inputs[0]->num(); - int input_c = inputs[0]->channel(); - int input_h = inputs[0]->height(); - int input_w = inputs[0]->width(); - - int axis = (param.num_axes == 0) ? 0 : param.axis; - int num_axes = param.num_axes >=0 ? param.num_axes : inputs[0]->shape().dims() - axis; - - int outer_dim = inputs[0]->count(0, axis); - int inner_dim = inputs[0]->count(axis + num_axes, inputs[0]->shape().dims()); - int scale_dim = inputs[0]->count(axis, axis + num_axes); - /* if (inputs.size() == 1) { */ - /* CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid"; */ - /* } */ - - float* scale_data = ¶m.scale_w[0]; - bm_device_mem_t* data_extension = new bm_device_mem_t(); - int size = input_n * input_c * input_h * input_w; - bm_malloc_device_byte(_handle, data_extension, size * sizeof(float)); - BMDNN_CHECK(bmdnn_scale_forward(_handle, in_data, bm_mem_from_system(scale_data), - input_n, input_c, input_h, input_w, - scale_dim, inner_dim, 0, - *data_extension, out_data)); - - if (param.bias_term) { - float* host_bias = ¶m.scale_b[0]; - float* host_extension = new float[size]; - int dim = inner_dim * scale_dim; - for (int i = 0; i < size; ++i) { - int bias_dim = (i % dim) / inner_dim; - host_extension[i] = host_bias[bias_dim]; - } - - bm_flush(get_bm_handle()); - BMDNN_CHECK(bmdnn_bias_forward(_handle, out_data, bm_mem_from_system(host_extension), - outer_dim, scale_dim * inner_dim, out_data)); - - delete [] host_bias; - delete [] host_extension; - } - bm_free_device(_handle, *data_extension); - return SaberSuccess; - } -private: - bm_handle_t _handle; -}; - -} -} -#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SCALE_H diff --git a/saber/funcs/impl/bm/vender_softmax.h b/saber/funcs/impl/bm/vender_softmax.h deleted file mode 100644 index 55612f66a..000000000 --- a/saber/funcs/impl/bm/vender_softmax.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H -#define ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H - -#include "saber/funcs/impl/impl_softmax.h" -#include "saber/saber_funcs_param.h" -#include "saber/saber_types.h" - -namespace anakin{ - -namespace saber{ - -template -class VenderSoftmax : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - SoftmaxParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderSoftmax(): _handle(NULL) {} - ~VenderSoftmax() {} - - /** - * \brief initial all bmdnn resources here - * @param inputs - * @param outputs - * @param param - * @param ctx - */ - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - SoftmaxParam& param, Context& ctx) { - - _handle = get_bm_handle(); - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - SoftmaxParam& param, Context &ctx) { - - } - - //call cudnnConvolutionForward here - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - SoftmaxParam ¶m){ - - const InDataType *in_data = (const InDataType *) inputs[0]->data(); - OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); - - /* - int input_n = inputs[0]->num(); - int input_c = inputs[0]->channel(); - int input_h = inputs[0]->height(); - int input_w = inputs[0]->width(); - */ - - int outer_num = inputs[0]->count(0, param.axis); - int inner_num = inputs[0]->count(param.axis + 1, inputs[0]->dims()); - - int N = outer_num; - int K = inputs[0]->valid_shape()[param.axis]; - int H = inner_num; - int W = 1; - - /* - const int stride_w = 1; - const int stride_h = W * stride_w; - const int stride_c = H * stride_h; - const int stride_n = K * stride_c; - */ - - bmdnn_softmax_forward( - _handle, - *in_data, - N, - K, - H * W, - *out_data - ); - - return SaberSuccess; - } - -private: - bm_handle_t _handle; -}; - -} //namespace saber - -} //namespace anakin - -#endif //ANAKIN_SABER_FUNCS_IMPL_BMDNN_SOFTMAX_H diff --git a/saber/funcs/impl/cuda/base/CMakeLists.txt b/saber/funcs/impl/cuda/base/CMakeLists.txt index 1f66c814d..933efddf0 100644 --- a/saber/funcs/impl/cuda/base/CMakeLists.txt +++ b/saber/funcs/impl/cuda/base/CMakeLists.txt @@ -1,9 +1,16 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved -# @file CMakeLists files in the saber subdirectory for nvidia gpu code -# @auther cuichaowen -# @date 2017-11-29 -# ---------------------------------------------------------------------------- +# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. if(USE_CUDA) anakin_fetch_files_with_suffix(${CUDA_BASE_CODE_ROOT}/cuda_c "cu" ANAKIN_SABER_CUDA_C_SRC) diff --git a/saber/funcs/impl/cuda/base/cuda_c/calibrate.cu b/saber/funcs/impl/cuda/base/cuda_c/calibrate.cu index a8d3689e5..76b2732f6 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/calibrate.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/calibrate.cu @@ -1,91 +1,322 @@ -#include "anakin_config.h" -#include -#include "saber/funcs/impl/impl_base.h" -#include "saber/funcs/impl/cuda/cudnn_helper.h" -#include "saber/core/tensor.h" -#include "saber/core/context.h" + #include "saber/core/common.h" -#include +#include "saber/core/tensor.h" +#include "saber/funcs/calibrate.h" +#include namespace anakin { -namespace saber{ - - __global__ void ker_calibrate_from_fp32_to_int8(void* data_int8, - const void* data_fp32, int size, float scale, - int in_n, int in_c, int in_h, int in_w, - int stride_n, int stride_c, int stride_h, int stride_w) { - - CUDA_KERNEL_LOOP(tid, size){ - int read_w = tid % in_w; - int read_h = (tid / (in_w)) % in_h; - int read_c = (tid / (in_h * in_w)) % in_c; - int read_n = (tid / (in_c * in_h * in_w)) % in_n; - - int in_idx = read_n * stride_n - + read_c * stride_c - + read_h * stride_h - + read_w * stride_w; - - float* data_in = (float*)data_fp32; - if (scale <= 1e-6) { - scale = 1e-6; - } +namespace saber { - ((char*)data_int8)[tid] = (char)(data_in[in_idx] / scale); -// printf("%f, ", data_in[in_idx] /scale); - } +__global__ +void transform_nchw_2_c4(char* out_data, const float* in_data, + int valid_num, int valid_channel_4, int valid_height, int valid_width, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + float scale, + int count) { + + int load0, load1, load2, load3; + int gid = threadIdx.x + blockIdx.x * blockDim.x; + + int write_w = (gid) % valid_width; + int write_h = (gid / (out_h_stride)) % valid_height; + int write_c = (gid / (out_c_stride)) % valid_channel_4; + int write_n = (gid / (out_n_stride)) % valid_num; + + int in_offset = write_n * in_n_stride + + write_c * (in_c_stride << 2) + + write_h * in_h_stride + + write_w * in_w_stride; + + int out_offset = write_n * out_n_stride + + write_c * out_c_stride + + write_h * out_h_stride + + write_w; + + if (gid < count) { + + char4 write; + load0 = __float2int_rn(__ldg(&in_data[in_offset]) * scale); + write.x = static_cast(load0); + + in_offset += in_c_stride; + load1 = __float2int_rn(__ldg(&in_data[in_offset]) * scale); + write.y = static_cast(load1); + + in_offset += in_c_stride; + load2 = __float2int_rn(__ldg(&in_data[in_offset]) * scale); + write.z = static_cast(load2); + + in_offset += in_c_stride; + load3 = __float2int_rn(__ldg(&in_data[in_offset]) * scale); + write.w = static_cast(load3); + + ((char4*)out_data)[out_offset] = write; + + } +} + +template<> +SaberStatus conv_calibrate_fp32_int8_c4(Tensor &out_tensor, + const Tensor &in_tensor, float in_scale, Context ctx) { + + const float * in_data = (const float*)in_tensor.data(); + char * out_data = (char*)out_tensor.mutable_data(); + + Shape in_stride = in_tensor.get_stride(); + + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); + int count = out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]; + + cudaStream_t cuda_stream = ctx.get_compute_stream(); + transform_nchw_2_c4<<>>(out_data, in_data, + out_shape[0], out_shape[1], out_shape[2], out_shape[3], + in_stride[0], in_stride[1], in_stride[2], in_stride[3], + out_shape[1] * out_shape[2] * out_shape[3], + out_shape[2] * out_shape[3], out_shape[3], 1, + (1.f / in_scale), count); + + return SaberSuccess; +} + +__global__ void transform_nchw_2_nchw(float * out_data, + const float* in_data, const int count, + int in_n, int in_c, int in_h, int in_w, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n, int out_c, int out_h, int out_w, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + float *scale, float input_scale) { + CUDA_KERNEL_LOOP(tid, count){ + int read_w = tid % in_w; + int read_h = (tid / (in_w)) % in_h; + int read_c = (tid / (in_h * in_w)) % in_c; + int read_n = (tid / (in_c * in_h * in_w)) % in_n; + + int write_w = tid % out_w; + int write_h = (tid / (out_w)) % out_h; + int write_c = (tid / (out_h * out_w)) % out_c; + int write_n = (tid / (out_c * out_h * out_w)) % out_n; + + int in_idx = read_n * in_n_stride + + read_c * in_c_stride + + read_h * in_h_stride + + read_w * in_w_stride; + + int out_idx = write_n * out_n_stride + + write_c * out_c_stride + + write_h * out_h_stride + + write_w * out_w_stride; + + float in_var = in_data[in_idx]; + float in_scale = scale[read_c]; + out_data[out_idx] = in_var * in_scale * input_scale; } +} - __global__ void ker_calibrate_from_int8_to_fp32(void* data_fp32, - const void* data_int8, int size, float scale, - int out_n, int out_c, int out_h, int out_w, - int stride_n, int stride_c, int stride_h, int stride_w) { +template<> +SaberStatus conv_calibrate_int32_fp32( + Tensor &out_tensor, const Tensor &in_tensor, + float in_scale, float* weight_scale, Context ctx) { - CUDA_KERNEL_LOOP(tid, size) { + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); - int write_w = tid % out_w; - int write_h = (tid / (out_w)) % out_h; - int write_c = (tid / (out_h * out_w)) % out_c; - int write_n = (tid / (out_c * out_h * out_w)) % out_n; + Shape stride_in = in_tensor.get_stride(); + Shape stride_out = out_tensor.get_stride(); - int out_idx = write_n * stride_n - + write_c * stride_c - + write_h * stride_h - + write_w * stride_w; + const float *in_data = (const float*)in_tensor.data(); + float *out_data = (float*)out_tensor.mutable_data(); - const char* data_in = (const char*)data_int8; - if (scale <= 1e-6) { - scale = 1e-6; - } - ((float*)data_fp32)[out_idx] = (float)(data_in[tid]) * scale; -// printf("%d, ", data_in[tid]); - } + const int count = in_tensor.valid_size(); + cudaStream_t cuda_stream = ctx.get_compute_stream(); + + transform_nchw_2_nchw + <<>>( + out_data, in_data, count, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + stride_in[0], stride_in[1], stride_in[2], stride_in[3], + out_shape[0], out_shape[1], out_shape[2], out_shape[3], + stride_out[0], stride_out[1], stride_out[2], stride_out[3], + weight_scale, in_scale); + return SaberSuccess; +} + +__global__ +void int8nchwc4_fp32nchw(float* out_data, const char* in_data, + int valid_num, int valid_channel_4, int valid_height, int valid_width, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + float* scale, int count) { + + float load0, load1, load2, load3; + int gid = threadIdx.x + blockIdx.x * blockDim.x; + + int read_w = (gid) % valid_width; + int read_h = (gid / (in_h_stride)) % valid_height; + int read_c = (gid / (in_c_stride)) % valid_channel_4; + int read_n = (gid / (in_n_stride)) % valid_num; + int scale_index = read_c << 2; + + int in_offset = read_n * in_n_stride + + read_c * in_c_stride + + read_h * in_h_stride + + read_w; + + int out_offset = read_n * out_n_stride + + read_c * (out_c_stride << 2) + + read_h * out_h_stride + + read_w * out_w_stride; + + if (gid < count) { + + char4 readin = __ldg(&((const char4*)in_data)[in_offset]); + + load0 = static_cast(readin.x); + load1 = static_cast(readin.y); + load2 = static_cast(readin.z); + load3 = static_cast(readin.w); + + out_data[out_offset] = load0 * scale[scale_index]; out_offset += out_c_stride; + out_data[out_offset] = load1 * scale[scale_index + 1]; out_offset += out_c_stride; + out_data[out_offset] = load2 * scale[scale_index + 2]; out_offset += out_c_stride; + out_data[out_offset] = load3 * scale[scale_index + 3]; } +} + +template<> +SaberStatus conv_calibrate_int8_c4_fp32( + Tensor &out_tensor, + const Tensor &in_tensor, + float* weight_scale, + Context ctx) { + + Shape out_stride = out_tensor.get_stride(); + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); + int count = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3]; + + const char * in_data = (const char*)in_tensor.data(); + float * out_data = (float*)out_tensor.mutable_data(); + + cudaStream_t cuda_stream = ctx.get_compute_stream(); + int8nchwc4_fp32nchw<<>>(out_data, in_data, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + in_shape[1] * in_shape[2] * in_shape[3], + in_shape[2] * in_shape[3], + in_shape[3], 1, + out_stride[0], out_stride[1], out_stride[2], out_stride[3], + weight_scale, count); + + return SaberSuccess; +} - void calibrate_to_int8(void* data_int8, const void* data_fp32, int size, float* scale, Context ctx, - int in_n, int in_c, int in_h, int in_w, - int stride_n, int stride_c, int stride_h, int stride_w) { +#define JUDGESIGN(x) (((x) >= 0) ? +1 : -1) - cudaStream_t cuda_stream = ctx.get_compute_stream(); - ker_calibrate_from_fp32_to_int8<<>>(data_int8, data_fp32, size, *scale, - in_n, in_c, in_h, in_w, - stride_n, stride_c, stride_h, stride_w); +__global__ +void calibrate_float2char_col(signed char* dst, const float* src, + float * scale, int height, int width) { + int gid = threadIdx.x + blockIdx.x * blockDim.x; + float col_max = 0.0f; + const float *data = src + gid; + for(int idx = 0; idx < height; ++idx){ + if (gid < width) { + float temp = fabsf(data[idx * width]); + col_max = (col_max >= temp)? col_max : temp; + } } + signed char* target = dst + gid; + float col_scale = (float)((1 << 7) - 1) / col_max; + for(int idx = 0; idx < height; ++idx) { + if(gid < width) { + float temp = data[idx * width]; + if(temp >= col_max - FLT_EPSILON) { + target[idx * width] = (signed char)((1 << 7) - 1); + } else if(temp <= -col_max + FLT_EPSILON) { + target[idx * width] = (signed char)(-(1 << 7)); + } else { + target[idx * width] = (signed char)(temp * col_scale + JUDGESIGN(temp) * 0.5); + } + } + } + scale[gid] = 1.f / col_scale; +} - void calibrate_to_fp32(void* data_fp32, const void* data_int8, int size, float* scale, Context ctx, - int out_n, int out_c, int out_h, int out_w, - int stride_n, int stride_c, int stride_h, int stride_w) { +__global__ +void calibrate_float2char_row(signed char* dst, const float* src, + float * scale, int height, int width) { - cudaStream_t cuda_stream = ctx.get_compute_stream(); - ker_calibrate_from_int8_to_fp32<<>>(data_fp32, data_int8, size, *scale, - out_n, out_c, out_h, out_w, - stride_n, stride_c, stride_h, stride_w); + int gid = threadIdx.x + blockIdx.x * blockDim.x; + float row_max = 0.0f; + const float * data = src + width * gid; + for(int idx = 0; idx < width; ++idx) { + if(gid < height){ + float temp = fabsf(data[idx]); + row_max = (row_max >= temp) ? row_max : temp; + } + } + signed char * target = dst + width * gid; + float row_scale = (float)((1 << 7) - 1) / row_max; + for(int idx = 0; idx < width; ++idx) { + if(gid < height) { + float temp = data[idx]; + if(temp >= row_max - FLT_EPSILON) { + target[idx] = (signed char)((1 << 7) - 1); + } else if(temp <= -row_max + FLT_EPSILON) { + target[idx] = (signed char)(-(1 << 7)); + } else { + target[idx] = (signed char)(temp * row_scale + JUDGESIGN(temp) * 0.5); + } + } + } + scale[gid] = 1.f / row_scale; +} +__global__ void calibrate_fix2float(float * dst, + const float* sA, const float* sB, + float alpha, float beta, int height, + int width, int threads) { + int ri = blockIdx.x; + int tid = threadIdx.x; + int loop = (width / threads) + ((width % threads == 0) ? 0 : 1); + + float rscale = (sA[ri] == 0.0f) ? 1.0f : sA[ri]; + float * data = dst + width * ri; + int idx = 0; + for (int i = 0; i < loop; ++i) { + if(idx + tid < width){ + float temp = data[idx + tid]; + float cscale = (sB[idx + tid] == 0.0f) ? 255.0f : sB[idx + tid]; + data[idx + tid] = beta * temp + alpha * temp * rscale * cscale; + } + idx += threads; } +} +template <> +void float2char(bool col_direct, signed char* dst, const float* src, + float *scale, int height, int width, Context ctx) { + int threads = 32; + cudaStream_t cuda_stream = ctx.get_compute_stream(); + if (col_direct) { + calibrate_float2char_col <<< (width / threads) + (((width % threads) == 0) ? 0 : 1), threads, 0, + cuda_stream >>> ( + dst, src, scale, height, width); + } else { + calibrate_float2char_row<<<(height / threads) + (((height % threads)==0) ? 0 : 1), threads, 0, cuda_stream>>>( + dst, src, scale, height, width); + } +} +template <> +void fix2float(float * dst, + const float *sA, const float *sB, + const float alpha, const float beta, int height, int width, Context ctx) { + int threads = 256; + cudaStream_t cuda_stream = ctx.get_compute_stream(); + calibrate_fix2float<<>>(dst, sA, sB, alpha, beta, + height, width, threads); +} } } \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/cuda_utils.cu b/saber/funcs/impl/cuda/base/cuda_c/cuda_utils.cu new file mode 100644 index 000000000..7c817dd96 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/cuda_utils.cu @@ -0,0 +1,69 @@ +#include "cuda_utils.h" +namespace anakin { + +namespace saber { + + +template +__global__ void trans_map2in(Dtype* output, const Dtype* input, const int* map, int count, + int lastdim) { + CUDA_KERNEL_LE(tid, count) { + int seq = tid / lastdim; + output[tid] = input[map[seq] * lastdim + tid % lastdim]; + // printf("in %d = %f\n",tid,output[tid]); + } +} + +template +__global__ void trans_map2out(Dtype* output, const Dtype* input, const int* map, int count, + int lastdim) { + CUDA_KERNEL_LE(tid, count) { + int seq = tid / lastdim; + output[map[seq] * lastdim + tid % lastdim] = input[tid]; + // printf("out %d = %f\n",map[seq]*lastdim + tid % lastdim,output[map[seq]*lastdim + tid % lastdim]); + } +} + +template +void trans_map2out_cfunc(const Dtype* input, Dtype* output, int word_size, int seq_sum, + cudaStream_t stream, + int* dev_map_vec) { + int count = seq_sum * word_size; + int block_dim = count; + int grid_dim = 1; + + if (count > 1024) { + block_dim = 256; + grid_dim = (count + block_dim - 1) / block_dim; + } + + trans_map2out << < grid_dim, block_dim, 0, stream >> > (output, input, dev_map_vec, + count, word_size); + + // cudaDeviceSynchronize(); +} + +template +void trans_map2in_cfunc(const Dtype* input, Dtype* output, int hidden_size, int seq_sum, + cudaStream_t stream, + int* dev_map_vec) { + int count = seq_sum * hidden_size; + int block_dim = count; + int grid_dim = 1; + if (count > 1024) { + block_dim = 256; + grid_dim = (count + block_dim - 1) / block_dim; + } + + trans_map2in << < grid_dim, block_dim, 0, stream >> > (output, input, dev_map_vec, + count, hidden_size); + +} +template void trans_map2in_cfunc(const float* input, float* output, int hidden_size, int seq_sum, + cudaStream_t stream, + int* dev_map_vec); +template void trans_map2out_cfunc(const float* input, float* output, int word_size, int seq_sum, + cudaStream_t stream, + int* dev_map_vec); +} +} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/detection_helper.cu b/saber/funcs/impl/cuda/base/cuda_c/detection_helper.cu deleted file mode 100644 index eb54ade9d..000000000 --- a/saber/funcs/impl/cuda/base/cuda_c/detection_helper.cu +++ /dev/null @@ -1,246 +0,0 @@ -#include "saber/funcs/impl/detection_helper.h" -namespace anakin{ - -namespace saber{ - -template -__global__ void decode_bbox_corner_variance_kernel(const int count, \ - const dtype* loc_data, const dtype* prior_data, const dtype* variance, \ - const int num_priors, const bool share_location, const int num_loc_classes, \ - const int background_label_id, dtype* bbox_data) { - CUDA_KERNEL_LOOP(index, count) { - const int c = index % num_loc_classes; - const int idx_p = (index % num_priors) * 4; - const int idx = index * 4; - if (!share_location && c == background_label_id) { - //! Ignore background class if not share_location. - return; - } - //! variance is encoded in target, we simply need to add the offset predictions. - bbox_data[idx] = prior_data[idx_p] + loc_data[idx]; - bbox_data[idx + 1] = prior_data[idx_p + 1] + loc_data[idx + 1]; - bbox_data[idx + 2] = prior_data[idx_p + 2] + loc_data[idx + 2]; - bbox_data[idx + 3] = prior_data[idx_p + 3] + loc_data[idx + 3]; - } -} - -template -__global__ void decode_bbox_corner_no_variance_kernel(const int count, \ - const dtype* loc_data, const dtype* prior_data, const dtype* variance, \ - const int num_priors, const bool share_location, const int num_loc_classes, \ - const int background_label_id, dtype* bbox_data) { - CUDA_KERNEL_LOOP(index, count) { - const int c = index % num_loc_classes; - const int idx_p = (index % num_priors) * 4; - const int idx = index * 4; - if (!share_location && c == background_label_id) { - //! Ignore background class if not share_location. - return; - } - //! variance is encoded in bbox, we need to scale the offset accordingly. - bbox_data[idx] = prior_data[idx_p] + loc_data[idx] * variance[idx_p]; - bbox_data[idx + 1] = prior_data[idx_p + 1] + loc_data[idx + 1] * variance[idx_p + 1]; - bbox_data[idx + 2] = prior_data[idx_p + 2] + loc_data[idx + 2] * variance[idx_p + 2]; - bbox_data[idx + 3] = prior_data[idx_p + 3] + loc_data[idx + 3] * variance[idx_p + 3]; - } -} - -template -__global__ void decode_bbox_center_variance_kernel(const int count, \ - const dtype* loc_data, const dtype* prior_data, const dtype* variance, \ - const int num_priors, const bool share_location, const int num_loc_classes, \ - const int background_label_id, dtype* bbox_data) { - CUDA_KERNEL_LOOP(index, count) { - const int c = index % num_loc_classes; - const int idx_p = (index % num_priors) * 4; - const int idx = index * 4; - if (!share_location && c == background_label_id) { - //! Ignore background class if not share_location. - return; - } - const dtype p_xmin = prior_data[idx_p]; - const dtype p_ymin = prior_data[idx_p + 1]; - const dtype p_xmax = prior_data[idx_p + 2]; - const dtype p_ymax = prior_data[idx_p + 3]; - const dtype prior_width = p_xmax - p_xmin; - const dtype prior_height = p_ymax - p_ymin; - const dtype prior_center_x = (p_xmin + p_xmax) / 2.; - const dtype prior_center_y = (p_ymin + p_ymax) / 2.; - - const dtype xmin = loc_data[idx]; - const dtype ymin = loc_data[idx + 1]; - const dtype xmax = loc_data[idx + 2]; - const dtype ymax = loc_data[idx + 3]; - - //! variance is encoded in target, we simply need to retore the offset predictions. - dtype decode_bbox_center_x = xmin * prior_width + prior_center_x; - dtype decode_bbox_center_y = ymin * prior_height + prior_center_y; - dtype decode_bbox_width = exp(xmax) * prior_width; - dtype decode_bbox_height = exp(ymax) * prior_height; - - bbox_data[idx] = decode_bbox_center_x - decode_bbox_width / 2.f; - bbox_data[idx + 1] = decode_bbox_center_y - decode_bbox_height / 2.f; - bbox_data[idx + 2] = decode_bbox_center_x + decode_bbox_width / 2.f; - bbox_data[idx + 3] = decode_bbox_center_y + decode_bbox_height / 2.f; - } -} - -template -__global__ void decode_bbox_center_no_variance_kernel(const int count, \ - const dtype* loc_data, const dtype* prior_data, const dtype* variance, \ - const int num_priors, const bool share_location, const int num_loc_classes, \ - const int background_label_id, dtype* bbox_data) { - CUDA_KERNEL_LOOP(index, count) { - const int c = index % num_loc_classes; - const int idx_p = (index % num_priors) * 4; - const int idx = index * 4; - if (!share_location && c == background_label_id) { - //! Ignore background class if not share_location. - return; - } - const dtype p_xmin = prior_data[idx_p]; - const dtype p_ymin = prior_data[idx_p + 1]; - const dtype p_xmax = prior_data[idx_p + 2]; - const dtype p_ymax = prior_data[idx_p + 3]; - const dtype prior_width = p_xmax - p_xmin; - const dtype prior_height = p_ymax - p_ymin; - const dtype prior_center_x = (p_xmin + p_xmax) / 2.; - const dtype prior_center_y = (p_ymin + p_ymax) / 2.; - - const dtype xmin = loc_data[idx]; - const dtype ymin = loc_data[idx + 1]; - const dtype xmax = loc_data[idx + 2]; - const dtype ymax = loc_data[idx + 3]; - - //! variance is encoded in bbox, we need to scale the offset accordingly. - dtype decode_bbox_center_x = - variance[idx_p] * xmin * prior_width + prior_center_x; - dtype decode_bbox_center_y = - variance[idx_p + 1] * ymin * prior_height + prior_center_y; - dtype decode_bbox_width = - exp(variance[idx_p + 2] * xmax) * prior_width; - dtype decode_bbox_height = - exp(variance[idx_p + 3] * ymax) * prior_height; - - bbox_data[idx] = decode_bbox_center_x - decode_bbox_width / 2.f; - bbox_data[idx + 1] = decode_bbox_center_y - decode_bbox_height / 2.f; - bbox_data[idx + 2] = decode_bbox_center_x + decode_bbox_width / 2.f; - bbox_data[idx + 3] = decode_bbox_center_y + decode_bbox_height / 2.f; - } -} - -template -__global__ void decode_bbox_corner_size_variance_kernel(const int count, \ - const dtype* loc_data, const dtype* prior_data, const dtype* variance, \ - const int num_priors, const bool share_location, const int num_loc_classes, \ - const int background_label_id, dtype* bbox_data) { - CUDA_KERNEL_LOOP(index, count) { - const int c = index % num_loc_classes; - const int idx_p = (index % num_priors) * 4; - const int idx = index * 4; - if (!share_location && c == background_label_id) { - //! Ignore background class if not share_location. - return; - } - const dtype p_xmin = prior_data[idx_p]; - const dtype p_ymin = prior_data[idx_p + 1]; - const dtype p_xmax = prior_data[idx_p + 2]; - const dtype p_ymax = prior_data[idx_p + 3]; - const dtype prior_width = p_xmax - p_xmin; - const dtype prior_height = p_ymax - p_ymin; - //! variance is encoded in target, we simply need to add the offset predictions. - bbox_data[idx] = p_xmin + loc_data[idx] * prior_width; - bbox_data[idx + 1] = p_ymin + loc_data[idx + 1] * prior_height; - bbox_data[idx + 2] = p_xmax + loc_data[idx + 2] * prior_width; - bbox_data[idx + 3] = p_ymax + loc_data[idx + 3] * prior_height; - } -} - -template -__global__ void decode_bbox_corner_size_no_variance_kernel(const int count, \ - const dtype* loc_data, const dtype* prior_data, const dtype* variance, \ - const int num_priors, const bool share_location, const int num_loc_classes, \ - const int background_label_id, dtype* bbox_data) { - CUDA_KERNEL_LOOP(index, count) { - const int c = index % num_loc_classes; - const int idx_p = (index % num_priors) * 4; - const int idx = index * 4; - if (!share_location && c == background_label_id) { - //! Ignore background class if not share_location. - return; - } - const dtype p_xmin = prior_data[idx_p]; - const dtype p_ymin = prior_data[idx_p + 1]; - const dtype p_xmax = prior_data[idx_p + 2]; - const dtype p_ymax = prior_data[idx_p + 3]; - const dtype prior_width = p_xmax - p_xmin; - const dtype prior_height = p_ymax - p_ymin; - //! variance is encoded in bbox, we need to scale the offset accordingly. - bbox_data[idx] = - p_xmin + loc_data[idx] * variance[idx_p] * prior_width; - bbox_data[idx + 1] = - p_ymin + loc_data[idx + 1] * variance[idx_p + 1] * prior_height; - bbox_data[idx + 2] = - p_xmax + loc_data[idx + 2] * variance[idx_p + 2] * prior_width; - bbox_data[idx + 3] = - p_ymax + loc_data[idx + 3] * variance[idx_p + 3] * prior_height; - } -} - -template -void decode_bboxes(const int nthreads, - const Dtype* loc_data, const Dtype* prior_data, - const CodeType code_type, const bool variance_encoded_in_target, - const int num_priors, const bool share_location, - const int num_loc_classes, const int background_label_id, - Dtype* bbox_data, cudaStream_t stream) { - int count = nthreads / 4; - const Dtype* variance_data = prior_data + 4 * num_priors; - if (code_type == CORNER) { - if (variance_encoded_in_target) { - decode_bbox_corner_variance_kernel\ - <<>>\ - (count, loc_data, prior_data, variance_data, num_priors, share_location, \ - num_loc_classes, background_label_id, bbox_data); - } else { - decode_bbox_corner_no_variance_kernel\ - <<>>\ - (count, loc_data, prior_data, variance_data, num_priors, share_location, \ - num_loc_classes, background_label_id, bbox_data); - } - } else if (code_type == CENTER_SIZE) { - if (variance_encoded_in_target) { - decode_bbox_center_variance_kernel\ - <<>>\ - (count, loc_data, prior_data, variance_data, num_priors, share_location, \ - num_loc_classes, background_label_id, bbox_data); - } else { - decode_bbox_center_no_variance_kernel\ - <<>>\ - (count, loc_data, prior_data, variance_data, num_priors, share_location, \ - num_loc_classes, background_label_id, bbox_data); - } - } else if (code_type == CORNER_SIZE) { - if (variance_encoded_in_target) { - decode_bbox_corner_size_variance_kernel\ - <<>>\ - (count, loc_data, prior_data, variance_data, num_priors, share_location, \ - num_loc_classes, background_label_id, bbox_data); - } else { - decode_bbox_corner_size_no_variance_kernel\ - <<>>\ - (count, loc_data, prior_data, variance_data, num_priors, share_location, \ - num_loc_classes, background_label_id, bbox_data); - } - } -} - -template void decode_bboxes(const int nthreads, - const float* loc_data, const float* prior_data, - const CodeType code_type, const bool variance_encoded_in_target, - const int num_priors, const bool share_location, - const int num_loc_classes, const int background_label_id, - float* bbox_data, cudaStream_t stream); -} //namespace anakin - -} //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_activation.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_activation.cu index 2b72b2a6f..4717081bb 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_activation.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_activation.cu @@ -6,31 +6,25 @@ namespace saber{ template __global__ void ker_relu_fwd(Dtype * out_data, - const Dtype* in_data, const int count, Dtype neg_slop, - int in_n, int in_c, int in_h, int in_w, - int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, - int out_n, int out_c, int out_h, int out_w, - int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { - CUDA_KERNEL_LOOP(tid, count){ - int read_w = tid % in_w; - int read_h = (tid / (in_w)) % in_h; - int read_c = (tid / (in_h * in_w)) % in_c; - int read_n = (tid / (in_c * in_h * in_w)) % in_n; - - int write_w = tid % out_w; - int write_h = (tid / (out_w)) % out_h; - int write_c = (tid / (out_h * out_w)) % out_c; - int write_n = (tid / (out_c * out_h * out_w)) % out_n; - - int in_idx = read_n * in_n_stride - + read_c * in_c_stride - + read_h * in_h_stride - + read_w * in_w_stride; - - int out_idx = write_n * out_n_stride - + write_c * out_c_stride - + write_h * out_h_stride - + write_w * out_w_stride; + const Dtype* in_data, const int count, Dtype neg_slop, + int in_n, int in_c, int in_h, int in_w, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { + CUDA_KERNEL_LOOP(tid, count) { + int w = tid % in_w; + int h = (tid / (in_w)) % in_h; + int c = (tid / (in_h * in_w)) % in_c; + int n = (tid / (in_c * in_h * in_w)) % in_n; + + int in_idx = n * in_n_stride + + c * in_c_stride + + h * in_h_stride + + w * in_w_stride; + + int out_idx = n * out_n_stride + + c * out_c_stride + + h * out_h_stride + + w * out_w_stride; Dtype in_var = in_data[in_idx]; out_data[out_idx] = in_var > Dtype(0) ? in_var : in_var * neg_slop; @@ -39,101 +33,115 @@ __global__ void ker_relu_fwd(Dtype * out_data, template __global__ void ker_sigmoid_fwd(Dtype * out_data, + const Dtype* in_data, const int count, + int in_n, int in_c, int in_h, int in_w, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { + + CUDA_KERNEL_LOOP(tid, count) { + int w = tid % in_w; + int h = (tid / (in_w)) % in_h; + int c = (tid / (in_h * in_w)) % in_c; + int n = (tid / (in_c * in_h * in_w)) % in_n; + + int in_idx = n * in_n_stride + + c * in_c_stride + + h * in_h_stride + + w * in_w_stride; + + int out_idx = n * out_n_stride + + c * out_c_stride + + h * out_h_stride + + w * out_w_stride; + + Dtype in_var = in_data[in_idx]; + out_data[out_idx] = Dtype( Dtype(1) / (Dtype(1)+ exp(-in_var))); + + } +} + +template +__global__ void ker_tanh_fwd(Dtype * out_data, const Dtype* in_data, const int count, int in_n, int in_c, int in_h, int in_w, int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, - int out_n, int out_c, int out_h, int out_w, int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { - CUDA_KERNEL_LOOP(tid, count){ - int read_w = tid % in_w; - int read_h = (tid / (in_w)) % in_h; - int read_c = (tid / (in_h * in_w)) % in_c; - int read_n = (tid / (in_c * in_h * in_w)) % in_n; - - int write_w = tid % out_w; - int write_h = (tid / (out_w)) % out_h; - int write_c = (tid / (out_h * out_w)) % out_c; - int write_n = (tid / (out_c * out_h * out_w)) % out_n; - - int in_idx = read_n * in_n_stride - + read_c * in_c_stride - + read_h * in_h_stride - + read_w * in_w_stride; - - int out_idx = write_n * out_n_stride - + write_c * out_c_stride - + write_h * out_h_stride - + write_w * out_w_stride; + CUDA_KERNEL_LOOP(tid, count) { + int w = tid % in_w; + int h = (tid / (in_w)) % in_h; + int c = (tid / (in_h * in_w)) % in_c; + int n = (tid / (in_c * in_h * in_w)) % in_n; + + int in_idx = n * in_n_stride + + c * in_c_stride + + h * in_h_stride + + w * in_w_stride; + + int out_idx = n * out_n_stride + + c * out_c_stride + + h * out_h_stride + + w * out_w_stride; Dtype in_var = in_data[in_idx]; - out_data[out_idx] = Dtype( Dtype(1) / (Dtype(1)+ expf(-in_var))); + //(expf(in_var) - expf(-in_var)) / (expf(in_var) + expf(-in_var));exp + out_data[out_idx] = Dtype(1) - (Dtype(2) / (Dtype(1) + exp(in_var * 2))); } } template -__global__ void ker_tanh_fwd(Dtype * out_data, - const Dtype* in_data, const int count, - int in_n, int in_c, int in_h, int in_w, - int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, - int out_n, int out_c, int out_h, int out_w, - int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { - - CUDA_KERNEL_LOOP(tid, count){ - int read_w = tid % in_w; - int read_h = (tid / (in_w)) % in_h; - int read_c = (tid / (in_h * in_w)) % in_c; - int read_n = (tid / (in_c * in_h * in_w)) % in_n; +__global__ void ker_stanh_fwd(Dtype * out_data, + const Dtype* in_data, const int count, const Dtype slope, const Dtype coef, + int in_n, int in_c, int in_h, int in_w, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { - int write_w = tid % out_w; - int write_h = (tid / (out_w)) % out_h; - int write_c = (tid / (out_h * out_w)) % out_c; - int write_n = (tid / (out_c * out_h * out_w)) % out_n; + CUDA_KERNEL_LOOP(tid, count) { + int w = tid % in_w; + int h = (tid / (in_w)) % in_h; + int c = (tid / (in_h * in_w)) % in_c; + int n = (tid / (in_c * in_h * in_w)) % in_n; - int in_idx = read_n * in_n_stride - + read_c * in_c_stride - + read_h * in_h_stride - + read_w * in_w_stride; + int in_idx = n * in_n_stride + + c * in_c_stride + + h * in_h_stride + + w * in_w_stride; - int out_idx = write_n * out_n_stride - + write_c * out_c_stride - + write_h * out_h_stride - + write_w * out_w_stride; + int out_idx = n * out_n_stride + + c * out_c_stride + + h * out_h_stride + + w * out_w_stride; Dtype in_var = in_data[in_idx]; - out_data[out_idx] = Dtype( (expf(in_var) - expf(-in_var)) / (expf(in_var)+ expf(-in_var))); + Dtype var = in_var * slope; + //output_data[j] = param.coef * tanh(param.negative_slope * input_data[j]); + out_data[out_idx] = Dtype( coef * (Dtype(1) - (Dtype(2) / (Dtype(1) + exp(var * 2))))); } } template __global__ void ker_clipped_relu_fwd(Dtype * out_data, - const Dtype* in_data, const int count, Dtype clipped_threadhold, - int in_n, int in_c, int in_h, int in_w, - int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, - int out_n, int out_c, int out_h, int out_w, - int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { - CUDA_KERNEL_LOOP(tid, count){ - int read_w = tid % in_w; - int read_h = (tid / (in_w)) % in_h; - int read_c = (tid / (in_h * in_w)) % in_c; - int read_n = (tid / (in_c * in_h * in_w)) % in_n; - - int write_w = tid % out_w; - int write_h = (tid / (out_w)) % out_h; - int write_c = (tid / (out_h * out_w)) % out_c; - int write_n = (tid / (out_c * out_h * out_w)) % out_n; - - int in_idx = read_n * in_n_stride - + read_c * in_c_stride - + read_h * in_h_stride - + read_w * in_w_stride; - - int out_idx = write_n * out_n_stride - + write_c * out_c_stride - + write_h * out_h_stride - + write_w * out_w_stride; + const Dtype* in_data, const int count, Dtype clipped_threadhold, + int in_n, int in_c, int in_h, int in_w, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { + CUDA_KERNEL_LOOP(tid, count) { + int w = tid % in_w; + int h = (tid / (in_w)) % in_h; + int c = (tid / (in_h * in_w)) % in_c; + int n = (tid / (in_c * in_h * in_w)) % in_n; + + int in_idx = n * in_n_stride + + c * in_c_stride + + h * in_h_stride + + w * in_w_stride; + + int out_idx = n * out_n_stride + + c * out_c_stride + + h * out_h_stride + + w * out_w_stride; Dtype in_var = in_data[in_idx]; in_var = in_var > 0 ? in_var : 0; @@ -142,43 +150,68 @@ __global__ void ker_clipped_relu_fwd(Dtype * out_data, } template __global__ void ker_elu_fwd(Dtype * out_data, - const Dtype* in_data, const int count, Dtype coef, - int in_n, int in_c, int in_h, int in_w, - int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, - int out_n, int out_c, int out_h, int out_w, - int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { + const Dtype* in_data, const int count, Dtype coef, + int in_n, int in_c, int in_h, int in_w, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { CUDA_KERNEL_LOOP(tid, count){ - int read_w = tid % in_w; - int read_h = (tid / (in_w)) % in_h; - int read_c = (tid / (in_h * in_w)) % in_c; - int read_n = (tid / (in_c * in_h * in_w)) % in_n; - - int write_w = tid % out_w; - int write_h = (tid / (out_w)) % out_h; - int write_c = (tid / (out_h * out_w)) % out_c; - int write_n = (tid / (out_c * out_h * out_w)) % out_n; - - int in_idx = read_n * in_n_stride - + read_c * in_c_stride - + read_h * in_h_stride - + read_w * in_w_stride; - - int out_idx = write_n * out_n_stride - + write_c * out_c_stride - + write_h * out_h_stride - + write_w * out_w_stride; + int w = tid % in_w; + int h = (tid / (in_w)) % in_h; + int c = (tid / (in_h * in_w)) % in_c; + int n = (tid / (in_c * in_h * in_w)) % in_n; + + int in_idx = n * in_n_stride + + c * in_c_stride + + h * in_h_stride + + w * in_w_stride; + + int out_idx = n * out_n_stride + + c * out_c_stride + + h * out_h_stride + + w * out_w_stride; Dtype in_var = in_data[in_idx]; - out_data[out_idx] = in_var > 0 ? in_var : coef * (expf(in_var)-1); + out_data[out_idx] = in_var > 0 ? in_var : coef * (exp(in_var)-1); } } -template <> -SaberStatus SaberActivation::dispatch( \ - const std::vector& inputs, - std::vector& outputs, - ActivationParam& param) { +template +__global__ void ker_prelu_fwd(Dtype * out_data, + const Dtype* in_data, const int count, + const Dtype* slope, bool is_channel_shared, + int in_n, int in_c, int in_h, int in_w, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { + CUDA_KERNEL_LOOP(tid, count){ + int w = tid % in_w; + int h = (tid / (in_w)) % in_h; + int c = (tid / (in_h * in_w)) % in_c; + int n = (tid / (in_c * in_h * in_w)) % in_n; + + int in_idx = n * in_n_stride + + c * in_c_stride + + h * in_h_stride + + w * in_w_stride; + + int out_idx = n * out_n_stride + + c * out_c_stride + + h * out_h_stride + + w * out_w_stride; + + Dtype in_var = in_data[in_idx]; + if (is_channel_shared) { + out_data[out_idx] = in_var > 0 ? in_var : slope[0] * in_var; + } else { + out_data[out_idx] = in_var > 0 ? in_var : slope[c] * in_var; + } + } +} + +template +SaberStatus SaberActivation::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam& param) { Shape in_shape = inputs[0]->valid_shape(); Shape out_shape = outputs[0]->valid_shape(); @@ -186,74 +219,102 @@ SaberStatus SaberActivationget_stride(); Shape stride_out = outputs[0]->get_stride(); - const InDataType *in_data = (const InDataType*)inputs[0]->data(); - OutDataType *out_data = (OutDataType*)outputs[0]->mutable_data(); + const OpDataType *in_data = (const OpDataType*)inputs[0]->data(); + OpDataType *out_data = (OpDataType*)outputs[0]->mutable_data(); const int count = inputs[0]->valid_size(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); - InDataType negative_slope = param.negative_slope; - InDataType coef = param.coef; - switch (param.active){ + OpDataType negative_slope = param.negative_slope; + OpDataType coef = param.coef; + switch (param.active) { + //x > 0 ? x : 0 case Active_relu: - ker_relu_fwd + ker_relu_fwd <<>>( out_data, in_data, count, negative_slope, - in_shape[0], in_shape[1], in_shape[2], in_shape[3], - stride_in[0], stride_in[1], stride_in[2], stride_in[3], - out_shape[0], out_shape[1], out_shape[2], out_shape[3], - stride_out[0], stride_out[1], stride_out[2], stride_out[3]); + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + stride_in[0], stride_in[1], stride_in[2], stride_in[3], + stride_out[0], stride_out[1], stride_out[2], stride_out[3]); break; + // sigmoid: 1/(exp(-x) + 1) case Active_sigmoid: - ker_sigmoid_fwd + ker_sigmoid_fwd <<>>( out_data, in_data, count, - in_shape[0], in_shape[1], in_shape[2], in_shape[3], - stride_in[0], stride_in[1], stride_in[2], stride_in[3], - out_shape[0], out_shape[1], out_shape[2], out_shape[3], - stride_out[0], stride_out[1], stride_out[2], stride_out[3]); + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + stride_in[0], stride_in[1], stride_in[2], stride_in[3], + stride_out[0], stride_out[1], stride_out[2], stride_out[3]); break; + // tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x)) case Active_tanh: - - ker_tanh_fwd + + ker_tanh_fwd <<>>( out_data, in_data, count, - in_shape[0], in_shape[1], in_shape[2], in_shape[3], - stride_in[0], stride_in[1], stride_in[2], stride_in[3], - out_shape[0], out_shape[1], out_shape[2], out_shape[3], - stride_out[0], stride_out[1], stride_out[2], stride_out[3]); + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + stride_in[0], stride_in[1], stride_in[2], stride_in[3], + stride_out[0], stride_out[1], stride_out[2], stride_out[3]); break; + + // stanh : b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}} + case Active_stanh: + ker_stanh_fwd + <<>>( + out_data, in_data, count, negative_slope, coef, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + stride_in[0], stride_in[1], stride_in[2], stride_in[3], + stride_out[0], stride_out[1], stride_out[2], stride_out[3]); + break; + + // x > 0 ? x : 0; + // x < threshold ? x : threshold case Active_clipped_relu: - ker_clipped_relu_fwd + ker_clipped_relu_fwd <<>>( out_data, in_data, count, coef, - in_shape[0], in_shape[1], in_shape[2], in_shape[3], - stride_in[0], stride_in[1], stride_in[2], stride_in[3], - out_shape[0], out_shape[1], out_shape[2], out_shape[3], - stride_out[0], stride_out[1], stride_out[2], stride_out[3]); + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + stride_in[0], stride_in[1], stride_in[2], stride_in[3], + stride_out[0], stride_out[1], stride_out[2], stride_out[3]); break; + //elu: x > 0 ? x : coef * (exp(x) - 1) case Active_elu: - ker_elu_fwd + ker_elu_fwd <<>>( out_data, in_data, count, coef, - in_shape[0], in_shape[1], in_shape[2], in_shape[3], - stride_in[0], stride_in[1], stride_in[2], stride_in[3], - out_shape[0], out_shape[1], out_shape[2], out_shape[3], - stride_out[0], stride_out[1], stride_out[2], stride_out[3]); + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + stride_in[0], stride_in[1], stride_in[2], stride_in[3], + stride_out[0], stride_out[1], stride_out[2], stride_out[3]); break; - } + //prelu: x > 0 ? x : slope[c] * x + case Active_prelu: + auto prelu_param = param.prelu_param; + const OpDataType* slope_ptr = (const OpDataType*)prelu_param.slope->data(); + bool shared = prelu_param.channel_shared; + ker_prelu_fwd + <<>>( + out_data, in_data, count, + slope_ptr, shared, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + stride_in[0], stride_in[1], stride_in[2], stride_in[3], + stride_out[0], stride_out[1], stride_out[2], stride_out[3]); + break; + } CUDA_POST_KERNEL_CHECK; - return SaberSuccess; + return SaberSuccess; } +template class SaberActivation; +DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, NV, AK_HALF); +} } -} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_argmax.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_argmax.cu index 77a66b1fd..0f9827f46 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_argmax.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_argmax.cu @@ -35,20 +35,7 @@ __global__ void top1(const Dtype* in_data, share_index[index] = -1; } __syncthreads(); - #if 0 - for (int stride = blockDim.x >> 1; - stride > 0; - stride >>= 1) { - if (index < stride) { - int index2 = index + stride; - if (share_data[index2] > share_data[index]) { - share_data[index] = share_data[index2]; - share_index[index] = share_index[index2]; - } - } - __syncthreads(); - } - #else + if (blockSize >= 512) { if (index < 256) { int index2 = index + 256; @@ -127,7 +114,6 @@ __global__ void top1(const Dtype* in_data, } __syncthreads(); - #endif if (index == 0) { if (!out_max_val) { out_data[blockIdx.x] = share_index[0]; @@ -159,20 +145,7 @@ __global__ void block_top1(const Dtype* in_data, } __syncthreads(); - #if 0 - for (int stride = blockDim.x >> 1; - stride > 0; - stride >>= 1) { - if (index < stride) { - int index2 = index + stride; - if (share_data[index2] > share_data[index]) { - share_data[index] = share_data[index2]; - share_index[index] = share_index[index2]; - } - } - __syncthreads(); - } - #else + if (blockSize >= 512) { if (index < 256) { int index2 = index + 256; @@ -251,7 +224,6 @@ __global__ void block_top1(const Dtype* in_data, } __syncthreads(); - #endif if (index == 0) { int offset = blockIdx.y * gridDim.x + blockIdx.x; out_data[offset] = share_data[0]; @@ -288,20 +260,6 @@ __global__ void top1(const Dtype* in_data, share_index[index] = -1; } __syncthreads(); - #if 0 - for (int stride = blockDim.x >> 1; - stride > 0; - stride >>= 1) { - if (index < stride) { - int index2 = index + stride; - if (share_data[index2] > share_data[index]) { - share_data[index] = share_data[index2]; - share_index[index] = share_index[index2]; - } - } - __syncthreads(); - } - #else if (blockSize >= 512) { if (index < 256) { int index2 = index + 256; @@ -380,7 +338,6 @@ __global__ void top1(const Dtype* in_data, } __syncthreads(); - #endif if (index == 0) { int block_id = share_index[0]; if (!out_max_val) { @@ -484,8 +441,8 @@ __global__ void topk_channel(const Dtype* in_data, // const Dtype* tmp_in_data = in_data + num_id * channel * inner_dim + inner_id; extern __shared__ Dtype trees[]; - Dtype* small_heap_tree = trees + thread_id * top_k; - Dtype* tree_index = trees + thread_id * top_k + blockDim.x * top_k; + Dtype* small_heap_tree = trees + threadIdx.x * top_k; + Dtype* tree_index = trees + threadIdx.x * top_k + blockDim.x * top_k; for (int i = 0; i < top_k; i++) { small_heap_tree[i] = -FLT_MAX; tree_index[i] = -1; @@ -494,7 +451,7 @@ __global__ void topk_channel(const Dtype* in_data, Dtype data = tmp_in_data[i*inner_dim]; if (data > small_heap_tree[0]) { small_heap_tree[0] = data; - tree_index[i] = i; + tree_index[0] = i; adjust_small_heap_with_index_device(small_heap_tree, tree_index, 0, top_k); } } @@ -656,99 +613,7 @@ __global__ void topk_heap_shared(Dtype *out_data, int n, int inner_dim, const in } __syncthreads(); } - // if (tid < 32) { - // if (blockSize >= 64) { - // volatile Dtype* cur = cur_tree; - // volatile Dtype* cur_index = cur_tree_index; - // volatile Dtype* next = cur + 32 * top_k; - // volatile Dtype* next_index = cur_index + 32 * top_k; - // for (int i = 0; i < top_k; i++) { - // if (next[i] > cur[0]) { - // cur[0] = next[i]; - // cur_index[0] = next_index[i]; - // adjust_small_heap_with_index_device(cur, cur_index, 0, top_k); - // } - // } - // } - // if (blockSize >= 32) { - // volatile Dtype* cur = cur_tree; - // volatile Dtype* cur_index = cur_tree_index; - // volatile Dtype* next = cur + 16 * top_k; - // volatile Dtype* next_index = cur_index + 16 * top_k; - // for (int i = 0; i < top_k; i++) { - // if (next[i] > cur[0]) { - // cur[0] = next[i]; - // cur_index[0] = next_index[i]; - // adjust_small_heap_with_index_device(cur, cur_index, 0, top_k); - // } - // } - // } - // if (blockSize >= 16) { - // volatile Dtype* cur = cur_tree; - // volatile Dtype* cur_index = cur_tree_index; - // volatile Dtype* next = cur + 8 * top_k; - // volatile Dtype* next_index = cur_index + 8 * top_k; - // for (int i = 0; i < top_k; i++) { - // if (next[i] > cur[0]) { - // cur[0] = next[i]; - // cur_index[0] = next_index[i]; - // adjust_small_heap_with_index_device(cur, cur_index, 0, top_k); - // } - // } - // if (tid < 8) { - // for(int i = 0; i < top_k; i++) { - // printf("block_id:%d, tid:%d, i:%d, cur_tree:%f, \n", block_id, tid, i, cur[i]); - // } - // } - // } - // if (blockSize >= 8) { - // volatile Dtype* cur = cur_tree; - // volatile Dtype* cur_index = cur_tree_index; - // volatile Dtype* next = cur + 4 * top_k; - // volatile Dtype* next_index = cur_index + 4 * top_k; - // for (int i = 0; i < top_k; i++) { - // if (next[i] > cur[0]) { - // cur[0] = next[i]; - // cur_index[0] = next_index[i]; - // adjust_small_heap_with_index_device(cur, cur_index, 0, top_k); - // if (block_id == 0 && tid < 1) { - // for(int m = 0; m < top_k; m++) { - // printf("block_id:%d, tid:%d, i:%d, m:%d, cur_tree:%f, next:%f\n", block_id, tid, i, m, cur[m], next[m]); - // } - // } - // } - // } - // } - // if (blockSize >= 4) { - // volatile Dtype* cur = cur_tree; - // volatile Dtype* cur_index = cur_tree_index; - // volatile Dtype* next = cur + 2 * top_k; - // volatile Dtype* next_index = cur_index + 2 * top_k; - // for (int i = 0; i < top_k; i++) { - // if (next[i] > cur[0]) { - // cur[0] = next[i]; - // cur_index[0] = next_index[i]; - // adjust_small_heap_with_index_device(cur, cur_index, 0, top_k); - // } - // } - // } - // if (blockSize >= 2) { - // volatile Dtype* cur = cur_tree; - // volatile Dtype* cur_index = cur_tree_index; - // volatile Dtype* next = cur + 1 * top_k; - // volatile Dtype* next_index = cur_index + 1 * top_k; - // if (tid == 0) { - // for (int i = 0; i < top_k; i++) { - // printf("block_id:%d, i:%d, cur_val:%f, cur_index:%f, next_val:%f, next_val:%f\n", block_id, i, cur[i], cur_index[i], next[i], next_index[i]); - // //if (next[i] > cur[0]) { - // // cur[0] = next[i]; - // // cur_index[0] = next_index[i]; - // // adjust_small_heap_with_index_device(cur, cur_index, 0, top_k); - // //} - // } - // } - // } - // } + if (tid == 0) { int stride = out_max_val ? block_id * top_k * 2 : block_id * top_k; Dtype* out = out_data + stride; @@ -765,205 +630,16 @@ __global__ void topk_heap_shared(Dtype *out_data, int n, int inner_dim, const in } } } - -template -__global__ void topk_heap_shared_no_bank(Dtype *out_data, int n, int inner_dim, const int top_k, const bool out_max_val, const Dtype *in_data){ - extern __shared__ Dtype trees[]; - const int block_id = blockIdx.x; - const int tid = threadIdx.x; - Dtype *cur_tree = trees + tid ; - Dtype *cur_tree_index = cur_tree + top_k * blockDim.x; - for (int i = 0; i < top_k; i++){ - cur_tree[i*blockDim.x] = -FLT_MAX; - cur_tree_index[i * blockDim.x] = -1; - } - int stride = blockDim.x; - -/*build small heap for every thread in one picture*/ - const Dtype* in = in_data + block_id * inner_dim; - for (int i = tid; i < inner_dim; i += blockDim.x){ - if (in[i] > cur_tree[0]) { - cur_tree[0] = in[i]; - cur_tree_index[0] = i; - adjust_small_heap_with_index_device_stride(cur_tree, cur_tree_index, 0, top_k, stride); - } - } - __syncthreads(); - if (blockSize >= 512) { - if (tid < 256) { - Dtype* next_tree = cur_tree + 256; - Dtype* next_tree_index = cur_tree_index + 256; - for (int i = 0; i < top_k; i++) { - int off = i*stride; - if (next_tree[off] > cur_tree[0]) { - cur_tree[0] = next_tree[off]; - cur_tree_index[0] = next_tree_index[off]; - adjust_small_heap_with_index_device_stride(cur_tree, cur_tree_index, 0, top_k, stride); - } - } - } - __syncthreads(); - } - if (blockSize >= 256) { - if (tid < 128) { - Dtype* next_tree = cur_tree + 128; - Dtype* next_tree_index = cur_tree_index + 128; - for (int i = 0; i < top_k; i++) { - int off = i*stride; - if (next_tree[off] > cur_tree[0]) { - cur_tree[0] = next_tree[off]; - cur_tree_index[0] = next_tree_index[off]; - adjust_small_heap_with_index_device_stride(cur_tree, cur_tree_index, 0, top_k, stride); - } - } - } - __syncthreads(); - } - if (blockSize >= 128) { - if (tid < 64) { - Dtype* next_tree = cur_tree + 64; - Dtype* next_tree_index = cur_tree_index + 64; - for (int i = 0; i < top_k; i++) { - int off = i*stride; - if (next_tree[off] > cur_tree[0]) { - cur_tree[0] = next_tree[off]; - cur_tree_index[0] = next_tree_index[off]; - adjust_small_heap_with_index_device_stride(cur_tree, cur_tree_index, 0, top_k, stride); - } - } - } - __syncthreads(); - } - if (blockSize >= 64) { - if (tid < 32) { - Dtype* next_tree = cur_tree + 32; - Dtype* next_tree_index = cur_tree_index + 32; - for (int i = 0; i < top_k; i++) { - int off = i*stride; - if (next_tree[off] > cur_tree[0]) { - cur_tree[0] = next_tree[off]; - cur_tree_index[0] = next_tree_index[off]; - adjust_small_heap_with_index_device_stride(cur_tree, cur_tree_index, 0, top_k, stride); - } - } - } - __syncthreads(); - } - if (blockSize >= 32) { - if (tid < 16) { - Dtype* next_tree = cur_tree + 16; - Dtype* next_tree_index = cur_tree_index + 16; - for (int i = 0; i < top_k; i++) { - int off = i*stride; - if (next_tree[off] > cur_tree[0]) { - cur_tree[0] = next_tree[off]; - cur_tree_index[0] = next_tree_index[off]; - adjust_small_heap_with_index_device_stride(cur_tree, cur_tree_index, 0, top_k, stride); - } - } - } - __syncthreads(); - } - if (blockSize >= 16) { - if (tid < 8) { - Dtype* next_tree = cur_tree + 8; - Dtype* next_tree_index = cur_tree_index + 8; - for (int i = 0; i < top_k; i++) { - int off = i*stride; - if (next_tree[off] > cur_tree[0]) { - cur_tree[0] = next_tree[off]; - cur_tree_index[0] = next_tree_index[off]; - adjust_small_heap_with_index_device_stride(cur_tree, cur_tree_index, 0, top_k, stride); - } - } - } - __syncthreads(); - } - if (blockSize >= 8) { - if (tid < 4) { - Dtype* next_tree = cur_tree + 4; - Dtype* next_tree_index = cur_tree_index + 4; - for (int i = 0; i < top_k; i++) { - int off = i*stride; - if (next_tree[off] > cur_tree[0]) { - cur_tree[0] = next_tree[off]; - cur_tree_index[0] = next_tree_index[off]; - adjust_small_heap_with_index_device_stride(cur_tree, cur_tree_index, 0, top_k, stride); - } - } - } - __syncthreads(); - } - if (blockSize >= 4) { - if (tid < 2) { - Dtype* next_tree = cur_tree + 2; - Dtype* next_tree_index = cur_tree_index + 2; - for (int i = 0; i < top_k; i++) { - int off = i*stride; - if (next_tree[off] > cur_tree[0]) { - cur_tree[0] = next_tree[off]; - cur_tree_index[0] = next_tree_index[off]; - adjust_small_heap_with_index_device_stride(cur_tree, cur_tree_index, 0, top_k, stride); - } - } - } - __syncthreads(); - } - if (blockSize >= 2) { - if (tid < 1) { - Dtype* next_tree = cur_tree + 1; - Dtype* next_tree_index = cur_tree_index + 1; - for (int i = 0; i < top_k; i++) { - int off = i*stride; - if (next_tree[off] > cur_tree[0]) { - cur_tree[0] = next_tree[off]; - cur_tree_index[0] = next_tree_index[off]; - adjust_small_heap_with_index_device_stride(cur_tree, cur_tree_index, 0, top_k, stride); - } - } - } - __syncthreads(); - } - if (tid == 0) { - int stride = out_max_val ? block_id * top_k * 2 : block_id * top_k; - Dtype* out = out_data + stride; - for (int i = top_k - 1; i >= 0; i--) { - if (!out_max_val) { - out[i] = cur_tree_index[0]; - } else { - out[i] = cur_tree[0]; - out[i + top_k] = cur_tree_index[0]; - } - cur_tree[0] = FLT_MAX; - cur_tree_index[0] = -1; - adjust_small_heap_with_index_device_stride(cur_tree, cur_tree_index, 0, top_k, stride); - } - } -} - -/* -template -SaberStatus SaberArgmax::dispatch( - const std::vector inputs, - std::vector outputs, - ArgmaxParam ¶m) { -*/ -template -SaberStatus SaberArgmax::dispatch(const std::vector& inputs, - std::vector& outputs, - ArgmaxParam& param) { - - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); +template <> +SaberStatus SaberArgmax::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + ArgmaxParam& param) { + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); - const InDataType * in_data = inputs[0]->data(); - OutDataType * out_data = outputs[0]->mutable_data(); + const OpDataType * in_data = (const OpDataType*)inputs[0]->data(); + OpDataType * out_data = (OpDataType*)outputs[0]->mutable_data(); int outer_dim = inputs[0]->count(0, param.axis); if (param.has_axis) { int count = inputs[0]->count(0, inputs[0]->dims()); @@ -971,9 +647,9 @@ SaberStatus SaberArgmaxcount(param.axis + 1, inputs[0]->dims()); int total_threads = count / dim; if (param.top_k == 1) { - top1_channel<<>>(in_data, outer_dim, dim, inner_dim, param.out_max_val, out_data); + top1_channel<<>>(in_data, outer_dim, dim, inner_dim, param.out_max_val, out_data); } else { - topk_channel<<>>(in_data, outer_dim, dim, inner_dim, param.top_k, param.out_max_val, out_data); + topk_channel<<>>(in_data, outer_dim, dim, inner_dim, param.top_k, param.out_max_val, out_data); } } else { int inner_dim = inputs[0]->count(1, inputs[0]->dims()); @@ -982,21 +658,21 @@ SaberStatus SaberArgmax CUDA_NUM_THREADS ? CUDA_NUM_THREADS : block_size; - top1<<>>(in_data, outer_dim, inner_dim, param.out_max_val, out_data); + top1<<>>(in_data, outer_dim, inner_dim, param.out_max_val, out_data); } else { int block_num = CUDA_GET_BLOCKS(inner_dim); dim3 grid(block_num, outer_dim); - block_top1<<>>(in_data, outer_dim, inner_dim, _block_max_value.mutable_data(), _block_max_index.mutable_data()); - top1<<>>(_block_max_value.data(), _block_max_index.data(), outer_dim, block_num, param.out_max_val, out_data); + block_top1<<>>(in_data, outer_dim, inner_dim, (OpDataType*)_block_max_value.mutable_data(), (OpDataType*)_block_max_index.mutable_data()); + top1<<>>((OpDataType*)_block_max_value.data(), (OpDataType*)_block_max_index.data(), outer_dim, block_num, param.out_max_val, out_data); } } else { - //topk_heap_shared<<>>(out_data, outer_dim, inner_dim, param.top_k, param.out_max_val, in_data); - topk_heap_shared_no_bank<<>>(out_data, outer_dim, inner_dim, param.top_k, param.out_max_val, in_data); + topk_heap_shared<<>>(out_data, outer_dim, inner_dim, param.top_k, param.out_max_val, in_data); } } return SaberSuccess; } - +DEFINE_OP_TEMPLATE(SaberArgmax, ArgmaxParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberArgmax, ArgmaxParam, NV, AK_HALF); } } diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_axpy.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_axpy.cu index 15fbd8fad..49f1c3f46 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_axpy.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_axpy.cu @@ -6,45 +6,42 @@ namespace anakin{ namespace saber{ -template +template __global__ void ker_axpy_fwd(int n, int img_size, - const DataDtype* scale, const DataDtype* x, const DataDtype* y, DataDtype* dst) { + const Dtype* scale, const Dtype* x, const Dtype* y, Dtype* dst) { CUDA_KERNEL_LOOP(idx, n) { int scale_id = idx / img_size; dst[idx] = scale[scale_id] * x[idx] + y[idx]; } } -template -SaberStatus SaberAxpy::dispatch(const std::vector& inputs, - std::vector& outputs, - AxpyParam& param) { - - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); +template <> +SaberStatus SaberAxpy::dispatch( \ + const std::vector *>& inputs, + std::vector *>& outputs, + AxpyParam& param){ + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); if (!(inputs[1]->valid_shape() == outputs[0]->valid_shape()) || !(inputs[2]->valid_shape() == outputs[0]->valid_shape())) { return SaberUnKownError; } - const InDataType* scale = inputs[0]->data(); - const InDataType* x = inputs[1]->data(); - const InDataType* y = inputs[2]->data(); - OutDataType* dst = outputs[0]->mutable_data(); + const OpDataType* scale = (OpDataType*)inputs[0]->data(); + const OpDataType* x = (OpDataType*)inputs[1]->data(); + const OpDataType* y = (OpDataType*)inputs[2]->data(); + OpDataType* dst = (OpDataType*)outputs[0]->mutable_data(); int img_size = outputs[0]->height() * outputs[0]->width(); int count = outputs[0]->valid_size(); if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem() && inputs[1]->is_continue_mem() && inputs[2]->is_continue_mem()) { - ker_axpy_fwd<<>>\ + ker_axpy_fwd<<>>\ (count, img_size, scale, x, y, dst); } + //LOG(INFO) << "passed"; return SaberSuccess; } +DEFINE_OP_TEMPLATE(SaberAxpy, AxpyParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberAxpy, AxpyParam, NV, AK_HALF); } //namespace anakin } //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_cast.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_cast.cu index 6aafbc3e6..f8c57ee0a 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_cast.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_cast.cu @@ -17,33 +17,46 @@ __global__ void ker_cast_fwd(Ttype * out_data, \ -template -SaberStatus SaberCast::dispatch(const std::vector& inputs, - std::vector& outputs, - CastParam& param) { - - const InDataType* in_data = inputs[0]->data(); - OutDataType* out_data = outputs[0]->mutable_data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); +template +SaberStatus SaberCast::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + CastParam& param) { + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); int count = outputs[0]->valid_size(); outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); - if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { - ker_cast_fwd\ + if(_inDtype == _outDtype){ + outputs[0]->copy_from(*inputs[0]); + return SaberSuccess; + } + + if(inputs[0]->get_dtype() == 1){//AK_FLOAT + const float* in_data = (const float*)inputs[0]->data(); + int* out_data = (int*)outputs[0]->mutable_data(); + if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { + ker_cast_fwd\ <<>>(\ - out_data, in_data, \ - count); + out_data, in_data, count); + } + + } + + if(inputs[0]->get_dtype() == 5){//AK_INT32 + const int* in_data = (const int*)inputs[0]->data(); + float* out_data = (float*)outputs[0]->mutable_data(); + if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { + ker_cast_fwd\ + <<>>(\ + out_data, in_data, count); + } } return SaberSuccess; } - +template class SaberCast; +template class SaberCast; +DEFINE_OP_TEMPLATE(SaberCast, CastParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberCast, CastParam, NV, AK_HALF); } } diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_concat.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_concat.cu index 2a1966914..d07708a2b 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_concat.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_concat.cu @@ -41,55 +41,16 @@ __global__ void concat_impl_2d_impl(const int inner_size, const int num_concats, } -template -SaberStatus SaberConcat::dispatch(const std::vector& inputs, - std::vector& outputs, - ConcatParam& param) { - - cudaStream_t stream = this->_ctx.get_compute_stream(); +template <> +SaberStatus SaberConcat::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, ConcatParam& param) { - int input_size = inputs.size(); - - #if 0 //disable share memory - //! inputs only has one tensor - if (input_size == 1) { - outputs[0]->set_shape(outputs[0]->valid_shape(), inputs[0]->shape(), \ - inputs[0]->offset()); - outputs[0]->share_from(*inputs[0]); - return; - } + cudaStream_t stream = this->_ctx->get_compute_stream(); - //! check whether the output is shared from input tensors - bool share_mem = false; - Shape offset_min = inputs[0]->offset(); - const dtype* ptr = inputs[0]->data(); - for (int i = 1; i < input_size; ++i) { - const dtype* ptr2= inputs[i]->data(); - if (inputs[i]->offset() < offset_min) { - offset_min = inputs[i]->offset(); - } - share_mem = (ptr == ptr2); - if (!share_mem){ - break; - } - } - //! input tensors are sharing one tensor - if (share_mem){ - CHECK_LE(outputs[0]->valid_size(), inputs[0]->size()) << "input shared tensors overlap"; - outputs[0]->set_shape(outputs[0]->valid_shape(), inputs[0]->shape(), offset_min); - outputs[0]->share_from(*inputs[0]); - return; - } - #endif // disable share memory + int input_size = inputs.size(); //! get output data, valid shape and stride shape - OutDataType* out_data = outputs[0]->mutable_data(); + OpDataType* out_data = (OpDataType*)outputs[0]->mutable_data(); int offset_concat_axis = 0; Shape out_shape = outputs[0]->valid_shape(); const int out_concat_axis = out_shape[param.axis]; @@ -104,7 +65,7 @@ SaberStatus SaberConcatvalid_shape(); //std::vector bottom_shape = {tmp[3], tmp[2], tmp[1], tmp[0]}; - const InDataType* in_data = inputs[i]->data(); + const OpDataType* in_data = (const OpDataType*)inputs[i]->data(); const int in_concat_axis = in_shape[param.axis]; const int in_concat_size = in_concat_axis * _concat_input_size; const int nthreads = in_concat_size * _num_concats; @@ -117,54 +78,32 @@ SaberStatus SaberConcat<<>>( + concat_impl_2d_impl<<>>( in_concat_size, _num_concats, in_data, _concat_input_size, out_concat_axis, offset_concat_axis, out_data ); } else { // NOLINT_NEXT_LINE(whitespace/operators) - concat_impl_cuda<<>>( \ + concat_impl_cuda<<>>( \ nthreads, in_data, _num_concats, _concat_input_size, \ out_concat_axis, in_concat_axis, offset_concat_axis, out_data); } offset_concat_axis += in_concat_axis; } } else { //! inputs or outputs memory is not continuous -#if 1 Shape offset_out = outputs[0]->offset(); - OpTensor tsub; + Tensor tsub; for (int i = 0; i < input_size; ++i) { Shape in_shape = inputs[i]->valid_shape(); tsub.share_sub_buffer(*outputs[0], in_shape, offset_out); offset_out[param.axis] += in_shape[param.axis]; tsub.async_copy_from(*inputs[i], stream); } -#endif } - - //outputs[0]->record_event(stream); return SaberSuccess; } -#if 0 -typedef Tensor Tensor4f_1; -typedef Tensor Tensor4f_2; -typedef Tensor Tensor2f; -typedef Tensor Tensor4c_1; -typedef Tensor Tensor4c_2; -typedef Tensor Tensor2c; -template SaberStatus SaberConcat::dispatch(const std::vector inputs, std::vector outputs, - ConcatParam ¶m); -template SaberStatus SaberConcat::dispatch(const std::vector inputs, std::vector outputs, - ConcatParam ¶m); -template SaberStatus SaberConcat::dispatch(const std::vector inputs, std::vector outputs, - ConcatParam ¶m); -template SaberStatus SaberConcat::dispatch(const std::vector inputs, std::vector outputs, - ConcatParam ¶m); -template SaberStatus SaberConcat::dispatch(const std::vector inputs, std::vector outputs, - ConcatParam ¶m); -template SaberStatus SaberConcat::dispatch(const std::vector inputs, std::vector outputs, - ConcatParam ¶m); -#endif +DEFINE_OP_TEMPLATE(SaberConcat, ConcatParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberConcat, ConcatParam, NV, AK_HALF); } //namespace anakin } //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_conv_upadding_padding.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_conv_upadding_padding.cu new file mode 100644 index 000000000..b1d87c926 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_conv_upadding_padding.cu @@ -0,0 +1,107 @@ +#include "saber/funcs/impl/cuda/saber_conv_upadding_padding.h" +#include "saber/funcs/saber_util.h" + +namespace anakin { +namespace saber { + +template +__global__ void var_conv_unpadding_padding(Dtype* output, + const Dtype* input, + const int* offset_w, + const int batch_size, + const int channel_num, + const int src_height, + const int src_width, + const int dst_height, + const int dst_width) { + // each thread process one channel of a matching matrix + int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + int thread_num = gridDim.x * blockDim.x; + + for (int mm_idx = thread_idx; mm_idx < channel_num * batch_size; mm_idx += thread_num) { + int batch_idx = mm_idx / channel_num; + int channel_idx = mm_idx % channel_num; + int width = offset_w[batch_idx + 1] - offset_w[batch_idx]; + Dtype* p_dst = output + mm_idx * dst_height * dst_width; + const Dtype* p_src = input + mm_idx * src_height * src_width; + + for (int i = 0; i < dst_height; ++i) { + Dtype* p_dst_tmp = p_dst + i * dst_width; + const Dtype* p_src_tmp = p_src + i * src_width; + + for (int j = 0; j < dst_width; ++j) { + if (i < dst_height && j < width) { + *(p_dst_tmp + j) = *(p_src_tmp + j); + } else { + *(p_dst_tmp + j) = 0; + } + } + } + } +} + +template +void anakin_gpu_var_conv_unpadding_padding(Dtype* output_data, + const Dtype* input, + const int* offset_w, + const int batch_size, + const int channel_num, + const int src_height, + const int src_width, + const int dst_height, + const int dst_width, + cudaStream_t stream) { + + int blocks=CUDA_GET_BLOCKS(batch_size * channel_num); + int threads=CUDA_NUM_THREADS; + var_conv_unpadding_padding <<< blocks, threads, 0, stream>>>(output_data, + input, + offset_w, + batch_size, + channel_num, + src_height, + src_width, + dst_height, + dst_width); +} + + +template <> +SaberStatus SaberConvUnpaddingPadding::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs, + ConvUnpaddingPaddingParam& param) { + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); + + const OpDataType* in_ptr= static_cast(inputs[0]->data()); + OpDataType* out_ptr= static_cast(outputs[0]->mutable_data()); +// const int* gpu_height_offset_ptr=static_cast(_height_offset_tensor.data()); + const int* gpu_width_offset_ptr=static_cast(_width_offset_tensor.data()); + + Shape in_shape=inputs[0]->valid_shape(); + int in_num=in_shape[0]; + int in_channel=in_shape[1]; + int in_height=in_shape[2]; + int in_width=in_shape[3]; + Shape out_shape=outputs[0]->valid_shape(); + int out_height=out_shape[2]; + int out_width=out_shape[3]; + + anakin_gpu_var_conv_unpadding_padding(out_ptr, + in_ptr, + gpu_width_offset_ptr, + in_num, + in_channel, + in_height, + in_width, + out_height, + out_width, + this->_ctx->get_compute_stream()); + return SaberSuccess; +} + +template class SaberConvUnpaddingPadding; +DEFINE_OP_TEMPLATE(SaberConvUnpaddingPadding, ConvUnpaddingPaddingParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberConvUnpaddingPadding, ConvUnpaddingPaddingParam, NV, AK_INT8); +} +} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_crf_decoding.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_crf_decoding.cu new file mode 100644 index 000000000..c1e60980f --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_crf_decoding.cu @@ -0,0 +1,156 @@ +#include "saber/funcs/impl/cuda/saber_crf_decoding.h" +#include "cuda_fp16.h" + + +namespace anakin{ + +namespace saber{ + +template +__global__ void decoding_kernel2(Dtype* decode_path, const Dtype* emission_ptr, const Dtype* trans_ptr, \ + Dtype* alpha_ptr, int* track_ptr, int* seq_offset, int seq_num, int slice_size, int tag_num, const int base_idx){ + + int bdx = blockIdx.x; + if (bdx >= seq_num){ + return; + } + int seq_len = seq_offset[bdx]; + int sum = 0; + int sum2 = 0; + for (int i = 0; i < bdx; i++){ + int tmp = seq_offset[i]; + sum += tmp; + sum2 += tmp * slice_size; + } + Dtype* path = decode_path + sum; + const Dtype* emission = emission_ptr + sum2; + + int idx = threadIdx.x; + const Dtype* x = emission; + const Dtype* w = trans_ptr; + if (idx < tag_num){ + alpha_ptr[idx] = trans_ptr[idx] + emission_ptr[idx]; + } + for (int k = 1; k < seq_len; ++k) { + if (idx < tag_num) { + Dtype max_score = -1e32;//-std::numeric_limits::max(); + int max_j = 0; + for (int j = 0; j < tag_num; ++j) { + Dtype score = alpha_ptr[(k - 1) * tag_num + j] + + w[(j + base_idx) * tag_num + idx]; + if (score > max_score) { + max_score = score; + max_j = j; + } + } + alpha_ptr[k * tag_num + idx] = max_score + x[k * tag_num + idx]; + track_ptr[k * tag_num + idx] = max_j; + } + } + __syncthreads(); +//only run block times + Dtype max_score = -1e32; + int max_i = 0; + for (int i = 0; i < tag_num; i++) { + Dtype score = alpha_ptr[(seq_len - 1) * tag_num + i] + w[tag_num + i]; + if (score > max_score) { + max_score = score; + max_i = i; + } + } + path[seq_len - 1] = max_i; + for (int k = seq_len - 1; k >= 1; k--) { + max_i = track_ptr[k * tag_num + max_i]; + path[k - 1] = max_i; + } +} + +template +__global__ void decoding_kernel(Dtype* decode_path, const Dtype* emission_ptr, const Dtype* trans_ptr, \ + Dtype* alpha_ptr, int* track_ptr, int seq_len, int tag_num, const int base_idx){ + int idx = threadIdx.x; + const Dtype* x = emission_ptr; + const Dtype* w = trans_ptr; + Dtype* alpha_value = alpha_ptr; + + for (int i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i]; + + for (int k = 1; k < seq_len; ++k) { + for (int i = 0; i < tag_num; ++i) { + Dtype max_score = -1e32;//-std::numeric_limits::max(); + int max_j = 0; + for (int j = 0; j < tag_num; ++j) { + Dtype score = alpha_value[(k - 1) * tag_num + j] + + w[(j + base_idx) * tag_num + i]; + if (score > max_score) { + max_score = score; + max_j = j; + } + } + alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i]; + track_ptr[k * tag_num + i] = max_j; + } + } + Dtype max_score = -1e32; + int max_i = 0; + for (size_t i = 0; i < tag_num; i++) { + Dtype score = alpha_ptr[(seq_len - 1) * tag_num + i] + trans_ptr[tag_num + i]; + if (score > max_score) { + max_score = score; + max_i = i; + } + } + decode_path[seq_len - 1] = max_i; + for (int k = seq_len - 1; k >= 1; k--) { + max_i = track_ptr[k * tag_num + max_i]; + decode_path[k - 1] = max_i; + } +} + +template <> +SaberStatus SaberCrfDecoding::dispatch( \ + const std::vector *>& inputs, + std::vector *>& outputs, + CrfDecodingParam& param){ + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + const OpDataType* emission_ptr = (const OpDataType*)inputs[0]->data(); + const OpDataType* trans_ptr = (const OpDataType*)param.mutable_transition_weight()->data(); + OpDataType* decode_path = (OpDataType*)outputs[0]->mutable_data(); + + int tag_num = inputs[0]->channel(); + int slice_size = tag_num * inputs[0]->height() * inputs[0]->width(); + std::vector> seq_offset = inputs[0]->get_seq_offset(); + int seq_num = seq_offset[0].size() - 1; + const int base_idx = 2; + #if 1 + for (int i = 0; i < seq_num; i++){ + int seq_len = seq_offset[0][i+1] - seq_offset[0][i]; + if (seq_len < 1) continue; + decoding_kernel<<<1, 1, 0, cuda_stream>>>(decode_path, \ + emission_ptr, trans_ptr, (OpDataType*)_alpha.mutable_data(), \ + (int*)_track.mutable_data(), seq_len, tag_num, base_idx); + + emission_ptr += slice_size * seq_len; + decode_path += seq_len; + } + #else + Tensor seq_host; + seq_host.re_alloc(Shape({1, 1, 1, seq_num}, Layout_NCHW), AK_INT32); + _seq.re_alloc(Shape({1, 1, 1, seq_num}, Layout_NCHW), AK_INT32); + int* seq = (int*)seq_host.mutable_data(); + for (int i = 0; i < seq_num; i++){ + seq[i] = seq_offset[0][i+1] - seq_offset[0][i]; + } + _seq.copy_from(seq_host); + decoding_kernel2<<>>(decode_path, \ + emission_ptr, trans_ptr, (OpDataType*)_alpha.mutable_data(), \ + (int*)_track.mutable_data(), (int*)_seq.mutable_data(), seq_num, slice_size, tag_num, base_idx); + // delete seq_host; + #endif + return SaberSuccess; +} +DEFINE_OP_TEMPLATE(SaberCrfDecoding, CrfDecodingParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberCrfDecoding, CrfDecodingParam, NV, AK_HALF); +} //namespace anakin + +} //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_crop.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_crop.cu index ad9a267da..29b9132ec 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_crop.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_crop.cu @@ -32,20 +32,14 @@ __global__ void ker_crop_fwd(Dtype * out_data, \ } -template -SaberStatus SaberCrop::dispatch(const std::vector& inputs, - std::vector& outputs, - CropParam& param) { - - const InDataType* in_data = inputs[0]->data(); - OutDataType* out_data = outputs[0]->mutable_data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); +template <> +SaberStatus SaberCrop::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + CropParam& param) { + + const OpDataType* in_data = (const OpDataType*)inputs[0]->data(); + OpDataType* out_data = (OpDataType*)outputs[0]->mutable_data(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); int count = outputs[0]->valid_size(); int out_n = outputs[0]->num(); int out_c = outputs[0]->channel(); @@ -53,7 +47,7 @@ SaberStatus SaberCropwidth(); if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { - ker_crop_fwd\ + ker_crop_fwd\ <<>>(\ out_data, in_data + _img_offset, \ _in_n_stride, _in_c_stride, _in_h_stride, _in_w_stride,\ @@ -63,6 +57,7 @@ SaberStatus SaberCrop -__global__ void ker_ctc_align_fwd(Dtype * out_data, \ - int* out_offset, - const Dtype* in_data, - const int* in_offset, - const int seq_num, - const int blank, - const bool merge_repeated, - const int num_threads) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid == 0) { - int index = 0; - for (int seq_id = 0; seq_id < seq_num; seq_id++) { - Dtype prev_token = -1; - out_offset[seq_id] = index; - for (int i = in_offset[seq_id]; i < in_offset[seq_id + 1]; i++) { - if (in_data[i] != blank && !(merge_repeated && in_data[i] == prev_token)) { - out_data[index++] = in_data[i]; - prev_token = in_data[i]; - } - } - } - out_offset[seq_num] = index; - } -} - -template -SaberStatus SaberCtcAlign::dispatch(const std::vector& inputs, - std::vector& outputs, - CtcAlignParam& param) { - - const InDataType* in_data = inputs[0]->data(); - OutDataType* out_data = outputs[0]->mutable_data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); - int count = outputs[0]->valid_size(); - int out_n = outputs[0]->num(); - int* in_offset = _in_offset.mutable_data(); - int* out_offset = _out_offset.mutable_data(); - int seq_num = (inputs[0]->get_seq_offset()).size() - 1; - if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { - cudaMemcpyAsync(in_offset, &(inputs[0]->get_seq_offset())[0], sizeof(int) * (seq_num + 1), cudaMemcpyHostToDevice, cuda_stream); - ker_ctc_align_fwd\ - <<>>(\ - out_data, out_offset, in_data, \ - in_offset, seq_num, param.blank, param.merge_repeated, - 1); - - std::vector seq_offset; - seq_offset.resize((inputs[0]->get_seq_offset()).size()); - cudaMemcpyAsync(&seq_offset[0], out_offset, sizeof(int) * (seq_num + 1), cudaMemcpyDeviceToHost, cuda_stream); - outputs[0]->set_seq_offset(seq_offset); - } - - return SaberSuccess; -} - -} -} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_deconv.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_deconv.cu index 79c869f83..3239c9250 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_deconv.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_deconv.cu @@ -1,16 +1,110 @@ #include "saber/funcs/impl/cuda/saber_deconv.h" -#include "saber/funcs/impl/cuda/base/sass_funcs.h" -#include -namespace anakin{ +#include "saber/funcs/saber_util.h" -namespace saber{ +namespace anakin { +namespace saber { + +template +static __global__ void ker_bias_relu(Dtype* tensor, const Dtype* bias, int channel_num, + int channel_size) { + const int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int channel_id = thread_id / channel_size; + const int channel_inner_index = thread_id % channel_size; + + if (channel_id < channel_num) { + Dtype tmp = tensor[thread_id] + bias[channel_id]; + + if (with_relu) { + tensor[thread_id] = tmp > 0 ? tmp : 0; + } else { + tensor[thread_id] = tmp; + } + } +}; + +template +static inline void bias_relu(Dtype* tensor, const Dtype* bias, int channel_num, int channel_size, + int with_relu, cudaStream_t stream) { + if (with_relu) { + ker_bias_relu <<< CUDA_GET_BLOCKS(channel_num* channel_size), + CUDA_NUM_THREADS, 0, stream>>>(tensor, bias, channel_num, channel_size); + } else { + ker_bias_relu <<< CUDA_GET_BLOCKS(channel_num* channel_size), + CUDA_NUM_THREADS, 0, stream>>>(tensor, bias, channel_num, channel_size); + } +} + +template +static __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col, + const int height, const int width, const int channels, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int height_col, const int width_col, + Dtype* data_im) { + CUDA_KERNEL_LOOP(index, n) { + Dtype val = 0; + const int w_im = index % width + pad_w; + const int h_im = (index / width) % height + pad_h; + const int c_im = index / (width * height); + int kernel_extent_w = (kernel_w - 1) * dilation_w + 1; + int kernel_extent_h = (kernel_h - 1) * dilation_h + 1; + // compute the start and end of the output + const int w_col_start = + (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; + const int w_col_end = min(w_im / stride_w + 1, width_col); + const int h_col_start = + (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; + const int h_col_end = min(h_im / stride_h + 1, height_col); + + // TODO: use LCM of stride and dilation to avoid unnecessary loops + for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) { + for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) { + int h_k = (h_im - h_col * stride_h); + int w_k = (w_im - w_col * stride_w); + + if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { + h_k /= dilation_h; + w_k /= dilation_w; + int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) * + height_col + h_col) * width_col + w_col; + val += data_col[data_col_index]; + } + } + } + + data_im[index] = val; + } +} + +template +static void col2im_gpu(const Dtype* data_col, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, const int dilation_h, const int dilation_w, + Dtype* data_im, cudaStream_t stream) { + int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / + stride_h + 1; + int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / + stride_w + 1; + int num_kernels = channels * height * width; + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operators) + col2im_gpu_kernel <<< CUDA_GET_BLOCKS(num_kernels), + CUDA_NUM_THREADS, 0, stream>>>( + num_kernels, data_col, height, width, channels, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + height_col, width_col, data_im); +} template __global__ void direct_deconv(const dtype* const din, const dtype* bias_data, const dtype* const weight_data, const int num, const int in_channels, const int out_channels, - const int hout,const int wout, const int channel_out_stride, + const int hout, const int wout, const int channel_out_stride, const int hin, const int win, const int channel_in_stride, const int kernel_h, const int kernel_w, const int kernel_size, const int stride_h, const int stride_w, @@ -29,16 +123,17 @@ __global__ void direct_deconv(const dtype* const din, int idx_out = iout * channel_out_stride + ho * wout + wo; extern __shared__ dtype sharedw[]; - dtype val = 0; if (wo < wout && ho < hout) { - for(int ic = 0; ic < in_channels; ic++) { + for (int ic = 0; ic < in_channels; ic++) { //! read weights int idx_weight = threadIdx.y * blockDim.x + threadIdx.x; + if (idx_weight < kernel_size) { sharedw[idx_weight] = weight_data[(ic * out_channels + cout) * kernel_size + idx_weight]; } + __syncthreads(); //! get start and end index const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; @@ -65,10 +160,9 @@ __global__ void direct_deconv(const dtype* const din, val += bias_data[cout]; } if (flag_act) { - val = val > (dtype)0? val : (dtype)0; + val = val > (dtype)0 ? val : (dtype)0; } dout[idx_out] = val; - } } @@ -92,6 +186,7 @@ __global__ void depthwise_deconv_2d(const int channel_in_stride, const int chann extern __shared__ dtype sharedw[]; int idx = threadIdx.y * blockDim.x + threadIdx.x; + if (idx < kernel_size) { sharedw[idx] = weight[c * kernel_size + idx]; } @@ -103,12 +198,11 @@ __global__ void depthwise_deconv_2d(const int channel_in_stride, const int chann const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; const int pwend = min(w / stride_w + 1, win); - const int khstart=(h >= kernel_h) ? ((h-kernel_h)%stride_h)+(kernel_h-stride_h): h; - const int kwstart=(w >= kernel_w) ? ((w-kernel_w)%stride_w)+(kernel_w-stride_w) : w; + const int khstart = (h >= kernel_h) ? ((h - kernel_h) % stride_h) + (kernel_h - stride_h) : h; + const int kwstart = (w >= kernel_w) ? ((w - kernel_w) % stride_w) + (kernel_w - stride_w) : w; dtype gradient = 0; const dtype* const top_diff_slice = din + i * channel_in_stride; - const dtype* const weight_slice = weight + c * kernel_size; for (int ph = phstart; ph < phend; ++ph) { @@ -122,24 +216,60 @@ __global__ void depthwise_deconv_2d(const int channel_in_stride, const int chann if (bias_flag) { gradient += bias[c]; } + if (relu_flag) { - gradient = gradient > (dtype)0? gradient : (dtype)0; + gradient = gradient > (dtype)0 ? gradient : (dtype)0; } + dout[index] = gradient; } } + + +template <> +SaberStatus SaberDeconv2D::create( + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context &ctx) { + _use_k4_s2_p1 = true; + _use_k4_s2_p1 = _use_k4_s2_p1 && (param.weight()->width() == 4); + _use_k4_s2_p1 = _use_k4_s2_p1 && (param.weight()->height() == 4); + _use_k4_s2_p1 = _use_k4_s2_p1 && (param.stride_h == 2); + _use_k4_s2_p1 = _use_k4_s2_p1 && (param.stride_w == 2); + _use_k4_s2_p1 = _use_k4_s2_p1 && (param.pad_h == 1); + _use_k4_s2_p1 = _use_k4_s2_p1 && (param.pad_w == 1); + _use_k4_s2_p1 = _use_k4_s2_p1 && (param.group == 1); + _use_k4_s2_p1 = _use_k4_s2_p1 && (inputs[0]->width() % 64 == 0); + if (_use_k4_s2_p1) { + int in_channel = inputs[0]->channel(); + int out_channel = outputs[0]->channel(); + scale_to_new_tensor_k4_s2_p1_deconv<4>(param.mutable_weight(), + in_channel, out_channel); + return SaberSuccess; + } else { + return SaberUnImplError; + } +} + +template <> +SaberStatus SaberDeconv2D::init( + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + template <> -SaberStatus SaberDeconv2D::dispatch( \ - const std::vector& inputs, - std::vector& outputs, - ConvParam& param) { - cudaStream_t stream = this->_ctx.get_compute_stream(); - //! inputs only has one tensor - - const float* din = inputs[0]->data(); - float* dout = outputs[0]->mutable_data(); - const float* weight = param.weight()->data(); +SaberStatus SaberDeconv2D::dispatch(\ + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param) { + cudaStream_t stream = this->_ctx->get_compute_stream(); + + const float* din = (const float*)inputs[0]->data(); + float* dout = (float*)outputs[0]->mutable_data(); + const float* weight = (const float*)param.weight()->data(); int win = inputs[0]->width(); int hin = inputs[0]->height(); @@ -152,67 +282,23 @@ SaberStatus SaberDeconv2Dwidth(); int kernel_h = param.weight()->height(); - dim3 block(32, 32); - int gx = (wout + block.x - 1) / block.x; - int gy = (hout + block.y - 1) / block.y; - dim3 grid(gx, gy, num * ch_out); - int channel_in_stride = hin * win; - int channel_out_stride = hout * wout; - int kernel_size = kernel_h * kernel_w; - int shared_mem_size = kernel_size * sizeof(float); - if (_use_k4_s2_p1) { const float * bias_data = (param.bias()->valid_size() > 0) ? - param.bias()->data() : NULL; - const float *weights_data = new_weights_dev.data(); + (const float*)param.bias()->data() : NULL; + const float *weights_data = (const float*)param.weight()->data(); ker_deconv_implicit_gemm_k4_s2_p1_16x64(dout, din, weights_data, bias_data, num, hin, win, hout, wout, ch_in, ch_out, stream); return SaberSuccess; + } else { + return SaberUnImplError; } - if (param.bias()->valid_size() > 0) { // deconv with bias - const float* bias = param.bias()->data(); - //! depthwise deconv - if (param.group == ch_in && ch_in == ch_out) { -// LOG(ERROR) << "In deconv cu"; - depthwise_deconv_2d<<>>( - channel_in_stride, channel_out_stride, kernel_size, \ - din, num, ch_in, hin, win, hout, wout, kernel_h, \ - kernel_w, param.stride_h, param.stride_w, \ - param.pad_h, param.pad_w, \ - dout, weight, bias); - } else { - direct_deconv<<>>\ - (din, bias, weight, num, ch_in, ch_out, hout, wout, channel_out_stride, \ - hin, win, channel_in_stride, kernel_h, kernel_w, kernel_size, \ - param.stride_h, param.stride_w, param.pad_h, param.pad_w, \ - param.dilation_h, param.dilation_w, dout); - } - } else { //deconv without bias - //! depthwise deconv - if (param.group == ch_in && ch_in == ch_out) { -// LOG(ERROR) << "In deconv cu"; - depthwise_deconv_2d << < grid, block, shared_mem_size, stream>> > ( - channel_in_stride, channel_out_stride, kernel_size, \ - din, num, ch_in, hin, win, hout, wout, kernel_h, \ - kernel_w, param.stride_h, param.stride_w, \ - param.pad_h, param.pad_w, \ - dout, weight, nullptr); - } else { -// LOG(INFO)<<"Calling This "; - direct_deconv<<>>\ - (din, nullptr, weight, num, ch_in, ch_out, hout, wout, channel_out_stride, \ - hin, win, channel_in_stride, kernel_h, kernel_w, kernel_size, \ - param.stride_h, param.stride_w, param.pad_h, param.pad_w, \ - param.dilation_h, param.dilation_w, dout); - } - } - - return SaberSuccess; } - +template class SaberDeconv2D; +DEFINE_OP_TEMPLATE(SaberDeconv2D, ConvParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberDeconv2D, ConvParam, NV, AK_INT8); } //namespace anakin } //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_deconv_act.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_deconv_act.cu deleted file mode 100644 index 75a2c47d1..000000000 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_deconv_act.cu +++ /dev/null @@ -1,227 +0,0 @@ -#include "saber/funcs/impl/cuda/saber_deconv_act.h" -#include "saber/funcs/impl/cuda/base/sass_funcs.h" - -namespace anakin{ - -namespace saber{ - -template -__global__ void direct_deconv(const dtype* const din, - const dtype* bias_data, const dtype* const weight_data, - const int num, const int in_channels, const int out_channels, - const int hout,const int wout, const int channel_out_stride, - const int hin, const int win, const int channel_in_stride, - const int kernel_h, const int kernel_w, const int kernel_size, - const int stride_h, const int stride_w, - const int pad_h, const int pad_w, - const int dilation_h, const int dilation_w, - dtype* dout) { - - int wo = blockIdx.x * blockDim.x + threadIdx.x; - int w = wo + pad_w; - int ho = blockIdx.y * blockDim.y + threadIdx.y; - int h = ho + pad_h; - int iout = blockIdx.z; - int cout = iout % out_channels; - int n = iout / out_channels; - int iin = n * in_channels; - int idx_out = iout * channel_out_stride + ho * wout + wo; - - extern __shared__ dtype sharedw[]; - - dtype val = 0; - - if (wo < wout && ho < hout) { - for(int ic = 0; ic < in_channels; ic++) { - //! read weights - int idx_weight = threadIdx.y * blockDim.x + threadIdx.x; - if (idx_weight < kernel_size) { - sharedw[idx_weight] = weight_data[(cout * in_channels + ic) * kernel_size + idx_weight]; - } - __syncthreads(); - //! get start and end index - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, hin); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, win); - - const int khstart = (h >= kernel_h) ? ((h - kernel_h) % stride_h) + (kernel_h - stride_h) : h; - const int kwstart = (w >= kernel_w) ? ((w - kernel_w) % stride_w) + (kernel_w - stride_w) : w; - - const dtype* const din_c = din + (iin + ic) * channel_in_stride; - - //! start computation - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - int kh = khstart - (ph - phstart) * stride_h; - int kw = kwstart - (pw - pwstart) * stride_w; - val += din_c[ph * win + pw] * sharedw[kh * kernel_w + kw]; - } - } - } - //! finnal computation - if (flag_bias) { - val += bias_data[cout]; - } - if (flag_act) { - val = val > (dtype)0? val : (dtype)0; - } - dout[idx_out] = val; - - } -} - -template -__global__ void depthwise_deconv_2d(const int channel_in_stride, const int channel_out_stride, - const int kernel_size, - const dtype* const din, const int num, const int channels, - const int hin, const int win, const int hout, - const int wout, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - dtype* const dout, const dtype* const weight, const dtype* const bias) { - - int wo = blockIdx.x * blockDim.x + threadIdx.x; - int w = wo + pad_w; - int ho = blockIdx.y * blockDim.y + threadIdx.y; - int h = ho + pad_h; - int c = blockIdx.z % channels; - //int n = blockIdx.z / channels; - int i = blockIdx.z; - int index = i * channel_out_stride + ho * wout + wo; - - extern __shared__ dtype sharedw[]; - int idx = threadIdx.y * blockDim.x + threadIdx.x; - if (idx < kernel_size) { - sharedw[idx] = weight[c * kernel_size + idx]; - } - __syncthreads(); - - if (wo < wout && ho < hout) { - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, hin); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, win); - - const int khstart=(h >= kernel_h) ? ((h-kernel_h)%stride_h)+(kernel_h-stride_h): h; - const int kwstart=(w >= kernel_w) ? ((w-kernel_w)%stride_w)+(kernel_w-stride_w) : w; - - dtype gradient = 0; - const dtype* const top_diff_slice = din + i * channel_in_stride; - - const dtype* const weight_slice = weight + c * kernel_size; - - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - int kh = khstart - (ph - phstart) * stride_h; - int kw = kwstart - (pw - pwstart) * stride_w; - gradient += top_diff_slice[ph * win + pw] * sharedw[kh * kernel_w + kw]; - //gradient += top_diff_slice[ph * win + pw] * weight_slice[kh * kernel_w + kw]; - } - } - if (bias_flag) { - gradient += bias[c]; - } - if (relu_flag) { - gradient = gradient > (dtype)0? gradient : (dtype)0; - } - dout[index] = gradient; - } -} - -template -SaberStatus SaberDeconv2DAct::dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param) { - cudaStream_t stream = this->_ctx.get_compute_stream(); - //! inputs only has one tensor - - const InDataType* din = inputs[0]->data(); - OutDataType* dout = outputs[0]->mutable_data(); - const OpDataType* weight = param.conv_param.weight()->data(); - - int win = inputs[0]->width(); - int hin = inputs[0]->height(); - int num = inputs[0]->num(); - int ch_in = inputs[0]->channel(); - int wout = outputs[0]->width(); - int hout = outputs[0]->height(); - int ch_out = outputs[0]->channel(); - - int kernel_w = param.conv_param.weight()->width(); - int kernel_h = param.conv_param.weight()->height(); - - dim3 block(32, 32); - int gx = (wout + block.x - 1) / block.x; - int gy = (hout + block.y - 1) / block.y; - dim3 grid(gx, gy, num * ch_out); - int channel_in_stride = hin * win; - int channel_out_stride = hout * wout; - int kernel_size = kernel_h * kernel_w; - int shared_mem_size = kernel_size * sizeof(OpDataType); - - if (_use_k4_s2_p1) { - const InDataType * bias_data = (param.conv_param.bias()->valid_size() > 0) ? - param.conv_param.bias()->data() : NULL; - const OpDataType *weights_data = new_weights_dev.data(); - ker_deconv_implicit_gemm_k4_s2_p1_32x32_relu(dout, din, - weights_data, bias_data, - num, - hin, win, hout, wout, - ch_in, ch_out, stream); - return SaberSuccess; - } - - if (param.conv_param.bias()->valid_size() > 0) { // deconv with bias - const InDataType* bias = param.conv_param.bias()->data(); - //! depthwise deconv - if (param.conv_param.group == ch_in && ch_in == ch_out) { - depthwise_deconv_2d<<>>( - channel_in_stride, channel_out_stride, kernel_size, \ - din, num, ch_in, hin, win, hout, wout, kernel_h, \ - kernel_w, param.conv_param.stride_h, param.conv_param.stride_w, \ - param.conv_param.pad_h, param.conv_param.pad_w, \ - dout, weight, bias); - } else { - direct_deconv<<>> - (din, bias, weight, - num, ch_in, ch_out, hout, wout, channel_out_stride, - hin, win, channel_in_stride, - kernel_h, kernel_w, kernel_size, - param.conv_param.stride_h, param.conv_param.stride_w, - param.conv_param.pad_h, param.conv_param.pad_w, - param.conv_param.dilation_h, param.conv_param.dilation_w, - dout); - - } - } else { //deconv without bias - //! depthwise deconv - if (param.conv_param.group == ch_in && ch_in == ch_out) { - depthwise_deconv_2d << < grid, block, shared_mem_size, stream>> > ( - channel_in_stride, channel_out_stride, kernel_size, \ - din, num, ch_in, hin, win, hout, wout, kernel_h, \ - kernel_w, param.conv_param.stride_h, param.conv_param.stride_w, \ - param.conv_param.pad_h, param.conv_param.pad_w, \ - dout, weight, nullptr); - } else { - direct_deconv<<>> - (din, nullptr, weight, num, ch_in, ch_out, hout, wout, channel_out_stride, - hin, win, channel_in_stride, kernel_h, kernel_w, kernel_size, - param.conv_param.stride_h, param.conv_param.stride_w, - param.conv_param.pad_h, param.conv_param.pad_w, - param.conv_param.dilation_h, param.conv_param.dilation_w, - dout); - } - } - - return SaberSuccess; -} - -} //namespace anakin - -} //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_deformable_conv.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_deformable_conv.cu deleted file mode 100644 index 60ca9a7c4..000000000 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_deformable_conv.cu +++ /dev/null @@ -1,191 +0,0 @@ -#include "saber/funcs/impl/cuda/saber_deformable_conv.h" -#include "cuda_fp16.h" -#include "saber/core/tensor_op.h" - -namespace anakin { -namespace saber { - -__device__ float deformable_im2col_bilinear(const float* bottom_data, const int data_width, - const int height, const int width, float h, float w) { - int h_low = floor(h); - int w_low = floor(w); - int h_high; - int w_high; - if (h_low >= height - 1) { - h_high = h_low = height - 1; - h = (float) h_low; - } else { - h_high = h_low + 1; - } - - if (w_low >= width - 1) { - w_high = w_low = width - 1; - w = (float) w_low; - } else { - w_high = w_low + 1; - } - float lh = h - h_low; - float lw = w - w_low; - float hh = 1 - lh, hw = 1 - lw; - float v1 = bottom_data[h_low * data_width + w_low]; - float v2 = bottom_data[h_low * data_width + w_high]; - float v3 = bottom_data[h_high * data_width + w_low]; - float v4 = bottom_data[h_high * data_width + w_high]; - float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; - float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); - return val; -} - -__global__ void deformable_im2col_gpu_kernel(const int n, const float* data_im, - const float* data_offset, const int height, const int width, - const int kernel_h, const int kernel_w, const int pad_h, - const int pad_w, const int stride_h, const int stride_w, - const int dilation_h, const int dilation_w, - const int channel_per_deformable_group, const int height_col, - const int width_col, float* data_col) { - - CUDA_KERNEL_LOOP(index, n) { - const int w_col = index % width_col; - const int h_col = (index / width_col) % height_col; - const int c_im = (index / width_col) / height_col; - const int c_col = c_im * kernel_h * kernel_w; - - // compute deformable group index - // THIS IS THE TRUE CHANNEL - const int deformable_group_index = c_im / channel_per_deformable_group; - - //input map coord(h_in, w_in) - const int h_in = h_col * stride_h - pad_h; - const int w_in = w_col * stride_w - pad_w; - //data_col (data & offset) - float* data_col_ptr = data_col - + (c_col * height_col + h_col) * width_col + w_col; - const float* data_im_ptr = data_im + (c_im * height + h_in) * width - + w_in; - const float* data_offset_ptr = data_offset - + deformable_group_index * 2 * kernel_h * kernel_w * height_col - * width_col; - - for (int i = 0; i < kernel_h; ++i) { - for (int j = 0; j < kernel_w; ++j) { - //offset_h and offset_w in the same channel - const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) - * height_col + h_col) * width_col + w_col; - - const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) - * height_col + h_col) * width_col + w_col; - const float offset_h = data_offset_ptr[data_offset_h_ptr]; - const float offset_w = data_offset_ptr[data_offset_w_ptr]; - float val = 0.f; - const float h_im = h_in + i * dilation_h + offset_h; - const float w_im = w_in + j * dilation_w + offset_w; - if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { - const float map_h = i * dilation_h + offset_h; - const float map_w = j * dilation_w + offset_w; - // cur_height (from h_in to height) - const int cur_height = height - h_in; - const int cur_width = width - w_in; - val = deformable_im2col_bilinear(data_im_ptr, width, - cur_height, cur_width, map_h, map_w); - } - *data_col_ptr = val; - data_col_ptr += height_col * width_col; - } - } - } -} -__global__ void gpu_add_bias(float * out_data, const int count, - int in_n, int in_c, int in_h, int in_w, - int in_n_stride, int in_c_stride, - int in_h_stride, int in_w_stride, - const float *bias) { - CUDA_KERNEL_LOOP(tid, count){ - int read_w = tid % in_w; - int read_h = (tid / (in_w)) % in_h; - int read_c = (tid / (in_h * in_w)) % in_c; - int read_n = (tid / (in_c * in_h * in_w)) % in_n; - - int in_idx = read_n * in_n_stride - + read_c * in_c_stride - + read_h * in_h_stride - + read_w * in_w_stride; - - float in_var = out_data[in_idx]; - float in_bias = bias[read_c]; - out_data[in_idx] = in_var + in_bias; - } -} -template <> -SaberStatus SaberDeformableConv2D::dispatch(const std::vector& inputs, - std::vector& outputs, - DeformableConvParam& param) { - - int in_channel = inputs[0]->channel(); - int conv_out_channel = outputs[0]->channel(); - - const OpDataType* weight = (const float*)param.weight()->data(); - const InDataType* data = inputs[0]->data(); - const InDataType* offset = inputs[1]->data(); - - InDataType* top_data = outputs[0]->mutable_data(); - - InDataType* deformable_col_buffer_data = _deform_col_buffer.mutable_data(); - const InDataType* deform_col_buffer_data_const = _deform_col_buffer.data(); - - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); - - for (int n = 0; n < inputs[0]->num(); ++n) { - - // transform image to col_buffer in order to use gemm - - int channel_per_group = in_channel / param.group; - int num_kernels = in_channel * _deform_col_buffer.height() * _deform_col_buffer.width(); - - deformable_im2col_gpu_kernel - <<>>( - num_kernels, data + n * _bottom_dim, offset + n * _offset_dim, - inputs[0]->height(), inputs[0]->width(), - param.weight()->height(), param.weight()->width(), - param.pad_h, param.pad_w, param.stride_h, param.stride_w, - param.dilation_h, param.dilation_w, - channel_per_group, _deform_col_buffer.height(), - _deform_col_buffer.width(), - deformable_col_buffer_data); - - for (int g = 0; g < param.group; ++g) { - float alpha = 1.f; - float beta = 0.f; - CUBLAS_CHECK(cublasSgemm(_handle, CUBLAS_OP_N, CUBLAS_OP_N, - _conv_out_spatial_dim, - conv_out_channel / param.group, - _kernel_dim / param.group, - &alpha, - deform_col_buffer_data_const + _col_offset * g, - _conv_out_spatial_dim, - weight + _kernel_offset * g, - _kernel_dim / param.group, - &beta, - top_data + _output_offset * g, - _conv_out_spatial_dim)); - } - if (param.bias()->size() > 0) { - Shape out_shape = outputs[0]->valid_shape(); - Shape out_stride = outputs[0]->get_stride(); - int out_count = outputs[0]->size(); - const float* bias_data = (const float*)param.bias()->data(); - gpu_add_bias<<>> (top_data, out_count, - out_shape[0], out_shape[1], - out_shape[2], out_shape[3], - out_stride[0], out_stride[1], - out_stride[2], out_stride[3], - bias_data); - } - CUDA_POST_KERNEL_CHECK; - } - - return SaberSuccess; -} - -} -} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_depthwiseconv_act.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_depthwiseconv_act.cu index 9630f0164..ea3aef53d 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_depthwiseconv_act.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_depthwiseconv_act.cu @@ -1,5 +1,6 @@ -#include "saber/funcs/impl/cuda/saber_conv_act.h" - +//#include "saber/funcs/impl/cuda/saber_conv_act.h" +#include "saber/saber_types.h" +#include "saber/core/common.h" namespace anakin{ namespace saber{ @@ -34,8 +35,8 @@ __global__ void depthwise_conv_1d(const int nthreads, const Dtype* const weight_slice = weight + c * size_kernel; - int khstart = hend < kernel_h? kernel_h - hend : 0; - int kwstart = wend < kernel_w? kernel_w - wend : 0; + int khstart = hend < kernel_h ? kernel_h - hend : 0; + int kwstart = wend < kernel_w ? kernel_w - wend : 0; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -104,7 +105,7 @@ template SaberStatus saber_depthwise_conv_act(const dtype* input, dtype* output, \ int num, int cin, int hin, int win, int hout, int wout, \ int kw, int kh, int stride_w, int stride_h, \ - int pad_h, int pad_w, const dtype* weights, const dtype* bias, \ + int pad_w, int pad_h, const dtype* weights, const dtype* bias, \ cudaStream_t stream) { #define D1 @@ -123,12 +124,12 @@ SaberStatus saber_depthwise_conv_act(const dtype* input, dtype* output, \ if (bias_flag) { #ifdef D1 - depthwise_conv_1d<<>>( + depthwise_conv_1d<<>>( count, input, num, cin, hin, win, hout, wout, kh, \ kw, stride_h, stride_w, pad_h, pad_w, \ output, weights, bias); #else - depthwise_conv_2d<<>>( + depthwise_conv_2d<<>>( channel_in_stride, channel_out_stride, kernel_size, \ input, num, cin, hin, win, hout, wout, kh, \ kw, stride_h, stride_w, pad_h, pad_w, \ @@ -136,12 +137,12 @@ SaberStatus saber_depthwise_conv_act(const dtype* input, dtype* output, \ #endif } else { #ifdef D1 - depthwise_conv_1d<<< CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>> ( + depthwise_conv_1d<<< CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>> ( count, input, num, cin, hin, win, hout, wout, kh, \ kw, stride_h, stride_w, pad_h, \ pad_w, output, weights, nullptr); #else - depthwise_conv_2d<<>>( + depthwise_conv_2d<<>>( channel_in_stride, channel_out_stride, kernel_size, \ input, num, cin, hin, win, hout, wout, kh, \ kw, stride_h, stride_w, pad_h, pad_w, \ diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_detection_output.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_detection_output.cu deleted file mode 100644 index af08e4dbe..000000000 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_detection_output.cu +++ /dev/null @@ -1,100 +0,0 @@ -#include "saber/funcs/impl/cuda/saber_detection_output.h" -#include "saber/funcs/impl/detection_helper.h" -namespace anakin{ - -namespace saber{ -template -__global__ void permute_data_kernel(const int nthreads, - const dtype* data, const int num_classes, const int num_data, - const int num_dim, dtype* new_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int i = index % num_dim; - const int c = (index / num_dim) % num_classes; - const int d = (index / num_dim / num_classes) % num_data; - const int n = index / num_dim / num_classes / num_data; - const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i; - new_data[new_index] = data[index]; - } -} - -template -void permute_data(const int nthreads, - const dtype* data, const int num_classes, const int num_data, - const int num_dim, dtype* new_data, cudaStream_t stream) { - // NOLINT_NEXT_LINE(whitespace/operators) - permute_data_kernel<<>>(nthreads, data, num_classes, num_data, - num_dim, new_data); -} - -template -SaberStatus SaberDetectionOutput::dispatch(const std::vector& inputs, - std::vector& outputs, - DetectionOutputParam& param) { - - //typedef typename DataTensor_in::Dtype InDataType; - //typedef typename - cudaStream_t stream = this->_ctx.get_compute_stream(); - - DataTensor_in* t_loc = inputs[0]; - DataTensor_in* t_conf = inputs[1]; - DataTensor_in* t_prior = inputs[2]; - - const InDataType* loc_data = t_loc->data(); - const InDataType* prior_data = t_prior->data(); - const int num = t_loc->num(); - - // Decode predictions. - InDataType* bbox_data = _bbox_preds.mutable_data(); - const int loc_count = _bbox_preds.valid_size(); - decode_bboxes(loc_count, loc_data, prior_data, param.type, \ - param.variance_encode_in_target, _num_priors, param.share_location, \ - _num_loc_classes, param.background_id, bbox_data, stream); - // Retrieve all decoded location predictions. - if (!param.share_location) { - InDataType * bbox_permute_data = _bbox_permute.mutable_data(); - permute_data(loc_count, bbox_data, _num_loc_classes, _num_priors, - 4, bbox_permute_data, stream); - } - // Retrieve all confidences. - InDataType* conf_permute_data = _conf_permute.mutable_data(); - permute_data(t_conf->valid_size(), t_conf->data(), \ - this->_num_classes, _num_priors, 1, conf_permute_data, stream); - - CUDA_CHECK(cudaMemcpyAsync(_bbox_cpu_data, _bbox_preds.data(), \ - _bbox_preds.valid_size() * sizeof(InDataType), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaMemcpyAsync(_conf_cpu_data, _conf_permute.data(), \ - _conf_permute.valid_size() * sizeof(InDataType), cudaMemcpyDeviceToHost, stream)); - cudaStreamSynchronize(stream); - - std::vector result; - - nms_detect(_bbox_cpu_data, _conf_cpu_data, result, num, this->_num_classes, _num_priors, param.background_id, \ - param.keep_top_k, param.nms_top_k, param.conf_thresh, param.nms_thresh, param.nms_eta, param.share_location); - - if(result.size() == 0) { - result.resize(7); - for (int i = 0; i < 7; ++i) { - result[i] = (InDataType)-1; - } - outputs[0]->reshape({1, 1, 1, 7}); - } else { - outputs[0]->reshape({1, 1, result.size() / 7, 7}); - } - - CUDA_CHECK(cudaMemcpyAsync(outputs[0]->mutable_data(), result.data(), \ - result.size() * sizeof(InDataType), cudaMemcpyHostToDevice, stream)); - - return SaberSuccess; -} - -//template class SaberDetectionOutput; -} //namespace anakin - -} //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_eltwise.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_eltwise.cu index b4f2e28c9..8fdf09bbd 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_eltwise.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_eltwise.cu @@ -1,99 +1,241 @@ #include "saber/funcs/impl/cuda/saber_eltwise.h" namespace anakin { namespace saber { +#if 0 +template +static __global__ void ker_multi_elt_production(Dtype* out_data, const Dtype** in_data, int count, + int input_size) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; -template -__global__ void ker_elt_production(Dtype* out_data, const Dtype * in_data_a, const Dtype * in_data_b, int count){ - CUDA_KERNEL_LOOP(tid, count){ - out_data[tid] = in_data_a[tid] * in_data_b[tid]; + if (tid < count) { + Dtype tmp = in_data[0][tid]; + + for (int i = 1; i < input_size; i++) { + tmp *= in_data[i][tid]; + } + + if (with_relu) { + out_data[tid] = tmp > static_cast(0) ? tmp : static_cast(0); + } else { + out_data[tid] = tmp; + } } } -template -__global__ void ker_elt_sum(Dtype* out_data, const Dtype * in_data1,const Dtype * in_data2, Dtype coeff1, Dtype coeff2, int count){ - CUDA_KERNEL_LOOP(tid, count){ - out_data[tid] = coeff1*in_data1[tid] + coeff2 * in_data2[tid]; +template +static __global__ void ker_multi_elt_sum(Dtype* out_data, const Dtype** in_data, const Dtype* coeff, + int count, int input_size) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid < count) { + Dtype tmp = coeff[0] * in_data[0][tid]; + + for (int i = 1; i < input_size; i++) { + tmp += coeff[i] * in_data[i][tid]; + } + + if (with_relu) { + out_data[tid] = tmp > static_cast(0.0f) ? tmp : static_cast(0.0f); + } else { + out_data[tid] = tmp; + } } } -template -__global__ void ker_elt_max(Dtype * out_data, float * mask, const Dtype * in_data_a, const Dtype * in_data_b, int count, int bid){ - if(bid == 0){ - CUDA_KERNEL_LOOP(tid, count){ - Dtype var_a = in_data_a[tid]; - Dtype var_b = in_data_b[tid]; - bool a_gt_b = var_a > var_b; - out_data[tid] = a_gt_b ? var_a : var_b; - mask[tid] = a_gt_b ? 0 : 1; +template +static __global__ void ker_multi_elt_max(Dtype* out_data, const Dtype** in_data, int count, + int input_size) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid < count) { + Dtype tmp = in_data[0][tid]; + + for (int i = 1; i < input_size; i++) { + tmp = tmp >= in_data[i][tid] ? tmp : in_data[i][tid]; + } + + if (with_relu) { + out_data[tid] = tmp > static_cast(0) ? tmp : static_cast(0); + } else { + out_data[tid] = tmp; } } - else{ - CUDA_KERNEL_LOOP(tid, count){ - Dtype var_a = in_data_a[tid]; - Dtype var_b = in_data_b[tid]; - bool a_gt_b = var_a > var_b; - if( ! a_gt_b){ - out_data[tid] = var_b; - mask[tid] = bid; - } +} +#endif + +template +__global__ void ker_elt_production(Dtype* out_data, const Dtype* in_data_a, const Dtype* in_data_b, + int count) { + CUDA_KERNEL_LOOP(tid, count) { + Dtype tmp = in_data_a[tid] * in_data_b[tid]; + + if (with_relu) { + out_data[tid] = tmp > static_cast(0.0f) ? tmp : static_cast(0.0f); + } else { + out_data[tid] = tmp; + } + } +} + +template +__global__ void ker_elt_sum(Dtype* out_data, const Dtype* in_data1, const Dtype* in_data2, + Dtype coeff1, Dtype coeff2, int count) { + CUDA_KERNEL_LOOP(tid, count) { + Dtype tmp = coeff1 * in_data1[tid] + coeff2 * in_data2[tid]; + + if (with_relu) { + out_data[tid] = tmp > static_cast(0.0f) ? tmp : static_cast(0.0f); + } else { + out_data[tid] = tmp; } } } +template +__global__ void ker_elt_max(Dtype* out_data, const Dtype* in_data_a, const Dtype* in_data_b, + int count) { + + CUDA_KERNEL_LOOP(tid, count) { + Dtype tmp; + Dtype var_a = in_data_a[tid]; + Dtype var_b = in_data_b[tid]; + bool a_gt_b = var_a > var_b; + tmp = a_gt_b ? var_a : var_b; + + if (with_relu) { + out_data[tid] = tmp > static_cast(0.0f) ? tmp : static_cast(0.0f); + } else { + out_data[tid] = tmp; + } + } +} + + +template +SaberStatus SaberEltwise::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + EltwiseParam& param) { + const int count = outputs[0]->valid_size(); + OpDataType* out_data = static_cast(outputs[0]->mutable_data()); + const OpDataType* in_data_a = static_cast(inputs[0]->data()); + const OpDataType* in_data_b = static_cast(inputs[1]->data()); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + + + int grid_dim = CUDA_GET_BLOCKS(count); + int block_dim = CUDA_NUM_THREADS; -template <> -SaberStatus SaberEltwise::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - EltwiseParam ¶m) { - float * mask = NULL; - const int count = outputs[0]->size(); - float *out_data = outputs[0]->mutable_data(); - const float *in_data_a = inputs[0]->data(); - const float *in_data_b = inputs[1]->data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); - switch(param.operation){ - case Eltwise_prod: - ker_elt_production - <<>>(out_data, in_data_a, + switch (param.operation) { + case Eltwise_prod: + if (_with_relu) { + if (inputs.size() <= 2) { + ker_elt_production <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, in_data_a, + in_data_b, count); + } else { + ker_elt_production <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, + in_data_a, + in_data_b, count); + + for (int i = 2; i < inputs.size() - 1; i++) { + ker_elt_production + <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, out_data, + static_cast(inputs[i]->data()), count); + } + + ker_elt_production + <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, out_data, + static_cast(inputs[inputs.size() - 1]->data()), count); + } + + } else { + + ker_elt_production <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, + in_data_a, in_data_b, count); - for(int i = 2; i < inputs.size(); i++){ - ker_elt_production - <<>>(out_data, out_data, - inputs[i]->data(), count); - } - break; - case Eltwise_sum: - ker_elt_sum - <<>>(out_data, - inputs[0]->data(), inputs[1]->data(), + for (int i = 2; i < inputs.size(); i++) { + ker_elt_production + <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, out_data, + static_cast(inputs[i]->data()), count); + } + + } + + break; + + case Eltwise_sum: + if (_with_relu) { + ker_elt_sum + <<< + grid_dim, block_dim, 0, cuda_stream >>> (out_data, + in_data_a, in_data_b, param.coeff[0], param.coeff[1], count); - break; - case Eltwise_max: - mask = _max_idx.mutable_data(); - ker_elt_max - <<>>(out_data, mask, - in_data_a, in_data_b, count, 0); - - for(int i = 2; i < inputs.size(); i++){ - ker_elt_max - <<>>(out_data, mask, - out_data, inputs[i]->data(), count, i); - } - break; - default: - LOG(FATAL) << "unknown elementwise operation. "; - } + } else { + ker_elt_sum + <<< + grid_dim, block_dim, 0, cuda_stream >>> (out_data, + in_data_a, in_data_b, + param.coeff[0], param.coeff[1], count); + } + + break; + + case Eltwise_max: + + // mask = (float *) _max_idx.mutable_data(); + if (_with_relu) { + if (inputs.size() <= 2) { + ker_elt_max + <<< grid_dim, block_dim, 0, cuda_stream >>>(out_data, + in_data_a, in_data_b, count); + } else { + ker_elt_max <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, + in_data_a, + in_data_b, count); + + for (int i = 2; i < inputs.size() - 1; i++) { + ker_elt_max + <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, out_data, + static_cast(inputs[i]->data()), count); + } + + ker_elt_max + <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, out_data, + static_cast(inputs[inputs.size() - 1]->data()), count); + } + } else { + + ker_elt_max <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, + in_data_a, + in_data_b, count); + + for (int i = 2; i < inputs.size() ; i++) { + ker_elt_max + <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, out_data, + static_cast(inputs[i]->data()), count); + } + + } + + + break; + + default: + LOG(FATAL) << "unknown elementwise operation. "; + } + + if (_other_activation) { + SABER_CHECK(_saber_activation.dispatch(inputs, outputs, param.activation_param)); + } CUDA_POST_KERNEL_CHECK; return SaberSuccess; } +template class SaberEltwise; +DEFINE_OP_TEMPLATE(SaberEltwise, EltwiseParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberEltwise, EltwiseParam, NV, AK_INT8); + } } \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_eltwise_act.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_eltwise_act.cu deleted file mode 100644 index 3a770c33e..000000000 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_eltwise_act.cu +++ /dev/null @@ -1,50 +0,0 @@ -#include "saber/funcs/impl/cuda/saber_eltwise_act.h" - -namespace anakin { -namespace saber { - -template -__global__ void ker_elt_sum_with_relu(Dtype* out_data, const Dtype * in_data1,const Dtype * in_data2, Dtype coeff1,Dtype coeff2, int count){ - CUDA_KERNEL_LOOP(tid, count){ - Dtype temp = coeff1*in_data1[tid] + coeff2 * in_data2[tid]; - out_data[tid] = temp > 0.0 ? temp : 0.0; - } -} - -template <> -SaberStatus SaberEltwiseActive::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - EltwiseActiveParam ¶m) { - - const int count = outputs[0]->size(); - float * out_data = outputs[0]->mutable_data(); - const float *in_data_a = inputs[0]->data(); - const float *in_data_b = inputs[1]->data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); - switch(param.eltwise_param.operation){ - case Eltwise_prod: - LOG(FATAL)<<"NOT IMPLEMENT yet!!"; - - break; - case Eltwise_sum: - ker_elt_sum_with_relu - <<>>(out_data, - inputs[0]->data(), inputs[1]->data(), - param.eltwise_param.coeff[0], param.eltwise_param.coeff[1], count); - break; - case Eltwise_max: - LOG(FATAL)<<"NOT IMPLEMENT yet!!"; - - break; - default: - LOG(FATAL) << "unknown elementwise operation. "; - } - - CUDA_POST_KERNEL_CHECK; - return SaberSuccess; -} - -} -} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_embedding.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_embedding.cu index 529c38661..9a6fda535 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_embedding.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_embedding.cu @@ -26,28 +26,35 @@ __global__ void ker_embedding_fwd(OutDataType * out_data, } } -template <> -SaberStatus SaberEmbedding::dispatch( \ - const std::vector& inputs, - std::vector& outputs, - EmbeddingParam& param) { +template +SaberStatus SaberEmbedding::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + EmbeddingParam& param) { - const InDataType *in_data = (const InDataType*)inputs[0]->data(); - const OpDataType *op_data = (const InDataType*)(param.weight()->data()); - OutDataType *out_data = (OutDataType*)outputs[0]->mutable_data(); + CHECK_EQ(inputs[0]->get_dtype(), AK_FLOAT) <<" Embedding only support float inputs."; + const OpDataType *op_data = (const OpDataType*)(param.weight()->data()); const int count = outputs[0]->valid_size(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); - - ker_embedding_fwd - <<>>( - out_data, in_data, op_data, param.word_num, param.emb_dim, inputs[0]->num(), - param.padding_idx, outputs[0]->valid_size()); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + + //outputs: chose corresponding informations of words. + //inputs: word_id [Its type maybe float or int] + //outputs = weights[inputs[j]]. + ker_embedding_fwd + <<>>( + (OpDataType*)outputs[0]->mutable_data(), + (const float*)inputs[0]->data(), op_data, + param.word_num, param.emb_dim, inputs[0]->num(), + param.padding_idx, outputs[0]->valid_size()); + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); CUDA_POST_KERNEL_CHECK; return SaberSuccess; } +template class SaberEmbedding; +template class SaberEmbedding; +DEFINE_OP_TEMPLATE(SaberEmbedding, EmbeddingParam, NV, AK_HALF); } -} +} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_fc.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_fc.cu index cef5f8fc3..37e83c89f 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_fc.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_fc.cu @@ -13,41 +13,40 @@ __global__ void add_bias(int n, int output_size, const dtype* bias, dtype* dout) } } -template -SaberStatus SaberFc::dispatch( - const std::vector& inputs, - std::vector& outputs, - FcParam& param) { - - cudaStream_t stream = this->_ctx.get_compute_stream(); - - const InDataType* din = inputs[0]->data(); - OutDataType* dout = outputs[0]->mutable_data(); - const OpDataType* weight = param.weights->data(); - const InDataType* bias = nullptr; +template +SaberStatus SaberFc::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs, + FcParam& param) { + + cudaStream_t stream = this->_ctx->get_compute_stream(); + + const OpDataType *din = (const OpDataType *)inputs[0]->data(); + OpDataType *dout = (float *)outputs[0]->mutable_data(); + const OpDataType *weight = (OpDataType *)param.weights->data(); + const OpDataType *bias = nullptr; bool bias_term = param.bias != nullptr; if (bias_term) { - bias = param.bias->data(); + bias = (const OpDataType *)param.bias->data(); } - + float alpha = 1.f; float beta = 0.f; + _kernel(_M, _N, _K, alpha, din, beta, weight, dout, stream); + if (bias_term) { int total_size = _M * _N; - add_bias<<>>\ + add_bias<<>>\ (total_size, _N, bias, dout); } return SaberSuccess; } +template class SaberFc; +DEFINE_OP_TEMPLATE(SaberFc, FcParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberFc, FcParam, NV, AK_INT8); } //namespace anakin -} //namespace anakin +} //namespace anakin \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_gru.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_gru.cu index 294b9cc45..657a2e5ae 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_gru.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_gru.cu @@ -1,804 +1,232 @@ #include "saber/funcs/impl/cuda/saber_gru.h" #include "saber/core/tensor_op.h" -#include "cuda_fp16.h" +#include "cuda_inline_activation.h" namespace anakin { namespace saber { -////TODO:can try record vector in shared -template -__global__ void trans_map2in(Dtype* output, const Dtype* input, const int* map, int count, - int lastdim) { - CUDA_KERNEL_LE(tid, count) { - int seq = tid / lastdim; - output[tid] = input[map[seq] * lastdim + tid % lastdim]; - } -} - -template -__global__ void trans_map2out(Dtype* output, const Dtype* input, const int* map, int count, - int lastdim) { - CUDA_KERNEL_LE(tid, count) { - int seq = tid / lastdim; - output[map[seq]*lastdim + tid % lastdim] = input[tid]; - } -} - -template <> -void SaberGru::seq2hw(\ - std::vector outputs, std::vector inputs, - GruParam& param, int hidden_size, - void* real_temp_out - ) { - DataTensor_in* din = inputs[0]; - DataTensor_out* dout = outputs[0]; - int wordsize = din->channel(); - std::vector offset_vec = din->get_seq_offset(); - CHECK_GE(offset_vec.size(), 2) << "offset must >=2" ; - int batch_size = offset_vec.size() - 1; - - int max_len = 0; - std::vector length_vec; - - if ((void*)(outputs[0]->data()) == real_temp_out) { - DLOG(INFO) << "not use inner space"; - return; - } - - const OutDataType* origin = _temp_tensor_out.data(); - OutDataType* target = dout->mutable_data(); - - //source is sequence id in seq target is hw id in seq,map is source to target ptr offset - int seq_sum = offset_vec[batch_size]; - CUDA_CHECK(cudaMemcpyAsync(_temp_map_dev.mutable_data(), _temp_map_host.data(), sizeof(int)*seq_sum, - cudaMemcpyHostToDevice, _ctx.get_compute_stream())); - int count=seq_sum * hidden_size; - int block_dim=count; - int grid_dim=1; - if(count>1024){ - block_dim=256; - grid_dim=(count+block_dim-1)/block_dim; - } - trans_map2in <<< grid_dim, block_dim, 0, _ctx.get_compute_stream()>>>(target, origin, _temp_map_dev.data(), - count, hidden_size); - -// trans_map2in_old <<< 4, 128, 0, _ctx.get_compute_stream()>>>(target, origin, _temp_map_dev.data(), -// count, hidden_size); - -} - -//TODO:gem by self, flatten by time, padding by nothing (zhangs) -template <> -const float* SaberGru::hw2seq(\ - std::vector inputs, GruParam& param, \ - int word_size, int hidden_size, int& sequence_len) { - DataTensor_in* din = inputs[0]; - - std::vector offset_vec = din->get_seq_offset(); - CHECK_GE(offset_vec.size(), 2) << "offset must >=2" ; - int batch_size = offset_vec.size() - 1; - int seq_sum = offset_vec[offset_vec.size() - 1]; - int wordsize = din->channel(); - int max_len = 0; - std::vector length_vec(batch_size); - - for (int i = 0; i < offset_vec.size() - 1; ++i) { - int len = offset_vec[i + 1] - offset_vec[i]; - max_len = max_len > len ? max_len : len; - length_vec[i] = len; - } - - Shape seq_shape(1, max_len, batch_size, word_size); - _temp_tensor_in.try_expand_size(seq_shape); - - Shape seq_out_shape(1, max_len, batch_size, hidden_size); - _temp_tensor_out.try_expand_size(seq_out_shape); - - sequence_len = max_len; - - if (batch_size == 1 || max_len == 1) { - return din->mutable_data(); - } - - InDataType* target = _temp_tensor_in.mutable_data(); - const InDataType* origin = din->data(); - - _temp_map_host.try_expand_size(seq_sum); - _temp_map_dev.try_expand_size(seq_sum); - int* map = _temp_map_host.mutable_data(); - - if (param.is_reverse) { - for (int batchid = 0; batchid < batch_size; ++batchid) { - int batch_offset = max_len - length_vec[batchid]; - - for (int seqid = 0; seqid < length_vec[batchid]; ++seqid) { - int source = (offset_vec[batchid] + seqid); - int target = ((seqid + batch_offset) * batch_size + batchid); - map[source] = target; - } - } - } else { - for (int batchid = 0; batchid < batch_size; ++batchid) { - for (int seqid = 0; seqid < length_vec[batchid]; ++seqid) { - int source = (offset_vec[batchid] + seqid); - int target = (seqid * batch_size + batchid); - map[source] = target; - } - } - } - - CUDA_CHECK(cudaMemcpyAsync(_temp_map_dev.mutable_data(), _temp_map_host.data(), sizeof(int)*seq_sum, - cudaMemcpyHostToDevice, _ctx.get_compute_stream())); - int count=seq_sum * wordsize; - int block_dim=count; - int grid_dim=1; - if(count>1024){ - block_dim=256; - grid_dim=(count+block_dim-1)/block_dim; - } - trans_map2out <<< grid_dim, block_dim, 0, _ctx.get_compute_stream()>>>(target, origin, _temp_map_dev.data(), - count, wordsize); - -// trans_map2out_old <<< 4, 128, 0, _ctx.get_compute_stream()>>>(target, origin, _temp_map_dev.data(), -// count, wordsize); - - - return _temp_tensor_in.data(); -} - -#define SIGMOID_THRESHOLD_MIN_PADDLE -40.0 -#define SIGMOID_THRESHOLD_MAX_PADDLE 13.0 -#define EXP_MAX_INPUT_PADDLE 40.0 - -template -inline static __device__ T identity(const T a) { - return a; -} - -template -inline static __device__ T relu(const T a) { - return a > static_cast(0.0) ? a : static_cast(0.0); -} - - -template -inline static __device__ T sigmoid_paddle(const T a) { - const T min = SIGMOID_THRESHOLD_MIN_PADDLE; - const T max = SIGMOID_THRESHOLD_MAX_PADDLE; - T tmp = (a < min) ? min : ((a > max) ? max : a); - return static_cast(1.0) / (static_cast(1.0) + exp(-tmp)); -} - -template -inline static __device__ T tanh_paddle(const T a) { - T tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT_PADDLE) ? EXP_MAX_INPUT_PADDLE : tmp; - return (2.0 / (1.0 + exp(tmp))) - 1.0; -} - -static void anakin_NV_gemm(cublasHandle_t handle, const bool TransA, - const bool TransB, const int M, const int N, const int K, - const float alpha, const float* A, const float* B, const float beta, - float* C) { - // Note that cublas follows fortran order. - int lda = (!TransA/* == CblasNoTrans*/) ? K : M; - int ldb = (!TransB/* == CblasNoTrans*/) ? N : K; - cublasOperation_t cuTransA = - (!TransA/* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (!TransB/* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(handle, cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); -} - -/** - * gridDim=batchsize - * @tparam Dtype - * @param w_x_r - * @param w_h_r - * @param br - * @param hidden_size - * @param output_r - * @param w_x_z - * @param w_h_z - * @param bz - * @param output_z - */ -template -__global__ void cal_reset_update(Dtype* w_x_r, Dtype* w_h_r, const Dtype* b_r, - const int hidden_size, Dtype* output_r, - Dtype* w_x_z, Dtype* w_h_z, const Dtype* b_z, Dtype* output_z) { - int w_base_index = blockIdx.x * hidden_size * 3; - int h_base_index = blockIdx.x * hidden_size; - Dtype* in_w_x_r = w_x_r + w_base_index; - Dtype* in_w_h_r = w_h_r + w_base_index; - Dtype* in_w_x_z = w_x_z + w_base_index; - Dtype* in_w_h_z = w_h_z + w_base_index; - Dtype* out_output_r = output_r + h_base_index; - Dtype* out_output_z = output_z + h_base_index; - - for (int index = threadIdx.x; index < hidden_size; index += blockDim.x) { - Dtype before_act_r = in_w_x_r[index] + in_w_h_r[index] + b_r[index]; - out_output_r[index] = Dtype(Dtype(1) / (Dtype(1) + expf(-before_act_r))); - Dtype before_act_z = in_w_x_z[index] + in_w_h_z[index] + b_z[index]; - out_output_z[index] = Dtype(Dtype(1) / (Dtype(1) + expf(-before_act_z))); - - } -} - -template -__global__ void cal_final(Dtype* w_x_o, Dtype* w_h_o, Dtype* reset, const Dtype* b_o, - const int hidden_size, Dtype* update, Dtype* output, Dtype* hidden_pre) { - int w_base_index = blockIdx.x * hidden_size * 3; - int h_base_index = blockIdx.x * hidden_size; - - Dtype* in_w_x_o = w_x_o + w_base_index; - Dtype* in_w_h_o = w_h_o + w_base_index; - Dtype* in_hidden_pre = hidden_pre + h_base_index; - Dtype* in_update = update + h_base_index; - Dtype* in_reset = reset + h_base_index; - Dtype* out_output = output + h_base_index; - - for (int index = threadIdx.x; index < hidden_size; index += blockDim.x) { - Dtype before_act_h = in_w_x_o[index] + in_w_h_o[index] * in_reset[index] - + b_o[index]; - Dtype acted = tanhf(before_act_h); - Dtype update_t = in_update[index]; - out_output[index] = (1 - update_t) * acted + update_t* in_hidden_pre[index]; - } -} - -template -__global__ void cal_one_kernel_paddlesigmoid_tanh_cudnn_formula(Dtype* w_x_r, Dtype* w_x_z, - Dtype* w_x_o, - Dtype* w_h_r, Dtype* w_h_z, Dtype* w_h_o, - const Dtype* b_r, const Dtype* b_z, const Dtype* b_o, - int hidden_size, Dtype* output, const Dtype* hidden_pre) { - int w_base_index = blockIdx.x * hidden_size * 3; - int h_base_index = blockIdx.x * hidden_size; - Dtype* in_w_x_r = w_x_r + w_base_index; - Dtype* in_w_h_r = w_h_r + w_base_index; - Dtype* in_w_x_z = w_x_z + w_base_index; - Dtype* in_w_h_z = w_h_z + w_base_index; - Dtype* in_w_x_o = w_x_o + w_base_index; - Dtype* in_w_h_o = w_h_o + w_base_index; - const Dtype* in_hidden_pre = hidden_pre + h_base_index; - Dtype* out_output = output + h_base_index; - - for (int index = threadIdx.x; index < hidden_size; index += blockDim.x) { - const Dtype min = SIGMOID_THRESHOLD_MIN_PADDLE; - const Dtype max = SIGMOID_THRESHOLD_MAX_PADDLE; - - Dtype before_act_r = in_w_x_r[index] + in_w_h_r[index] + b_r[index]; - before_act_r = (before_act_r < min) ? min : ((before_act_r > max) ? max : before_act_r); - Dtype act_r = static_cast(1.0) / (static_cast(1.0) + exp(-before_act_r)); - - Dtype before_act_z = in_w_x_z[index] + in_w_h_z[index] + b_z[index]; - before_act_z = (before_act_z < min) ? min : ((before_act_z > max) ? max : before_act_z); - Dtype act_z = static_cast(1.0) / (static_cast(1.0) + exp(-before_act_z)); - - Dtype before_act_h = in_w_x_o[index] + in_w_h_o[index] * act_r - + b_o[index]; - before_act_h = (before_act_h > EXP_MAX_INPUT_PADDLE) ? EXP_MAX_INPUT_PADDLE : before_act_h; - Dtype acted = tanhf(before_act_h); - out_output[index] = (1 - act_z) * acted + act_z * in_hidden_pre[index]; - } -} - -template -__global__ void cal_one_kernel_sigmoid_tanh_modi_cudnn_formula(Dtype* w_x_r, Dtype* w_x_z, - Dtype* w_x_o, - Dtype* w_h_r, Dtype* w_h_z, Dtype* w_h_o, - const Dtype* b_r, const Dtype* b_z, const Dtype* b_o, - int hidden_size, Dtype* output, const Dtype* hidden_pre) { - - int w_base_index = blockIdx.x * hidden_size * 3 + threadIdx.x; - int h_base_index = blockIdx.x * hidden_size + threadIdx.x; - - for (int index = threadIdx.x; index < hidden_size; - index += blockDim.x, w_base_index += blockDim.x, h_base_index += blockDim.x) { - Dtype before_act_r = w_x_r[w_base_index] + w_h_r[w_base_index] + b_r[index]; - Dtype act_r = static_cast(1.0) / (static_cast(1.0) + expf(-before_act_r)); - Dtype before_act_z = w_x_z[w_base_index] + w_h_z[w_base_index] + b_z[index]; - Dtype act_z = static_cast(1.0) / (static_cast(1.0) + expf(-before_act_z)); - Dtype before_act_h = w_x_o[w_base_index] + w_h_o[w_base_index] * act_r - + b_o[index]; - Dtype acted = tanh(before_act_h); - output[h_base_index] = (static_cast(1.0) - act_z) * acted + act_z * hidden_pre[h_base_index]; - } -} - -template -__global__ void cal_one_kernel_paddlesigmoid_relu_paddle_formula(Dtype* w_x_r, Dtype* w_x_z, - Dtype* w_x_o, - Dtype* w_h_r, Dtype* w_h_z, const Dtype* w_o, - const Dtype* b_r, const Dtype* b_z, const Dtype* b_o, - int hidden_size, Dtype* output, const Dtype* hidden_pre) { - int index = threadIdx.x; - - if (index > hidden_size) { - return; - } - - int w_base_index = blockIdx.x * hidden_size * 3 + index; - int u_base_index = blockIdx.x * hidden_size * 2 + index; - int h_base_index = blockIdx.x * hidden_size + index; - extern __shared__ Dtype shared_hidden_pre[]; - Dtype hidden_pre_value = hidden_pre[h_base_index]; - Dtype before_act_r = w_x_r[w_base_index] + w_h_r[u_base_index] + b_r[index]; - Dtype act_r = sigmoid_paddle(before_act_r); - shared_hidden_pre[index] = hidden_pre_value * act_r; - Dtype before_act_z = w_x_z[w_base_index] + w_h_z[u_base_index] + b_z[index]; - Dtype act_z = sigmoid_paddle(before_act_z); - Dtype w_h_o = static_cast(0.0); - int k_index = index; - __syncthreads(); - - for (int w_index = 0; w_index < hidden_size; ++w_index) { - w_h_o += shared_hidden_pre[w_index] * w_o[k_index]; - k_index += hidden_size; - } - - Dtype before_act_h = w_x_o[w_base_index] + w_h_o - + b_o[index]; - Dtype acted = relu(before_act_h); - output[h_base_index] = (static_cast(1.0) - act_z) * hidden_pre_value + act_z * acted; +static int round_up(int k, int c) { + return ((k + c - 1) / c) * c; } template -__global__ void cal_one_kernel_sigmoid_tanh_paddle_formula(Dtype* w_x_r, Dtype* w_x_z, Dtype* w_x_o, - Dtype* w_h_r, Dtype* w_h_z, const Dtype* w_o, - const Dtype* b_r, const Dtype* b_z, const Dtype* b_o, - int hidden_size, Dtype* output, const Dtype* hidden_pre) { - int index = threadIdx.x; - - if (index > hidden_size) { - return; +__global__ void cal_reset_kernel(Dtype* w_x_r,Dtype* w_h_r,const Dtype* b_r,int hidden_size,int batch_size, Dtype* output, + const Dtype* hidden_pre,const ActiveType gate_activity) { + const int thread_id = blockIdx.x*blockDim.x+threadIdx.x; + const int batch_id = thread_id/hidden_size; + const int index=thread_id%hidden_size; + if (index < hidden_size&&batch_id(1.0) / (static_cast(1.0) + expf(-before_act_r)); -// printf("%d %f=[%f , %f ,%f]\n",index,act_r,w_x_r[w_base_index],w_h_r[u_base_index],b_r[index]); - shared_hidden_pre[index] = hidden_pre_value * act_r; - Dtype before_act_z = w_x_z[w_base_index] + w_h_z[u_base_index] + b_z[index]; - Dtype act_z = static_cast(1.0) / (static_cast(1.0) + expf(-before_act_z)); - Dtype w_h_o = static_cast(0.0); - int k_index = index; - __syncthreads(); - - for (int w_index = 0; w_index < hidden_size; ++w_index) { - w_h_o += shared_hidden_pre[w_index] * w_o[k_index]; - k_index += hidden_size; - } - - Dtype before_act_h = w_x_o[w_base_index] + w_h_o - + b_o[index]; - Dtype acted = tanhf(before_act_h); - output[h_base_index] = (static_cast(1.0) - act_z) * hidden_pre_value + act_z * acted; -// printf("output %d = %f\n",index,output[h_base_index]); -} +}; template -__global__ void cal_one_kernel_sigmoid_tanh(Dtype* w_x_r, Dtype* w_x_z, Dtype* w_x_o, - Dtype* w_h_r, Dtype* w_h_z, Dtype* w_h_o, - const Dtype* b_r, const Dtype* b_z, const Dtype* b_o, - int hidden_size, Dtype* output, const Dtype* hidden_pre) { - int w_base_index = blockIdx.x * hidden_size * 3; - int h_base_index = blockIdx.x * hidden_size; - Dtype* in_w_x_r = w_x_r + w_base_index; - Dtype* in_w_h_r = w_h_r + w_base_index; - Dtype* in_w_x_z = w_x_z + w_base_index; - Dtype* in_w_h_z = w_h_z + w_base_index; - Dtype* in_w_x_o = w_x_o + w_base_index; - Dtype* in_w_h_o = w_h_o + w_base_index; - const Dtype* in_hidden_pre = hidden_pre + h_base_index; - Dtype* out_output = output + h_base_index; - - for (int index = threadIdx.x; index < hidden_size; index += blockDim.x) { - Dtype before_act_r = in_w_x_r[index] + in_w_h_r[index] + b_r[index]; - Dtype act_r = static_cast(1.0) / (static_cast(1.0) + expf(-before_act_r)); - Dtype before_act_z = in_w_x_z[index] + in_w_h_z[index] + b_z[index]; - Dtype act_z = static_cast(1.0) / (static_cast(1.0) + expf(-before_act_z)); - Dtype before_act_h = in_w_x_o[index] + in_w_h_o[index] * act_r +__global__ void cal_final_kernel( Dtype* w_x_z, Dtype* w_x_o,Dtype* w_h_z,const Dtype* b_z, const Dtype* b_o, + int hidden_size, int batch_size,Dtype* output, const Dtype* hidden_pre,const Dtype* w_h_o, + const ActiveType gate_activity,const ActiveType h_activity) { + const int thread_id = blockIdx.x*blockDim.x+threadIdx.x; + const int batch_id = thread_id/hidden_size; + const int index=thread_id%hidden_size; + if (index < hidden_size&&batch_id(1.0) - act_z) * acted + act_z * in_hidden_pre[index]; - } -} + Dtype acted = activate_cuda_float(before_act_h, h_activity); -template -__global__ void cal_one_kernel_sigmoid_tanh_index_modi(Dtype* w_x_r, Dtype* w_x_z, Dtype* w_x_o, - Dtype* w_h_r, Dtype* w_h_z, Dtype* w_h_o, - const Dtype* b_r, const Dtype* b_z, const Dtype* b_o, - int hidden_size, Dtype* output, const Dtype* hidden_pre, - int seq_batch_hidden, int batch_size) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - if (tid >= seq_batch_hidden) { - return; - } - - int batch_id = tid / hidden_size % batch_size; - int index = tid % hidden_size; - int w_base_index = batch_id * hidden_size * 3; - int h_base_index = batch_id * hidden_size; - int index_w = index + w_base_index; - int index_h = index + h_base_index; - - { - Dtype before_act_r = w_x_r[index_w] + w_h_r[index_w] + b_r[index]; - Dtype act_r = static_cast(1.0) / (static_cast(1.0) + expf(-before_act_r)); - Dtype before_act_z = w_x_z[index_w] + w_h_z[index_w] + b_z[index]; - Dtype act_z = static_cast(1.0) / (static_cast(1.0) + expf(-before_act_z)); - Dtype before_act_h = w_x_o[index_w] + w_h_o[index_w] * act_r - + b_o[index]; - Dtype acted = tanhf(before_act_h); - output[index_h] = (static_cast(1.0) - act_z) * acted + act_z * hidden_pre[index_h]; + output[h_base_index] = (static_cast(1.0) - act_z) * hidden_pre_value + act_z * acted; } } template -__global__ void cal_one_kernel_sigmoid_tanh_index(Dtype* w_x_r, Dtype* w_x_z, Dtype* w_x_o, - Dtype* w_h_r, Dtype* w_h_z, Dtype* w_h_o, - const Dtype* b_r, const Dtype* b_z, const Dtype* b_o, - int hidden_size, Dtype* output, const Dtype* hidden_pre, - int seq_batch_hidden, int batch_size) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - if (tid >= seq_batch_hidden) { - return; - } +__global__ void cal_cudnn_kernel( const Dtype* w_x_r,const Dtype* w_x_z, const Dtype* w_x_o, + const Dtype* w_h_r,const Dtype* w_h_z,const Dtype* w_h_o, + const Dtype* b_r,const Dtype* b_z, const Dtype* b_o, + int hidden_size, int batch_size,Dtype* output, const Dtype* hidden_pre) { + const int thread_id = blockIdx.x*blockDim.x+threadIdx.x; + const int batch_id = thread_id/hidden_size; + const int index=thread_id%hidden_size; + if (index < hidden_size&&batch_id(1.0) - z) * _h + z * hidden_pre_value; } } -template -__global__ void cal_one_kernel_paddlesigmoid_relu_cudnn_formula(Dtype* w_x_r, Dtype* w_x_z, - Dtype* w_x_o, - Dtype* w_h_r, Dtype* w_h_z, Dtype* w_h_o, - const Dtype* b_r, const Dtype* b_z, const Dtype* b_o, - int hidden_size, Dtype* output, const Dtype* hidden_pre) { - int w_base_index = blockIdx.x * hidden_size * 3; - int h_base_index = blockIdx.x * hidden_size; - Dtype* in_w_x_r = w_x_r + w_base_index; - Dtype* in_w_h_r = w_h_r + w_base_index; - Dtype* in_w_x_z = w_x_z + w_base_index; - Dtype* in_w_h_z = w_h_z + w_base_index; - Dtype* in_w_x_o = w_x_o + w_base_index; - Dtype* in_w_h_o = w_h_o + w_base_index; - const Dtype* in_hidden_pre = hidden_pre + h_base_index; - Dtype* out_output = output + h_base_index; - - for (int index = threadIdx.x; index < hidden_size; index += blockDim.x) { - const Dtype min = SIGMOID_THRESHOLD_MIN_PADDLE; - const Dtype max = SIGMOID_THRESHOLD_MAX_PADDLE; - - Dtype before_act_r = in_w_x_r[index] + in_w_h_r[index] + b_r[index]; - before_act_r = (before_act_r < min) ? min : ((before_act_r > max) ? max : before_act_r); - Dtype act_r = static_cast(1.0) / (static_cast(1.0) + exp(-before_act_r)); - - Dtype before_act_z = in_w_x_z[index] + in_w_h_z[index] + b_z[index]; - before_act_z = (before_act_z < min) ? min : ((before_act_z > max) ? max : before_act_z); - Dtype act_z = static_cast(1.0) / (static_cast(1.0) + exp(-before_act_z)); - - Dtype before_act_h = in_w_x_o[index] + in_w_h_o[index] * act_r - + b_o[index]; - Dtype acted = before_act_h > static_cast(0.0) ? before_act_h : static_cast(0.0); - out_output[index] = (1 - act_z) * acted + act_z * in_hidden_pre[index]; - } -} - -template -__global__ void cal_one_kernel_sigmoid_tanh_modi(Dtype* w_x_r, Dtype* w_x_z, Dtype* w_x_o, - Dtype* w_h_r, Dtype* w_h_z, Dtype* w_h_o, - const Dtype* b_r, const Dtype* b_z, const Dtype* b_o, - int hidden_size, Dtype* output, const Dtype* hidden_pre) { - - int w_base_index = blockIdx.x * hidden_size * 3 + threadIdx.x; - int h_base_index = blockIdx.x * hidden_size + threadIdx.x; - - for (int index = threadIdx.x; index < hidden_size; - index += blockDim.x, w_base_index += blockDim.x, h_base_index += blockDim.x) { - Dtype before_act_r = w_x_r[w_base_index] + w_h_r[w_base_index] + b_r[index]; - - Dtype act_r = static_cast(1.0) / (static_cast(1.0) + expf(-before_act_r)); - Dtype before_act_z = w_x_z[w_base_index] + w_h_z[w_base_index] + b_z[index]; - Dtype act_z = static_cast(1.0) / (static_cast(1.0) + expf(-before_act_z)); - Dtype before_act_h = w_x_o[w_base_index] + w_h_o[w_base_index] * act_r - + b_o[index]; - Dtype acted = tanhf(before_act_h); - output[h_base_index] = (static_cast(1.0) - act_z) * acted + act_z * hidden_pre[h_base_index]; - } -} - -template <> -SaberStatus SaberGru::gru_cudnn( - const std::vector inputs, - std::vector outputs, - GruParam& param) { - - DataTensor_in* x = inputs[0]; - const InDataType* x_data = x->data(); - std::vector offset=x->get_seq_offset(); - const InDataType* h; - DataTensor_out* dout = outputs[0]; - OutDataType* dout_data = dout->mutable_data(); - - //TODO:check shape first - const OpTensor* b = param.bias(); - - int batch_size = offset.size() - 1;; //x->get_seq_offset().size()-1; - int sequence = x->num(); - int hidden_size = b->valid_size() / 3; - bool isHW2Seq=offset.size()>2; +template<> +SaberStatus SaberGru::dispatch(\ + const std::vector& inputs, + std::vector& outputs, + GruParam & param) { +// CHECK_GE(param.formula,GRU_ORIGIN)<<"ONLY SUPPORT GRU_ORIGIN NOW"; + OpTensor* x = inputs[0]; + std::vector> offset_vec_vec = x->get_seq_offset(); + std::vector offset = offset_vec_vec[offset_vec_vec.size()-1]; + + const OpDataType* x_data = static_cast(x->data()); + const OpDataType* h; + OpTensor* dout = outputs[0]; + OpDataType* dout_data = static_cast(dout->mutable_data()); + + const OpDataType* weights_i2h=static_cast(param.weight()->data()); + const OpDataType* weights_h2h=weights_i2h+3*_hidden_size*_word_size; + const OpDataType* weights_bias=static_cast(param.bias()->data()); + + int batch_size = offset.size() - 1; + int seq_sum = x->num(); + bool is_batched = offset.size() > 2; int o_offset = 0; int r_offset = 1; int z_offset = 2; -// CHECK_EQ(w_h2h->height(), hidden_size) << "w_h2h->height()==batch_size"; -// CHECK_EQ(w_h2h->width(), hidden_size * 3) << "w_h2h->width()==hidden_size*3"; -// -// CHECK_EQ(w_i2h->height(), word_size) << "w_i2h->height()==word_size"; -// CHECK_EQ(w_i2h->width(), hidden_size * 3) << "w_i2h->width()==hidden_size*3"; + std::vector emit_offset_vec; + int emit_length = 0; + utils::try_expand_tensor(_temp_map_dev,seq_sum); + is_batched = _seq_util.get_sorted_map(offset, emit_offset_vec, emit_length, + _ctx->get_compute_stream()); - if (isHW2Seq) { - x_data = hw2seq(inputs, param, _word_size, hidden_size, sequence); - batch_size = offset.size() - 1; - - if (x_data != x->data()) { - dout_data = _temp_tensor_out.mutable_data(); - } + if (is_batched) { + Shape seq_shape({1, 1, seq_sum, _word_size}); + utils::try_expand_tensor(_temp_tensor_in,seq_shape); + Shape seq_out_shape({1, 1, seq_sum, _hidden_size}); + utils::try_expand_tensor(_temp_tensor_out,seq_out_shape); + _seq_util.seq_2_sorted_seq(x_data, static_cast(_temp_tensor_in.mutable_data()), _word_size, + _ctx->get_compute_stream()); + x_data = static_cast(_temp_tensor_in.data()); + dout_data = static_cast(_temp_tensor_out.mutable_data()); } - Shape shape_wx(sequence, batch_size, 3, hidden_size); - _temp_WX.try_expand_size(shape_wx); + Shape shape_wx({seq_sum, 1, 3, _hidden_size}); + utils::try_expand_tensor(_temp_wx,shape_wx); - Shape shape_wh(1, batch_size, 3, hidden_size); - _temp_WH.try_expand_size(shape_wh); + Shape shape_wh({1, batch_size, 2, _hidden_size}); + utils::try_expand_tensor(_temp_wh,shape_wh); - anakin_NV_gemm(_cublas_handle, false, false, sequence * batch_size, 3 * hidden_size, - _word_size, 1.0, x_data, _weights_i2h.data(), 0.0, _temp_WX.mutable_data()); + Shape shape_whr({1, batch_size, 1, _hidden_size}); + utils::try_expand_tensor(_temp_whr,shape_whr); + _gemm_wx(seq_sum, 3 * _hidden_size, _word_size, 1.f, x_data, 0.f, weights_i2h, + static_cast(_temp_wx.mutable_data()), _ctx->get_compute_stream()); - const OpDataType* b_r = b->data() + r_offset * hidden_size; - const OpDataType* b_z = b->data() + z_offset * hidden_size; - const OpDataType* b_o = b->data() + o_offset * hidden_size; + const OpDataType* b_r = weights_bias + r_offset * _hidden_size; + const OpDataType* b_z = weights_bias + z_offset * _hidden_size; + const OpDataType* b_o = weights_bias + o_offset * _hidden_size; if (inputs.size() == 1) { - CUDA_CHECK(cudaMemsetAsync(dout_data, 0, sizeof(InDataType) * batch_size * hidden_size, - _ctx.get_compute_stream())); - h = dout_data; - } else { - h = inputs[1]->data(); - CHECK_EQ(inputs[1]->valid_size(), batch_size * hidden_size) << - "h size should be batch_size * hidden_size"; - } - - for (int seq = 0; seq < sequence; seq++) { - const InDataType* hidden_in; - InDataType* hidden_out = dout_data + seq * batch_size * hidden_size; - - if (seq == 0) { - hidden_in = h; - } else { - hidden_in = dout_data + (seq - 1) * batch_size * hidden_size; - } - - anakin_NV_gemm(_cublas_handle, false, false, batch_size, - 3 * hidden_size, hidden_size, 1.0, hidden_in, - _weights_h2h.data(), 0.0, _temp_WH.mutable_data()); - - OpDataType* w_x_r = _temp_WX.mutable_data() + r_offset * hidden_size - + seq * batch_size * hidden_size * 3; - OpDataType* w_x_z = _temp_WX.mutable_data() + z_offset * hidden_size - + seq * batch_size * hidden_size * 3; - OpDataType* w_x_o = _temp_WX.mutable_data() + o_offset * hidden_size - + seq * batch_size * hidden_size * 3; - - OpDataType* w_h_r = _temp_WH.mutable_data() + r_offset * hidden_size; - OpDataType* w_h_z = _temp_WH.mutable_data() + z_offset * hidden_size; - OpDataType* w_h_o = _temp_WH.mutable_data() + o_offset * hidden_size; - - int frame_per_block = hidden_size <= 1024 ? hidden_size : 1024; - - if (param.gate_activity == Active_sigmoid - && param.h_activity == Active_tanh) { - cal_one_kernel_sigmoid_tanh_modi_cudnn_formula - << < batch_size, frame_per_block, 0, _ctx.get_compute_stream() >> > - (w_x_r, w_x_z, w_x_o, w_h_r, w_h_z, w_h_o - , b_r, b_z, b_o, hidden_size, hidden_out, hidden_in); - } else if (param.gate_activity == Active_sigmoid_fluid - && param.h_activity == Active_tanh) { - cal_one_kernel_paddlesigmoid_tanh_cudnn_formula - << < batch_size, frame_per_block, 0, _ctx.get_compute_stream() >> > - (w_x_r, w_x_z, w_x_o, w_h_r, w_h_z, w_h_o - , b_r, b_z, b_o, hidden_size, hidden_out, hidden_in); - } else if (param.gate_activity == Active_sigmoid_fluid - && param.h_activity == Active_relu) { - cal_one_kernel_paddlesigmoid_relu_cudnn_formula - << < batch_size, frame_per_block, 0, _ctx.get_compute_stream() >> > - (w_x_r, w_x_z, w_x_o, w_h_r, w_h_z, w_h_o - , b_r, b_z, b_o, hidden_size, hidden_out, hidden_in); - } else { - LOG(ERROR) << "not support active function"; + if (_temp_zero.valid_size() < batch_size * _hidden_size) { + utils::try_expand_tensor(_temp_zero,batch_size * _hidden_size); + CUDA_CHECK(cudaMemsetAsync(_temp_zero.mutable_data(), 0, + sizeof(OpDataType)*batch_size * _hidden_size, + _ctx->get_compute_stream())); } - - } - - if (isHW2Seq) { - seq2hw(outputs, inputs, param, hidden_size, dout_data); - outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); - } - return SaberSuccess; - -} -template<> -SaberStatus SaberGru::dispatch(\ - const std::vector& inputs, - std::vector& outputs, - GruParam & param) { - if (param.formula == GRU_CUDNN) { - LOG(ERROR) << "saber cudnn formula not support reverse yet"; - if (param.is_reverse) { - LOG(ERROR) << "saber cudnn formula not support reverse yet"; - - } - return gru_cudnn(inputs, outputs, param); + h = static_cast(_temp_zero.data()); + } else { + h = static_cast(inputs[1]->data()); } - // LOG(INFO)<<"gru_paddle"; - DataTensor_in* x = inputs[0]; - std::vector offset=x->get_seq_offset(); - const InDataType* x_data = x->data(); - const InDataType* h; - DataTensor_out* dout = outputs[0]; - OutDataType* dout_data = dout->mutable_data(); - - //TODO:check shape first - const OpTensor* b = param.bias(); - - int batch_size = offset.size() - 1; //x->get_seq_offset().size()-1; - int sequence = x->num(); - int hidden_size = b->valid_size() / 3; - bool isHW2Seq=offset.size()>2; - int o_offset = 0; - int r_offset = 1; - int z_offset = 2; - -// CHECK_EQ(w_h2h->height(), hidden_size) << "w_h2h->height()==batch_size"; -// CHECK_EQ(w_h2h->width(), hidden_size * 3) << "w_h2h->width()==hidden_size*3"; -// -// CHECK_EQ(w_i2h->height(), word_size) << "w_i2h->height()==word_size"; -// CHECK_EQ(w_i2h->width(), hidden_size * 3) << "w_i2h->width()==hidden_size*3"; - if (isHW2Seq) { - x_data = hw2seq(inputs, param, _word_size, hidden_size, sequence); -// batch_size = inputs[0]->get_seq_offset().size() - 1; + for (int word_id = 0; word_id < emit_length; word_id++) { + int real_word_id = word_id; + int last_word_id = word_id - 1; - if (x_data != x->data()) { - dout_data = _temp_tensor_out.mutable_data(); + if (param.is_reverse && batch_size == 1) { + real_word_id = emit_length - word_id - 1; + last_word_id = real_word_id + 1; } - } - - Shape shape_WX(sequence, batch_size, 3, hidden_size); - _temp_WX.try_expand_size(shape_WX); - - Shape shape_WH(1, batch_size, 2, hidden_size); - _temp_WH.try_expand_size(shape_WH); - - anakin_NV_gemm(_cublas_handle, false, false, sequence * batch_size, 3 * hidden_size, - _word_size, 1.0, x_data, _weights_i2h.data(), 0.0, _temp_WX.mutable_data()); - - const OpDataType* b_r = b->data() + r_offset * hidden_size; - const OpDataType* b_z = b->data() + z_offset * hidden_size; - const OpDataType* b_o = b->data() + o_offset * hidden_size; - - if (inputs.size() == 1) { - CUDA_CHECK(cudaMemsetAsync(dout_data, 0, sizeof(OutDataType)*batch_size * hidden_size, - _ctx.get_compute_stream())); - h = dout_data; - } else { - h = inputs[1]->data(); - } - for (int seq = 0; seq < sequence; ++seq) { - int realseq = seq; - int last_seq = realseq - 1; + int emit_word_id_start = emit_offset_vec[real_word_id]; + int emit_word_id_end = emit_offset_vec[real_word_id + 1]; + int emit_word_length = emit_word_id_end - emit_word_id_start; - if (param.is_reverse) { -// DLOG(INFO)<<"reverse gru"; - realseq = sequence - 1 - seq; - last_seq = realseq + 1; - } + const OpDataType* hidden_in; + OpDataType* hidden_out = dout_data + emit_offset_vec[real_word_id] * _hidden_size; - const OutDataType* hidden_in; - OutDataType* hidden_out = dout_data + realseq * batch_size * hidden_size; - - if (seq == 0) { + if (word_id == 0) { hidden_in = h; } else { - hidden_in = dout_data + last_seq * batch_size * hidden_size; + hidden_in = dout_data + emit_offset_vec[last_word_id] * _hidden_size; } - - anakin_NV_gemm(_cublas_handle, false, false, batch_size, - 2 * hidden_size, hidden_size, 1.0, hidden_in, - _weights_h2h.data() + hidden_size * hidden_size, 0.0, _temp_WH.mutable_data()); - - - OutDataType* w_x_r = _temp_WX.mutable_data() + r_offset * hidden_size - + realseq * batch_size * hidden_size * 3; - OutDataType* w_x_z = _temp_WX.mutable_data() + z_offset * hidden_size - + realseq * batch_size * hidden_size * 3; - OutDataType* w_x_o = _temp_WX.mutable_data() + o_offset * hidden_size - + realseq * batch_size * hidden_size * 3; - - OutDataType* w_h_r = _temp_WH.mutable_data() + 0 * hidden_size; - OutDataType* w_h_z = _temp_WH.mutable_data() + 1 * hidden_size; - const OpDataType * w_o = _weights_h2h.data(); - - CHECK_LE(hidden_size, 1024) << "now not support hidden size > 1024 for paddle formula"; - - int frame_per_block = hidden_size <= 1024 ? hidden_size : 1024; - - // DLOG(INFO) << "act = " << param._gate_activity << "," << param._h_activity; - - if (param.gate_activity == Active_sigmoid - && param.h_activity == Active_tanh) { - cal_one_kernel_sigmoid_tanh_paddle_formula - <<< batch_size, frame_per_block, sizeof(OutDataType)*hidden_size - , _ctx.get_compute_stream()>>>( - w_x_r, w_x_z, w_x_o, w_h_r, w_h_z, w_o - , b_r, b_z, b_o, hidden_size, hidden_out, hidden_in); - - } else if (param.gate_activity == Active_sigmoid_fluid - && param.h_activity == Active_relu) { - cal_one_kernel_paddlesigmoid_relu_paddle_formula - << < batch_size, frame_per_block, sizeof(OutDataType)*hidden_size - , _ctx.get_compute_stream() >> > - (w_x_r, w_x_z, w_x_o, w_h_r, w_h_z, w_o - , b_r, b_z, b_o, hidden_size, hidden_out, hidden_in); - - } else { - LOG(ERROR) << "not support active function"; + OpDataType* w_x_r = static_cast(_temp_wx.mutable_data()) + r_offset * _hidden_size + + emit_word_id_start * _hidden_size * 3; + OpDataType* w_x_z = static_cast(_temp_wx.mutable_data()) + z_offset * _hidden_size + + emit_word_id_start * _hidden_size * 3; + OpDataType* w_x_o = static_cast(_temp_wx.mutable_data()) + o_offset * _hidden_size + + emit_word_id_start * _hidden_size * 3; + + if(param.formula==GRU_ORIGIN) { + OpDataType* w_h_r = static_cast(_temp_wh.mutable_data()) + 0 * _hidden_size; + OpDataType* w_h_z = static_cast(_temp_wh.mutable_data()) + 1 * _hidden_size; + + _gemm_wh_2(emit_word_length, 2 * _hidden_size, _hidden_size, 1.f, hidden_in, 0.f, + weights_h2h + _hidden_size * _hidden_size, static_cast( _temp_wh.mutable_data()), + _ctx->get_compute_stream()); + + const OpDataType *w_o = weights_h2h; + + const int block_dim = 512; + const int grid_dim = round_up(emit_word_length * _hidden_size, block_dim); + + cal_reset_kernel << < grid_dim, block_dim, 0 + , _ctx->get_compute_stream() >> > ( + w_x_r, w_h_r + , b_r, _hidden_size, emit_word_length, hidden_out, hidden_in, param.gate_activity); + + _gemm_wh_o(emit_word_length, _hidden_size, _hidden_size, 1.f, hidden_out, 0.f, w_o, + static_cast(_temp_whr.mutable_data()), _ctx->get_compute_stream()); + + cal_final_kernel << < grid_dim, block_dim, 0 + , _ctx->get_compute_stream() >> > ( + w_x_z, w_x_o, w_h_z, b_z, b_o, _hidden_size, emit_word_length, hidden_out, hidden_in, static_cast(_temp_whr.data()), + param.gate_activity, param.h_activity); + } else{ + OpDataType* w_h_r = static_cast(_temp_wh.mutable_data()) + r_offset * _hidden_size; + OpDataType* w_h_z = static_cast(_temp_wh.mutable_data()) + z_offset * _hidden_size; + OpDataType* w_h_o = static_cast(_temp_wh.mutable_data()) + o_offset * _hidden_size; + + _gemm_wh_2(emit_word_length, 3 * _hidden_size, _hidden_size, 1.f, hidden_in, 0.f, + static_cast(_temp_weights_h2h.data()), static_cast( _temp_wh.mutable_data()), + _ctx->get_compute_stream()); + + const OpDataType *w_o = weights_h2h; + + const int block_dim = 512; + const int grid_dim = round_up(emit_word_length * _hidden_size, block_dim); + cal_cudnn_kernel<< < grid_dim, block_dim, 0 + , _ctx->get_compute_stream() >> >( w_x_r, w_x_z, w_x_o, + w_h_r, w_h_z, w_h_o,b_r, b_z, b_o,_hidden_size, emit_word_length, hidden_out, hidden_in); } + } - if (isHW2Seq) { - seq2hw(outputs, inputs, param, hidden_size, dout_data); + if (is_batched) { + _seq_util.sorted_seq_2_seq(static_cast(_temp_tensor_out.data()), static_cast(dout->mutable_data()), _hidden_size, + _ctx->get_compute_stream()); } + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); return SaberSuccess; } - +template class SaberGru; +DEFINE_OP_TEMPLATE(SaberGru, GruParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberGru, GruParam, NV, AK_INT8); } } diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_im2sequence.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_im2sequence.cu index 9c7e63f5a..dc101f1e2 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_im2sequence.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_im2sequence.cu @@ -135,21 +135,15 @@ __global__ void ker_im2sequence_fwd_shared(Dtype * out_data, \ } -template -SaberStatus SaberIm2Sequence::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - Im2SequenceParam& param) { +template +SaberStatus SaberIm2Sequence::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + Im2SequenceParam& param) { - const InDataType* in_data = inputs[0]->data(); - OutDataType* out_data = outputs[0]->mutable_data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); + const OpDataType* in_data = (const OpDataType*)inputs[0]->data(); + OpDataType* out_data = (OpDataType*)outputs[0]->mutable_data(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); int count = outputs[0]->valid_size(); int out_n = outputs[0]->num(); int out_c = outputs[0]->channel(); @@ -158,14 +152,18 @@ SaberStatus SaberIm2Sequenceheight(); int in_w = inputs[0]->width(); int num_threads = out_n * c; - std::vector seq_offset; + std::vectoroffset(n+1); + std::vector> seq_offset; + seq_offset.push_back(offset); int per_seq_len = out_n / n; for (int i = 0; i < n; i++) { - seq_offset.push_back(i * per_seq_len); + seq_offset[0].push_back(i * per_seq_len); } - seq_offset.push_back(n * per_seq_len); + seq_offset[0].push_back(n * per_seq_len); outputs[0]->set_seq_offset(seq_offset); - //LOG(INFO)<<"im2sequence out shape"<< outputs[0]->num()<<"c"<channel()<<"h"<height()<<"w"<width(); + +// LOG(INFO)<<"im2sequence out shape"<<" n: " \ + << outputs[0]->num()<<" c: "<channel()<<" h:"<height()<<" w:"<width(); if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { if (0) { @@ -183,7 +181,7 @@ SaberStatus SaberIm2Sequence\ - <<>>(\ + <<>>(\ out_data, in_data, \ n, c, in_h, in_w,\ _output_height, _output_width,\ @@ -199,6 +197,7 @@ SaberStatus SaberIm2Sequence -SaberStatus SaberLayerNorm::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - LayerNormParam ¶m) { +template +SaberStatus SaberLayerNorm::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + LayerNormParam ¶m) { - cudaStream_t stream = this->_ctx.get_compute_stream(); + + cudaStream_t stream = this->_ctx->get_compute_stream(); int total_size = inputs[0]->valid_size(); - const float* src = inputs[0]->data(); - float* dst = outputs[0]->mutable_data(); - float* mean_ptr = _mean.mutable_data(); - float* std_ptr = _std.mutable_data(); + const OpDataType* src = (const OpDataType*)inputs[0]->data(); + OpDataType* dst = (OpDataType*)outputs[0]->mutable_data(); + OpDataType* mean_ptr = (OpDataType*)_mean.mutable_data(); + OpDataType* std_ptr = (OpDataType*)_std.mutable_data(); - const float* scale_ptr = param.scale_weights()->data(); - const float* bias_ptr = param.bias_weights()->data(); + const OpDataType* scale_ptr = (const OpDataType*)param.scale_weights()->data(); + const OpDataType* bias_ptr = (const OpDataType*)param.bias_weights()->data(); - const size_t share_mem_size = CUDA_NUM_THREADS * sizeof(float); + const size_t share_mem_size = CUDA_NUM_THREADS * sizeof(OpDataType); //! get mean - reduce_mean\ + reduce_mean\ <<<_outer_size, CUDA_NUM_THREADS, share_mem_size, stream>>>\ (total_size, _inner_size, src, mean_ptr); //! get std - reduce_std\ + reduce_std\ <<<_outer_size, CUDA_NUM_THREADS, share_mem_size, stream>>>\ (total_size, _inner_size, param.eps, src, mean_ptr, std_ptr); if (_flag_scale) { if (_flag_bias) { - normalize_with_scale_bias_kernel\ + normalize_with_scale_bias_kernel\ <<>>\ (total_size, _inner_size, mean_ptr, std_ptr, scale_ptr, bias_ptr, src, dst); } else { - normalize_with_scale_bias_kernel\ + normalize_with_scale_bias_kernel\ <<>>\ (total_size, _inner_size, mean_ptr, std_ptr, scale_ptr, bias_ptr, src, dst); } } else { if (_flag_bias) { - normalize_with_scale_bias_kernel\ + normalize_with_scale_bias_kernel\ <<>>\ (total_size, _inner_size, mean_ptr, std_ptr, scale_ptr, bias_ptr, src, dst); } else { - normalize_with_scale_bias_kernel\ + normalize_with_scale_bias_kernel\ <<>>\ (total_size, _inner_size, mean_ptr, std_ptr, scale_ptr, bias_ptr, src, dst); } @@ -222,7 +223,8 @@ SaberStatus SaberLayerNorm:: return SaberSuccess; } - +DEFINE_OP_TEMPLATE(SaberLayerNorm, LayerNormParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberLayerNorm, LayerNormParam, NV, AK_INT8); } //namespace anakin } //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_lrn.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_lrn.cu index 5ff0c4a74..f4d094f24 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_lrn.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_lrn.cu @@ -52,21 +52,15 @@ __global__ void ker_cross_map_region_norm_fwd(Dtype * out_data, \ } } -template -SaberStatus SaberLrn::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - LrnParam& param) { +template +SaberStatus SaberLrn::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + LrnParam& param) { - const InDataType* in_data = inputs[0]->data(); - OutDataType* out_data = outputs[0]->mutable_data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); + const OpDataType* in_data = (const OpDataType*)inputs[0]->data(); + OpDataType* out_data = (OpDataType*)outputs[0]->mutable_data(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); int out_n = outputs[0]->num(); int out_c = outputs[0]->channel(); int out_h = outputs[0]->height(); @@ -85,5 +79,7 @@ SaberStatus SaberLrn +__global__ void cal_lstm_kernel_batch_with_peephole_anyactivate( + const Dtype* w_x, const Dtype* b_i, const Dtype* b_f, const Dtype* b_c, const Dtype* b_o, + const Dtype* w_ci, const Dtype* w_cf, const Dtype* w_co, Dtype* cell,const int hidden_size, + const int aligned_hidden_size,const int batch_size,const int word_start_id, + const ActiveType gate_activity, const ActiveType cell_activity,const ActiveType candidate_activity,Dtype* output + ) { + + const int thread_id = blockIdx.x*blockDim.x+threadIdx.x; + const int batch_id = thread_id/aligned_hidden_size; + const int tid=thread_id%aligned_hidden_size; + if (tid < hidden_size && batch_id(gate_activity); + Dtype(*cell_act)(const Dtype)=Activate_inner(cell_activity); + Dtype(*candi_act)(const Dtype)=Activate_inner(candidate_activity); + + const int emit_wx_offset = (word_start_id + batch_id) * hidden_size * 4; + const Dtype* w_x_i = w_x + emit_wx_offset; + const Dtype* w_x_f = w_x_i + hidden_size ; + const Dtype* w_x_c = w_x_f + hidden_size; + const Dtype* w_x_o = w_x_c + hidden_size; + + + Dtype* gate_h_p = output + batch_id * hidden_size; + Dtype* gate_c_p = cell + batch_id * hidden_size; + + const Dtype c_1 = gate_c_p[tid]; + const Dtype gate_i = gat_act(w_x_i[tid] + b_i[tid] + w_ci[tid] * c_1); + const Dtype gate_f = gat_act(w_x_f[tid] + b_f[tid] + w_cf[tid] * c_1); + + const Dtype gate_c_s = cell_act(w_x_c[tid] + b_c[tid]); + const Dtype gate_c = gate_f * c_1 + gate_i * gate_c_s; + const Dtype gate_o = gat_act(w_x_o[tid] + b_o[tid] + gate_c * w_co[tid]); + gate_c_p[tid] = gate_c; + gate_h_p[tid] = gate_o * candi_act(gate_c); + } +} + +template +__global__ void cal_lstm_kernel_batch_without_peephole_anyactivate( + const Dtype* w_x,const Dtype* b_i, const Dtype* b_f, const Dtype* b_c, const Dtype* b_o, Dtype* cell, + const int hidden_size, const int aligned_hidden_size,const int batch_size,const int word_start_id, const ActiveType gate_activity,const ActiveType cell_activity,const ActiveType candidate_activity, + Dtype* output) { + + const int thread_id = blockIdx.x*blockDim.x+threadIdx.x; + const int batch_id = thread_id/aligned_hidden_size; + const int tid=thread_id%aligned_hidden_size; + if (tid < hidden_size && batch_id(gate_activity); + Dtype(*cell_act)(const Dtype)=Activate_inner(cell_activity); + Dtype(*candi_act)(const Dtype)=Activate_inner(candidate_activity); + + const int emit_wx_offset = (word_start_id + batch_id) * hidden_size * 4; + const Dtype* w_x_i = w_x + emit_wx_offset; + const Dtype* w_x_f = w_x_i + hidden_size ; + const Dtype* w_x_c = w_x_f + hidden_size; + const Dtype* w_x_o = w_x_c + hidden_size; + + + Dtype* gate_h_p = output + batch_id * hidden_size; + Dtype* gate_c_p = cell + batch_id * hidden_size; + + const Dtype c_1 = gate_c_p[tid]; + const Dtype gate_i = gat_act(w_x_i[tid] + b_i[tid]); + const Dtype gate_f = gat_act(w_x_f[tid] + b_f[tid]); + + const Dtype gate_c_s = cell_act(w_x_c[tid] + b_c[tid]); + const Dtype gate_c = gate_f * c_1 + gate_i * gate_c_s; + const Dtype gate_o = gat_act(w_x_o[tid] + b_o[tid]); + gate_c_p[tid] = gate_c; + gate_h_p[tid] = gate_o * candi_act(gate_c); +// printf("tid = %d, f = %f, i = %f, o = %f, hout = %f, w_x_i = %f, c_i = %f,c_out = %f, batch_id = %d\n",tid,gate_f,gate_i,gate_o,gate_h_p[tid],w_x_i[tid],c_1,gate_c,batch_id); + } +} + + +template +__global__ void cal_lstm_kernel_batch_with_peephole( + const Dtype* w_x, const Dtype* b_i, const Dtype* b_f, const Dtype* b_c, const Dtype* b_o, + const Dtype* w_ci, const Dtype* w_cf, const Dtype* w_co, Dtype* cell,const int hidden_size, + const int aligned_hidden_size,const int batch_size, const int word_start_id, + Dtype* output) { + + + const int thread_id = blockIdx.x*blockDim.x+threadIdx.x; + const int batch_id = thread_id/aligned_hidden_size; + const int tid=thread_id%aligned_hidden_size; + if (tid < hidden_size && batch_id +__global__ void cal_lstm_kernel_batch_without_peephole( + const Dtype* w_x,const Dtype* b_i, const Dtype* b_f, const Dtype* b_c, const Dtype* b_o, Dtype* cell, + const int hidden_size, const int aligned_hidden_size,const int batch_size,const int word_start_id, Dtype* output) { + + const int thread_id = blockIdx.x*blockDim.x+threadIdx.x; + const int batch_id = thread_id/aligned_hidden_size; + const int tid=thread_id%aligned_hidden_size; + if (tid < hidden_size && batch_id +SaberStatus +SaberLstm::dispatch_batch( + const std::vector < Tensor* >& inputs, + std::vector < Tensor* >& outputs, + LstmParam < NV >& param) { + Tensor* x = inputs[0]; + std::vector offset_vec = x->get_seq_offset()[x->get_seq_offset().size()-1]; + int seq_sum = x->num(); + int batch_size = offset_vec.size() - 1; + const OpDataType* x_data = (const OpDataType*)x->data(); + + const OpDataType *weight_h = (const OpDataType *)(param.weight()->data())+4*_hidden_size*_word_size; + const OpDataType *weight_w = (const OpDataType *)param.weight()->data(); + const OpDataType *bias = (const OpDataType *)param.bias()->data(); + const OpDataType *weight_peephole = (const OpDataType *)(param.bias()->data())+4*_hidden_size; + const OpDataType* h_init = nullptr; + const OpDataType* inner_x = (const OpDataType *)inputs[0]->data(); + OpDataType* inner_h_out = (OpDataType *)outputs[0]->mutable_data(); + OpDataType* inner_cell = nullptr; + + + std::vector emit_offset_vec; + int emit_length = 0; + utils::try_expand_tensor(_temp_map_dev,seq_sum); + bool transform = _seq_util.get_sorted_map(offset_vec, emit_offset_vec, emit_length, + _ctx->get_compute_stream()); + + if (inputs.size() > 1) { + h_init = (const OpDataType *)inputs[1]->data(); + utils::try_expand_tensor(_init_hidden,batch_size * _hidden_size); + h_init = (const OpDataType *)_init_hidden.data(); + } else if (param.init_hidden() != nullptr) { + h_init = (const OpDataType *)param.init_hidden()->data(); + //FIXME:is it correct? + } else { + if (_temp_zero.valid_size() < batch_size * _hidden_size) { + utils::try_expand_tensor(_temp_zero,batch_size * _hidden_size); + CUDA_CHECK(cudaMemsetAsync(_temp_zero.mutable_data(), 0, + sizeof(OpDataType)*batch_size * _hidden_size, + _ctx->get_compute_stream())); + } + h_init = (const OpDataType *)_temp_zero.data(); + } + + utils::try_expand_tensor(_temp_wx,seq_sum * 4 * _hidden_size); + utils::try_expand_tensor(_temp_wh,batch_size * 4 * _hidden_size); + utils::try_expand_tensor(_temp_out,seq_sum * _hidden_size * param.num_direction); + utils::try_expand_tensor(_temp_cell,batch_size * _hidden_size); + + if (transform) { + utils::try_expand_tensor(_temp_x,seq_sum * _word_size); + _seq_util.seq_2_sorted_seq(x_data, (OpDataType *)_temp_x.mutable_data(), _word_size, _ctx->get_compute_stream()); + + inner_h_out = (OpDataType *)_temp_out.mutable_data(); + inner_x = (OpDataType *)_temp_x.mutable_data(); + + if (inputs.size() > 1 || param.init_hidden() != nullptr) { + CHECK(false) << "not support inner_h_init != nullptr"; + } + } + + + inner_cell = (OpDataType *)_temp_cell.mutable_data(); + CUDA_CHECK(cudaMemsetAsync(inner_cell, 0, sizeof(OpDataType)*batch_size * _hidden_size, + _ctx->get_compute_stream())); + + OpDataType* temp_wh = (OpDataType *)_temp_wh.mutable_data(); + OpDataType* temp_wx = (OpDataType *)_temp_wx.mutable_data(); + + _gemm_wx(seq_sum, 4 * _hidden_size, _word_size, 1.0, inner_x, 0.0, weight_w, temp_wx, + _ctx->get_compute_stream()); + + + const int i_offset = 0; + const int f_offset = 1; + const int c_offset = 2; + const int o_offset = 3; + const OpDataType* b_i = bias + i_offset * _hidden_size; + const OpDataType* b_f = bias + f_offset * _hidden_size; + const OpDataType* b_c = bias + c_offset * _hidden_size; + const OpDataType* b_o = bias + o_offset * _hidden_size; + const OpDataType* w_ci = nullptr; + const OpDataType* w_cf =nullptr; + const OpDataType* w_co =nullptr; + if(param.with_peephole){ + w_ci = weight_peephole + 0 * _hidden_size; + w_cf = weight_peephole + 1 * _hidden_size; + w_co = weight_peephole + 2 * _hidden_size; + } + + + for (int word_id = 0; word_id < emit_length; word_id++) { + int real_word_id = word_id; + int last_word_id = word_id - 1; + + if (param.is_reverse && batch_size == 1) { + real_word_id = emit_length - word_id - 1; + last_word_id = real_word_id + 1; + } + + int emit_word_id_start = emit_offset_vec[real_word_id]; + int emit_word_id_end = emit_offset_vec[real_word_id + 1]; + int emit_word_length = emit_word_id_end - emit_word_id_start; + const OpDataType* hin; + + if (word_id == 0) { + hin = h_init; + } else { + hin = inner_h_out + emit_offset_vec[last_word_id] * _hidden_size; + } + +// DLOG(INFO) << "word_id = " << word_id << ",emit_start = " << emit_word_id_start << ",emit_end=" <get_compute_stream()); + + + + const int block_dim=512; + const int grid_dim=round_up(emit_word_length*_aligned_hidden_size,block_dim); + + + if (param.gate_activity == Active_sigmoid && param.cell_activity == Active_tanh + && param.candidate_activity == Active_tanh) { + if (param.with_peephole) { + + cal_lstm_kernel_batch_with_peephole << get_compute_stream() >> > + (temp_wx, b_i,b_f,b_c,b_o, w_ci,w_cf,w_co, inner_cell, _hidden_size,_aligned_hidden_size,emit_word_length, emit_word_id_start, hout); + } else { + cal_lstm_kernel_batch_without_peephole << < grid_dim, block_dim , 0 + , _ctx->get_compute_stream() >> > + (temp_wx, b_i,b_f,b_c,b_o, inner_cell, _hidden_size, _aligned_hidden_size,emit_word_length,emit_word_id_start, hout); + } + } else { + if (param.with_peephole) { + cal_lstm_kernel_batch_with_peephole_anyactivate << < grid_dim, block_dim , 0 + , _ctx->get_compute_stream() >> > + (temp_wx, b_i, b_f, b_c, b_o, w_ci, w_cf, w_co, inner_cell, _hidden_size, _aligned_hidden_size,emit_word_length,emit_word_id_start, param.gate_activity, + param.cell_activity, param.candidate_activity, hout); + } else{ + cal_lstm_kernel_batch_without_peephole_anyactivate << < grid_dim, block_dim , 0 + , _ctx->get_compute_stream() >> > + (temp_wx, b_i, b_f, b_c, b_o, inner_cell, _hidden_size,_aligned_hidden_size,emit_word_length, emit_word_id_start, param.gate_activity, + param.cell_activity, param.candidate_activity, hout); + } + } + } + + if (transform) { + _seq_util.sorted_seq_2_seq((const OpDataType *)_temp_out.data(), (OpDataType *)outputs[0]->mutable_data(), _hidden_size, + _ctx->get_compute_stream()); + } + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); + return SaberSuccess; + +}; +//TODO:complate dispatch_once +template<> +SaberStatus +SaberLstm::dispatch_once( + const std::vector < Tensor* >& inputs, + std::vector < Tensor* >& outputs, + LstmParam < NV >& param) { + + return SaberSuccess; +}; + +template<> +SaberStatus +SaberLstm::dispatch( + const std::vector < Tensor* >& inputs, + std::vector < Tensor* >& outputs, + LstmParam < NV >& param) { + CHECK_EQ(inputs.size(),1)<<"only support input size = 1"; + CHECK_EQ(outputs.size(),1)<<"only support outputs size = 1"; + CHECK_EQ(param.init_hidden()==nullptr, true )<<"only support param.init_hidden() == nullptr"; + CHECK_EQ(param.num_layers,1)<<"only support param.num_layers==1"; + return dispatch_batch(inputs, outputs, param); + +} +DEFINE_OP_TEMPLATE(SaberLstm, LstmParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberLstm, LstmParam, NV, AK_INT8); +} +} + diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_mvn.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_mvn.cu index 49558316a..ab61b058c 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_mvn.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_mvn.cu @@ -197,21 +197,15 @@ __global__ void sum(const Dtype* in_data, } } -template -SaberStatus SaberMvn::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - MvnParam& param) { +template +SaberStatus SaberMvn::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + MvnParam& param) { - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); - const InDataType * in_data = inputs[0]->data(); - OutDataType * out_data = outputs[0]->mutable_data(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + const OpDataType * in_data = (const OpDataType*)inputs[0]->data(); + OpDataType * out_data = (OpDataType*)outputs[0]->mutable_data(); int num = inputs[0]->num() * inputs[0]->channel(); int inner_dim = inputs[0]->height() * inputs[0]->width(); if (param.across_channels) { @@ -221,10 +215,10 @@ SaberStatus SaberMvn<<>>(\ in_data, num, inner_dim, mean, sd); @@ -239,6 +233,7 @@ SaberStatus SaberMvn -__global__ void normalize_kernel_no_across_spatial(const int size_in_channel, const int channels, \ - const Dtype* scale, const Dtype* bottom_data, Dtype* top_data, const float eps, const int p){ - - CUDA_KERNEL_LOOP(index, size_in_channel){ +__global__ void normalize_kernel_no_across_spatial(const int size_in_channel, const int n,\ +const int channels,const Dtype* scale, const Dtype* bottom_data, Dtype* top_data, const float eps, const int p){ + CUDA_KERNEL_LOOP(index, size_in_channel*n){ float sqr_sum = 0.f; + int num_index=index/size_in_channel; + int index_in_channel=index%size_in_channel; + int data_index=num_index*channels*size_in_channel+index_in_channel; for (int i = 0; i < channels; ++i) { if (p == 1) { - sqr_sum += fabsf(bottom_data[index + i * size_in_channel]); + sqr_sum += fabsf(bottom_data[data_index + i * size_in_channel]); } else { - sqr_sum += bottom_data[index + i * size_in_channel] * \ - bottom_data[index + i * size_in_channel]; + sqr_sum += bottom_data[data_index + i * size_in_channel] * \ + bottom_data[data_index + i * size_in_channel]; } } float norm; if (p == 1) { norm = 1.f / (sqr_sum + eps); } else { - norm = 1.f / (sqrtf(sqr_sum) + eps); + norm = 1.f / sqrtf(sqr_sum+ eps); } + for (int i = 0; i < channels; ++i) { if (has_scale) { if (shared) { - top_data[index + i * size_in_channel] = \ - bottom_data[index + i * size_in_channel] * scale[0] * norm; + top_data[data_index + i * size_in_channel] = \ + bottom_data[data_index + i * size_in_channel] * scale[0]*norm; } else { - top_data[index + i * size_in_channel] = \ - bottom_data[index + i * size_in_channel] * scale[i] * norm; + top_data[data_index + i * size_in_channel] = \ + bottom_data[data_index + i * size_in_channel] * scale[i]*norm; } } else { - top_data[index + i * size_in_channel] = \ - bottom_data[index + i * size_in_channel] * norm; + top_data[data_index + i * size_in_channel] = \ + bottom_data[data_index + i * size_in_channel] * norm; } } @@ -223,34 +226,36 @@ __global__ void normalize_compute_norm_kernel(int n, int inner_size, \ } template <> -SaberStatus SaberNormalize::dispatch(\ +SaberStatus SaberNormalize::dispatch(\ const std::vector& inputs, \ std::vector& outputs, \ - NormalizeParam ¶m) { - cudaStream_t stream = this->_ctx.get_compute_stream(); - const float* src = inputs[0]->data(); - float* dst = outputs[0]->mutable_data(); + NormalizeParam ¶m) { + cudaStream_t stream = this->_ctx->get_compute_stream(); + const float* src = static_cast(inputs[0]->data()); + float* dst = static_cast(outputs[0]->mutable_data()); if (!param.across_spatial) { + int num=inputs[0]->num(); int size_in_channel = inputs[0]->width() * inputs[0]->height(); + int thread_num=size_in_channel*num; int channel = inputs[0]->channel(); if (param.has_scale) { if (param.channel_shared) { normalize_kernel_no_across_spatial \ - <<>>\ - (size_in_channel, channel, param.scale->data(), src, dst, param.eps, param.p); + <<>>\ + (size_in_channel,num, channel, static_cast(param.scale->data()), src, dst, param.eps, param.p); } else { normalize_kernel_no_across_spatial \ - <<>>\ - (size_in_channel, channel, param.scale->data(), src, dst, param.eps, param.p); + <<>>\ + (size_in_channel,num, channel, static_cast(param.scale->data()), src, dst, param.eps, param.p); } } else { normalize_kernel_no_across_spatial \ - <<>>\ - (size_in_channel, channel, nullptr, src, dst, param.eps, param.p); + <<>>\ + (size_in_channel, num,channel, nullptr, src, dst, param.eps, param.p); } } else { - float* norm_reduce_ptr = _norm_reduce.mutable_data(); + float* norm_reduce_ptr = static_cast(_norm_reduce.mutable_data()); const size_t share_mem_size = CUDA_NUM_THREADS * sizeof(float); //! compute sum across C * H * W or H * W int blockx = CUDA_NUM_THREADS; @@ -293,7 +298,7 @@ SaberStatus SaberNormalize:: pw = 1.f; } gpu_pow_reverse<<>>\ - (_norm_size, _norm_reduce.data(), _norm_reduce.mutable_data(), pw, eps); + (_norm_size, static_cast(_norm_reduce.data()), static_cast(_norm_reduce.mutable_data()), pw, eps); //! compute output with scale if (param.has_scale) { @@ -301,24 +306,25 @@ SaberStatus SaberNormalize:: if (param.channel_shared) { normalize_with_scale_kernel\ <<>>\ - (_size, _compute_size, _channel_stride, _channels, _norm_reduce.data(), \ - param.scale->data(), inputs[0]->data(), outputs[0]->mutable_data()); + (_size, _compute_size, _channel_stride, _channels, static_cast(_norm_reduce.data()), \ + static_cast(param.scale->data()), static_cast(inputs[0]->data()), static_cast(outputs[0]->mutable_data())); } else {//! scale is diffs across channel normalize_with_scale_kernel\ <<>>\ - (_size, _compute_size, _channel_stride, _channels, _norm_reduce.data(), \ - param.scale->data(), inputs[0]->data(), outputs[0]->mutable_data()); + (_size, _compute_size, _channel_stride, _channels, static_cast(_norm_reduce.data()), \ + static_cast(param.scale->data()), static_cast(inputs[0]->data()), static_cast(outputs[0]->mutable_data())); } } else { //! without scale normalize_kernel<<>>\ - (_size, _compute_size, _norm_reduce.data(), \ - inputs[0]->data(), outputs[0]->mutable_data()); + (_size, _compute_size, static_cast(_norm_reduce.data()), \ + static_cast(inputs[0]->data()), static_cast(outputs[0]->mutable_data())); } #endif } return SaberSuccess; } - +DEFINE_OP_TEMPLATE(SaberNormalize, NormalizeParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberNormalize, NormalizeParam, NV, AK_INT8); } //namespace anakin } //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_pad.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_pad.cu index bb52ab7fc..41920bb2e 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_pad.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_pad.cu @@ -32,21 +32,15 @@ __global__ void ker_pad_fwd(Dtype * out_data, \ } } -template -SaberStatus SaberPad::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - PadParam& param) { +template +SaberStatus SaberPad::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + PadParam& param) { - const InDataType* in_data = inputs[0]->data(); - OutDataType* out_data = outputs[0]->mutable_data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); + const dtype* in_data = static_cast(inputs[0]->data()); + dtype* out_data = static_cast(outputs[0]->mutable_data()); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); int count = inputs[0]->valid_size(); int in_n = inputs[0]->num(); int in_c = inputs[0]->channel(); @@ -55,8 +49,8 @@ SaberStatus SaberPadvalid_size(); if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { - cudaMemsetAsync(out_data, 0, out_size * sizeof(OpDataType), cuda_stream); - ker_pad_fwd\ + cudaMemsetAsync(out_data, 0, out_size * sizeof(dtype), cuda_stream); + ker_pad_fwd\ <<>>(\ out_data + _img_offset, in_data, _in_n_stride, \ _in_c_stride, _in_h_stride, _in_w_stride,\ @@ -66,6 +60,7 @@ SaberStatus SaberPad -SaberStatus SaberPermute::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - PermuteParam& param) { +template <> +SaberStatus SaberPermute::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + PermuteParam& param) { - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); - const InDataType * in_data = inputs[0]->data(); - OutDataType * out_data = outputs[0]->mutable_data(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + const float* in_data =static_cast(inputs[0]->data()); + float* out_data = static_cast(outputs[0]->mutable_data()); int count = outputs[0]->valid_size(); - const int * permute_order = _permute_order.data(); - const int * new_steps = _out_steps.data(); - const int * old_steps = _in_steps.data(); - const int * out_valid_shape = _out_valid_shape.data(); + const int* permute_order = static_cast(_permute_order.data()); + const int* new_steps = static_cast(_out_steps.data()); + const int* old_steps = static_cast(_in_steps.data()); + const int* out_valid_shape = static_cast(_out_valid_shape.data()); std::vector permute_order_nhwc_to_nchw = {0, 3, 1, 2}; - PermuteParam param_nhwc_to_nchw(permute_order_nhwc_to_nchw); + PermuteParam param_nhwc_to_nchw(permute_order_nhwc_to_nchw); std::vector permute_order_nchw_to_nhwc = {0, 2, 3, 1}; - PermuteParam param_nchw_to_nhwc(permute_order_nchw_to_nhwc); + PermuteParam param_nchw_to_nhwc(permute_order_nchw_to_nhwc); if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { if (_need_permute) { if (inputs[0]->num() == 1 && inputs[0]->width() == 3 && param == param_nhwc_to_nchw) { int out_w = outputs[0]->width() * outputs[0]->height(); int out_h = outputs[0]->channel(); - ker_permute_fwd_transpose\ + ker_permute_fwd_transpose\ <<>>(\ out_data, out_h, out_w, in_data); } else if (inputs[0]->num() == 1 && param == param_nchw_to_nhwc) { @@ -192,16 +186,17 @@ SaberStatus SaberPermutechannel(); dim3 block_size(TRANS_BLOCK_SIZE, TRANS_BLOCK_SIZE); dim3 grid_size((out_h + TRANS_BLOCK_SIZE - 1) / TRANS_BLOCK_SIZE, (out_w + TRANS_BLOCK_SIZE - 1) / TRANS_BLOCK_SIZE); - ker_transpose\ + ker_transpose\ <<>>(\ out_data, out_h, out_w, in_data); } else { - ker_permute_fwd\ + ker_permute_fwd\ <<>>(\ out_data, _num_axes, count, permute_order, \ new_steps, old_steps, in_data); } } else { + outputs[0]->copy_from(*inputs[0]); //outputs[0]->share_from(inputs[0]); } } else { @@ -210,7 +205,7 @@ SaberStatus SaberPermutewidth() * outputs[0]->height(); int out_h = outputs[0]->channel(); - ker_permute_fwd_transpose\ + ker_permute_fwd_transpose\ <<>>(\ out_data, outputs[0]->num(), outputs[0]->channel(), \ outputs[0]->height(), outputs[0]->width(), @@ -219,23 +214,25 @@ SaberStatus SaberPermutenum() * inputs[0]->channel() + TRANS_BLOCK_SIZE - 1) / TRANS_BLOCK_SIZE, (inputs[0]->height() * inputs[0]->width() + TRANS_BLOCK_SIZE - 1) / TRANS_BLOCK_SIZE); - ker_nchw_to_nhwc\ + ker_nchw_to_nhwc\ <<>>(\ out_data, inputs[0]->num(), inputs[0]->channel(),\ inputs[0]->height(), inputs[0]->width(),\ new_steps, old_steps, in_data); } else { - ker_permute_fwd\ + ker_permute_fwd\ <<>>(\ out_data, _num_axes, count, permute_order, \ new_steps, old_steps, in_data); } } else { + outputs[0]->copy_from(*inputs[0]); //outputs[0]->share_from(inputs[0]); } } return SaberSuccess; } - +DEFINE_OP_TEMPLATE(SaberPermute, PermuteParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberPermute, PermuteParam, NV, AK_INT8); } } diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_permute_power.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_permute_power.cu index 64c7d476e..24ed86846 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_permute_power.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_permute_power.cu @@ -1,5 +1,4 @@ #include "saber/funcs/impl/cuda/saber_permute_power.h" -#include "cuda_fp16.h" namespace anakin { namespace saber { @@ -204,55 +203,49 @@ __global__ void ker_permute_scale_fwd(Dtype * out_data, const int num_axes,\ } } -template -SaberStatus SaberPermutePower::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - PermutePowerParam& param) { +template <> +SaberStatus SaberPermutePower::\ + dispatch(const std::vector *>& inputs, \ + std::vector *>& outputs, \ + PermutePowerParam& param) { - const InDataType *in_data = inputs[0]->data(); - OutDataType *out_data = outputs[0]->mutable_data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); + const float *in_data = static_cast(inputs[0]->data()); + float *out_data = static_cast(outputs[0]->mutable_data()); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); int count = outputs[0]->valid_size(); - const int * permute_order = _permute_order.data(); - const int * new_steps = _new_steps.data(); - const int * old_steps = _old_steps.data(); - const int * new_valid_shape = _out_valid_shape.data(); + const int * permute_order = static_cast(_permute_order.data()); + const int * new_steps = static_cast(_new_steps.data()); + const int * old_steps = static_cast(_old_steps.data()); + const int * new_valid_shape = static_cast(_out_valid_shape.data()); const float scale = param.has_power_param ? param.power_param.scale : 1.0f; const float shift = param.has_power_param ? param.power_param.shift : 0.0f; const float power = param.has_power_param ? param.power_param.power : 1.0f; std::vector permute_order_t = {0, 3, 1, 2}; - PermuteParam param_t(permute_order_t); + PermuteParam param_t(permute_order_t); if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { if (inputs[0]->num() == 1 && inputs[0]->width() == 3 && param.permute_param == param_t && 1) { int out_w = outputs[0]->width() * outputs[0]->height(); int out_h = outputs[0]->channel(); if (power != 1.0f) { - ker_permute_power_fwd_transpose\ + ker_permute_power_fwd_transpose\ <<>>(\ out_data, out_h, out_w, scale, shift, power, in_data); } else { - ker_permute_scale_fwd_transpose\ + ker_permute_scale_fwd_transpose\ <<>>(\ out_data, out_h, out_w, scale, shift, in_data); } } else { if (power != 1.0f) { - ker_permute_power_fwd\ + ker_permute_power_fwd\ <<>>(\ out_data, _num_axes, count, permute_order, \ new_steps, old_steps, scale, shift, power, in_data); } else { - ker_permute_scale_fwd\ + ker_permute_scale_fwd\ <<>>(\ out_data, _num_axes, count, permute_order, \ new_steps, old_steps, @@ -270,7 +263,7 @@ SaberStatus SaberPermutePowerget_stride(); Shape in_stride = inputs[0]->get_stride(); if (power != 1.0f) { - ker_nhwc_to_nchw_power + ker_nhwc_to_nchw_power <<>>(out_data, \ out_n, out_c, \ out_h, out_w, \ @@ -281,7 +274,7 @@ SaberStatus SaberPermutePower + ker_nhwc_to_nchw_scale <<>>(out_data, \ out_n, out_c, \ out_h, out_w, \ @@ -295,7 +288,7 @@ SaberStatus SaberPermutePowervalid_size(); if (power != 1.0f) { - ker_permute_power_fwd\ + ker_permute_power_fwd\ <<>>(\ out_data, _num_axes, count, permute_order, \ new_steps, old_steps, @@ -303,7 +296,7 @@ SaberStatus SaberPermutePower\ + ker_permute_scale_fwd\ <<>>(\ out_data, _num_axes, count, permute_order, \ new_steps, old_steps, @@ -317,6 +310,7 @@ SaberStatus SaberPermutePower max_data) { - max_data = data;; + max_data = data; max_index = i * in_w + j; } } @@ -63,22 +63,16 @@ __global__ void ker_pool_with_index_fwd(Dtype * out_data, } } -template -SaberStatus SaberPoolingWithIndex::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - PoolingParam& param) { +template +SaberStatus SaberPoolingWithIndex::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + PoolingParam& param) { - const InDataType* in_data = inputs[0]->data(); - OutDataType* out_data = outputs[0]->mutable_data(); - OutDataType* out_index = outputs[1]->mutable_data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); + const dtype* in_data = static_cast(inputs[0]->data()); + dtype* out_data = static_cast(outputs[0]->mutable_data()); + dtype* out_index = static_cast(outputs[1]->mutable_data()); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); int count = outputs[0]->valid_size(); int out_n = outputs[0]->num(); int out_c = outputs[0]->channel(); @@ -88,7 +82,7 @@ SaberStatus SaberPoolingWithIndexwidth(); if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { - ker_pool_with_index_fwd\ + ker_pool_with_index_fwd\ <<>>(\ out_data, out_index, in_data, \ _in_n_stride, _in_c_stride, \ @@ -100,10 +94,13 @@ SaberStatus SaberPoolingWithIndex -SaberStatus SaberPower::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - PowerParam& param) { +template <> +SaberStatus SaberPower::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + PowerParam& param) { - const InDataType* in_data = inputs[0]->data(); - OutDataType* out_data = outputs[0]->mutable_data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); + const float* in_data = static_cast(inputs[0] -> data()); + float* out_data = static_cast(outputs[0]->mutable_data()); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); int count = outputs[0]->valid_size(); const float scale = param.scale; const float shift = param.shift; const float power = param.power; if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { if (power == 1) { - ker_scale_fwd\ + ker_scale_fwd\ <<>>(\ out_data, count, scale, shift, in_data); } else { - ker_power_fwd\ + ker_power_fwd\ <<>>(\ out_data, count, scale, shift, power, in_data); } } else { - const int* i_stride = _in_steps.data(); - const int* o_stride = _out_steps.data(); - const int* valid_shape = _out_valid_shape.data(); + const int* i_stride = static_cast(_in_steps.data()); + const int* o_stride = static_cast(_out_steps.data()); + const int* valid_shape =static_cast( _out_valid_shape.data()); if (power == 1) { - ker_scale_fwd\ + ker_scale_fwd\ <<>>(\ out_data, count, scale, shift, valid_shape, o_stride, i_stride, outputs[0]->dims(), in_data); } else { - ker_power_fwd\ + ker_power_fwd\ <<>>(\ out_data, count, scale, shift, power, valid_shape, o_stride, i_stride, outputs[0]->dims(), in_data); @@ -124,6 +118,7 @@ SaberStatus SaberPower -__global__ void prelu_shared_kernel(int n, const dtype* slope, const dtype* src, dtype* dst) { - CUDA_KERNEL_LOOP(idx, n) { - dst[idx] = src[idx] > 0 ? src[idx] : src[idx] * slope[0]; - } -} - -template -__global__ void prelu_kernel(int n, int channels, int inner_size, \ - const dtype* slope, const dtype* src, dtype* dst) { - - CUDA_KERNEL_LOOP(idx, n) { - int c = (idx / inner_size) % channels; - dst[idx] = src[idx] > 0 ? src[idx] : src[idx] * slope[c]; - } -} - -template -__global__ void prelu_shared_roi_kernel(int n, int dims, \ - const int* input_stride_real, const int* output_stride_real, const int* shape_valid, \ - const dtype* slope, const dtype* src, dtype* dst) { - - CUDA_KERNEL_LOOP(idx, n) { - int index = idx; - //! compute real data index - int input_real_index = 0; - int output_real_index = 0; - for (int i = dims - 1; i >= 0; i--) { - int x = index % shape_valid[i]; - input_real_index += x * input_stride_real[i]; - output_real_index += x * output_stride_real[i]; - index = index / shape_valid[i]; - } - dst[output_real_index] = src[input_real_index] > 0 ? src[input_real_index] : \ - src[input_real_index] * slope[0]; - } -} - -template -__global__ void prelu_roi_kernel(int n, int channels, int inner_size, int dims, \ - const int* input_stride_real, const int* output_stride_real, const int* shape_valid, \ - const dtype* slope, const dtype* src, dtype* dst) { - - CUDA_KERNEL_LOOP(idx, n) { - int index = idx; - //! compute real data index - int input_real_index = 0; - int output_real_index = 0; - for (int i = dims - 1; i >= 0; i--) { - int x = index % shape_valid[i]; - input_real_index += x * input_stride_real[i]; - output_real_index += x * output_stride_real[i]; - index = index / shape_valid[i]; - } - int c = (idx / inner_size) % channels; - dst[output_real_index] = src[input_real_index] > 0 ? src[input_real_index] : \ - src[input_real_index] * slope[c]; - } -} - - -template -SaberStatus SaberPrelu::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - PreluParam& param) { - cudaStream_t stream = this->_ctx.get_compute_stream(); - - const InDataType* src = inputs[0]->data(); - OutDataType* dst = outputs[0]->mutable_data(); - const int* valid_shape = _valid_shape.data(); - const int* input_stride = _input_stride.data(); - const int* output_stride = _output_stride.data(); - - if (_is_continue_buf) { - if (param.channel_shared) { - prelu_shared_kernel<<>>\ - (_size, param.slope->data(), src, dst); - } else { - prelu_kernel<<>>\ - (_size, _channels, _inner_size, param.slope->data(), src, dst); - } - } else { - if (param.channel_shared) { - prelu_shared_roi_kernel\ - <<>>\ - (_size, _dims, input_stride, output_stride, valid_shape, \ - param.slope->data(), src, dst); - } else { - prelu_roi_kernel\ - <<>>\ - (_size, _channels, _inner_size, _dims, input_stride, \ - output_stride, valid_shape, param.slope->data(), src, dst); - } - } - - return SaberSuccess; -} - -} //namespace anakin - -} //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_resize.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_resize.cu index 2bfd667d5..035854a06 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_resize.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_resize.cu @@ -52,19 +52,21 @@ __global__ void resize_bilinear_2d_kernel(const int wout, const int hout, float w_10 = w_h1 * w_w0; float w_11 = w_h1 * w_w1; - int hl = src_h * src_stride_h; - int hh = h * src_stride_h; - int wl = src_w * src_stride_w; - int wh = w * src_stride_w; + for (int i = 0; i < num; ++i) { + int src_batch_idx = i * src_stride_batch; - int src_indexTL = hl + wl; - int src_indexTR = hl + wh; - int src_indexBL = hh + wl; - int src_indexBR = hh + wh; + int hl = src_h * src_stride_h; + int hh = h * src_stride_h; + int wl = src_w * src_stride_w; + int wh = w * src_stride_w; - int dst_index = dst_w * dst_stride_w + dst_h * dst_stride_h; + int src_indexTL = src_batch_idx + hl + wl; + int src_indexTR = src_batch_idx + hl + wh; + int src_indexBL = src_batch_idx + hh + wl; + int src_indexBR = src_batch_idx + hh + wh; + + int dst_index = i * dst_stride_batch + dst_w * dst_stride_w + dst_h * dst_stride_h; - for (int i = 0; i < num; ++i) { for (int j = 0; j < channels; ++j) { #if 0 dtype tl = (src_w < 0 || src_h < 0)? 0 : src[src_indexTL]; @@ -73,9 +75,9 @@ __global__ void resize_bilinear_2d_kernel(const int wout, const int hout, dtype br = (w > win || h > hin)? 0 : src[src_indexBR]; #else dtype tl = src[src_indexTL]; - dtype tr = w > win? 0 : src[src_indexTR];//w > win? 0 : - dtype bl = h > hin? 0 : src[src_indexBL];//h > hin? 0 : - dtype br = (w > win || h > hin)? 0 : src[src_indexBR];//(w > win || h > hin)? 0 : + dtype tr = w >= win? 0 : src[src_indexTR];//w > win? 0 : + dtype bl = h >= hin? 0 : src[src_indexBL];//h > hin? 0 : + dtype br = (w >= win || h >= hin)? 0 : src[src_indexBR];//(w > win || h > hin)? 0 : #endif dst[dst_index] = static_cast(w_00 * tl + w_01 * tr + w_10 * bl + w_11 * br); src_indexBR += src_stride_c; @@ -89,18 +91,15 @@ __global__ void resize_bilinear_2d_kernel(const int wout, const int hout, } -template -SaberStatus SaberResize::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - ResizeParam& param) { - cudaStream_t stream = this->_ctx.get_compute_stream(); +template +SaberStatus SaberResize::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + ResizeParam& param) { + + CHECK_EQ(inputs[0]->get_dtype() == OpDtype && outputs[0]->get_dtype() == OpDtype, true) << \ + "input datatype, output datatype are not match to Op datatype"; + cudaStream_t stream = this->_ctx->get_compute_stream(); int w_out = outputs[0]->width(); int h_out = outputs[0]->height(); @@ -129,8 +128,6 @@ SaberStatus SaberResizedata(); - OutDataType* out_data = outputs[0]->mutable_data(); Shape src_real_shape; Shape dst_real_shape; if (inputs[0]->is_continue_mem()) { @@ -152,28 +149,20 @@ SaberStatus SaberResizecount(height_idx + 1, dims); int dst_stride_channel = dst_real_shape.count(channel_idx + 1);//outputs[0]->count(channel_idx + 1, dims); int dst_stride_batch = dst_real_shape.count(num_idx + 1);//outputs[0]->count(num_idx + 1, dims); - const InDataType* in_data_batch = in_data; - OutDataType* out_data_batch = out_data; - for (int i = 0; i < n_out; ++i) { - resize_bilinear_2d_kernel<<>>( - w_out, h_out, n_out, c_out, - dst_stride_w, dst_stride_h, dst_stride_channel, dst_stride_batch, - w_in, h_in, - src_stride_w, src_stride_h, src_stride_channel, src_stride_batch, - 1 / param.width_scale, 1 / param.height_scale, - in_data, out_data); - in_data_batch += src_stride_batch; - out_data_batch += dst_stride_batch; - } + resize_bilinear_2d_kernel<<>>( + w_out, h_out, n_out, c_out, + dst_stride_w, dst_stride_h, dst_stride_channel, dst_stride_batch, + w_in, h_in, + src_stride_w, src_stride_h, src_stride_channel, src_stride_batch, + 1 / param.width_scale, 1 / param.height_scale, + (const OpDataType*)inputs[0]->data(), (OpDataType*)outputs[0]->mutable_data()); + //outputs[0]->record_event(stream); return SaberSuccess; } -template class SaberResize; -template class SaberResize; -template class SaberResize; -template class SaberResize; -template class SaberResize; -template class SaberResize; +template class SaberResize; +template class SaberResize; +DEFINE_OP_TEMPLATE(SaberResize, ResizeParam, NV, AK_HALF); } //namespace anakin -} //namespace anakin +} //namespace diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_input.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_input.cu new file mode 100644 index 000000000..2c6674733 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_input.cu @@ -0,0 +1,90 @@ + +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/cuda/saber_reverse_input.h" +#include "saber/funcs/saber_util.h" + +namespace anakin { +namespace saber { + +template +SaberStatus SaberReverseInput::init(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m, + Context &ctx) { + this->_ctx=&ctx; + for(int i=0;i()); + _offset_map_vec[i].set_dtype(AK_INT32); + _offset_map_cu_vec.push_back(*new OpTensor()); + _offset_map_cu_vec[i].set_dtype(AK_INT32); + } + + return create(inputs,outputs,param,ctx); +}; +template +SaberStatus SaberReverseInput::create(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m, + Context &ctx) { + if(this->_ctx=&ctx){ + this->_ctx=&ctx; + } + return SaberSuccess; +}; + +static inline int round_up(int k, int c) { + return ((k + c - 1) / c) * c; +} + +template +__global__ static void ker_reverse_input(const Dtype* in,Dtype* out,int length,int* offset){ + int tid=blockIdx.x*blockDim.x+threadIdx.x; + if(tid +SaberStatus SaberReverseInput::dispatch(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m) { + int input_size=inputs.size(); + + cudaStream_t stream=this->_ctx->get_compute_stream(); + for(int input_id=0;input_id> offset_vec=inputs[input_id]->get_seq_offset(); + std::vector offset=offset_vec[offset_vec.size()-1]; + int word_sum=offset[offset.size()-1]; + utils::try_expand_tensor(_offset_map_vec[input_id],word_sum); + utils::try_expand_tensor(_offset_map_cu_vec[input_id],word_sum); + int* offset_map_ptr= static_cast(_offset_map_vec[input_id].mutable_data()); + int* offset_map_cu_ptr= static_cast(_offset_map_cu_vec[input_id].mutable_data()); + for(int sequence_id=0;sequence_id(inputs[input_id]->data()); + OpDataType* out=static_cast(outputs[input_id]->mutable_data()); + ker_reverse_input<<>>(in,out,word_sum,offset_map_cu_ptr); + } + + return SaberSuccess; + +}; + +template class SaberReverseInput; +template class SaberReverseInput; +template class SaberReverseInput; +template class SaberReverseInput; + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_sequence.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_sequence.cu new file mode 100644 index 000000000..eaea1548a --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_sequence.cu @@ -0,0 +1,94 @@ + +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/cuda/saber_reverse_sequence.h" +#include "saber/funcs/saber_util.h" + +namespace anakin { +namespace saber { + +template +SaberStatus SaberReverseSequence::init(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m, + Context &ctx) { + this->_ctx=&ctx; + + return create(inputs,outputs,param,ctx); +}; +template +SaberStatus SaberReverseSequence::create(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m, + Context &ctx) { + if(this->_ctx=&ctx){ + this->_ctx=&ctx; + } + int input_size=inputs.size(); + CHECK_EQ(input_size,1)<<"only support one input now"; + return SaberSuccess; +}; + +static inline int round_up(int k, int c) { + return ((k + c - 1) / c) * c; +} + +template +__global__ static void ker_reverse_sequence(const Dtype* in,Dtype* out,int length,int word_size,int* offset){ + int tid=blockIdx.x*blockDim.x+threadIdx.x; + if(tid +SaberStatus SaberReverseSequence::dispatch(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m) { + int input_size=inputs.size(); + CHECK_EQ(input_size,1)<<"only support one input now"; + + cudaStream_t stream=this->_ctx->get_compute_stream(); + std::vector> offset_vec=inputs[0]->get_seq_offset(); + std::vector offset=offset_vec[offset_vec.size()-1]; + + + int batch_size=offset.size()-1; + int word_size=inputs[0]->valid_shape()[1]; + int word_sum=offset[batch_size]; + + utils::try_expand_tensor(_offset_map,word_sum); + utils::try_expand_tensor(_offset_map_cu,word_sum); + int* offset_map_ptr= static_cast(_offset_map.mutable_data()); + int* offset_map_cu_ptr= static_cast(_offset_map_cu.mutable_data()); + + for (int i = 0; i < batch_size; i++) { + int seq_len = offset[i + 1] - offset[i]; + int start_word_id=offset[i]; + for (int j = 0; j < seq_len; j++) { + offset_map_ptr[start_word_id+seq_len-1-j]=start_word_id+j; + } + } + CUDA_CHECK(cudaMemcpyAsync(offset_map_cu_ptr,offset_map_ptr, sizeof(int)*word_sum,cudaMemcpyHostToDevice,stream)); + int tid_sum=word_sum*word_size; + int block_dim=256; + if(tid_sum(inputs[0]->data()); + OpDataType* out=static_cast(outputs[0]->mutable_data()); + ker_reverse_sequence<<>>(in,out,tid_sum,word_size,offset_map_cu_ptr); + + return SaberSuccess; + +}; + +template class SaberReverseSequence; +template class SaberReverseSequence; +template class SaberReverseSequence; +template class SaberReverseSequence; + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_roi_pool.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_roi_pool.cu deleted file mode 100644 index 720456936..000000000 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_roi_pool.cu +++ /dev/null @@ -1,123 +0,0 @@ -#include "saber/funcs/impl/cuda/saber_roi_pool.h" -#include "cuda_fp16.h" -#include - -namespace anakin { - -namespace saber { - -template -__global__ void ker_roi_pool_fwd(Dtype * out_data, \ - Dtype* out_index, - const Dtype* in_data, - const Dtype* in_rois, - const int in_n_stride, - const int in_c_stride, - const int in_h_stride, - const int in_w_stride, - const int out_n_stride, - const int out_c_stride, - const int out_h_stride, - const int out_w_stride, - const Dtype spatial_scale, - const int in_n, - const int in_c, - const int in_h, - const int in_w, - const int roi_num, - const int roi_size, - const int out_h, - const int out_w, - const int num_threads) -{ - CUDA_KERNEL_LOOP(tid, num_threads){ - int n = (tid / out_n_stride); - int c = (tid / out_c_stride) % in_c; - int h = (tid / out_h_stride) % out_h; - int w = (tid / out_w_stride) % out_w; - const Dtype* cur_roi = in_rois + n * roi_size; - int roi_batch_id = cur_roi[0]; - int roi_start_w = round(cur_roi[1] * spatial_scale); - int roi_start_h = round(cur_roi[2] * spatial_scale); - int roi_end_w = round(cur_roi[3] * spatial_scale); - int roi_end_h = round(cur_roi[4] * spatial_scale); - int roi_width = roi_end_w - roi_start_w + 1; - int roi_height = roi_end_h - roi_start_h + 1; - Dtype pool_w_rate = roi_width / out_w; - Dtype pool_h_rate = roi_height / out_h; - - int h_start = static_cast(floor(static_cast(h) * pool_h_rate)); - int w_start = static_cast(floor(static_cast(w) * pool_w_rate)); - int h_end = static_cast(ceil(static_cast(h + 1) * pool_h_rate)); - int w_end = static_cast(ceil(static_cast(w + 1) * pool_w_rate)); - h_start = fminf(fmaxf(h_start + roi_start_h, 0), in_h); - h_end = fminf(fmaxf(h_end + roi_start_h, 0), in_h); - w_start = fminf(fmaxf(w_start + roi_start_w, 0), in_w); - w_end = fminf(fmaxf(w_end + roi_start_w, 0), in_w); - bool is_empty = (h_end <= h_start) || (w_end <= w_start); - Dtype max_val = is_empty ? 0 : -FLT_MAX; - int max_idx = -1; - const Dtype* in_tmp = - in_data + roi_batch_id * in_n_stride + c * in_c_stride; - for (int h_id = h_start; h_id < h_end; ++h_id) { - for (int w_id = w_start; w_id < w_end; ++w_id) { - int input_data_index = h_id * in_h_stride + w_id * in_w_stride; - Dtype data = in_tmp[input_data_index]; - if (data > max_val) { - max_val = data; - max_idx = input_data_index; - } - } - } - out_data[tid] = max_val; - if (out_index) { - out_index[tid] = max_idx; - } - } -} - -template -SaberStatus SaberRoiPool::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - RoiPoolParam& param) { - - const InDataType* in_data = inputs[0]->data(); - const InDataType* in_rois = inputs[1]->data(); - OutDataType* out_data = outputs[0]->mutable_data(); - OutDataType* out_index = nullptr; - if (outputs.size() == 2) { - out_index = outputs[1]->mutable_data(); - } - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); - int count = outputs[0]->valid_size(); - int out_n = outputs[0]->num(); - int out_c = outputs[0]->channel(); - int out_h = outputs[0]->height(); - int out_w = outputs[0]->width(); - int in_n = inputs[0]->num(); - int in_c = inputs[0]->channel(); - int in_h = inputs[0]->height(); - int in_w = inputs[0]->width(); - - if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { - ker_roi_pool_fwd\ - <<>>(\ - out_data, out_index, in_data, in_rois, \ - _in_n_stride, _in_c_stride, _in_h_stride, _in_w_stride,\ - _out_n_stride, _out_c_stride, _out_h_stride, _out_w_stride,\ - param.spatial_scale, - in_n, in_c, in_h, in_w, - out_n, 5, out_h, out_w, count); - } - return SaberSuccess; -} - -} -} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_scale.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_scale.cu old mode 100644 new mode 100755 index 4c16e4150..454f0d481 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_scale.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_scale.cu @@ -36,32 +36,36 @@ __global__ void ker_scale_fwd(Dtype * out_data, template <> -SaberStatus SaberScale::dispatch( \ - const std::vector& inputs, - std::vector& outputs, - ScaleParam& param) { +SaberStatus SaberScale::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + ScaleParam& param) { - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); auto in_data = inputs[0]->data(); auto out_data = outputs[0]->mutable_data(); const int count = inputs[0]->valid_size(); + if (inputs.size() > 1) { + _scale_dim = inputs[1]->valid_size(); + _inner_dim = count / _scale_dim; + } if (_scale_dim > 1 || inputs.size() > 1) { auto scale_data = inputs.size() > 1 ? inputs[1]->data() : _weight.data(); auto bias_data = param.bias_term ? _bias.data() : NULL; - ker_scale_fwd + ker_scale_fwd <<>>( - out_data, in_data, scale_data, bias_data, count, _scale_dim, _inner_dim); + (OpDataType*)out_data, (const OpDataType*)in_data, (const OpDataType*)scale_data, \ + (const OpDataType*)bias_data, count, _scale_dim, _inner_dim); } else { auto scale = param.scale_w[0]; - InDataType bias = 0; + OpDataType bias = 0; if (_bias_term) { bias = param.scale_b[0]; } - ker_scale_fwd + ker_scale_fwd <<>>( - out_data, in_data, scale, bias, count); + (OpDataType*)out_data, (const OpDataType*)in_data, scale, bias, count); } CUDA_POST_KERNEL_CHECK; diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_pool.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_pool.cu new file mode 100644 index 000000000..1af4c29e1 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_pool.cu @@ -0,0 +1,200 @@ +#include "core/common.h" +#include "saber/funcs/impl/cuda/saber_sequence_pool.h" +#include "saber/saber_funcs_param.h" +#include "cuda.h" +namespace anakin { +namespace saber { + +template +__global__ void seq_pool_average_kernel(Dtype* dst, const Dtype* src_in,const int batch_size, + const int* seq_offset, const int slice_size){ + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total){ + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_slice_num = seq_offset[out_batch_id + 1] - seq_offset[out_batch_id]; + int in_offset = seq_offset[out_batch_id] * slice_size; + src_in += in_offset + out_id; + Dtype sum = (Dtype)0; + for(int i = 0; i < in_slice_num; ++i){ + sum += src_in[i * slice_size]; + } + dst[out_batch_id * slice_size + out_id] = sum / in_slice_num; + } +} + +template +__global__ void seq_pool_sum_kernel(Dtype* dst, const Dtype* src_in,const int batch_size, + const int* seq_offset, const int slice_size){ + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total){ + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_slice_num = seq_offset[out_batch_id + 1] - seq_offset[out_batch_id]; + int in_offset = seq_offset[out_batch_id] * slice_size; + src_in += in_offset + out_id; + Dtype sum = (Dtype)0; + for(int i = 0; i < in_slice_num; ++i){ + sum += src_in[i * slice_size]; + } + dst[out_batch_id * slice_size + out_id] = sum; + } +} + +template +__global__ void seq_pool_sqrt_kernel(Dtype* dst, const Dtype* src_in,const int batch_size, + const int* seq_offset, const int slice_size){ + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total){ + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_slice_num = seq_offset[out_batch_id + 1] - seq_offset[out_batch_id]; + int in_offset = seq_offset[out_batch_id] * slice_size; + src_in += in_offset + out_id; + Dtype sum = (Dtype)0; + for(int i = 0; i < in_slice_num; ++i){ + sum += src_in[i * slice_size]; + } + dst[out_batch_id * slice_size + out_id] = sum * rsqrtf(in_slice_num); + } +} + +template +__global__ void seq_pool_max_kernel(Dtype* dst, const Dtype* src_in,const int batch_size, + const int* seq_offset, const int slice_size){ + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total){ + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_slice_num = seq_offset[out_batch_id + 1] - seq_offset[out_batch_id]; + int in_offset = seq_offset[out_batch_id] * slice_size; + src_in += in_offset + out_id; + Dtype max = src_in[0]; + for (int i = 1; i < in_slice_num; ++i){ + Dtype val = src_in[i * slice_size]; + if (val > max){ + max = val; + } + } + dst[out_batch_id * slice_size + out_id] = max; + } +} + +template +void seq_pool_average(dtype* dst, const dtype* src_in,const int batch_size, + const int* seq_offset, const int slice_size, Context* ctx) { + seq_pool_average_kernel<<>>\ + (dst,src_in,batch_size,seq_offset,slice_size); +} + +template +void seq_pool_sum(dtype* dst, const dtype* src_in,const int batch_size, + const int* seq_offset, const int slice_size, Context* ctx) { + seq_pool_sum_kernel<<>>\ + (dst,src_in,batch_size,seq_offset,slice_size); +} + +template +void seq_pool_sqrt(dtype* dst, const dtype* src_in,const int batch_size, + const int* seq_offset, const int slice_size, Context* ctx) { + seq_pool_sqrt_kernel<<>>\ + (dst,src_in,batch_size,seq_offset,slice_size); +} + +template +void seq_pool_max(dtype* dst, const dtype* src_in,const int batch_size, + const int* seq_offset, const int slice_size, Context* ctx) { + seq_pool_max_kernel<<>>\ + (dst,src_in,batch_size,seq_offset,slice_size); +} + +template +void seq_pool_first(dtype* dst, const dtype* src_in,const int batch_size, + const int* seq_offset, const int slice_size, Context* ctx) { + for (int i = 0; i < batch_size; ++i){ + int in_slice_num = seq_offset[i]; + CUDA_CHECK(cudaMemcpyAsync(dst + i * slice_size, src_in + in_slice_num * slice_size, \ + sizeof(dtype) * slice_size, cudaMemcpyDeviceToDevice, ctx->get_compute_stream())); + } +} + +template +void seq_pool_last(dtype* dst, const dtype* src_in,const int batch_size, + const int* seq_offset, const int slice_size, Context* ctx) { + for (int i = 0; i < batch_size; ++i){ + int in_slice_num = seq_offset[i+1]; + CUDA_CHECK(cudaMemcpyAsync(dst + i * slice_size, src_in + (in_slice_num-1) * slice_size, \ + sizeof(dtype) * slice_size, cudaMemcpyDeviceToDevice, ctx->get_compute_stream())); + } +} + +template +void seq_pool_unknow(dtype* dst, const dtype* src_in,const int batch_size, + const int* seq_offset, const int slice_size, Context* ctx) { + LOG(ERROR) << " UNKNOWN seq pool type"; +} + +template +SaberStatus SaberSequencePool::init( + const std::vector& inputs, + std::vector& outputs, + SequencePoolParam& param, Context& ctx) { + + this->_ctx = &ctx; + kernel_direct_map = { + {Sequence_pool_unknow, seq_pool_unknow}, + {Sequence_pool_average, seq_pool_average}, + {Sequence_pool_sum, seq_pool_sum}, + {Sequence_pool_sqrt, seq_pool_sqrt}, + {Sequence_pool_max, seq_pool_max}, + {Sequence_pool_last, seq_pool_last}, + {Sequence_pool_first, seq_pool_first}, + }; + return create(inputs, outputs, param, ctx); + +} + +template +SaberStatus SaberSequencePool::dispatch( + const std::vector& inputs, + std::vector& outputs, + SequencePoolParam& param) { + + CHECK_EQ(inputs[0]->channel(), outputs[0]->channel()); + CHECK_EQ(inputs[0]->height(), outputs[0]->height()); + CHECK_EQ(inputs[0]->width(), outputs[0]->width()); + + std::vector seq_offset = inputs[0]->get_seq_offset()[0]; + int slice_size = outputs[0]->channel() + * outputs[0]->height() + * outputs[0]->width(); + DataType_in* dst_ptr = (DataType_in*)outputs[0]->mutable_data(); + const DataType_out* src_ptr = (const DataType_out*)inputs[0]->data(); + int batch_size = seq_offset.size()-1; + Tensor seq_offset_D; + seq_offset_D.re_alloc(Shape({1, 1, 1, (int)seq_offset.size()}), AK_INT32); + CUDA_CHECK(cudaMemcpyAsync(seq_offset_D.mutable_data(), seq_offset.data(), \ + sizeof(int) * seq_offset.size(),cudaMemcpyHostToDevice,this->_ctx->get_compute_stream())); + if(param.sequence_pool_type == Sequence_pool_first || param.sequence_pool_type == Sequence_pool_last){ + kernel_direct_map[param.sequence_pool_type](dst_ptr, src_ptr, batch_size, (const int*)seq_offset.data(), slice_size, this->_ctx); + } + else{ + kernel_direct_map[param.sequence_pool_type](dst_ptr, src_ptr, batch_size, (const int*)seq_offset_D.data(), slice_size, this->_ctx); + } + + std::vector offset_new(batch_size + 1); + + for (int i = 0; i <= batch_size; ++i) { + offset_new[i] = i; + } + std::vector> voffset_new; + voffset_new.push_back(offset_new); + outputs[0]->set_seq_offset(voffset_new); + return SaberSuccess; + +} + +template class SaberSequencePool; + +} +} // namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_slice.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_slice.cu index 3391e7fcb..f98e306ae 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_slice.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_slice.cu @@ -20,47 +20,28 @@ __global__ void slice_impl_cuda(const int nthreads, const dtype* in_data, } -template -SaberStatus SaberSlice::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - SliceParam& param) { +template +SaberStatus SaberSlice::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + SliceParam& param) { - cudaStream_t stream = this->_ctx.get_compute_stream(); + cudaStream_t stream = this->_ctx->get_compute_stream(); //! inputs only has one tensor Shape shape_in = inputs[0]->valid_shape(); int output_size = outputs.size(); -#if 0 //! shared buffer - outputs[0]->share_sub_buffer(*inputs[0], outputs[0]->valid_shape(), \ - inputs[0]->offset()); - for (int i = 1; i < output_size; ++i) { - Shape offset = inputs[0]->offset(); - offset[param.axis] += param.slice_points[i - 1]; - outputs[i]->share_sub_buffer(*inputs[0], outputs[i]->valid_shape(), offset); - } - -#endif - -#if 1 //! deep copy - //! if output only has one tensor, then shared the memory buffer if (output_size == 1) { outputs[0]->share_from(*inputs[0]); return SaberSuccess; } int offset_slice_axis = 0; - const OpDataType* in_data = inputs[0]->data(); + const OpDataType* in_data = (const OpDataType*)inputs[0]->data(); const int in_slice_axis_size = shape_in[param.axis]; for (int i = 0; i < output_size; ++i) { - OpDataType* out_data = outputs[i]->mutable_data(); + OpDataType* out_data = (OpDataType*)outputs[i]->mutable_data(); const int out_slice_axis_size = outputs[i]->valid_shape()[param.axis]; const int out_slice_size = out_slice_axis_size * _slice_size; const int nthreads = out_slice_size * _slice_num; @@ -68,13 +49,12 @@ SaberStatus SaberSlicerecord_event(stream); } -#endif return SaberSuccess; } - +DEFINE_OP_TEMPLATE(SaberSlice, SliceParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSlice, SliceParam, NV, AK_INT8); } //namespace anakin } //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_softmax.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_softmax.cu old mode 100644 new mode 100755 index 653cf9efb..3dc92608f --- a/saber/funcs/impl/cuda/base/cuda_c/saber_softmax.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_softmax.cu @@ -21,7 +21,7 @@ __global__ void softmax_max_kernel(int total_size, const dtype* in_data, dtype* for (int i = 0; i < axis_size; ++i) { max_data = in_data[real_index] > max_data? in_data[real_index] : max_data; real_index += inner_num; - } + } out_data[idx] = max_data; } } @@ -206,19 +206,13 @@ __global__ void sharemem_softmax_kernel(int total_size, \ } } - //! subtract - #pragma unroll - for (int i = 0; i < axis_size; ++i) { - data[i * blocksize] -= max_data; - } - - //! summarize + //! subtract then summarize dtype sum = 0; #pragma unroll for (int i = 0; i < axis_size; ++i) { //dtype *dt = &data[i][thread_idx]; dtype *dt = data + i * blocksize; - *dt = expf(*dt); + *dt = expf(*dt - max_data); sum += *dt; } @@ -268,7 +262,7 @@ __global__ void sharemem_softmax_roi_kernel(int total_size, \ for (int i = 0; i < axis_size; ++i) { data[i * blocksize] = in_data[input_real_index]; input_real_index += input_stride_real[softmax_axis]; - } + } //! get maximum value in softmax channel dtype max_data = data[0]; @@ -280,19 +274,13 @@ __global__ void sharemem_softmax_roi_kernel(int total_size, \ } } - //! subtract - #pragma unroll - for (int i = 0; i < axis_size; ++i) { - data[i * blocksize] -= max_data; - } - - //! summarize + //! subtract then summarize dtype sum = 0; #pragma unroll for (int i = 0; i < axis_size; ++i) { //dtype *dt = &data[i][thread_idx]; dtype *dt = data + i * blocksize; - *dt = expf(*dt); + *dt = expf(*dt - max_data); sum += *dt; } @@ -305,51 +293,45 @@ __global__ void sharemem_softmax_roi_kernel(int total_size, \ } } -template -SaberStatus SaberSoftmax::dispatch(\ +template +SaberStatus SaberSoftmax::dispatch(\ const std::vector& inputs, \ std::vector& outputs, \ - SoftmaxParam& param) { + SoftmaxParam& param) { - cudaStream_t stream = this->_ctx.get_compute_stream(); + cudaStream_t stream = this->_ctx->get_compute_stream(); //! inputs only has one tensor int total_threads = this->_inner_num * this->_outer_num; - const InDataType* data_in = inputs[0]->data(); - InDataType* data_out = outputs[0]->mutable_data(); - InDataType* max_data = this->_max_data.mutable_data(); - InDataType* sum_data = this->_sum_data.mutable_data(); - const int* valid_shape = _valid_shape.data(); - const int* input_stride = _input_stride.data(); - const int* output_stride = _output_stride.data(); + const OpDataType* data_in = (const OpDataType* )inputs[0]->data(); + OpDataType* data_out = (OpDataType*)outputs[0]->mutable_data(); + OpDataType* max_data = (OpDataType*)this->_max_data.mutable_data(); + OpDataType* sum_data = (OpDataType*)this->_sum_data.mutable_data(); + const int* valid_shape = (const int*)_valid_shape.data(); + const int* input_stride = (const int*)_input_stride.data(); + const int* output_stride = (const int*)_output_stride.data(); if (_is_continue_buf) { //! softmax kernel without roi if (this->_axis_size <= _max_dimsize){ - int sharemem_size = this->_axis_size * CUDA_NUM_THREADS * sizeof(InDataType); - sharemem_softmax_kernel\ + int sharemem_size = this->_axis_size * CUDA_NUM_THREADS * sizeof(OpDataType); + sharemem_softmax_kernel\ <<>>( total_threads, data_in, data_out, this->_inner_num, this->_outer_num, this->_axis_size); } else { //! firstly, get maximum data - InDataType min_data = std::numeric_limits::min(); - softmax_max_kernel\ + OpDataType min_data = std::numeric_limits::min(); + softmax_max_kernel\ <<>>( total_threads, data_in, max_data, min_data, \ this->_inner_num, this->_outer_num, this->_axis_size); //! then, compute exp and sum data - softmax_sub_exp_sum_kernel + softmax_sub_exp_sum_kernel <<>>( total_threads, data_in, data_out, max_data, sum_data, \ this->_inner_num, this->_outer_num, this->_axis_size); //! lastly, compute divided output - softmax_divid_output_kernel\ + softmax_divid_output_kernel\ <<>>( total_threads, data_out, sum_data, \ this->_inner_num, this->_outer_num, this->_axis_size); @@ -357,28 +339,28 @@ SaberStatus SaberSoftmax_axis_size <= _max_dimsize){ - int sharemem_size = this->_axis_size * CUDA_NUM_THREADS * sizeof(InDataType); - sharemem_softmax_roi_kernel\ + int sharemem_size = this->_axis_size * CUDA_NUM_THREADS * sizeof(OpDataType); + sharemem_softmax_roi_kernel\ <<>>( total_threads, data_in, data_out, input_stride, output_stride, valid_shape, \ param.axis, _axis_size, _dims); } else { //! firstly, get maximum data - InDataType min_data = std::numeric_limits::min(); - softmax_max_roi_kernel\ + OpDataType min_data = std::numeric_limits::min(); + softmax_max_roi_kernel\ <<>>( total_threads, data_in, max_data, min_data, \ input_stride, output_stride, valid_shape, \ param.axis, _axis_size, _dims); //! then, compute exp and sum data - softmax_sub_exp_sum_roi_kernel + softmax_sub_exp_sum_roi_kernel <<>>( total_threads, data_in, data_out, max_data, sum_data, \ input_stride, output_stride, valid_shape, \ param.axis, _axis_size, _dims); //! lastly, compute divided output - softmax_divid_output_roi_kernel\ + softmax_divid_output_roi_kernel\ <<>>( total_threads, data_out, sum_data, \ input_stride, output_stride, valid_shape, \ @@ -389,7 +371,8 @@ SaberStatus SaberSoftmaxrecord_event(stream); return SaberSuccess; } - +DEFINE_OP_TEMPLATE(SaberSoftmax, SoftmaxParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSoftmax, SoftmaxParam, NV, AK_INT8); } //namespace anakin } //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_spp.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_spp.cu deleted file mode 100644 index 27688d65b..000000000 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_spp.cu +++ /dev/null @@ -1,66 +0,0 @@ -#include "saber/funcs/impl/cuda/saber_spp.h" -#include "cuda_fp16.h" - -namespace anakin { - -namespace saber { - -#if 0 -template -__global__ void ker_concat_fwd(Dtype* out_data, const Dtype* in_data, - const int n, - const int w, - const int n_stride, const int nthreads) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int n_id = index / w; - const int w_id = index % w; - const int out_index = n_id * n_stride + w_id; - out_data[out_index] = in_data[index]; - } -} - -template -SaberStatus SaberSpp::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ - SPPParam& param) { - - const InDataType* in_data = inputs[0]->data(); - OutDataType* out_data = outputs[0]->mutable_data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); - int count = outputs[0]->valid_size(); - int out_n = outputs[0]->num(); - int out_c = outputs[0]->channel(); - int out_h = outputs[0]->height(); - int out_w = outputs[0]->width(); - - if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { - std::vector pool_outputs; - pool_outputs.resize(1); - for (int i = 0; i < param.pyramid_height; i++) { - pool_outputs[0] = _pooling_output[i]; - (*_pooling[i])(inputs, pool_outputs, _pooling_param[i], this->_ctx); - int valid_size = pool_outputs[0]->valid_size(); - int offset = (pow(4, i) - 1) / 3; - ker_concat_fwd<<>>( - out_data + offset, - pool_outputs[0]->data(), - pool_outputs[0]->num() * pool_outputs[0]->channel(), - pool_outputs[0]->height() * pool_outputs[0]->width(), - outputs[0]->width(), - valid_size); - } - } - - return SaberSuccess; -} -#endif -} //namespace saber - -} //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_transpose.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_transpose.cu old mode 100644 new mode 100755 index e01362dab..302cd4a03 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_transpose.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_transpose.cu @@ -38,19 +38,13 @@ __global__ void transpose_tile_2d_if(dtype *odata, const dtype *idata, int num, } } -template -SaberStatus SaberTranspose::dispatch(\ +template +SaberStatus SaberTranspose::dispatch(\ const std::vector& inputs,\ std::vector& outputs, \ - TransposeParam& param) { + TransposeParam& param) { - cudaStream_t stream = this->_ctx.get_compute_stream(); + cudaStream_t stream = this->_ctx->get_compute_stream(); int w_out = outputs[0]->width(); int h_out = outputs[0]->height(); @@ -81,14 +75,15 @@ SaberStatus SaberTransposedata(); - OutDataType* out_data = outputs[0]->mutable_data(); + const InDataType* in_data = (const InDataType*)inputs[0]->data(); + OutDataType* out_data = (OutDataType*)outputs[0]->mutable_data(); transpose_tile_2d_if<<>>(out_data, in_data, n_in, c_in, h_in, w_in); return SaberSuccess; } - +DEFINE_OP_TEMPLATE(SaberTranspose, TransposeParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberTranspose, TransposeParam, NV, AK_INT8); }//namespace saber }//namespace anakin \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_unpool.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_unpool.cu old mode 100644 new mode 100755 index f3e2f5536..036b613ac --- a/saber/funcs/impl/cuda/base/cuda_c/saber_unpool.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_unpool.cu @@ -17,7 +17,7 @@ __global__ void ker_unpool_max_fwd(Dtype * out_data, \ const int num_threads) { CUDA_KERNEL_LOOP(tid, num_threads){ - int n = (tid / in_n_stride) % in_n; + int n = tid / in_n_stride; int c = (tid / in_c_stride) % in_c; int out_offset = n * out_n_stride + c * out_c_stride; int index = in_max_index[tid]; @@ -25,22 +25,16 @@ __global__ void ker_unpool_max_fwd(Dtype * out_data, \ } } -template -SaberStatus SaberUnpool::dispatch(\ - const std::vector *>& inputs, \ - std::vector *>& outputs,\ - PoolingParam& param) { +template +SaberStatus SaberUnpool::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs,\ + PoolingParam& param) { - const InDataType* in_data = inputs[0]->data(); - const OutDataType* in_max_index = inputs[1]->data(); - OutDataType* out_data = outputs[0]->mutable_data(); - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); + const InDataType* in_data = (const InDataType*)inputs[0]->data(); + const OutDataType* in_max_index = (const OutDataType*)inputs[1]->data(); + OutDataType* out_data = (OutDataType*)outputs[0]->mutable_data(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); int count = inputs[0]->valid_size(); int in_n = inputs[0]->num(); int in_c = inputs[0]->channel(); diff --git a/saber/funcs/impl/cuda/base/cuda_c/tensor_op_cuda.cu b/saber/funcs/impl/cuda/base/cuda_c/tensor_op_cuda.cu index 68e102765..0fd14e90d 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/tensor_op_cuda.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/tensor_op_cuda.cu @@ -1,22 +1,25 @@ #include "saber/core/tensor_op.h" +#include "anakin_config.h" #include namespace anakin{ namespace saber{ +#ifdef USE_CUDA + template -__global__ void set_device_data(Dtype* data_ptr, Dtype value, int size){ - CUDA_KERNEL_LOOP(index, size){ +__global__ void set_device_data(Dtype* data_ptr, Dtype value, long long size) { + CUDA_KERNEL_LOOP(index, size) { data_ptr[index] = value; } } template -__global__ void print_device_data(const Dtype* data_ptr, int size, int width){ - for (int i = 0; i < size; i++){ - printf("%.2f ", static_cast(data_ptr[i])); - if ((i + 1) % width == 0){ +__global__ void print_device_data(const Dtype* data_ptr, long long size, int width) { + for (int i = 0; i < size; i++) { + printf("%.6f ", static_cast(data_ptr[i])); + if ((i + 1) % width == 0) { printf("\n"); } } @@ -24,31 +27,15 @@ __global__ void print_device_data(const Dtype* data_ptr, int size, int width){ } template -__global__ void cuda_cvt_data(const float* src, Dtype* dst, Dtype scale, int size){ - CUDA_KERNEL_LOOP(index, size){ - dst[index] = static_cast(src[index] * scale); +__global__ void cuda_cvt_data(const float* src, Dtype* dst, Dtype vstart, Dtype scale, int size) { + CUDA_KERNEL_LOOP(index, size) { + dst[index] = static_cast(vstart + src[index] * scale); } } -template -void fill_tensor_device_const(Tensor_t& tensor, \ - typename Tensor_t::Dtype value, \ - typename Tensor_t::API::stream_t stream){ - - typedef typename Tensor_t::Dtype Dtype; - Dtype* data_ptr = static_cast(tensor.get_buf()->get_data_mutable()); - int size = tensor.size(); - set_device_data<<>>(data_ptr, value, size); - CUDA_POST_KERNEL_CHECK; -}; - - -template -void fill_tensor_device_rand(Tensor_t& tensor, typename Tensor_t::API::stream_t stream) { - - typedef typename Tensor_t::Dtype Dtype; - Dtype* data_ptr = static_cast(tensor.get_buf()->get_data_mutable()); - int size = tensor.size(); +template +void fill_tensor_device_rand_impl(Dtype* data_ptr, long long size, + typename Tensor::API::stream_t stream) { float* data_f; cudaMalloc(&data_f, size * sizeof(float)); @@ -60,21 +47,17 @@ void fill_tensor_device_rand(Tensor_t& tensor, typename Tensor_t::API::stream_t CHECK_EQ(curandDestroyGenerator(gen), CURAND_STATUS_SUCCESS); Dtype scale = std::numeric_limits::max(); - - cuda_cvt_data<<>>(data_f, data_ptr, scale, size); + Dtype z = 0; + cuda_cvt_data<<>>(data_f, data_ptr, z, scale, size); cudaDeviceSynchronize(); cudaFree(data_f); CUDA_POST_KERNEL_CHECK; }; -template -void fill_tensor_device_rand(Tensor_t& tensor, typename Tensor_t::Dtype vstart, \ - typename Tensor_t::Dtype vend, typename Tensor_t::API::stream_t stream) { - - typedef typename Tensor_t::Dtype Dtype; - Dtype* data_ptr = static_cast(tensor.get_buf()->get_data_mutable()); - int size = tensor.size(); +template +void fill_tensor_device_rand_impl2(Dtype* data_ptr, Dtype vstart, \ + Dtype vend, long long size , typename Tensor::API::stream_t stream) { float* data_f; cudaMalloc(&data_f, size * sizeof(float)); @@ -86,276 +69,165 @@ void fill_tensor_device_rand(Tensor_t& tensor, typename Tensor_t::Dtype vstart, CHECK_EQ(curandDestroyGenerator(gen), CURAND_STATUS_SUCCESS); Dtype scale = vend - vstart; - - cuda_cvt_data<<>>(data_f, data_ptr, scale, size); + cuda_cvt_data<<>>(data_f, data_ptr, vstart, scale, size); cudaDeviceSynchronize(); cudaFree(data_f); CUDA_POST_KERNEL_CHECK; }; -template -void print_tensor_device(Tensor_t& tensor, typename Tensor_t::API::stream_t stream){ - - typedef typename Tensor_t::Dtype Dtype; - LOG(INFO) << "device tensor size: " << tensor.size(); - const Dtype* data_ptr = static_cast(tensor.get_buf()->get_data()); - int size = tensor.size(); - print_device_data<<<1, 1, 0, stream>>>(data_ptr, size, tensor.width()); - cudaDeviceSynchronize(); - CUDA_POST_KERNEL_CHECK; -}; - -#define FILL_TENSOR_NV(type, layout) \ - template void fill_tensor_device_const>\ - (Tensor& tensor, DataTrait::dtype value, \ - typename TargetWrapper::stream_t stream); \ - template void fill_tensor_device_rand>\ - (Tensor& tensor, typename TargetWrapper::stream_t stream); \ - template void fill_tensor_device_rand>\ - (Tensor& tensor, DataTrait::dtype vstart, \ - DataTrait::dtype vend, typename TargetWrapper::stream_t stream); \ - template void print_tensor_device>\ - (Tensor& tensor, typename TargetWrapper::stream_t stream); - -FILL_TENSOR_NV(AK_FLOAT, NCHW); -FILL_TENSOR_NV(AK_FLOAT, NHWC); -FILL_TENSOR_NV(AK_FLOAT, NHW); -FILL_TENSOR_NV(AK_FLOAT, NW); -FILL_TENSOR_NV(AK_FLOAT, HW); -FILL_TENSOR_NV(AK_FLOAT, W); - -FILL_TENSOR_NV(AK_INT8, NCHW); -FILL_TENSOR_NV(AK_INT8, NHWC); -FILL_TENSOR_NV(AK_INT8, NHW); -FILL_TENSOR_NV(AK_INT8, NW); -FILL_TENSOR_NV(AK_INT8, HW); -FILL_TENSOR_NV(AK_INT8, W); - -// INT8 NCHW_C4 -template void fill_tensor_device_const>(Tensor& tensor, \ - char value, typename TargetWrapper::stream_t stream); -template void fill_tensor_device_rand>(Tensor& tensor, \ - typename TargetWrapper::stream_t stream); - -template <> -void print_tensor_device>(Tensor& tensor, \ - typename TargetWrapper::stream_t stream) { - - typedef typename Tensor::Dtype Dtype; - LOG(INFO) << "device tensor size: " << tensor.size(); - const Dtype* data_ptr = (const Dtype*)tensor.get_buf()->get_data(); - int size = tensor.size(); - print_device_data<<<1, 1, 0, stream>>>(data_ptr, size, tensor.width() * 4); - CUDA_POST_KERNEL_CHECK; -}; - -// use BLOCKCOUNT and THREADNUM -__global__ -void int8nchwc4_fp32nchw(float* out_data, const char* in_data, - int valid_num, int valid_channel_4, int valid_height, int valid_width, - int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, - int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, - float* scale, int count) { - - float load0, load1, load2, load3; - int gid = threadIdx.x + blockIdx.x * blockDim.x; - - int read_w = (gid) % valid_width; - int read_h = (gid / (in_h_stride)) % valid_height; - int read_c = (gid / (in_c_stride)) % valid_channel_4; - int read_n = (gid / (in_n_stride)) % valid_num; - int scale_index = read_c << 2; - - int in_offset = read_n * in_n_stride - + read_c * in_c_stride - + read_h * in_h_stride - + read_w; - - int out_offset = read_n * out_n_stride - + read_c * (out_c_stride << 2) - + read_h * out_h_stride - + read_w * out_w_stride; - - if (gid < count) { - - char4 readin = __ldg(&((const char4*)in_data)[in_offset]); - - load0 = static_cast(readin.x); - load1 = static_cast(readin.y); - load2 = static_cast(readin.z); - load3 = static_cast(readin.w); - - out_data[out_offset] = load0 * scale[scale_index]; out_offset += out_c_stride; - out_data[out_offset] = load1 * scale[scale_index + 1]; out_offset += out_c_stride; - out_data[out_offset] = load2 * scale[scale_index + 2]; out_offset += out_c_stride; - out_data[out_offset] = load3 * scale[scale_index + 3]; +template<> +void fill_tensor_const(Tensor& tensor, float value, typename Tensor::API::stream_t stream) { + long long size = tensor.size(); + void* dio = tensor.mutable_data(); + DataType type = tensor.get_dtype(); + switch (type) { + case AK_UINT8: set_device_data<<>>((unsigned char*)dio, static_cast(value), size); break; + case AK_INT8: set_device_data<<>>((char*)dio, static_cast(value), size); break; + case AK_INT16: set_device_data<<>>((short*)dio, static_cast(value), size); break; + case AK_UINT16: set_device_data<<>>((unsigned short*)dio, static_cast(value), size); break; + case AK_HALF: set_device_data<<>>((short*)dio, static_cast(value), size); break; + case AK_UINT32: set_device_data<<>>((unsigned int*)dio, static_cast(value), size); break; + case AK_INT32: set_device_data<<>>((int*)dio, static_cast(value), size); break; + case AK_FLOAT: set_device_data<<>>((float*)dio, static_cast(value), size); break; + case AK_DOUBLE: set_device_data<<>>((double*)dio, static_cast(value), size); break; + default: LOG(FATAL) << "data type: " << type << " is unsupported now"; } } template<> -SaberStatus DataTensorTransformHelper::transform, Tensor >( - Tensor &out_tensor, - const Tensor &in_tensor, Context ctx){ - - Shape out_stride = out_tensor.get_stride(); - - Shape in_shape = in_tensor.valid_shape(); - Shape out_shape = out_tensor.valid_shape(); - - int count = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3]; - - const char * in_data = in_tensor.data(); - float * out_data = out_tensor.mutable_data(); - - cudaStream_t cuda_stream = ctx.get_compute_stream(); - int8nchwc4_fp32nchw<<>>(out_data, in_data, - in_shape[0], in_shape[1], in_shape[2], in_shape[3], - in_shape[1] * in_shape[2] * in_shape[3], - in_shape[2] * in_shape[3], - in_shape[3], 1, - out_stride[0], out_stride[1], out_stride[2], out_stride[3], - _weight_scale, count); - - return SaberSuccess; +void fill_tensor_rand(Tensor& tensor, typename Tensor::API::stream_t stream) { + long long size = tensor.size(); + void* dio = tensor.mutable_data(); + DataType type = tensor.get_dtype(); + switch (type) { + case AK_UINT8: fill_tensor_device_rand_impl((unsigned char*)dio, size, stream); break; + case AK_INT8: fill_tensor_device_rand_impl((char*)dio, size, stream); break; + case AK_INT16: fill_tensor_device_rand_impl((short*)dio, size, stream); break; + case AK_UINT16: fill_tensor_device_rand_impl((unsigned short*)dio, size, stream); break; + case AK_UINT32: fill_tensor_device_rand_impl((unsigned int*)dio, size, stream); break; + case AK_INT32: fill_tensor_device_rand_impl((int*)dio, size, stream); break; + case AK_HALF: fill_tensor_device_rand_impl((short*)dio, size, stream); break; + case AK_FLOAT: fill_tensor_device_rand_impl((float*)dio, size, stream); break; + case AK_DOUBLE: fill_tensor_device_rand_impl((double*)dio, size, stream); break; + default: LOG(FATAL) << "data type: " << type << " is unsupported now"; + } } -__global__ -void transform_nchw_2_c4(char* out_data, const float* in_data, - int valid_num, int valid_channel_4, int valid_height, int valid_width, - int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, - int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, - float scale, - int count) { - - int load0, load1, load2, load3; - int gid = threadIdx.x + blockIdx.x * blockDim.x; - - int write_w = (gid) % valid_width; - int write_h = (gid / (out_h_stride)) % valid_height; - int write_c = (gid / (out_c_stride)) % valid_channel_4; - int write_n = (gid / (out_n_stride)) % valid_num; - - int in_offset = write_n * in_n_stride - + write_c * (in_c_stride << 2) - + write_h * in_h_stride - + write_w * in_w_stride; - - int out_offset = write_n * out_n_stride - + write_c * out_c_stride - + write_h * out_h_stride - + write_w; - - if (gid < count) { - - char4 write; - - load0 = __float2int_rn(__ldg(&in_data[in_offset]) * scale); - write.x = static_cast(load0); - - in_offset += in_c_stride; - load1 = __float2int_rn(__ldg(&in_data[in_offset]) * scale); - write.y = static_cast(load1); - - in_offset += in_c_stride; - load2 = __float2int_rn(__ldg(&in_data[in_offset]) * scale); - write.z = static_cast(load2); - - in_offset += in_c_stride; - load3 = __float2int_rn(__ldg(&in_data[in_offset]) * scale); - write.w = static_cast(load3); - - ((char4*)out_data)[out_offset] = write; - +template<> +void fill_tensor_rand(Tensor& tensor, float vstart, float vend, typename Tensor::API::stream_t stream) { + long long size = tensor.size(); + void* dio = tensor.mutable_data(); + DataType type = tensor.get_dtype(); + switch (type) { + case AK_UINT8: fill_tensor_device_rand_impl2((unsigned char*)dio, static_cast(vstart), + static_cast(vend), size, stream); break; + case AK_INT8: fill_tensor_device_rand_impl2((char*)dio, static_cast(vstart), static_cast(vend), size, stream); break; + case AK_INT16: fill_tensor_device_rand_impl2((short*)dio, static_cast(vstart), static_cast(vend), size, stream); break; + case AK_UINT16: fill_tensor_device_rand_impl2((unsigned short*)dio, static_cast(vstart), + static_cast(vend), size, stream); break; + case AK_UINT32: fill_tensor_device_rand_impl2((unsigned int*)dio, static_cast(vstart), + static_cast(vend), size, stream); break; + case AK_INT32: fill_tensor_device_rand_impl2((int*)dio, static_cast(vstart), static_cast(vend), size, stream); break; + case AK_HALF: fill_tensor_device_rand_impl2((short*)dio, static_cast(vstart), static_cast(vend), size, stream); break; + case AK_FLOAT: fill_tensor_device_rand_impl2((float*)dio, static_cast(vstart), static_cast(vend), size, stream); break; + case AK_DOUBLE: fill_tensor_device_rand_impl2((double*)dio, static_cast(vstart), static_cast(vend), size, stream); break; + default: LOG(FATAL) << "data type: " << type << " is unsupported now"; } } template<> -SaberStatus DataTensorTransformHelper::transform, Tensor >( - Tensor &out_tensor, - const Tensor &in_tensor, Context ctx){ - - const float * in_data = in_tensor.data(); - char * out_data = out_tensor.mutable_data(); - - Shape in_stride = in_tensor.get_stride(); - - Shape in_shape = in_tensor.valid_shape(); - Shape out_shape = out_tensor.valid_shape(); - int count = out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]; - - cudaStream_t cuda_stream = ctx.get_compute_stream(); - transform_nchw_2_c4<<>>(out_data, in_data, - out_shape[0], out_shape[1], out_shape[2], out_shape[3], - in_stride[0], in_stride[1], in_stride[2], in_stride[3], - out_shape[1] * out_shape[2] * out_shape[3], - out_shape[2] * out_shape[3], out_shape[3], 1, - (1.f / _in_scale), count); - - return SaberSuccess; +void print_tensor(Tensor& tensor, typename Tensor::API::stream_t stream) { + LOG(INFO) << "device tensor data:" << tensor.size(); + const void* data_ptr = tensor.data(); + long long size = tensor.size(); + int width = tensor.width(); + DataType type = tensor.get_dtype(); + switch(type) { + case AK_UINT8: print_device_data<<<1, 1, + 0, stream>>>((const unsigned char*)data_ptr, size, width); break; + case AK_INT8: print_device_data<<<1, 1, + 0, stream>>>((const char*)data_ptr, size, width); break; + case AK_UINT16: print_device_data<<<1, 1, + 0, stream>>>((const unsigned short*)data_ptr, size, width); break; + case AK_HALF: print_device_data<<<1, 1, + 0, stream>>>((const short*)data_ptr, size, width); break; + case AK_UINT32: print_device_data<<<1, 1, + 0, stream>>>((const unsigned int*)data_ptr, size, width); break; + case AK_INT32: print_device_data<<<1, 1, + 0, stream>>>((const int*)data_ptr, size, width); break; + case AK_FLOAT: print_device_data<<<1, 1, + 0, stream>>>((const float*)data_ptr, size, width); break; + case AK_DOUBLE: print_device_data<<<1, 1, + 0, stream>>>((const double*)data_ptr, size, width); break; + default: LOG(FATAL) << "data type: " << type << " is unsupported now"; + } + cudaDeviceSynchronize(); + CUDA_POST_KERNEL_CHECK; + printf("\n"); } -__global__ void transform_nchw_2_nchw(float * out_data, - const float* in_data, const int count, - int in_n, int in_c, int in_h, int in_w, - int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, - int out_n, int out_c, int out_h, int out_w, - int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, - float *scale) { - CUDA_KERNEL_LOOP(tid, count){ - int read_w = tid % in_w; - int read_h = (tid / (in_w)) % in_h; - int read_c = (tid / (in_h * in_w)) % in_c; - int read_n = (tid / (in_c * in_h * in_w)) % in_n; - - int write_w = tid % out_w; - int write_h = (tid / (out_w)) % out_h; - int write_c = (tid / (out_h * out_w)) % out_c; - int write_n = (tid / (out_c * out_h * out_w)) % out_n; - - int in_idx = read_n * in_n_stride - + read_c * in_c_stride - + read_h * in_h_stride - + read_w * in_w_stride; - - int out_idx = write_n * out_n_stride - + write_c * out_c_stride - + write_h * out_h_stride - + write_w * out_w_stride; - - float in_var = in_data[in_idx]; - float in_scale = scale[read_c]; - out_data[out_idx] = in_var * in_scale; +template<> +void print_tensor_valid(Tensor& tensor, typename Tensor::API::stream_t stream) { + LOG(INFO) << "device tensor data:" << tensor.valid_size(); + const void* data_ptr = (const void*)((const char*)tensor.data() + tensor.data_offset() * type_length(tensor.get_dtype())); + long long size = tensor.valid_size(); + int width = tensor.width(); + DataType type = tensor.get_dtype(); + if (tensor.is_continue_mem()) { + switch(type) { + case AK_UINT8: print_device_data<<<1, 1, + 0, stream>>>((const unsigned char*)data_ptr, size, width); break; + case AK_INT8: print_device_data<<<1, 1, + 0, stream>>>((const char*)data_ptr, size, width); break; + case AK_UINT16: print_device_data<<<1, 1, + 0, stream>>>((const unsigned short*)data_ptr, size, width); break; + case AK_HALF: print_device_data<<<1, 1, + 0, stream>>>((const short*)data_ptr, size, width); break; + case AK_UINT32: print_device_data<<<1, 1, + 0, stream>>>((const unsigned int*)data_ptr, size, width); break; + case AK_INT32: print_device_data<<<1, 1, + 0, stream>>>((const int*)data_ptr, size, width); break; + case AK_FLOAT: print_device_data<<<1, 1, + 0, stream>>>((const float*)data_ptr, size, width); break; + case AK_DOUBLE: print_device_data<<<1, 1, + 0, stream>>>((const double*)data_ptr, size, width); break; + default: LOG(FATAL) << "data type: " << type << " is unsupported now"; + } + cudaDeviceSynchronize(); + CUDA_POST_KERNEL_CHECK; + printf("\n"); + } else { + Tensor tvalid(tensor.valid_shape()); + tvalid.copy_from(tensor); + print_tensor(tvalid, stream); } } -template<> -SaberStatus DataTensorTransformHelper::transform, Tensor >( - Tensor &out_tensor, - const Tensor &in_tensor, Context ctx){ - - Shape in_shape = in_tensor.valid_shape(); - Shape out_shape = out_tensor.valid_shape(); - - Shape stride_in = in_tensor.get_stride(); - Shape stride_out = out_tensor.get_stride(); - const float *in_data = (const float*)in_tensor.data(); - float *out_data = (float*)out_tensor.mutable_data(); - const int count = in_tensor.valid_size(); - cudaStream_t cuda_stream = ctx.get_compute_stream(); - - transform_nchw_2_nchw - <<>>( - out_data, in_data, count, - in_shape[0], in_shape[1], in_shape[2], in_shape[3], - stride_in[0], stride_in[1], stride_in[2], stride_in[3], - out_shape[0], out_shape[1], out_shape[2], out_shape[3], - stride_out[0], stride_out[1], stride_out[2], stride_out[3], - _weight_scale); +template<> +double tensor_mean_value(Tensor& tensor, typename Tensor::API::stream_t stream) { + Tensor tvalid(tensor.valid_shape()); + tvalid.copy_from(tensor); + return tensor_mean_value(tvalid, stream); +} - return SaberSuccess; +template<> +double tensor_mean_value_valid(Tensor& tensor, typename Tensor::API::stream_t stream) { + Tensor tvalid(tensor.valid_shape()); + tvalid.copy_from(tensor); + return tensor_mean_value(tvalid, stream); } +#endif } //namespace saber diff --git a/saber/funcs/impl/cuda/base/cuda_c/vender_fc.cu b/saber/funcs/impl/cuda/base/cuda_c/vender_fc.cu deleted file mode 100644 index 40369bd89..000000000 --- a/saber/funcs/impl/cuda/base/cuda_c/vender_fc.cu +++ /dev/null @@ -1,126 +0,0 @@ -#include "saber/funcs/impl/cuda/vender_fc.h" - - -namespace anakin{ - -namespace saber{ - -template -void anakin_NV_gemv(cublasHandle_t handle, const bool TransA, \ - const int M, const int N, \ - const dtype alpha, const dtype* A,\ - const dtype* x, const dtype beta,\ - dtype* y); - -template <> -void anakin_NV_gemv(cublasHandle_t handle, const bool TransA, \ - const int M, const int N, const float alpha, const float* A, const float* x, \ - const float beta, float* y) { - cublasOperation_t cuTransA = (TransA == false) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(handle, cuTransA, N, M, &alpha, - A, N, x, 1, &beta, y, 1)); - } - -template <> -void anakin_NV_gemv(cublasHandle_t handle, const bool TransA, \ - const int M, const int N, \ - const char alpha, const char* A,\ - const char* x, const char beta,\ - char* y) { - LOG(FATAL) << "int8 gemv is not implemented"; -} - -template -void anakin_NV_gemm(cublasHandle_t handle, const bool TransA, - const bool TransB, const int M, const int N, const int K, - const dtype alpha, const dtype* A, const dtype* B, const dtype beta, - dtype* C); - -template <> -void anakin_NV_gemm(cublasHandle_t handle, const bool TransA, - const bool TransB, const int M, const int N, const int K, - const float alpha, const float* A, const float* B, const float beta, - float* C) { - // Note that cublas follows fortran order. - int lda = (!TransA/* == CblasNoTrans*/) ? K : M; - int ldb = (!TransB/* == CblasNoTrans*/) ? N : K; - cublasOperation_t cuTransA = - (!TransA/* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (!TransB/* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(handle, cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); -} - -template <> -void anakin_NV_gemm(cublasHandle_t handle, const bool TransA, - const bool TransB, const int M, const int N, const int K, - const char alpha, const char* A, const char* B, const char beta, - char* C) { - LOG(FATAL) << "int8 gemm is not implemented"; -} - -template -__global__ void add_bias(int n, int output_size, const dtype* bias, dtype* dout) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int bias_index = index % output_size; - if (index < n) { - //printf("index: %d, bias_index: %d, val_in: %.2f\n", index, bias_index, bias[bias_index]); - dout[index] = dout[index] + bias[bias_index]; - } -} - -template -SaberStatus VenderFc::dispatch( - const std::vector& inputs, - std::vector& outputs, - FcParam& param) { - - cudaStream_t stream = this->_ctx.get_compute_stream(); - - const InDataType* din = inputs[0]->data(); - OutDataType* dout = outputs[0]->mutable_data(); - const OpDataType* weight = param.weights->data(); - const InDataType* bias = nullptr; - bool bias_term = param.bias != nullptr; - //dim3 grid(CUDA_GET_BLOCKS(param.num_output), _M); - if (bias_term) { - bias = param.bias->data(); - } - - if (_M == 1 && _K > 50000) { - anakin_NV_gemv(_handle, false, _N, _K, (InDataType)1, weight, din, \ - (InDataType)0, dout); - } else { - anakin_NV_gemm(_handle, false, !_flag_trans_weights, \ - _M, _N, _K, (InDataType)1, din, weight, (InDataType)0, dout); - } - if (bias_term) { - int total_size = _M * _N; - add_bias<<>>\ - (total_size, _N, bias, dout); - } - return SaberSuccess; -} - -/* -#define INSTANCE_FC(datatype, layouttype) \ - template SaberStatus CublasFc::dispatch( \ - const std::vector::ioTensor *> inputs, \ - std::vector::ioTensor *> outputs, \ - FcParam::ioTensor> ¶m); - -INSTANCE_FC(AK_FLOAT, NCHW); -INSTANCE_FC(AK_INT8, NCHW); -INSTANCE_FC(AK_FLOAT, NHWC); -INSTANCE_FC(AK_INT8, NHWC); -*/ -} //namespace anakin - -} //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_funcs.h b/saber/funcs/impl/cuda/base/cuda_funcs.h index 1adeda9ed..3e703d568 100644 --- a/saber/funcs/impl/cuda/base/cuda_funcs.h +++ b/saber/funcs/impl/cuda/base/cuda_funcs.h @@ -1,3 +1,18 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + #include #include "saber/core/tensor.h" diff --git a/saber/funcs/impl/cuda/base/sass/sm_50/libanakin_saber_sass_sm_50.a b/saber/funcs/impl/cuda/base/sass/sm_50/libanakin_saber_sass_sm_50.a index 939650583..3b9e416a3 100644 Binary files a/saber/funcs/impl/cuda/base/sass/sm_50/libanakin_saber_sass_sm_50.a and b/saber/funcs/impl/cuda/base/sass/sm_50/libanakin_saber_sass_sm_50.a differ diff --git a/saber/funcs/impl/cuda/base/sass/sm_61/libanakin_saber_sass_sm_61.a b/saber/funcs/impl/cuda/base/sass/sm_61/libanakin_saber_sass_sm_61.a index 1775b9165..2d0cb7053 100644 Binary files a/saber/funcs/impl/cuda/base/sass/sm_61/libanakin_saber_sass_sm_61.a and b/saber/funcs/impl/cuda/base/sass/sm_61/libanakin_saber_sass_sm_61.a differ diff --git a/saber/funcs/impl/cuda/base/sass_funcs.h b/saber/funcs/impl/cuda/base/sass_funcs.h index 3374cf6cc..6f03ab691 100644 --- a/saber/funcs/impl/cuda/base/sass_funcs.h +++ b/saber/funcs/impl/cuda/base/sass_funcs.h @@ -336,7 +336,7 @@ void direct_conv_bias_relu_Kdivis4(const DataType* src, template void direct_conv_bias_relu_Kindiv4(const DataType* src, - DataType* dst, + DataType* dst, const OpType* weight, const DataType* bias, int img_num, @@ -429,25 +429,23 @@ void direct_conv_bias_relu_maxpool2k2s0p_Kindiv4(const DataType* src, cudaStream_t cuda_stream); template -void scale_to_new_tensor_k4_s2_p1_decov (Tensor &new_weights_dev, - const Tensor *weight, - int in_channel, int out_channel) { - Tensor new_weights_h; - Tensor temp_weights; - new_weights_dev.reshape(weight->valid_shape()); +void scale_to_new_tensor_k4_s2_p1_deconv (Tensor *weight, int in_channel, int out_channel) { + Tensor new_weights_h; + Tensor temp_weights; +// new_weights_dev.reshape(weight->valid_shape()); new_weights_h.reshape(weight->valid_shape()); temp_weights.reshape(weight->valid_shape()); temp_weights.copy_from(*weight); int offset = in_channel * out_channel * k; - float* trans_w = new_weights_h.mutable_data(); + float* trans_w = (float*)new_weights_h.mutable_data(); scale_weight_deconv_w4x4(trans_w + 0 * offset, trans_w + 1 * offset, trans_w + 2 * offset, trans_w + 3 * offset, - temp_weights.data(), + static_cast(temp_weights.data()), in_channel, out_channel); - new_weights_dev.copy_from(new_weights_h); + weight->copy_from(new_weights_h); } void ker_deconv_implicit_gemm_k4_s2_p1_16x64( @@ -487,6 +485,16 @@ void ker_gemm_32x32x32_NN_vec_bias_relu(const int M, const int N, const int K, const float beta, const float* B, float* C, const float* bias, cudaStream_t cuda_stream); +void ker_gemm_32x32x32_NN_bias(const int M, const int N, const int K, + const float alpha, const float* A, + const float beta, const float* B, + float* C, const float* bias, cudaStream_t cuda_stream); + +void ker_gemm_32x32x32_NN_vec_bias(const int M, const int N, const int K, + const float alpha, const float* A, + const float beta, const float* B, + float* C, const float* bias, cudaStream_t cuda_stream); + template void ker_sgemm_nn(const int M, const int N, const int K, const int lda, const int ldb, const int ldc, @@ -548,6 +556,58 @@ std::function +void conv_gemm_k1s1p0(int num, int in_stride, int out_stride, + float* out, const float* img, + const float* weights, int out_channel, + int in_channel, int img_h, int img_w, + const float* bias, cudaStream_t cuda_stream, + float a = 1.f, float b = 0.f) { + + float alpha = a; float beta = b; + int m = out_channel; + int k = in_channel; + int n = img_h * img_w; + if (ifVec(m, n, k, k, n, n)) { + if (with_relu) { + for (int i = 0; i < num; ++i) { + ker_gemm_32x32x32_NN_vec_bias_relu(m, n, k, + alpha, weights, + beta, img + i * in_stride, + out + i * out_stride, bias, + cuda_stream); + } + } else { + for (int i = 0; i < num; ++i) { + ker_gemm_32x32x32_NN_vec_bias(m, n, k, + alpha, weights, + beta, img + i * in_stride, + out + i * out_stride, bias, + cuda_stream); + } + } + } else { + if (with_relu) { + for (int i = 0; i < num; ++i) { + ker_gemm_32x32x32_NN_bias_relu(m, n, k, + alpha, weights, + beta, img + i * in_stride, + out + i * out_stride, bias, + cuda_stream); + } + } else { + for (int i = 0; i < num; ++i) { + ker_gemm_32x32x32_NN_bias(m, n, k, + alpha, weights, + beta, img + i * in_stride, + out + i * out_stride, bias, + cuda_stream); + } + } + } +} + } // namespace saber } // namespace anakin diff --git a/saber/funcs/impl/cuda/cuda_inline_activation.h b/saber/funcs/impl/cuda/cuda_inline_activation.h new file mode 100644 index 000000000..34815014c --- /dev/null +++ b/saber/funcs/impl/cuda/cuda_inline_activation.h @@ -0,0 +1,97 @@ + +#ifndef SABER_FUNCS_IMPL_CUDA_BASE_CUDA_C_CUDA_INLINE_ACTIVATION_H +#define SABER_FUNCS_IMPL_CUDA_BASE_CUDA_C_CUDA_INLINE_ACTIVATION_H + +#include "saber_types.h" +#include "cuda.h" + +#define SIGMOID_THRESHOLD_MIN_PADDLE -40.0 +#define SIGMOID_THRESHOLD_MAX_PADDLE 13.0 +#define EXP_MAX_INPUT_PADDLE 40.0 + +namespace anakin { + +namespace saber { + + +template +static inline __device__ Dtype +InValidAct(Dtype + a) { + printf("invalid act\n"); + return static_cast(0); +} + +template +static inline __device__ Dtype +Sigmoid(const Dtype a) { + return static_cast(1.0) / (static_cast(1.0) + expf(-a)); +} + + +template +static inline __device__ Dtype +Tanh(const Dtype a) { + Dtype tmp = static_cast(-2.0) * a; + return (static_cast(2.0) / (static_cast(1.0) + expf(tmp))) - static_cast(1.0); +} + +template +static inline __device__ Dtype +Identity(const Dtype a) { + return a; +} + +template +static inline __device__ Dtype +Relu(const Dtype a) { + return a > static_cast(0.0) ? a : static_cast(0.0); +} + +template +static inline __device__ Dtype +Sigmoid_fluid(const Dtype a) { + const Dtype min = SIGMOID_THRESHOLD_MIN_PADDLE; + const Dtype max = SIGMOID_THRESHOLD_MAX_PADDLE; + Dtype tmp = (a < min) ? min : ((a > max) ? max : a); + + return static_cast(1.0) / (static_cast(1.0) + expf(-tmp)); +} + +template +static inline __device__ Dtype +Tanh_fluid(const Dtype a) { + Dtype tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT_PADDLE) ? EXP_MAX_INPUT_PADDLE : tmp; + return (2.0 / (1.0 + expf(tmp))) - 1.0; +} + +static __device__ float (*act_funcs_cu[])(float) = {&InValidAct, &Sigmoid < float >, &Relu < float >, + &Tanh < float >, + &InValidAct, &InValidAct, + &Identity < float >, &Sigmoid_fluid < float >, + &Tanh_fluid < float > + }; + +static inline __device__ float activate_cuda_float(float x, ActiveType type_id) { + return act_funcs_cu[type_id](x); +} + +template +struct ACTIVATION { + typedef Dtype(*Act)(const Dtype); +}; + +template +__device__ inline typename ACTIVATION::Act Activate_inner(ActiveType type) { + static typename ACTIVATION::Act vec[7] = {&InValidAct, &Sigmoid < Dtype >, &Relu < Dtype >, + &Tanh < Dtype >, + &InValidAct, &InValidAct, + &Identity < Dtype > + }; + return vec[type]; +} + +} +} +#endif //SABER_FUNCS_IMPL_CUDA_BASE_CUDA_C_CUDA_INLINE_ACTIVATION_H diff --git a/saber/funcs/impl/cuda/cuda_utils.h b/saber/funcs/impl/cuda/cuda_utils.h new file mode 100644 index 000000000..ddae7c185 --- /dev/null +++ b/saber/funcs/impl/cuda/cuda_utils.h @@ -0,0 +1,190 @@ + +#ifndef SABER_FUNCS_IMPL_CUDA_BASE_CUDA_C_CUDA_UTILS_H +#define SABER_FUNCS_IMPL_CUDA_BASE_CUDA_C_CUDA_UTILS_H + +#include +#include +#include "core/common.h" +#include "core/tensor.h" +#include "cuda.h" +#include "saber_util.h" + +namespace anakin { + +namespace saber { + + + +template +extern void trans_map2out_cfunc(const Dtype* input, Dtype* output, int word_size, int seq_sum, + cudaStream_t stream, + int* dev_map_vec); + +template +extern void trans_map2in_cfunc(const Dtype* input, Dtype* output, int hidden_size, int seq_sum, + cudaStream_t stream, + int* dev_map_vec); + +class SeqSortedseqTranseUtil { + +public: + + SeqSortedseqTranseUtil(bool is_reverse = false, bool is_bi = false) + : _is_reverse(is_reverse), + _is_bi(is_bi), + _dev_map_vec(nullptr), + _dev_map_vec_length(0) + + {}; + + ~SeqSortedseqTranseUtil() { + if (_dev_map_vec != nullptr) { + CUDA_CHECK(cudaFree(_dev_map_vec)); + } + }; + + void print_vec(const float* in, int size, const char* perfix) { + for (int i = 0; i < size; i++) { + printf("[%s] %d = %f\n", perfix, i, in[i]); + } + } + void print_vec(const int* in, int size, const char* perfix) { + for (int i = 0; i < size; i++) { + printf("[%s] %d = %d\n", perfix, i, in[i]); + } + } + template + void seq_2_sorted_seq(const Dtype* input, Dtype* output, int word_size, cudaStream_t stream) { + int seq_sum = _map_vec.size(); +// print_vec(_map_vec.data(),_map_vec.size(),"map_vec"); + trans_map2out_cfunc(input, output, word_size, seq_sum, stream, _dev_map_vec); + } + template + void hidden_2_sorted_hidden(const Dtype* input, Dtype* output, int hidden_size) { + // _map_vec.resize(word_sum); + int batch_size = _length_index.size(); + // std::cout << "word_sum = " << word_sum << std::endl; + + for (int ori_word_id = 0; ori_word_id < batch_size; ++ori_word_id) { + //can param + int word_start = ori_word_id * hidden_size; + int maped_id = _length_index[ori_word_id]; + int maped_start = maped_id * hidden_size; + + for (int word_vec_offset = 0; word_vec_offset < hidden_size; ++word_vec_offset) { + // std::cout< "< class cudnnTypeWrapper; @@ -82,23 +106,24 @@ class cudnnTypeWrapper { return &v; } }; + template class TensorDescriptors { public: TensorDescriptors( size_t n, - const std::vector& dim, - const std::vector& stride) { + const std::vector>& dim, + const std::vector>& stride) { descs_.resize(n); - CHECK_EQ(dim.size(), stride.size()); + CHECK_EQ(dim.size(), stride.size()); for (auto i = 0; i < n; ++i) { CUDNN_CHECK(cudnnCreateTensorDescriptor(&descs_[i])); CUDNN_CHECK(cudnnSetTensorNdDescriptor( descs_[i], cudnnTypeWrapper::type, - dim.size(), - dim.data(), - stride.data())); + dim[i].size(), + dim[i].data(), + stride[i].data())); } } ~TensorDescriptors() { diff --git a/saber/funcs/impl/cuda/saber_activation.h b/saber/funcs/impl/cuda/saber_activation.h index 1405d6d6e..cb62040be 100644 --- a/saber/funcs/impl/cuda/saber_activation.h +++ b/saber/funcs/impl/cuda/saber_activation.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,54 +22,34 @@ namespace anakin{ namespace saber{ -template -class SaberActivation : \ +template +class SaberActivation : public ImplBase< - Tensor, - Tensor, - Tensor, - ActivationParam > > -{ + NV, OpDtype, + ActivationParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberActivation() - {} - + typedef typename DataTrait::Dtype OpDataType; + SaberActivation() = default; ~SaberActivation() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ActivationParam& param, Context& ctx) { - this->_ctx = ctx; + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ActivationParam& param, Context& ctx) { + this->_ctx = &ctx; return SaberSuccess; } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ActivationParam& param, Context &ctx) { + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ActivationParam& param, Context &ctx) { return SaberSuccess; } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ActivationParam& param); - + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam& param); }; -//template class SaberActivation; - } } diff --git a/saber/funcs/impl/cuda/saber_argmax.h b/saber/funcs/impl/cuda/saber_argmax.h index 57af6f948..ff05e423e 100644 --- a/saber/funcs/impl/cuda/saber_argmax.h +++ b/saber/funcs/impl/cuda/saber_argmax.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,8 +13,8 @@ limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_IMPL_SABER_ARGMAX_H -#define ANAKIN_SABER_FUNCS_IMPL_SABER_ARGMAX_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ARGMAX_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ARGMAX_H #include "saber/funcs/impl/impl_argmax.h" @@ -22,70 +22,52 @@ namespace anakin{ namespace saber{ -template -class SaberArgmax : \ +template +class SaberArgmax : public ImplBase< - Tensor, - Tensor, - Tensor, - ArgmaxParam > > + NV, OpDtype, + ArgmaxParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; - SaberArgmax() - {} + SaberArgmax(){} - ~SaberArgmax() { + ~SaberArgmax() {} + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ArgmaxParam& param, + Context &ctx) { + return create(inputs, outputs, param, ctx);//SaberSuccess; } - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ArgmaxParam& param, - Context &ctx) { - this->_ctx = ctx; + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ArgmaxParam& param, + Context& ctx) { + this->_ctx = &ctx; if (!param.has_axis) { int inner_dim = inputs[0]->count(1, inputs[0]->dims()); int outer_dim = inputs[0]->num(); int block_num = CUDA_GET_BLOCKS(inner_dim); - _block_max_value.re_alloc(Shape(outer_dim, block_num, 1, 1)); - _block_max_index.re_alloc(Shape(outer_dim, block_num, 1, 1)); + _block_max_value.re_alloc(Shape({outer_dim, block_num, 1, 1}, Layout_NCHW), OpDtype); + _block_max_index.re_alloc(Shape({outer_dim, block_num, 1, 1}, Layout_NCHW), OpDtype); } return SaberSuccess; } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ArgmaxParam& param, - Context& ctx) { - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ArgmaxParam& param); + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ArgmaxParam& param); private: - Tensor _block_max_value; - Tensor _block_max_index; + Tensor _block_max_value; + Tensor _block_max_index; }; -template class SaberArgmax; - } } -#endif //ANAKIN_SABER_FUNCS_IMPL_SABER_ARGMAX_H \ No newline at end of file +#endif //ANAKIN_SABER_FUNCS_IMPL_SABER_ARGMAX_H diff --git a/saber/funcs/impl/cuda/saber_axpy.h b/saber/funcs/impl/cuda/saber_axpy.h index a3a1eac28..bb3bb885e 100644 --- a/saber/funcs/impl/cuda/saber_axpy.h +++ b/saber/funcs/impl/cuda/saber_axpy.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,27 +22,14 @@ namespace anakin{ namespace saber{ -template -class SaberAxpy : \ +template +class SaberAxpy : \ public ImplBase< - Tensor, - Tensor, - Tensor, - AxpyParam > > + NV, OpDtype, + AxpyParam> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberAxpy() {} @@ -51,28 +38,28 @@ class SaberAxpy& inputs, - std::vector& outputs, - AxpyParam& param, + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + AxpyParam& param, Context &ctx) { - this->_ctx = ctx; + this->_ctx = &ctx; return SaberSuccess; } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - AxpyParam& param, + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + AxpyParam& param, Context& ctx) { return SaberSuccess; } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - AxpyParam& param); + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + AxpyParam& param); }; -template class SaberAxpy; +//template class SaberAxpy; } } diff --git a/saber/funcs/impl/cuda/saber_box_coder.h b/saber/funcs/impl/cuda/saber_box_coder.h deleted file mode 100644 index 19c722c07..000000000 --- a/saber/funcs/impl/cuda/saber_box_coder.h +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_CUDA_SABER_BOX_CODER_H -#define ANAKIN_SABER_FUNCS_CUDA_SABER_BOX_CODER_H - -#include "saber/funcs/impl/impl_box_coder.h" - - -namespace anakin{ - -namespace saber{ - -template -class SaberBoxCoder : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - BoxCoderParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberBoxCoder() = default; - ~SaberBoxCoder() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - BoxCoderParam& param, - Context &ctx) { - // get context - this->_ctx = ctx; - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - BoxCoderParam& param, - Context& ctx) { - - //! inputs[0]: prior boxes, dims = 2 {2, boxes * 4(xmin, ymin, xmax, ymax)}, with variance - //! inputs[1]: loc boxes, dims = 2 {N, boxes, } - Shape sh_bbox = inputs[0]->valid_shape(); - Shape sh_conf = inputs[1]->valid_shape(); - - CHECK_EQ(sh_conf[2], sh_bbox[1]) << \ - "Number of bboxes must match the number of scores per class."; - - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ArgmaxParam& param) { - LOG(FATAL) << "not impl yet"; - return SaberUnImplError; - } - - -}; -template class SaberBoxCoder; -} //namespace saber - -} //namespace anakin - -#endif //ANAKIN_SABER_FUNCS_CUDA_SABER_BOX_CODER_H \ No newline at end of file diff --git a/saber/funcs/impl/cuda/saber_cast.h b/saber/funcs/impl/cuda/saber_cast.h index ad15fe589..c79530577 100644 --- a/saber/funcs/impl/cuda/saber_cast.h +++ b/saber/funcs/impl/cuda/saber_cast.h @@ -1,16 +1,17 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CAST_H #define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CAST_H @@ -21,27 +22,14 @@ namespace anakin{ namespace saber{ -template -class SaberCast : \ +template < DataType OpDtype> +class SaberCast : \ public ImplBase< - Tensor, - Tensor, - Tensor, - CastParam > > + NV, OpDtype, + CastParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberCast() {} @@ -50,26 +38,41 @@ class SaberCast& inputs, - std::vector& outputs, - CastParam& param, Context& ctx) { - this->_ctx = ctx; - return SaberSuccess; + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + CastParam& param, Context& ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - CastParam& param, Context &ctx) { + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + CastParam& param, Context &ctx) { + _inDtype = param.in_type; + _outDtype = param.out_type; + if(_inDtype != 1 && _inDtype !=5){// AK_FLOAT AK_INT32 + LOG(FATAL) << "Cast not impl other type: " << _inDtype; + } + if(_outDtype != 1 && _outDtype !=5){ + LOG(FATAL) << "Cast not impl other type: " << _outDtype; + } + CHECK_EQ(_inDtype, inputs[0]->get_dtype()) << "inputs data type should be same with param.in_type"; + CHECK_EQ(_outDtype, outputs[0]->get_dtype()) << "outputs data type should be same with param.out_type"; + return SaberSuccess; } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - CastParam& param); + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + CastParam& param); + + private: + int _inDtype; + int _outDtype; }; -template class SaberCast; -template class SaberCast; +//template class SaberCast; +//template class SaberCast; } diff --git a/saber/funcs/impl/cuda/saber_concat.h b/saber/funcs/impl/cuda/saber_concat.h index 0f9f19456..a774ce096 100644 --- a/saber/funcs/impl/cuda/saber_concat.h +++ b/saber/funcs/impl/cuda/saber_concat.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,43 +21,28 @@ namespace anakin{ namespace saber{ -template -class SaberConcat : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConcatParam > > +template +class SaberConcat : + public ImplBase> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberConcat() = default; ~SaberConcat() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConcatParam& param, + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConcatParam& param, Context &ctx) { // get context - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConcatParam& param, + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConcatParam& param, Context& ctx) { _num_concats = inputs[0]->count_valid(0, param.axis); @@ -65,15 +50,14 @@ class SaberConcat& inputs, - std::vector& outputs, - ConcatParam& param); + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConcatParam& param); private: int _num_concats; int _concat_input_size; }; -template class SaberConcat; } //namespace saber } //namespace anakin diff --git a/saber/funcs/impl/cuda/saber_conv.cpp b/saber/funcs/impl/cuda/saber_conv.cpp new file mode 100644 index 000000000..d379800d5 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_conv.cpp @@ -0,0 +1,235 @@ + +#include "saber/funcs/impl/cuda/saber_conv.h" +#include "saber/funcs/calibrate.h" +#include "saber_conv.h" + +namespace anakin { +namespace saber { + +template <> +SaberStatus SaberConv2D::\ + create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + return SaberSuccess; +} +template <> +SaberStatus SaberConv2D::\ + init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + this->_ctx = &ctx; + + if (param.stride_h == 1 && + param.stride_w == 1 && + param.weight()->height() == 3 && + param.weight()->width() == 3 && param.group == 1) { + if (param.activation_param.has_active) { + if (param.activation_param.active == Active_relu) { + dispatch_func = winograd_conv_relu; + } else { + _with_saber_act = true; + dispatch_func = winograd_conv; + } + } else { + dispatch_func = winograd_conv; + } + } else if (param.group == 1) { + const int K = param.weight()->num(); + if (K % 4 == 0) { + if (param.bias()->size() > 0 && !param.activation_param.has_active) { + dispatch_func = direct_conv_bias_Kdivis4; + } else if (param.bias()->valid_size() > 0 && param.activation_param.active == Active_relu) { + dispatch_func = direct_conv_bias_relu_Kdivis4; + } else { + if (param.activation_param.has_active) { + // NOT SUPPORT conv relu fusion + _with_saber_act = true; + } + dispatch_func = direct_conv_Kdivis4; + } + } else { + if (param.bias()->size() > 0 && !param.activation_param.has_active) { + dispatch_func = direct_conv_bias_Kindiv4; + } else if (param.bias()->valid_size() > 0 && param.activation_param.active == Active_relu) { + dispatch_func = direct_conv_bias_relu_Kindiv4; + } else { + if (param.activation_param.has_active) { + // NOT SUPPORT conv relu fusion + _with_saber_act = true; + } + dispatch_func = direct_conv_Kindiv4; + } + } + } else if (param.group == inputs[0]->channel() && param.group == outputs[0]->channel()) { + if (param.activation_param.has_active) { + if (param.activation_param.active == Active_relu) { + if (param.bias()->size() > 0) { + depthwise_func = saber_depthwise_conv_act; + } else { + depthwise_func = saber_depthwise_conv_act; + } + } else { + if (param.bias()->size() > 0) { + depthwise_func = saber_depthwise_conv_act; + } else { + depthwise_func = saber_depthwise_conv_act; + } + _with_saber_act = true; + } + } else if (param.bias()->size() > 0) { + depthwise_func = saber_depthwise_conv_act; + } else { + depthwise_func = saber_depthwise_conv_act; + } + } else { + return SaberUnImplError; + } + + _kernel_height = param.weight()->height(); + _kernel_width = param.weight()->width(); + _use_k1s1p0 = true; + _use_k1s1p0 = _use_k1s1p0 && (_kernel_height == 1); + _use_k1s1p0 = _use_k1s1p0 && (_kernel_width == 1); + _use_k1s1p0 = _use_k1s1p0 && (param.pad_h == 0); + _use_k1s1p0 = _use_k1s1p0 && (param.pad_w == 0); + _use_k1s1p0 = _use_k1s1p0 && (param.stride_h == 1); + _use_k1s1p0 = _use_k1s1p0 && (param.stride_w == 1); + _use_k1s1p0 = _use_k1s1p0 && (param.bias()->valid_size()>0); + _use_k1s1p0 = _use_k1s1p0 && (param.activation_param.active == Active_relu); + + if (_use_k1s1p0) { + return SaberSuccess; + } + + if (_with_saber_act) { + _saber_act = new SaberActivation; + _saber_act->init(outputs, outputs, param.activation_param, ctx); + } + if (!_extern_trans) { + conv_trans_weights( + *(param.mutable_weight()), param.stride_h, param.stride_w, param.group, + _in_place, &_weight_dev); + } + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberConv2D::\ + dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param) { + //err code? + Shape shape_in = inputs[0]->valid_shape(); + Shape shape_out = outputs[0]->valid_shape(); + const float* bias_data = nullptr; + int num = inputs[0]->num(); + int chin = inputs[0]->channel(); + int win = inputs[0]->width(); + int hin = inputs[0]->height(); + int chout = outputs[0]->channel(); + int wout = outputs[0]->width(); + int hout = outputs[0]->height(); + int in_stride = chin * win * hin; + int out_stride = chout * wout * hout; + if (param.bias()->size() > 0) { + bias_data = (const float*)param.bias()->data(); + } + if (param.group == inputs[0]->channel() && param.group == outputs[0]->channel()) { + depthwise_func((const float*)inputs[0]->data(), + (float*)outputs[0]->mutable_data(), + inputs[0]->num(), inputs[0]->channel(), + inputs[0]->height(), inputs[0]->width(), outputs[0]->height(), + outputs[0]->width(), _kernel_width, _kernel_height, param.stride_w, + param.stride_h, param.pad_w, param.pad_h, + (const OpDataType*)param.weight()->data(), (const float*)bias_data, + this->_ctx->get_compute_stream()); + } else if (_use_k1s1p0){ + if (param.activation_param.has_active) { + conv_gemm_k1s1p0(num, in_stride, out_stride, + (float*)outputs[0]->mutable_data(), + (const float*)inputs[0]->data(), + (const float*)param.weight()->data(), + chout, chin, hin, win, bias_data, + this->_ctx->get_compute_stream(), 1.f, 0.f); + } else { + conv_gemm_k1s1p0(num, in_stride, out_stride, + (float*)outputs[0]->mutable_data(), + (const float*)inputs[0]->data(), + (const float*)param.weight()->data(), + chout, chin, hin, win, bias_data, + this->_ctx->get_compute_stream(), 1.f, 0.f); + } + return SaberSuccess; + } else { + const float* weight_ptr = nullptr; + if (_in_place) { + weight_ptr = (const float *) param.weight()->data(); + } else { + weight_ptr = (const float *) _weight_dev.data(); + } + dispatch_func((const float *) inputs[0]->data(), + (float *) outputs[0]->mutable_data(), + weight_ptr, + bias_data, + inputs[0]->num(), + inputs[0]->channel(), + inputs[0]->height(), + inputs[0]->width(), + outputs[0]->channel(), + outputs[0]->height(), + outputs[0]->width(), + shape_in[1], + shape_in[2], + shape_in[3], + shape_out[1], + shape_out[2], + shape_out[3], + _kernel_height, + _kernel_width, + param.pad_h, + param.pad_w, + param.stride_h, + param.stride_w, + param.dilation_h, + param.dilation_w, + param.group, + param.alpha, + param.beta, + this->_ctx->get_compute_stream()); + } + + if (_with_saber_act) { + _saber_act->dispatch(outputs, outputs, param.activation_param); + } + CUDA_CHECK(cudaGetLastError()); + return SaberSuccess; +} + +template <> +SaberStatus SaberConv2D::\ + create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + return SaberSuccess; +} + +template <> +SaberStatus SaberConv2D::\ + init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + return SaberInvalidValue; +} + +template <> +SaberStatus SaberConv2D::\ + dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param) { + + return SaberInvalidValue; +} +DEFINE_OP_TEMPLATE(SaberConv2D, ConvParam, NV, AK_HALF); +} +} diff --git a/saber/funcs/impl/cuda/saber_conv.h b/saber/funcs/impl/cuda/saber_conv.h index 5a5411b1b..6e185e283 100644 --- a/saber/funcs/impl/cuda/saber_conv.h +++ b/saber/funcs/impl/cuda/saber_conv.h @@ -1,16 +1,16 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV2D_H @@ -19,220 +19,98 @@ #include #include "saber/funcs/impl/impl_conv.h" #include "saber/funcs/impl/cuda/base/sass_funcs.h" +#include "saber/funcs/impl/cuda/saber_activation.h" #include "saber/funcs/funcs_utils.h" namespace anakin{ namespace saber{ -template -class SaberConv2D : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; +template +SaberStatus saber_depthwise_conv_act(const dtype* input, dtype* output, \ + int num, int cin, int hin, int win, int hout, int wout, \ + int kw, int kh, int stride_w, int stride_h, \ + int pad_h, int pad_w, const dtype* weights, const dtype* bias, \ + cudaStream_t stream); - SaberConv2D():_host_work_space(nullptr), _gpu_work_space(nullptr) - {} +template +class SaberConv2D : public ImplBase< + NV, OpDtype, ConvParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberConv2D() = default; ~SaberConv2D() { - if (_host_work_space) - { - free(_host_work_space); - } - if (_gpu_work_space) - { - cudaFree(_gpu_work_space); - } + delete _saber_act; } + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context &ctx); + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx); - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvParam& param, Context &ctx) { - this->_ctx = ctx; - //This is an ugly impl for now - if (param.stride_h == 1 && - param.stride_w == 1 && - param.weight()->height() == 3 && - param.weight()->width() == 3 && param.group == 1) - { - //Update weights if need - Shape weight_shape = param.weight()->shape(); - Tensor new_weight; - new_weight.re_alloc(weight_shape); - new_weight.copy_from(*(param.weight())); - OpDataType *weight_data = new_weight.mutable_data(); - - int round_in_channel = i_align_up(inputs[0]->channel(),8); - int round_out_channel = i_align_up(param.weight()->num(),32); - - int weight4x4_size = round_in_channel * round_out_channel * 4 * 4; - _host_work_space = (OpDataType*)malloc(weight4x4_size * sizeof(OpDataType)); - CUDA_CHECK(cudaMalloc((void**)&_gpu_work_space, weight4x4_size*sizeof(OpDataType))); - transform_3x3_weight_2_4x4(weight_data, _host_work_space, param.weight()->num(), round_out_channel, inputs[0]->channel(), round_in_channel); - CUDA_CHECK(cudaMemcpy((void*)_gpu_work_space, - (void*)_host_work_space, - weight4x4_size * sizeof(OpDataType), - cudaMemcpyHostToDevice)); - dispatch_func = winograd_conv; - - } - else if (param.group == 1) - { - - int weight_size = (param.weight()->shape()).count(); - Tensor weight_host; - weight_host.re_alloc(param.weight()->shape()); - weight_host.copy_from(*(param.weight())); - const OpDataType *weight_data = weight_host.data(); - - _host_work_space = (OpDataType*)malloc(weight_size * sizeof(OpDataType)); - CUDA_CHECK(cudaMalloc((void**)&_gpu_work_space, weight_size * sizeof(OpDataType))); - - transpose_filter_KCRS_2_CRSK(weight_data, _host_work_space, \ - param.weight()->num(), \ - param.weight()->channel(), \ - param.weight()->height(), \ - param.weight()->width()); - CUDA_CHECK(cudaMemcpy( (void*)_gpu_work_space, \ - (void*)_host_work_space, \ - weight_size * sizeof(OpDataType), \ - cudaMemcpyHostToDevice )); - - const int K = param.weight()->num(); - if (K % 4 == 0) - { - if (param.bias()->size() > 0) - dispatch_func = direct_conv_bias_Kdivis4; - else - dispatch_func = direct_conv_Kdivis4; - } - else - { - if (param.bias()->size() > 0) - dispatch_func = direct_conv_bias_Kindiv4; - else - dispatch_func = direct_conv_Kindiv4; - } - } - else - { - return SaberUnImplError; - } - cudaDeviceSynchronize(); - return create(inputs, outputs, param, ctx); - - - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvParam& param, Context& ctx) { - return SaberSuccess; - } + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param); - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvParam& param){ - //err code? - Shape shape_in = inputs[0]->valid_shape(); - Shape shape_out = outputs[0]->valid_shape(); - //LOG(WARNING) << "shape in: " << shape_in[0] << ", " << shape_in[1] << ", " << shape_in[2] << ", " << shape_in[3]; - const OutDataType* bias_data = nullptr; - if (param.bias()->size() > 0) { - bias_data = param.bias()->data(); + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + if (target_weights.valid_size() > 0) { + conv_trans_weights(target_weights, stride_h, stride_w, group, true, nullptr); } - //LOG(WARNING) << "saber conv dispatch"; - //CUDA_CHECK(cudaGetLastError()); - //LOG(WARNING) << "saber conv check previous error"; - //LOG(WARNING) << "width = " << inputs[0]->width() << ", height = " << inputs[0]->height() << ", channel = " << inputs[0]->channel(); - //LOG(WARNING) << "kw = " << param.weight()->width() << ", kh = " << param.weight()->height(); - //LOG(WARNING) << "group = " << param.group << ", filter = " << outputs[0]->channel(); - - dispatch_func(inputs[0]->data(), outputs[0]->mutable_data(), - _gpu_work_space, - bias_data, - inputs[0]->num(), - inputs[0]->channel(), - inputs[0]->height(), - inputs[0]->width(), - outputs[0]->channel(), - outputs[0]->height(), - outputs[0]->width(), - shape_in[1], - shape_in[2], - shape_in[3], - shape_out[1], - shape_out[2], - shape_out[3], - param.weight()->height(), - param.weight()->width(), - param.pad_h, - param.pad_w, - param.stride_h, - param.stride_w, - param.dilation_h, - param.dilation_w, - param.group, - param.alpha, - param.beta, - this->_ctx.get_compute_stream()); - - CUDA_CHECK(cudaGetLastError()); + _extern_trans = true; + _in_place = true; return SaberSuccess; } private: - OpDataType* _host_work_space; - OpDataType* _gpu_work_space; - std::function dispatch_func; + bool _with_saber_act{false}; + bool _in_place{false}; + bool _use_k1s1p0{false}; + bool _extern_trans{false}; + Tensor _weight_dev; + SaberActivation *_saber_act{nullptr}; + int _kernel_height; + int _kernel_width; + std::function dispatch_func; + + std::function depthwise_func; }; -template class SaberConv2D; } } diff --git a/saber/funcs/impl/cuda/saber_conv_act.h b/saber/funcs/impl/cuda/saber_conv_act.h deleted file mode 100644 index a34e4449a..000000000 --- a/saber/funcs/impl/cuda/saber_conv_act.h +++ /dev/null @@ -1,400 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV2DACT_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV2DACT_H - -#include -#include "saber/funcs/impl/impl_conv_act.h" -#include "saber/funcs/impl/cuda/base/sass_funcs.h" -#include "saber/funcs/funcs_utils.h" - -namespace anakin{ - -namespace saber{ - -template -SaberStatus saber_depthwise_conv_act(const dtype* input, dtype* output, \ - int num, int cin, int hin, int win, int hout, int wout, \ - int kw, int kh, int stride_w, int stride_h, \ - int pad_h, int pad_w, const dtype* weights, const dtype* bias, \ - cudaStream_t stream); - -template -class SaberConv2DAct : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvActiveParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberConv2DAct() - : _host_work_space(nullptr) - , _gpu_work_space(nullptr) - , _use_k1s1p0(false) - {} - - ~SaberConv2DAct() { - if (_host_work_space) - { - free(_host_work_space); - } - if (_gpu_work_space) - { - cudaFree(_gpu_work_space); - } - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx) { - - - return SaberSuccess; - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context &ctx) { - this->_ctx = ctx; - - _use_k1s1p0 = true; - _use_k1s1p0 = _use_k1s1p0 && (param.conv_param.weight()->height() == 1); - _use_k1s1p0 = _use_k1s1p0 && (param.conv_param.weight()->width() == 1); - _use_k1s1p0 = _use_k1s1p0 && (param.conv_param.pad_h == 0); - _use_k1s1p0 = _use_k1s1p0 && (param.conv_param.pad_w == 0); - _use_k1s1p0 = _use_k1s1p0 && (param.conv_param.stride_h == 1); - _use_k1s1p0 = _use_k1s1p0 && (param.conv_param.stride_w == 1); - _use_k1s1p0 = _use_k1s1p0 && (inputs[0]->num() == 1); - //This is an ugly impl for now - if (param.conv_param.group == inputs[0]->channel() && \ - param.conv_param.group == outputs[0]->channel()){ - - } else if (param.conv_param.stride_h == 1 && - param.conv_param.stride_w == 1 && - param.conv_param.weight()->height() == 3 && - param.conv_param.weight()->width() == 3 - &¶m.conv_param.group == 1) - { - //Update weights if need - Shape weight_shape = param.conv_param.weight()->shape(); - Tensor new_weight; - new_weight.re_alloc(weight_shape); - new_weight.copy_from(*(param.conv_param.weight())); - OpDataType *weight_data = new_weight.mutable_data(); - - int round_in_channel = i_align_up(inputs[0]->channel(),8); - int round_out_channel = i_align_up(param.conv_param.weight()->num(),32); - int weight4x4_size = round_in_channel * round_out_channel * 4 * 4; - _host_work_space = (OpDataType* )malloc(weight4x4_size * sizeof(OpDataType)); - CUDA_CHECK(cudaMalloc((void**)&_gpu_work_space, weight4x4_size * sizeof(OpDataType))); - transform_3x3_weight_2_4x4(weight_data, _host_work_space, param.conv_param.weight()->num(), round_out_channel, inputs[0]->channel(), round_in_channel); - CUDA_CHECK(cudaMemcpy((void*)_gpu_work_space, - (void*)_host_work_space, - weight4x4_size*sizeof(OpDataType), - cudaMemcpyHostToDevice)); - if (param.has_eltwise) { - dispatch_func_elt = winograd_conv_eltwise; - } else { - dispatch_func = winograd_conv_relu; - } - } - else if(param.conv_param.group == 1) - { - Shape weight_shape = param.conv_param.weight()->shape(); - Tensor new_weight; - new_weight.re_alloc(weight_shape); - new_weight.copy_from(*(param.conv_param.weight())); - OpDataType *weight_data = new_weight.mutable_data(); - - int weight_size = param.conv_param.weight()->shape().count(); - _host_work_space = (OpDataType* )malloc(weight_size * sizeof(OpDataType)); - CUDA_CHECK(cudaMalloc((void**)&_gpu_work_space, weight_size * sizeof(OpDataType))); - - //const OpDataType *weight_data = param.conv_param.weight()->data(); - transpose_filter_KCRS_2_CRSK(weight_data, _host_work_space, \ - param.conv_param.weight()->num(), \ - param.conv_param.weight()->channel(), \ - param.conv_param.weight()->height(), \ - param.conv_param.weight()->width()); - CUDA_CHECK(cudaMemcpy( (void*)_gpu_work_space, \ - (void*)_host_work_space, \ - weight_size * sizeof(OpDataType), \ - cudaMemcpyHostToDevice )); - - const int K = param.conv_param.weight()->num(); - if(K % 4 == 0) - { - if (param.conv_param.bias()->size() > 0) - dispatch_func = direct_conv_bias_relu_Kdivis4; - else - return SaberUnImplError; - } - else - { // TODO: would merge the bias(with/without) version - if (param.conv_param.bias()->size() > 0) - dispatch_func = direct_conv_bias_relu_Kindiv4; - else - return SaberUnImplError; - } - } - else{ - return SaberUnImplError; - } - cudaDeviceSynchronize(); - - return create(inputs, outputs, param, ctx); - //return SaberSuccess; - - } - - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param) { - - //err code? - Shape shape_in = inputs[0]->valid_shape(); - Shape shape_out = outputs[0]->valid_shape(); - const InDataType* bias_data; - if (param.conv_param.bias()->size() > 0) { - bias_data = param.conv_param.bias()->data(); - }else{ - bias_data = nullptr; - } - - int num = inputs[0]->num(); - int chin = inputs[0]->channel(); - int win = inputs[0]->width(); - int hin = inputs[0]->height(); - int chout = outputs[0]->channel(); - int wout = outputs[0]->width(); - int hout = outputs[0]->height(); - - int kw = param.conv_param.weight()->width(); - int kh = param.conv_param.weight()->height(); - //LOG(INFO) << "saber conv act"; - if (_use_k1s1p0) { -// LOG(INFO)<<"using k1s1p0"; - conv_gemm_k1s1p0(outputs[0]->mutable_data(), - inputs[0]->data(), - param.conv_param.weight()->data(), - chout, chin, hin, win, bias_data, - this->_ctx.get_compute_stream()); - return SaberSuccess; - } - if (param.conv_param.group == chin && param.conv_param.group == chout) { - - if (param.conv_param.bias()->size() > 0) { - if (param.has_active) { - saber_depthwise_conv_act(inputs[0]->data(), \ - outputs[0]->mutable_data(), num, chin, hin, win, hout, \ - wout, kw, kh, param.conv_param.stride_w, \ - param.conv_param.stride_h, param.conv_param.pad_w, param.conv_param.pad_h,\ - (const OpDataType*)param.conv_param.weight()->data(), bias_data, \ - this->_ctx.get_compute_stream()); - } else { - saber_depthwise_conv_act(inputs[0]->data(), \ - outputs[0]->mutable_data(), num, chin, hin, win, hout, \ - wout, kw, kh, param.conv_param.stride_w, \ - param.conv_param.stride_h, param.conv_param.pad_w, param.conv_param.pad_h,\ - (const OpDataType*)param.conv_param.weight()->data(), bias_data, \ - this->_ctx.get_compute_stream()); - } - - } else { - if (param.has_active) { - saber_depthwise_conv_act(inputs[0]->data(), \ - outputs[0]->mutable_data(), inputs[0]->num(), inputs[0]->channel(), \ - inputs[0]->height(), inputs[0]->width(), outputs[0]->height(), \ - outputs[0]->width(), param.conv_param.weight()->width(), \ - param.conv_param.weight()->height(), param.conv_param.stride_w, \ - param.conv_param.stride_h, param.conv_param.pad_w, param.conv_param.pad_h,\ - (const OpDataType*)param.conv_param.weight()->data(), bias_data, \ - this->_ctx.get_compute_stream()); - } else { - saber_depthwise_conv_act(inputs[0]->data(), \ - outputs[0]->mutable_data(), inputs[0]->num(), inputs[0]->channel(), \ - inputs[0]->height(), inputs[0]->width(), outputs[0]->height(), \ - outputs[0]->width(), param.conv_param.weight()->width(), \ - param.conv_param.weight()->height(), param.conv_param.stride_w, \ - param.conv_param.stride_h, param.conv_param.pad_w, param.conv_param.pad_h,\ - (const OpDataType*)param.conv_param.weight()->data(), bias_data, \ - this->_ctx.get_compute_stream()); - } - - } - - } else if (param.has_eltwise) { - //std::cout << "In dispatch_func_elt" << std::endl; - dispatch_func_elt(inputs[0]->data(), outputs[0]->mutable_data(), \ - _gpu_work_space, bias_data, num, chin, hin, win, \ - chout, hout, wout, - shape_in[1], - shape_in[2], - shape_in[3], - shape_out[1], - shape_out[2], - shape_out[3], - kh, kw, - param.conv_param.pad_h, - param.conv_param.pad_w, - param.conv_param.stride_h, - param.conv_param.stride_w, - param.conv_param.dilation_h, - param.conv_param.dilation_w, - param.conv_param.group, - param.conv_param.alpha, - param.conv_param.beta, - param.eltwise_param.operation, - this->_ctx.get_compute_stream()); - } else { - dispatch_func(inputs[0]->data(), outputs[0]->mutable_data(), \ - _gpu_work_space, bias_data, num, chin, hin, win, \ - chout, hout, wout, \ - shape_in[1], - shape_in[2], - shape_in[3], - shape_out[1], - shape_out[2], - shape_out[3], - param.conv_param.weight()->height(), - param.conv_param.weight()->width(), - param.conv_param.pad_h, - param.conv_param.pad_w, - param.conv_param.stride_h, - param.conv_param.stride_w, - param.conv_param.dilation_h, - param.conv_param.dilation_w, - param.conv_param.group, - param.conv_param.alpha, - param.conv_param.beta, - this->_ctx.get_compute_stream()); - } - - return SaberSuccess; - } - -private: - OpDataType* _host_work_space; - OpDataType* _gpu_work_space; - std::function dispatch_func; - - std::function dispatch_func_elt; - - bool _use_k1s1p0; - void conv_gemm_k1s1p0(float* out, const float* img, - const float* weights, int out_channel, - int in_channel, int img_h, int img_w, - const float* bias, cudaStream_t cuda_stream) { - float alpha = 1.0f; - float beta = 0.0f; - int m = out_channel; - int k = in_channel; - int n = img_h * img_w; - if (ifVec(m, n, k, k, n, n)) { - ker_gemm_32x32x32_NN_vec_bias_relu(m, n, k, - alpha, weights, - beta, img, - out, bias, - cuda_stream); - } else { - ker_gemm_32x32x32_NN_bias_relu(m, n, k, - alpha, weights, - beta, img, - out, bias, - cuda_stream); - } - } - -}; -template class SaberConv2DAct; -} - -} - - -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV2DACT_H diff --git a/saber/funcs/impl/cuda/saber_conv_act_pooling.h b/saber/funcs/impl/cuda/saber_conv_act_pooling.h deleted file mode 100644 index 5f7e75773..000000000 --- a/saber/funcs/impl/cuda/saber_conv_act_pooling.h +++ /dev/null @@ -1,260 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV2D_ACT_POOLING_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV2D_ACT_POOLING_H - -#include -#include "saber/funcs/impl/impl_conv_act_pooling.h" -#include "saber/funcs/impl/cuda/base/sass_funcs.h" -#include "saber/funcs/funcs_utils.h" - -namespace anakin{ - -namespace saber{ - -template -class SaberConv2DActPooling : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvActivePoolingParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberConv2DActPooling() - : _host_work_space(nullptr) - , _gpu_work_space(nullptr) - {} - - ~SaberConv2DActPooling() { - if (_host_work_space) - { - free(_host_work_space); - } - if (_gpu_work_space) - { - cudaFree(_gpu_work_space); - } - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam& param, Context &ctx) { - this->_ctx = ctx; - if (_host_work_space) - { - free(_host_work_space); - } - if (_gpu_work_space) - { - cudaFree(_gpu_work_space); - } - - if (param.conv_param.stride_h == 1 && - param.conv_param.stride_w == 1 && - param.conv_param.weight()->height() == 3 && - param.conv_param.weight()->width() == 3 && - param.conv_param.group == 1) - { - //Update weights if need - Shape weight_shape = param.conv_param.weight()->shape(); - Tensor new_weight; - new_weight.re_alloc(weight_shape); - new_weight.copy_from(*(param.conv_param.weight())); - OpDataType *weight_data = new_weight.mutable_data(); - - int round_in_channel = i_align_up(inputs[0]->channel(),8); - int round_out_channel = i_align_up(param.conv_param.weight()->num(),32); - int weight4x4_size = round_in_channel * round_out_channel * 4 * 4; - _host_work_space = (OpDataType*)malloc(weight4x4_size * sizeof(OpDataType)); - CUDA_CHECK(cudaMalloc((void**)&_gpu_work_space, weight4x4_size*sizeof(OpDataType))); - transform_3x3_weight_2_4x4(weight_data, _host_work_space, param.conv_param.weight()->num(), round_out_channel, inputs[0]->channel(), round_in_channel); - CUDA_CHECK(cudaMemcpy((void*)_gpu_work_space, - (void*)_host_work_space, - weight4x4_size*sizeof(OpDataType), - cudaMemcpyHostToDevice)); - - dispatch_func = winograd_conv_relu_pooling; - - } - else if(param.conv_param.group == 1) - { - //Update weights if need - Shape weight_shape = param.conv_param.weight()->shape(); - Tensor new_weight; - new_weight.re_alloc(weight_shape); - new_weight.copy_from(*(param.conv_param.weight())); - OpDataType *weight_data = new_weight.mutable_data(); - - int weight_size = param.conv_param.weight()->shape().count(); - _host_work_space = (OpDataType*)malloc(weight_size * sizeof(OpDataType)); - CUDA_CHECK(cudaMalloc((void**)&_gpu_work_space, weight_size * sizeof(OpDataType))); - - //const OpDtype *weight_data = param.conv_param.weight()->data(); - transpose_filter_KCRS_2_CRSK(weight_data, _host_work_space, \ - param.conv_param.weight()->num(), \ - param.conv_param.weight()->channel(), \ - param.conv_param.weight()->height(), \ - param.conv_param.weight()->width()); - CUDA_CHECK(cudaMemcpy( (void*)_gpu_work_space, \ - (void*)_host_work_space, \ - weight_size * sizeof(OpDataType), \ - cudaMemcpyHostToDevice )); - - const int K = param.conv_param.weight()->num(); - if (K % 4 == 0) - { - if (param.conv_param.bias()->size() > 0) - dispatch_func = direct_conv_bias_relu_maxpool2k2s0p_Kdivis4; - else - return SaberUnImplError; - } - else - { // TODO: would merge the bias(with/without) version - if (param.conv_param.bias()->size() > 0) - dispatch_func = direct_conv_bias_relu_maxpool2k2s0p_Kindiv4; - else - return SaberUnImplError; - } - } - else - { - return SaberUnImplError; - } - cudaDeviceSynchronize(); - return create(inputs, outputs, param, ctx); - - } - - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam& param, Context& ctx) { - - int input_dim = inputs[0]->height(); // P - int kernel_exten = param.conv_param.dilation_h * (param.conv_param.weight()->height() - 1) + 1; - _conv_out_height = (input_dim + 2 * param.conv_param.pad_h - kernel_exten) - / param.conv_param.stride_h + 1; - - input_dim = inputs[0]->width(); // Q - kernel_exten = param.conv_param.dilation_w * (param.conv_param.weight()->width() - 1) + 1; - _conv_out_width = (input_dim + 2 * param.conv_param.pad_w - kernel_exten) - / param.conv_param.stride_w + 1; - - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam& param) - { - //cudaDeviceSynchronize(); - Shape shape_in = inputs[0]->valid_shape(); - Shape shape_out = outputs[0]->valid_shape(); - const InDataType* bias_data = nullptr; - if (param.conv_param.bias()->size() > 0) { - bias_data = param.conv_param.bias()->data(); - } - - dispatch_func(inputs[0]->data(), outputs[0]->mutable_data(), - _gpu_work_space, - bias_data, - inputs[0]->num(), - inputs[0]->channel(), - inputs[0]->height(), - inputs[0]->width(), - outputs[0]->channel(), - _conv_out_height, - _conv_out_width, - shape_in[1], - shape_in[2], - shape_in[3], - shape_out[1], - shape_out[2], - shape_out[3], - param.conv_param.weight()->height(), - param.conv_param.weight()->width(), - param.conv_param.pad_h, - param.conv_param.pad_w, - param.conv_param.stride_h, - param.conv_param.stride_w, - param.conv_param.dilation_h, - param.conv_param.dilation_w, - param.conv_param.group, - param.conv_param.alpha, - param.conv_param.beta, - this->_ctx.get_compute_stream()); - - CUDA_CHECK(cudaGetLastError()); - - - return SaberSuccess; - } -private: - OpDataType* _host_work_space; - OpDataType* _gpu_work_space; - int _conv_out_height; - int _conv_out_width; - std::function dispatch_func; - -}; -template class SaberConv2DActPooling; - -} - -} -#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H diff --git a/saber/funcs/impl/cuda/saber_conv_eltwise.cpp b/saber/funcs/impl/cuda/saber_conv_eltwise.cpp new file mode 100644 index 000000000..25b193694 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_conv_eltwise.cpp @@ -0,0 +1,105 @@ + +#include "saber/funcs/impl/cuda/saber_conv.h" +#include "saber/funcs/impl/cuda/saber_conv_eltwise.h" +#include "saber/funcs/impl/cuda/base/sass_funcs.h" +#include "saber/funcs/calibrate.h" +#include "saber_conv_eltwise.h" + +namespace anakin { +namespace saber { + +template <> +SaberStatus SaberConvEltwise::\ + init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + + _ctx = &ctx; + _kernel_height = param.conv_param.weight()->height(); + _kernel_width = param.conv_param.weight()->width(); + + _use_k1s1p0 = true; + _use_k1s1p0 = _use_k1s1p0 && (_kernel_height == 1); + _use_k1s1p0 = _use_k1s1p0 && (_kernel_width == 1); + _use_k1s1p0 = _use_k1s1p0 && (param.conv_param.pad_h == 0); + _use_k1s1p0 = _use_k1s1p0 && (param.conv_param.pad_w == 0); + _use_k1s1p0 = _use_k1s1p0 && (param.conv_param.stride_h == 1); + _use_k1s1p0 = _use_k1s1p0 && (param.conv_param.stride_w == 1); + _use_k1s1p0 = _use_k1s1p0 && (param.conv_param.bias()->valid_size()>0); + _use_k1s1p0 = _use_k1s1p0 && (!param.conv_param.activation_param.has_active); + + if (_use_k1s1p0) { + return SaberSuccess; + } else { + return SaberUnImplError; + } + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberConvEltwise::\ + dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param) { + + const float* bias_data; + if (param.conv_param.bias()->size() > 0) { + bias_data = (const float*)param.conv_param.bias()->data(); + } else { + bias_data = nullptr; + } + Shape shape_in = inputs[0]->valid_shape(); + Shape shape_out = outputs[0]->valid_shape(); + int num = inputs[0]->num(); + int chin = inputs[0]->channel(); + int win = inputs[0]->width(); + int hin = inputs[0]->height(); + int chout = outputs[0]->channel(); + int wout = outputs[0]->width(); + int hout = outputs[0]->height(); + int in_stride = chin * win * hin; + int out_stride = chout * wout * hout; + if (_use_k1s1p0) { + if (param.eltwise_param.has_eltwise) { + if (param.eltwise_param.activation_param.has_active) { + conv_gemm_k1s1p0(num, in_stride, out_stride, + (float*)outputs[0]->mutable_data(), + (const float*)inputs[0]->data(), + (const float*)param.conv_param.weight()->data(), + chout, chin, hin, win, bias_data, + this->_ctx->get_compute_stream(),1.f, 1.f); + } else { + conv_gemm_k1s1p0(num, in_stride, out_stride, + (float*)outputs[0]->mutable_data(), + (const float*)inputs[0]->data(), + (const float*)param.conv_param.weight()->data(), + chout, chin, hin, win, bias_data, + this->_ctx->get_compute_stream(),1.f, 1.f); + } + } else { + if (param.conv_param.activation_param.has_active) { + conv_gemm_k1s1p0(num, in_stride, out_stride, + (float*)outputs[0]->mutable_data(), + (const float*)inputs[0]->data(), + (const float*)param.conv_param.weight()->data(), + chout, chin, hin, win, bias_data, + this->_ctx->get_compute_stream(), 1.f, 0.f); + } else { + conv_gemm_k1s1p0(num, in_stride, out_stride, + (float*)outputs[0]->mutable_data(), + (const float*)inputs[0]->data(), + (const float*)param.conv_param.weight()->data(), + chout, chin, hin, win, bias_data, + this->_ctx->get_compute_stream(), 1.f, 0.f); + } + } + return SaberSuccess; + } else { + return SaberUnImplError; + } + return SaberSuccess; +} +DEFINE_OP_TEMPLATE(SaberConvEltwise, ConvEltwiseParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberConvEltwise, ConvEltwiseParam, NV, AK_INT8); +} +} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/saber_conv_eltwise.h b/saber/funcs/impl/cuda/saber_conv_eltwise.h index 240c9763a..9b6a2a66e 100644 --- a/saber/funcs/impl/cuda/saber_conv_eltwise.h +++ b/saber/funcs/impl/cuda/saber_conv_eltwise.h @@ -1,16 +1,16 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_ELTWISE_H @@ -19,174 +19,89 @@ #include #include "saber/funcs/impl/impl_conv_eltwise.h" #include "saber/funcs/impl/cuda/base/sass_funcs.h" +#include "saber/funcs/impl/cuda/saber_conv.h" #include "saber/funcs/funcs_utils.h" namespace anakin{ namespace saber{ -template -class SaberConv2DEltWise : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvActiveParam > > -{ +template +class SaberConvEltwise : public ImplBase< + NV, OpDtype, ConvEltwiseParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberConv2DEltWise():_host_work_space(nullptr), _gpu_work_space(nullptr) - {} - - ~SaberConv2DEltWise() { - if (_host_work_space) - { - free(_host_work_space); - } - if (_gpu_work_space) - { - cudaFree(_gpu_work_space); - } - } + typedef typename DataTrait::Dtype OpDataType; - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context &ctx) { - this->_ctx = ctx; - //This is an ugly impl for now - if (param.conv_param.stride_h == 1 && - param.conv_param.stride_w == 1 && - param.conv_param.weight()->height() == 3 && - param.conv_param.weight()->width() == 3) - { - //Update weights if need - Shape weight_shape = param.conv_param.weight()->shape(); - Tensor new_weight; - new_weight.re_alloc(weight_shape); - new_weight.copy_from(*(param.conv_param.weight())); - OpDataType *weight_data = new_weight.mutable_data(); - - int round_in_channel = i_align_up(inputs[0]->channel(),8); - int round_out_channel = i_align_up(param.conv_param.weight()->num(),32); - - int weight4x4_size = round_in_channel * round_out_channel * 4 * 4; - _host_work_space = (OpDataType*)malloc(weight4x4_size * sizeof(OpDataType)); - CUDA_CHECK(cudaMalloc((void**)&_gpu_work_space, weight4x4_size*sizeof(OpDataType))); - transform_3x3_weight_2_4x4(weight_data, _host_work_space, param.conv_param.weight()->num(), round_out_channel, inputs[0]->channel(), round_in_channel); - CUDA_CHECK(cudaMemcpy((void*)_gpu_work_space, - (void*)_host_work_space, - weight4x4_size*sizeof(OpDataType), - cudaMemcpyHostToDevice)); - - dispatch_func = winograd_conv_eltwise; - - }else{ - return SaberUnImplError; - } - cudaDeviceSynchronize(); - return create(inputs, outputs, param, ctx); - } + SaberConvEltwise() = default; + ~SaberConvEltwise() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context &ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx) { return SaberSuccess; } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param){ - //err code? - Shape shape_in = inputs[0]->valid_shape(); - Shape shape_out = outputs[0]->valid_shape(); - const InDataType* bias_data = NULL; - if (param.conv_param.bias()->size() > 0) { - bias_data = param.conv_param.bias()->data(); - } - dispatch_func(inputs[0]->data(), outputs[0]->mutable_data(), - _gpu_work_space, - bias_data, - inputs[0]->num(), - inputs[0]->channel(), - inputs[0]->height(), - inputs[0]->width(), - outputs[0]->channel(), - outputs[0]->height(), - outputs[0]->width(), - shape_in[1], - shape_in[2], - shape_in[3], - shape_out[1], - shape_out[2], - shape_out[3], - param.conv_param.weight()->height(), - param.conv_param.weight()->width(), - param.conv_param.pad_h, - param.conv_param.pad_w, - param.conv_param.stride_h, - param.conv_param.stride_w, - param.conv_param.dilation_h, - param.conv_param.dilation_w, - param.conv_param.group, - param.conv_param.alpha, - param.conv_param.beta, - param.eltwise_param.operation, - this->_ctx.get_compute_stream()); - - CUDA_CHECK(cudaGetLastError()); + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param); + + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + if (target_weights.valid_size() > 0) { + conv_trans_weights(target_weights, stride_h, stride_w, group, true, nullptr); + } + _extern_trans = true; + _in_place = true; return SaberSuccess; } private: - OpDataType* _host_work_space; - OpDataType* _gpu_work_space; - std::function dispatch_func; + bool _in_place{false}; + Tensor _weight_dev; + int _kernel_height; + int _kernel_width; + bool _use_k1s1p0{false}; + bool _use_k3{false}; + bool _extern_trans{false}; + std::function dispatch_func_elt; }; -template class SaberConv2DEltWise; } } -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_ELTWISE_H +#endif //ANAKIN_SABER_FUNCS_SABER_CONV2D_H diff --git a/saber/funcs/impl/cuda/saber_conv_pooling.cpp b/saber/funcs/impl/cuda/saber_conv_pooling.cpp new file mode 100644 index 000000000..bdadcdb3e --- /dev/null +++ b/saber/funcs/impl/cuda/saber_conv_pooling.cpp @@ -0,0 +1,160 @@ + +#include "saber/funcs/impl/cuda/saber_conv_pooling.h" +#include "saber/funcs/impl/cuda/base/sass_funcs.h" +#include "saber/funcs/calibrate.h" +#include "saber_conv_eltwise.h" + +namespace anakin { +namespace saber { +// FP32 part +template <> +SaberStatus SaberConv2DPooling::\ + create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx) { + _ctx = &ctx; + _inner_shape = conv_compute_shape(inputs[0]->valid_shape(), param.conv_param); + if (_use_k3p) { + dispatch_func = winograd_conv_relu_pooling; + } else if (_use_kp) { + const int K = param.conv_param.weight()->num(); + if (K % 4 == 0) { + dispatch_func = direct_conv_bias_relu_maxpool2k2s0p_Kdivis4; + } else { + dispatch_func = direct_conv_bias_relu_maxpool2k2s0p_Kindiv4; + } + } else { + _inner_tensor.reshape(_inner_shape); + _inner_tensor_v.resize(1); + _inner_tensor_v[0] = &_inner_tensor; + + _saber_conv.create(inputs, _inner_tensor_v, param.conv_param, ctx); + _vender_pool.create(_inner_tensor_v, outputs, param.pooling_param, ctx); + } + return SaberSuccess; +} + +template <> +SaberStatus SaberConv2DPooling:: +init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx) { + + _ctx = &ctx; + _inner_shape = conv_compute_shape(inputs[0]->valid_shape(), param.conv_param); + _kernel_height = param.conv_param.weight()->height(); + _kernel_width = param.conv_param.weight()->width(); + +// _use_k3p = true; +// _use_k3p = _use_k3p && (param.conv_param.weight()->height() == 3); +// _use_k3p = _use_k3p && (param.conv_param.weight()->width() == 3); +// _use_k3p = _use_k3p && (param.conv_param.stride_h == 1); +// _use_k3p = _use_k3p && (param.conv_param.stride_w == 1); +// _use_k3p = _use_k3p && (param.conv_param.dilation_h == 1); +// _use_k3p = _use_k3p && (param.conv_param.dilation_w == 1); +// _use_k3p = _use_k3p && (param.conv_param.group == 1); +// _use_k3p = _use_k3p && (param.pooling_param.window_h == 2); +// _use_k3p = _use_k3p && (param.pooling_param.window_w == 2); +// _use_k3p = _use_k3p && (param.pooling_param.stride_h == 2); +// _use_k3p = _use_k3p && (param.pooling_param.stride_w == 2); +// _use_k3p = _use_k3p && (param.pooling_param.pad_h == 0); +// _use_k3p = _use_k3p && (param.pooling_param.pad_w == 0); +// _use_k3p = _use_k3p && (param.pooling_param.pooling_type == Pooling_max); + +// _use_kp = true; +// _use_kp = _use_kp && (param.conv_param.group == 1); +// _use_kp = _use_kp && (param.pooling_param.window_h == 2); +// _use_kp = _use_kp && (param.pooling_param.window_w == 2); +// _use_kp = _use_kp && (param.pooling_param.stride_h == 2); +// _use_kp = _use_kp && (param.pooling_param.stride_w == 2); +// _use_kp = _use_kp && (param.pooling_param.pad_h == 0); +// _use_kp = _use_kp && (param.pooling_param.pad_w == 0); +// _use_kp = _use_kp && (param.pooling_param.pooling_type == Pooling_max); +// _use_kp = _use_kp && (param.conv_param.bias()->valid_size() > 0); + if (_use_k3p || _use_kp) { + if (!_extern_trans) { + conv_trans_weights(*(param.conv_param.mutable_weight()), + param.conv_param.stride_h, + param.conv_param.stride_w, + param.conv_param.group, + _in_place, &_weight_dev); + } + } + if (_use_k3p) { + dispatch_func = winograd_conv_relu_pooling; + } else if (_use_kp) { + const int K = param.conv_param.weight()->num(); + if (K % 4 == 0) { + dispatch_func = direct_conv_bias_relu_maxpool2k2s0p_Kdivis4; + } else { + dispatch_func = direct_conv_bias_relu_maxpool2k2s0p_Kindiv4; + } + } else { + _inner_tensor.re_alloc(_inner_shape, AK_FLOAT); + _inner_tensor_v.resize(1); + _inner_tensor_v[0] = &_inner_tensor; + _saber_conv.init(inputs, _inner_tensor_v, param.conv_param, ctx); + _vender_pool.init(_inner_tensor_v, outputs, param.pooling_param, ctx); + } + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberConv2DPooling::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvPoolingParam& param) { + + Shape shape_in = inputs[0]->valid_shape(); + Shape shape_out = outputs[0]->valid_shape(); + const float* bias_data = nullptr; + const float* weight_data = nullptr; + if (param.conv_param.bias()->size() > 0) { + bias_data = (const float*)param.conv_param.bias()->data(); + } + if (!_in_place) { + weight_data = (const float*)_weight_dev.data(); + } else { + weight_data = (const float*)param.conv_param.weight()->data(); + } + if (_use_k3p || _use_kp) { + dispatch_func((const float*)inputs[0]->data(), (float*)outputs[0]->mutable_data(), + weight_data, + bias_data, + inputs[0]->num(), + inputs[0]->channel(), + inputs[0]->height(), + inputs[0]->width(), + outputs[0]->channel(), + _inner_shape.height(), + _inner_shape.width(), + shape_in[1], + shape_in[2], + shape_in[3], + shape_out[1], + shape_out[2], + shape_out[3], + _kernel_height, + _kernel_width, + param.conv_param.pad_h, + param.conv_param.pad_w, + param.conv_param.stride_h, + param.conv_param.stride_w, + param.conv_param.dilation_h, + param.conv_param.dilation_w, + param.conv_param.group, + param.conv_param.alpha, + param.conv_param.beta, + this->_ctx->get_compute_stream()); + } else { + _saber_conv.dispatch(inputs, _inner_tensor_v, param.conv_param); + _vender_pool.dispatch(_inner_tensor_v, outputs, param.pooling_param); + } + return SaberSuccess; +} + +template class SaberConv2DPooling; +DEFINE_OP_TEMPLATE(SaberConv2DPooling, ConvPoolingParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberConv2DPooling, ConvPoolingParam, NV, AK_INT8); +} +} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/saber_conv_pooling.h b/saber/funcs/impl/cuda/saber_conv_pooling.h new file mode 100644 index 000000000..c4ac53bd0 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_conv_pooling.h @@ -0,0 +1,113 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_POOLING_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_POOLING_H + +#include +#include "saber/funcs/impl/impl_conv_pooling.h" +#include "saber/funcs/impl/cuda/saber_conv.h" +#include "saber/funcs/impl/cuda/vender_conv.h" +#include "saber/funcs/impl/cuda/vender_pooling.h" +#include "saber/funcs/impl/cuda/base/sass_funcs.h" +#include "saber/funcs/impl/cuda/saber_conv.h" +#include "saber/funcs/funcs_utils.h" + +namespace anakin { + +namespace saber { + +template +class SaberConv2DPooling : public ImplBase< + NV, OpDtype, ConvPoolingParam> { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberConv2DPooling() = default; + ~SaberConv2DPooling() = default; + + virtual SaberStatus init(const std::vector *> &inputs, + std::vector *> &outputs, + ConvPoolingParam ¶m, Context + &ctx); + + virtual SaberStatus create(const std::vector *> &inputs, + std::vector *> &outputs, + ConvPoolingParam ¶m, Context + &ctx); + + virtual SaberStatus dispatch(const std::vector *> &inputs, + std::vector *> &outputs, + ConvPoolingParam ¶m); + + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + if (target_weights.valid_size() > 0) { + conv_trans_weights(target_weights, stride_h, stride_w, group, true, nullptr); + } + _extern_trans = true; + _in_place = true; + return SaberSuccess; + } +private: + bool _use_k3p{false}; + bool _use_kp{false}; + bool _in_place{false}; + bool _extern_trans{false}; + Tensor _weight_dev; + VenderPooling _vender_pool; + SaberConv2D _saber_conv; + Shape _inner_shape; + Tensor _inner_tensor; + std::vector *> _inner_tensor_v; + int _kernel_height; + int _kernel_width; + + std::function dispatch_func; +}; +} + +} + + +#endif //ANAKIN_SABER_FUNCS_SABER_CONV2D_H diff --git a/saber/funcs/impl/cuda/saber_conv_upadding_padding.h b/saber/funcs/impl/cuda/saber_conv_upadding_padding.h new file mode 100644 index 000000000..e35dbd7cb --- /dev/null +++ b/saber/funcs/impl/cuda/saber_conv_upadding_padding.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_UPADDING_PADDING_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_UPADDING_PADDING_H + +#include "saber/funcs/impl/impl_conv_unpadding_padding.h" +#include "saber/funcs/saber_util.h" +namespace anakin { + +namespace saber { + +template +class SaberConvUnpaddingPadding : \ + public ImplBase < + NV, OpDtype, + ConvUnpaddingPaddingParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberConvUnpaddingPadding() + {} + + ~SaberConvUnpaddingPadding() { + + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvUnpaddingPaddingParam& param, + Context& ctx) { + this->_ctx = &ctx; + _width_offset_tensor.set_dtype(AK_INT32); + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvUnpaddingPaddingParam& param, + Context& ctx) { + this->_ctx = &ctx; + std::vector width_vector=inputs[0]->get_seq_offset()[0]; + utils::try_expand_tensor(_width_offset_tensor,width_vector.size()); + CUDA_CHECK(cudaMemcpyAsync(_width_offset_tensor.mutable_data(),width_vector.data(), sizeof(int)*width_vector.size(),cudaMemcpyHostToDevice,this->_ctx->get_compute_stream())); + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvUnpaddingPaddingParam& param); + +private: + Tensor _width_offset_tensor; + +}; + + +} + +} + +#endif //ANAKIN_SABER_FUNCS_SABER_CONV2D_H diff --git a/saber/funcs/impl/cuda/saber_crf_decoding.h b/saber/funcs/impl/cuda/saber_crf_decoding.h new file mode 100644 index 000000000..dea0baefb --- /dev/null +++ b/saber/funcs/impl/cuda/saber_crf_decoding.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CRFDECODING_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CRFDECODING_H + +#include "saber/funcs/impl/impl_crf_decoding.h" +#include "saber/saber_funcs_param.h" + +namespace anakin{ +namespace saber { + +template +class SaberCrfDecoding : public ImplBase< + NV, OpDtype, + CrfDecodingParam > +{ + public: + typedef typename DataTrait< NV, OpDtype>::Dtype OpDataType; + + SaberCrfDecoding() = default; + + ~SaberCrfDecoding() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + CrfDecodingParam< NV> ¶m, + Context< NV> &ctx){ + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + CrfDecodingParam< NV> ¶m, + Context< NV> &ctx){ + CHECK_EQ(inputs[0]->get_dtype(), OpDtype) << "inputs data type should be same with OpDtype"; + CHECK_EQ(outputs[0]->get_dtype(), OpDtype) << "outputs data type should be same with OpDtype"; + + _track.re_alloc(inputs[0]->valid_shape(), AK_INT32); + _alpha.re_alloc(inputs[0]->valid_shape(), OpDtype); + + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + CrfDecodingParam< NV> ¶m) override; +private: + Tensor< NV> _alpha; + Tensor< NV> _track; + Tensor< NV> _seq; + int _aligned_tag_num; +}; +} +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CRFDECODING_H \ No newline at end of file diff --git a/saber/funcs/impl/cuda/saber_crop.h b/saber/funcs/impl/cuda/saber_crop.h index 04e7db21b..25f7ad7c7 100644 --- a/saber/funcs/impl/cuda/saber_crop.h +++ b/saber/funcs/impl/cuda/saber_crop.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,27 +22,14 @@ namespace anakin{ namespace saber{ -template -class SaberCrop : \ +template +class SaberCrop : \ public ImplBase< - Tensor, - Tensor, - Tensor, - CropParam > > + NV, OpDtype, + CropParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberCrop() {} @@ -51,17 +38,17 @@ class SaberCrop& inputs, - std::vector& outputs, - CropParam& param, + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + CropParam& param, Context &ctx) { - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - CropParam& param, + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + CropParam& param, Context& ctx) { Shape in_stride = inputs[0]->get_stride(); int in_n_index = inputs[0]->num_index(); @@ -101,9 +88,9 @@ class SaberCrop& inputs, - std::vector& outputs, - CropParam& param); + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + CropParam& param); private: int _img_offset; @@ -117,7 +104,7 @@ class SaberCrop; +//template class SaberCrop; } } diff --git a/saber/funcs/impl/cuda/saber_ctc_align.h b/saber/funcs/impl/cuda/saber_ctc_align.h deleted file mode 100644 index d22b60ee1..000000000 --- a/saber/funcs/impl/cuda/saber_ctc_align.h +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CTC_ALIGN_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CTC_ALIGN_H - -#include "saber/funcs/impl/impl_ctc_align.h" - -namespace anakin{ - -namespace saber{ - -template -class SaberCtcAlign : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - CtcAlignParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberCtcAlign() - {} - - ~SaberCtcAlign() { - - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - CtcAlignParam& param, - Context &ctx) { - this->_ctx = ctx; - Shape offset_shape = {inputs[0]->num(), 1, 1, 1}; - _in_offset.re_alloc(offset_shape); - _out_offset.re_alloc(offset_shape); - return SaberSuccess; - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - CtcAlignParam& param, - Context& ctx) { - Shape offset_shape = {inputs[0]->get_seq_offset().size(), 1, 1, 1}; - _in_offset.reshape(offset_shape); - _out_offset.reshape(offset_shape); - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - CtcAlignParam& param); - -private: - Tensor _in_offset; - Tensor _out_offset; -}; - -template class SaberCtcAlign; -} - -} - -#endif //ANAKIN_SABER_FUNCS_SABER_CONV2D_H diff --git a/saber/funcs/impl/cuda/saber_deconv.h b/saber/funcs/impl/cuda/saber_deconv.h index e44443e5c..d9488017d 100644 --- a/saber/funcs/impl/cuda/saber_deconv.h +++ b/saber/funcs/impl/cuda/saber_deconv.h @@ -1,16 +1,16 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_DECONV_H @@ -23,70 +23,44 @@ namespace anakin{ namespace saber{ -template -class SaberDeconv2D : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvParam > > -{ +template +class SaberDeconv2D : + public ImplBase > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - SaberDeconv2D() :_use_k4_s2_p1(false) {} + SaberDeconv2D() = default; - ~SaberDeconv2D() {} + ~SaberDeconv2D() = default; - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvParam& param, Context& ctx) { + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx); - this->_ctx = ctx; + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context &ctx); - return create(inputs, outputs, param, ctx); - } + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param); - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvParam& param, Context &ctx) { - _use_k4_s2_p1 = true; - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.weight()->width()==4); - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.weight()->height()==4); - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.stride_h==2); - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.stride_w==2); - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.pad_h==1); - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.pad_w==1); - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.group==1); - if (_use_k4_s2_p1) { - int in_channel = inputs[0]->channel(); - int out_channel = outputs[0]->channel(); - scale_to_new_tensor_k4_s2_p1_decov<4>(new_weights_dev, - param.weight(), in_channel, out_channel); -// LOG(INFO)<<"scale weights finished!!"; - } - return SaberSuccess; + SaberStatus trans_weights(Tensor &target_weights, + Tensor &target_bias, + int in_channel, int out_channel, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int group) { + return SaberUnImplError; } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvParam& param); private: - bool _use_k4_s2_p1; - OpTensor new_weights_dev; + bool _use_k4_s2_p1{false}; + std::function _gemm_wx; + Tensor _workspace_tensor; }; -template class SaberDeconv2D; + } //namespace saber diff --git a/saber/funcs/impl/cuda/saber_deconv_act.h b/saber/funcs/impl/cuda/saber_deconv_act.h deleted file mode 100644 index d6cb71f56..000000000 --- a/saber/funcs/impl/cuda/saber_deconv_act.h +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_DECONV_ACT_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_DECONV_ACT_H - -#include "saber/funcs/impl/impl_deconv_act.h" -#include "saber/funcs/impl/cuda/base/sass_funcs.h" - -namespace anakin { - -namespace saber { - -template -class SaberDeconv2DAct : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvActiveParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberDeconv2DAct() : _use_k4_s2_p1(false) {} - - ~SaberDeconv2DAct() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx) { - - this->_ctx = ctx; - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context &ctx) { - _use_k4_s2_p1 = true; - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.conv_param.weight()->width()==4); - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.conv_param.weight()->height()==4); - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.conv_param.stride_h==2); - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.conv_param.stride_w==2); - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.conv_param.pad_h==1); - _use_k4_s2_p1 = _use_k4_s2_p1 && (param.conv_param.pad_w==1); - if (_use_k4_s2_p1) { - int in_channel = inputs[0]->channel(); - int out_channel = outputs[0]->channel(); - scale_to_new_tensor_k4_s2_p1_decov<4>(new_weights_dev, - param.conv_param.weight(), - in_channel, out_channel); -// LOG(INFO)<<"scale weights finished!!"; - } - //update_weights(param); - - return SaberSuccess; - - } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param); -private: - bool _use_k4_s2_p1; - OpTensor new_weights_dev; -}; -template class SaberDeconv2DAct; -} // namespace saber - -} // namespace anakin -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_DECONV_ACT_H diff --git a/saber/funcs/impl/cuda/saber_deformable_conv.h b/saber/funcs/impl/cuda/saber_deformable_conv.h deleted file mode 100644 index c8888464c..000000000 --- a/saber/funcs/impl/cuda/saber_deformable_conv.h +++ /dev/null @@ -1,161 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_DEFORMABLE_CONV_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_DEFORMABLE_CONV_H - -#include "saber/funcs/impl/impl_deformable_conv.h" - -namespace anakin{ - -namespace saber{ - -template -class SaberDeformableConv2D: \ - public ImplBase< - Tensor, \ - Tensor, \ - Tensor, \ - DeformableConvParam>> { - -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberDeformableConv2D() - : _handle(NULL) - , _conv_out_spatial_dim(0) - , _kernel_dim(0) - , _bottom_dim(0) - , _offset_dim(0) - , _col_offset(0) - , _output_offset(0) - , _kernel_offset(0) - {} - - ~SaberDeformableConv2D() { - if (_handle != NULL) { - CUBLAS_CHECK(cublasDestroy(_handle)); - } - } - - /** - * [Create description] Init all cudnn resource here - * @AuthorHTL - * @DateTime 2018-02-01T16:13:06+0800 - * @param inputs [description] - * @param outputs [description] - * @param conv_param [conv parameters] - */ - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - DeformableConvParam& param, Context& ctx) { - - // ---- init cudnn resources ---- - this->_ctx = ctx; - CUBLAS_CHECK(cublasCreate(&_handle)); - CUBLAS_CHECK(cublasSetStream(_handle, this->_ctx.get_compute_stream())); - - _kernel_dim = param.weight()->channel() - * param.weight()->height() - * param.weight()->width(); - - _bottom_dim = inputs[0]->channel() - * inputs[0]->height() - * inputs[0]->width(); - - _offset_dim = inputs[1]->channel() - * inputs[1]->height() - * inputs[1]->width(); - - Shape deform_col_buffer_shape = {1, _kernel_dim, outputs[0]->height(), outputs[0]->width()}; - _deform_col_buffer.re_alloc(deform_col_buffer_shape); - - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - DeformableConvParam& param, Context& ctx) { - - if (!(ctx == this->_ctx)) { - this->_ctx = ctx; - if (_handle != NULL) { - CUBLAS_CHECK(cublasDestroy(_handle)); - } - CUBLAS_CHECK(cublasCreate(&_handle)); - CUBLAS_CHECK(cublasSetStream(_handle, this->_ctx.get_compute_stream())); - } - - int in_channel = inputs[0]->channel(); - int conv_out_channel = outputs[0]->channel(); - _conv_out_spatial_dim = outputs[0]->height() * outputs[0]->width(); - - _kernel_dim = param.weight()->channel() - * param.weight()->height() - * param.weight()->width(); - - _bottom_dim = inputs[0]->channel() - * inputs[0]->height() - * inputs[0]->width(); - - _offset_dim = inputs[1]->channel() - * inputs[1]->height() - * inputs[1]->width(); - - _col_offset = _kernel_dim * _conv_out_spatial_dim; - _output_offset = conv_out_channel * _conv_out_spatial_dim; - _kernel_offset = _kernel_dim * conv_out_channel; - - if ((outputs[0]->height() != _deform_col_buffer.height()) - || (outputs[0]->width() != _deform_col_buffer.width())) { - - Shape deform_col_buffer_shape = {1, _kernel_dim, outputs[0]->height(), outputs[0]->width()}; - _deform_col_buffer.reshape(deform_col_buffer_shape); - } - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - DeformableConvParam& param); - -private: - DataTensor_in _deform_col_buffer; - cublasHandle_t _handle; - - int _conv_out_spatial_dim; - int _kernel_dim; - int _bottom_dim; - int _offset_dim; - int _col_offset; - int _output_offset; - int _kernel_offset; -}; - - -} - -} -#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H diff --git a/saber/funcs/impl/cuda/saber_detection_output.h b/saber/funcs/impl/cuda/saber_detection_output.h deleted file mode 100644 index 350a01716..000000000 --- a/saber/funcs/impl/cuda/saber_detection_output.h +++ /dev/null @@ -1,132 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_CUDA_SABER_DETECTION_OUTPUT_H -#define ANAKIN_SABER_FUNCS_CUDA_SABER_DETECTION_OUTPUT_H - - -#include "saber/funcs/impl/impl_detection_output.h" - -namespace anakin{ - -namespace saber{ - -template -class SaberDetectionOutput : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - DetectionOutputParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberDetectionOutput() = default; - ~SaberDetectionOutput() { - if (_bbox_cpu_data) { - fast_free(_bbox_cpu_data); - } - if (_conf_cpu_data) { - fast_free(_conf_cpu_data); - } - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - DetectionOutputParam& param, Context& ctx) { - // get context - this->_ctx = ctx; - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - DetectionOutputParam& param, Context &ctx) { - - //! inputs[0]: location map, dims = 4 {N, boxes * 4, 1, 1} - //! inputs[1]: confidence map, dims = 4 {N, classes * boxes, 1, 1} - //! inputs[2]: prior boxes, dims = 4 {1, 1, 2, boxes * 4(xmin, ymin, xmax, ymax)} - Shape sh_loc = inputs[0]->valid_shape(); - Shape sh_conf = inputs[1]->valid_shape(); - Shape sh_box = inputs[2]->valid_shape(); - //! shape {1, 1, 2, boxes * 4(xmin, ymin, xmax, ymax)}, boxes = size / 2 / 4 - //! layout must be 4 dims, the priors is in the last dim - _num_priors = sh_box[3] / 4; - int num = inputs[0]->num(); - if (param.class_num == 0) { - _num_classes = inputs[1]->valid_size() / (num * _num_priors); - } else { - _num_classes = param.class_num; - } - if (param.share_location) { - _num_loc_classes = 1; - } else { - _num_loc_classes = _num_classes; - _bbox_permute.reshape(sh_loc); - } - - _bbox_preds.reshape(sh_loc); - _conf_permute.reshape(sh_conf); - - CHECK_EQ(_num_priors * _num_loc_classes * 4, sh_loc[1]) << \ - "Number of priors must match number of location predictions."; - CHECK_EQ(_num_priors * _num_classes, sh_conf[1]) << \ - "Number of priors must match number of confidence predictions."; - - if (_conf_cpu_data != nullptr) { - fast_free(_conf_cpu_data); - } - if (_bbox_cpu_data != nullptr) { - fast_free(_bbox_cpu_data); - } - _conf_cpu_data = (InDataType*)fast_malloc(sizeof(InDataType) * sh_conf.count()); - _bbox_cpu_data = (InDataType*)fast_malloc(sizeof(InDataType) * sh_loc.count()); - - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - DetectionOutputParam& param); - - -private: - int _num_classes; - int _num_loc_classes; - int _num_priors; - DataTensor_in _bbox_preds; - DataTensor_in _bbox_permute; - DataTensor_in _conf_permute; - InDataType* _bbox_cpu_data{nullptr}; - InDataType* _conf_cpu_data{nullptr}; -}; -template class SaberDetectionOutput; -} //namespace saber - -} //namespace anakin - -#endif //ANAKIN_SABER_FUNCS_CUDA_SABER_DETECTION_OUTPUT_H diff --git a/saber/funcs/impl/cuda/saber_eltwise.h b/saber/funcs/impl/cuda/saber_eltwise.h index 8c63676b8..4e302192e 100644 --- a/saber/funcs/impl/cuda/saber_eltwise.h +++ b/saber/funcs/impl/cuda/saber_eltwise.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,68 +17,76 @@ #define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ELTWISE_H #include "saber/funcs/impl/impl_eltwise.h" - +#include "saber/funcs/impl/cuda/saber_activation.h" namespace anakin { namespace saber { -template -class SaberEltwise:\ -public ImplBase< - Tensor, - Tensor, - Tensor, - EltwiseParam>> { +template +class SaberEltwise: + public ImplBase < + NV, OpDtype, + EltwiseParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; + typedef typename DataTrait::Dtype OpDataType; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; SaberEltwise() {} ~SaberEltwise() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - EltwiseParam ¶m, - Context &ctx) { + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + EltwiseParam& param, + Context& ctx) { // get context - this->_ctx = ctx; + this->_ctx = &ctx; + CHECK_GE(outputs.size(), 1) << "outputs size has to == 1"; + CHECK_GE(inputs.size(), 2) << "input size has to >= 2"; + CHECK(!(inputs.size() > 2 + && param.operation == Eltwise_sum)) << + "not support input size>2 and operation==Eltwise_sum, size = " << inputs.size() << ",activation = " + << param.operation; + _with_relu = param.has_eltwise && param.activation_param.active == Active_relu; + _other_activation = param.has_eltwise && param.activation_param.active != Active_relu + && param.activation_param.active != Active_unknow; + + if (_other_activation) { + SABER_CHECK(_saber_activation.init(inputs, outputs, param.activation_param, ctx)); + } + return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - EltwiseParam ¶m, - Context &ctx) { - this->_ctx = ctx; - if ((param.operation == Eltwise_max) && (outputs.size() == 1)) { - _max_idx.reshape(inputs[0]->shape()); + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + EltwiseParam& param, + Context& ctx) { + this->_ctx = &ctx; + + if (_other_activation) { + SABER_CHECK(_saber_activation.create(inputs, outputs, param.activation_param, ctx)); } + return SaberSuccess; } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - EltwiseParam ¶m) override; + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + EltwiseParam& param) override; private: - OpTensor _max_idx; + // Tensor _max_idx; + bool _with_relu; + bool _other_activation; + SaberActivation _saber_activation; + }; -template class SaberEltwise; + } //namespace saber } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ELTWISE_H \ No newline at end of file +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ELTWISE_H diff --git a/saber/funcs/impl/cuda/saber_eltwise_act.h b/saber/funcs/impl/cuda/saber_eltwise_act.h deleted file mode 100644 index 57e8ff20b..000000000 --- a/saber/funcs/impl/cuda/saber_eltwise_act.h +++ /dev/null @@ -1,87 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ELTWISE_ACT_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ELTWISE_ACT_H - -#include "saber/funcs/impl/impl_eltwise_act.h" - -namespace anakin { - -namespace saber { - -template -class SaberEltwiseActive:\ - public ImplBase< - Tensor,\ - Tensor, - Tensor, - EltwiseActiveParam>> { -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberEltwiseActive() {} - - ~SaberEltwiseActive() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - EltwiseActiveParam ¶m, - Context &ctx) { - // get context - this->_ctx = ctx; - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - EltwiseActiveParam ¶m, - Context &ctx) { - - this->_ctx = ctx; - EltwiseParam &elt_param = param.eltwise_param; - if ((elt_param.operation == Eltwise_max) && (outputs.size() == 1)) { - _max_idx.reshape(inputs[0]->shape()); - } - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - EltwiseActiveParam ¶m) override; - -private: - OpTensor _max_idx; -}; - -template class SaberEltwiseActive; - -} //namespace saber - -} //namespace anakin - -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ELTWISE_ACT_H \ No newline at end of file diff --git a/saber/funcs/impl/cuda/saber_embedding.h b/saber/funcs/impl/cuda/saber_embedding.h index 98926213c..81bbbe45b 100644 --- a/saber/funcs/impl/cuda/saber_embedding.h +++ b/saber/funcs/impl/cuda/saber_embedding.h @@ -1,9 +1,7 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software @@ -22,56 +20,36 @@ namespace anakin{ namespace saber{ -template -class SaberEmbedding : \ +template +class SaberEmbedding : public ImplBase< - Tensor, - Tensor, - Tensor, - EmbeddingParam > > -{ + NV, OpDtype, + EmbeddingParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberEmbedding() - {} - + typedef typename DataTrait::Dtype OpDataType; + SaberEmbedding() {} ~SaberEmbedding() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - EmbeddingParam& param, Context& ctx) { - this->_ctx = ctx; - + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + EmbeddingParam& param, Context& ctx) { + this->_ctx = &ctx; return SaberSuccess; } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - EmbeddingParam& param, Context &ctx) { - this->_ctx = ctx; + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + EmbeddingParam& param, Context &ctx) { return SaberSuccess; } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - EmbeddingParam& param); + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + EmbeddingParam& param); }; -//template class SaberEmbedding; } } -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_EMBEDDING_H +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_EMBEDDING_H \ No newline at end of file diff --git a/saber/funcs/impl/cuda/saber_fc.h b/saber/funcs/impl/cuda/saber_fc.h index 038a7d2e7..1a8214eb4 100644 --- a/saber/funcs/impl/cuda/saber_fc.h +++ b/saber/funcs/impl/cuda/saber_fc.h @@ -1,9 +1,7 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software @@ -23,44 +21,29 @@ namespace anakin{ namespace saber{ -template -class SaberFc: \ - public ImplBase< - Tensor, \ - Tensor, \ - Tensor, \ - FcParam>> { +template +class SaberFc: public ImplBase > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberFc() = default; ~SaberFc() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - FcParam& param, Context& ctx){ + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam& param, Context& ctx){ // get context - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - FcParam& param, Context& ctx){ + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam& param, Context& ctx){ - if (!(ctx == this->_ctx)) { - this->_ctx = ctx; + if (!(&ctx == this->_ctx)) { + this->_ctx = &ctx; } Shape shape_out = inputs[0]->valid_shape(); @@ -78,9 +61,9 @@ class SaberFc& inputs, - std::vector& outputs, - FcParam& param); + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam& param); private: @@ -94,9 +77,8 @@ class SaberFc _kernel; }; -template class SaberFc; } //namespace saber } //namespace anakin -#endif +#endif \ No newline at end of file diff --git a/saber/funcs/impl/cuda/saber_gemm.cpp b/saber/funcs/impl/cuda/saber_gemm.cpp new file mode 100644 index 000000000..a510866a1 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_gemm.cpp @@ -0,0 +1,45 @@ + +#include "saber/funcs/impl/cuda/saber_gemm.h" + +namespace anakin { +namespace saber { + +template <> +SaberStatus Gemm::init(const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx) { + + if (!(ctx == this->_ctx)) { + this->_ctx = ctx; + } + _m = m; + _n = n; + _k = k; + _kernel =saber_find_fast_sass_gemm(trans_a, trans_b, _m, _n, _k); + return SaberSuccess; +} +template <> +SaberStatus Gemm::dispatch(const float alpha, const float beta, + const float* ptr_a, const float* ptr_b, float* ptr_c) { + CHECK(ptr_a != nullptr); + CHECK(ptr_b != nullptr); + CHECK(ptr_c != nullptr); + cudaStream_t cuda_stream = _ctx.get_compute_stream(); + _kernel(_m, _n, _k, alpha, ptr_a, beta, ptr_b, ptr_c, cuda_stream); + return SaberSuccess; +} +template <> +SaberStatus Gemm::init(const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx) { + return SaberUnImplError; +} +template <> +SaberStatus Gemm::dispatch(const float alpha, const float beta, + const char* ptr_a, const char* ptr_b, float* ptr_c) { + + return SaberUnImplError; +} + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/saber_gemm.h b/saber/funcs/impl/cuda/saber_gemm.h new file mode 100644 index 000000000..847a1aac6 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_gemm.h @@ -0,0 +1,38 @@ + +#ifndef SABER_FUNCS_IMPL_CUDA_SABER_GEMM_H +#define SABER_FUNCS_IMPL_CUDA_SABER_GEMM_H + +#include "saber/funcs/gemm.h" +#include "saber/funcs/impl/cuda/base/sass_funcs.h" +namespace anakin { +namespace saber { + +template +class Gemm { +public: + Gemm() = default; + ~Gemm() {} + + SaberStatus init(const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx); + + SaberStatus dispatch(const outDtype alpha, const outDtype beta, + const inDtype* a, const inDtype* b, + outDtype* c); + +private: + int _m{-1}; + int _n{-1}; + int _k{-1}; + std::function _kernel; + Context _ctx; +}; + +} +} + +#endif \ No newline at end of file diff --git a/saber/funcs/impl/cuda/saber_gru.h b/saber/funcs/impl/cuda/saber_gru.h index d22c21206..2c14e850e 100644 --- a/saber/funcs/impl/cuda/saber_gru.h +++ b/saber/funcs/impl/cuda/saber_gru.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,140 +16,147 @@ #ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_GRU_H #define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_GRU_H #include "saber/funcs/impl/impl_gru.h" - +#include "saber/funcs/impl/cuda/base/sass_funcs.h" +#include "cuda_utils.h" namespace anakin { namespace saber { -template -class SaberGru:\ - public ImplBase< - Tensor, \ - Tensor, \ - Tensor, \ - GruParam>> { -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; +template +class SaberGru: public ImplBase < + NV, OpDtype,GruParam > { + +public: + typedef typename DataTrait::Dtype OpDataType; + typedef Tensor OpTensor; SaberGru() {} ~SaberGru() { - if (_cublas_handle != NULL) { - CUBLAS_CHECK(cublasDestroy(_cublas_handle)); - } - } - - virtual SaberStatus init(const std::vector& inputs, \ - std::vector& outputs, \ - GruParam & gru_param, Context& ctx) { - - this->_ctx = ctx; - CUBLAS_CHECK(cublasCreate(&_cublas_handle)); - CUBLAS_CHECK(cublasSetStream(_cublas_handle, this->_ctx.get_compute_stream())); - if(gru_param.formula==GRU_ORIGIN) { - _hidden_size = gru_param.bias()->valid_size() / 3; - - int weights_bias_size = _hidden_size * 3; - int weights_h2h_size = _hidden_size * _hidden_size * 3; - int weights_i2h_size = gru_param.weight()->valid_size() - weights_h2h_size; - - _word_size = weights_i2h_size / _hidden_size / 3; - _weights_i2h.try_expand_size(weights_i2h_size); - _weights_h2h.try_expand_size(weights_h2h_size); - _weights_bias.try_expand_size(weights_bias_size); - - int size_data_type = sizeof(InDataType); -// memcpy(_weights_i2h.mutable_data(), gru_param.weight()->data(), -// size_data_type * weights_i2h_size); -// memcpy(_weights_h2h.mutable_data(), gru_param.weight()->data() + weights_i2h_size, -// size_data_type * weights_h2h_size); -// memcpy(_weights_bias.mutable_data(), gru_param.bias()->data(), -// size_data_type * weights_bias_size); - - CUDA_CHECK(cudaMemcpy(_weights_i2h.mutable_data(), gru_param.weight()->data(), size_data_type * weights_i2h_size - ,cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaMemcpy(_weights_h2h.mutable_data(), gru_param.weight()->data() + weights_i2h_size, - size_data_type * weights_h2h_size,cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaMemcpy(_weights_bias.mutable_data(), gru_param.bias()->data(), size_data_type * weights_bias_size - ,cudaMemcpyDeviceToDevice)); - } - return create(inputs, outputs, gru_param, ctx); + } - virtual SaberStatus create(const std::vector& inputs, \ - std::vector& outputs, \ - GruParam& gru_param, Context& ctx) { + virtual SaberStatus init(const std::vector& inputs, \ + std::vector& outputs, \ + GruParam & param, Context& ctx) { + + this->_ctx = &ctx; + + CHECK(param.init_hidden() == nullptr)<< "only support init_hidden == null now"; + + _hidden_size = param.bias()->valid_size() / 3; + + int weights_h2h_size = _hidden_size * _hidden_size * 3; + int weights_i2h_size = param.weight()->valid_size() - weights_h2h_size; + + _word_size = weights_i2h_size / _hidden_size / 3; + + _seq_util = SeqSortedseqTranseUtil(param.is_reverse); + if(param.formula==GRU_CUDNN){ + + const OpDataType* ori_weights_ptr= static_cast(param.weight()->data())+weights_i2h_size; + utils::try_expand_tensor(_temp_weights_h2h,weights_h2h_size); + Tensor temp_weights_h2h_ori; + Tensor temp_weights_h2h_swarp; + utils::try_expand_tensor(temp_weights_h2h_swarp,weights_h2h_size); + utils::try_expand_tensor(temp_weights_h2h_ori,weights_h2h_size); + CUDA_CHECK(cudaMemcpyAsync(temp_weights_h2h_ori.data(),ori_weights_ptr, sizeof(OpDataType)*weights_h2h_size,cudaMemcpyDeviceToHost,this->_ctx->get_compute_stream())); + cudaDeviceSynchronize(); - if (!(ctx == this->_ctx)) { - if (_cublas_handle != NULL) { - CUBLAS_CHECK(cublasDestroy(_cublas_handle)); + float* temp_tensor_ptr= static_cast(temp_weights_h2h_swarp.mutable_data()); + memcpy(temp_tensor_ptr, static_cast(temp_weights_h2h_ori.data()), + sizeof(OpDataType) * _hidden_size*_hidden_size); + + float* rz_temp_tensor_ptr=temp_tensor_ptr+_hidden_size*_hidden_size; + const float* rz_weights_tensor_ptr=static_cast(temp_weights_h2h_ori.data()) +_hidden_size*_hidden_size; + for(int row=0;row<_hidden_size;row++){ + for(int block=0;block<2;block++) { + int block_offset=block*_hidden_size; + for (int cow = 0; cow < _hidden_size; cow++) { + rz_temp_tensor_ptr[block*_hidden_size*_hidden_size+row*_hidden_size+cow]=rz_weights_tensor_ptr[row*(2*_hidden_size)+cow+block_offset]; + } + } + } + + float* orz_temp_tensor_ptr=temp_tensor_ptr; + float* orz_weights_tensor_ptr=static_cast(temp_weights_h2h_ori.mutable_data()); + for(int row=0;row<_hidden_size;row++){ + for(int block=0;block<3;block++) { + int block_offset=block*_hidden_size; + for (int cow = 0; cow < _hidden_size; cow++) { + orz_weights_tensor_ptr[row*(3*_hidden_size)+cow+block_offset]=orz_temp_tensor_ptr[block*_hidden_size*_hidden_size+row*_hidden_size+cow]; + } + } } - this->_ctx = ctx; + _temp_weights_h2h.copy_from(temp_weights_h2h_ori); + cudaDeviceSynchronize(); - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - CUBLAS_CHECK(cublasCreate(&_cublas_handle)); - CUBLAS_CHECK(cublasSetStream(_cublas_handle, cuda_stream)); } + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector& inputs, \ + std::vector& outputs, \ + GruParam& param, Context& ctx) { + + if (!(&ctx == this->_ctx)) { + this->_ctx = &ctx; + } + std::vector> offset_vec=inputs[0]->get_seq_offset(); + std::vector offset=offset_vec[offset_vec.size()-1]; + int batch_size = offset.size()-1; + int sequence = inputs[0]->num(); + _gemm_wx = saber_find_fast_sass_gemm(false, false, sequence, 3 * _hidden_size, + _word_size); + _gemm_wh_2 = saber_find_fast_sass_gemm(false, false, batch_size, 2 * _hidden_size, _hidden_size); + + _gemm_wh_o = saber_find_fast_sass_gemm(false, false, batch_size, 1 * _hidden_size, _hidden_size); return SaberSuccess; } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - GruParam & param); + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + GruParam & param); private: - cublasHandle_t _cublas_handle; -/** - * for hw2seq - */ - Tensor _temp_tensor_in; - Tensor _temp_tensor_out; - Tensor _temp_WX; - Tensor _temp_WH; - - Tensor _temp_vector_offset; - Tensor _temp_map_host; - Tensor _temp_map_dev; + /** + * for hw2seq + */ + OpTensor _temp_tensor_in; + OpTensor _temp_tensor_out; + OpTensor _temp_wx; + OpTensor _temp_wh; + OpTensor _temp_whr; + + OpTensor _temp_zero; + + OpTensor _temp_weights_h2h; + + OpTensor _temp_vector_offset; + OpTensor _temp_map_host; + OpTensor _temp_map_dev; + + SeqSortedseqTranseUtil _seq_util; int _word_size; int _hidden_size; - OpTensor _weights_i2h; - OpTensor _weights_h2h; - OpTensor _weights_bias; - - void seq2hw(std::vector outputs, std::vector inputs, - GruParam& param, int hidden_size,void* real_temp_out); -/** - * dim2 input to seq,batch,wordsize - * @param inputs - * @param param - * @param word_size - * @param sequence - * @param out_sequence - * @param ctx - * @return sequence length - */ - const InDataType* hw2seq(std::vector inputs, GruParam& param, - int word_size, int hiddensize, int& sequence_len); - - SaberStatus gru_cudnn(const std::vector inputs, - std::vector outputs, - GruParam& param); + + std::function _gemm_wx; + + std::function _gemm_wh_2; + + std::function _gemm_wh_o; + }; diff --git a/saber/funcs/impl/cuda/saber_im2sequence.h b/saber/funcs/impl/cuda/saber_im2sequence.h index 1a377a1cc..44ad52c42 100644 --- a/saber/funcs/impl/cuda/saber_im2sequence.h +++ b/saber/funcs/impl/cuda/saber_im2sequence.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,44 +22,28 @@ namespace anakin{ namespace saber{ -template -class SaberIm2Sequence:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - Im2SequenceParam>> { +template +class SaberIm2Sequence:\ + public ImplBase > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberIm2Sequence() {} ~SaberIm2Sequence() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - Im2SequenceParam ¶m, + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + Im2SequenceParam ¶m, Context &ctx) { - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - Im2SequenceParam ¶m, + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + Im2SequenceParam ¶m, Context &ctx) { int input_height = inputs[0]->height(); // P _kernel_exten_h = param.dilation_h * (param.window_h - 1) + 1; @@ -74,9 +58,9 @@ class SaberIm2Sequence& inputs, - std::vector& outputs, - Im2SequenceParam ¶m); + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + Im2SequenceParam ¶m); private: int _output_height; @@ -85,7 +69,7 @@ class SaberIm2Sequence; +template class SaberIm2Sequence; } //namespace saber diff --git a/saber/funcs/impl/cuda/saber_layer_norm.h b/saber/funcs/impl/cuda/saber_layer_norm.h index 8cd20612c..41114b329 100644 --- a/saber/funcs/impl/cuda/saber_layer_norm.h +++ b/saber/funcs/impl/cuda/saber_layer_norm.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,50 +22,33 @@ namespace anakin{ namespace saber{ -template - class SaberLayerNorm:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - LayerNormParam>> { +template +class SaberLayerNorm:public ImplBase > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberLayerNorm() = default; ~SaberLayerNorm() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - LayerNormParam ¶m, + virtual SaberStatus init(const std::vector* >& inputs, + std::vector* >& outputs, + LayerNormParam ¶m, Context &ctx) { // get context - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - LayerNormParam ¶m, + virtual SaberStatus create(const std::vector* >& inputs, + std::vector* >& outputs, + LayerNormParam ¶m, Context &ctx) { //Shape sh_in = inputs[0]->valid_shape(); _inner_size = inputs[0]->count_valid(param.axis, inputs[0]->dims()); _outer_size = inputs[0]->count_valid(0, param.axis); - Shape sh = Shape::zero(inputs[0]->dims()); + Shape sh({0, 0, 0, 0}); for (int i = 0; i < sh.dims(); ++i) { sh[i] = 1; } @@ -84,22 +67,23 @@ template & inputs, - std::vector& outputs, - LayerNormParam ¶m); + virtual SaberStatus dispatch(const std::vector* >& inputs, + std::vector* >& outputs, + LayerNormParam ¶m); private: - OpTensor _mean; - OpTensor _std; + Tensor _mean; + Tensor _std; int _inner_size; int _outer_size; bool _flag_scale{true}; bool _flag_bias{true}; }; -template class SaberLayerNorm; +template class SaberLayerNorm; } //namespace saber } //namespace anakin diff --git a/saber/funcs/impl/cuda/saber_lrn.h b/saber/funcs/impl/cuda/saber_lrn.h index 312528768..075fc75df 100644 --- a/saber/funcs/impl/cuda/saber_lrn.h +++ b/saber/funcs/impl/cuda/saber_lrn.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,44 +22,29 @@ namespace anakin{ namespace saber{ -template -class SaberLrn:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - LrnParam>> { +template +class SaberLrn: public ImplBase > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberLrn() {} ~SaberLrn() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - LrnParam ¶m, + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + LrnParam ¶m, Context &ctx) { - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - LrnParam &crop_param, + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + LrnParam &crop_param, Context &ctx) { + Shape temp = outputs[0]->valid_shape(); + Shape temp_in = inputs[0]->valid_shape(); Shape out_stride = outputs[0]->get_stride(); Shape in_stride = inputs[0]->get_stride(); int in_n_index = inputs[0]->num_index(); @@ -81,9 +66,9 @@ class SaberLrn& inputs, - std::vector& outputs, - LrnParam ¶m); + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + LrnParam ¶m); private: int _in_n_stride; @@ -96,7 +81,7 @@ class SaberLrn; +template class SaberLrn; } //namespace saber diff --git a/saber/funcs/impl/cuda/saber_lstm.h b/saber/funcs/impl/cuda/saber_lstm.h new file mode 100644 index 000000000..5806e51b7 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_lstm.h @@ -0,0 +1,132 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_LSTM_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_LSTM_H +#include "saber/funcs/impl/impl_lstm.h" +#include "saber/funcs/impl/cuda/base/sass_funcs.h" +#include "cuda_utils.h" +namespace anakin { + +namespace saber { + +static int round_up(int k, int c) { + return ((k + c - 1) / c) * c; +} + +template +class SaberLstm: public ImplBase < + NV, OpDtype,LstmParam > { + +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberLstm() {} + ~SaberLstm() { + + } + + virtual SaberStatus init(const std::vector*>& inputs, \ + std::vector*>& outputs, \ + LstmParam & param, Context& ctx) { + + this->_ctx = &ctx; + if(param.with_peephole){ + _hidden_size=param.bias()->valid_size()/7; + }else{ + _hidden_size=param.bias()->valid_size()/4; + } + _word_size=(param.weight()->valid_size()-_hidden_size*_hidden_size*4)/_hidden_size/4; + //TODO:add round_up to saber_util + _aligned_hidden_size=round_up(_hidden_size,32); + + + _seq_util = SeqSortedseqTranseUtil(param.is_reverse); + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, \ + std::vector*>& outputs, \ + LstmParam < NV >& param, Context& ctx) { + if (!(&ctx == this->_ctx)) { + this->_ctx = &ctx; + } + + std::vector> lod=inputs[0]->get_seq_offset(); + std::vector offset=lod[lod.size()-1]; + int batch_size = offset.size() - 1; + CHECK_GE(batch_size,1)<<"batchsize must >= 1"; + + int sequence = inputs[0]->num(); + _gemm_wx = saber_find_fast_sass_gemm(false, false, sequence, 4 * _hidden_size,_word_size); + _gemm_wh = saber_find_fast_sass_gemm(false, false, batch_size, 4 * _hidden_size, _hidden_size); + return SaberSuccess; + } + + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam & param); + +private: + int _word_size; + int _hidden_size; + int _aligned_hidden_size; + + Tensor _init_hidden; + + Tensor _temp_wx; + Tensor _temp_wh; + Tensor _temp_cell; + + Tensor _temp_x; + Tensor _temp_out; + Tensor _temp_h_init; + + + Tensor _temp_map_dev; + Tensor _temp_zero; + + std::function _gemm_wx; + + std::function _gemm_wh; + + SeqSortedseqTranseUtil _seq_util; + + SaberStatus + dispatch_batch( + const std::vector < Tensor* >& inputs, + std::vector < Tensor* >& outputs, + LstmParam < NV >& param); + + SaberStatus + dispatch_once( + const std::vector < Tensor* >& inputs, + std::vector < Tensor* >& outputs, + LstmParam < NV >& param); + +}; + + + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_GRU_H \ No newline at end of file diff --git a/saber/funcs/impl/cuda/saber_mat_mul.h b/saber/funcs/impl/cuda/saber_mat_mul.h index 0a4c1b80a..39aca0116 100644 --- a/saber/funcs/impl/cuda/saber_mat_mul.h +++ b/saber/funcs/impl/cuda/saber_mat_mul.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,67 +22,50 @@ namespace anakin{ namespace saber{ -template -class SaberMatMul:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - MatMulParam>> { +template +class SaberMatMul: public ImplBase > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberMatMul() {} ~SaberMatMul() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - MatMulParam ¶m, + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + MatMulParam ¶m, Context &ctx) { - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - MatMulParam ¶m, + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + MatMulParam ¶m, Context &ctx) { - _kernel =saber_find_fast_sass_gemm(param._is_transpose_X, param._is_transpose_Y, param._M, param._N, param._K); + _kernel =saber_find_fast_sass_gemm(param._is_transpose_X, param._is_transpose_Y, param._m, param._n, param._k); return SaberSuccess; } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - MatMulParam ¶m) + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + MatMulParam ¶m) { - cudaStream_t stream = this->_ctx.get_compute_stream(); - const InDataType* X = inputs[0]->data(); - const InDataType* Y = inputs[1]->data(); - OutDataType* out = outputs[0]->mutable_data(); + cudaStream_t stream = this->_ctx->get_compute_stream(); + const OpDataType* X = (const OpDataType*)inputs[0]->data(); + const OpDataType* Y = (const OpDataType*)inputs[1]->data(); + OpDataType* out = (OpDataType*)outputs[0]->mutable_data(); //should add batch gemm here - for (int b = 0; b < param._B; b++) + for (int b = 0; b < param._b; b++) { - _kernel(param._M, param._N, param._K, 1.f, - X + b * param._M * param._K, + _kernel(param._m, param._n, param._k, 1.f, + X + b * param._m * param._k, 0.f, - Y + b * param._K * param._N, - out + b * param._M * param._N, stream); + Y + b * param._k * param._n, + out + b * param._m * param._n, stream); } return SaberSuccess; } @@ -94,7 +77,7 @@ class SaberMatMul _kernel; }; -template class SaberMatMul; +template class SaberMatMul; } //namespace saber diff --git a/saber/funcs/impl/cuda/saber_multiclass_nms.cpp b/saber/funcs/impl/cuda/saber_multiclass_nms.cpp deleted file mode 100644 index da353cba7..000000000 --- a/saber/funcs/impl/cuda/saber_multiclass_nms.cpp +++ /dev/null @@ -1,62 +0,0 @@ -#include "saber/funcs/impl/cuda/saber_multiclass_nms.h" -#include "saber/funcs/impl/detection_helper.h" -namespace anakin { - -namespace saber { - -template -SaberStatus SaberMultiClassNMS ::dispatch(const std::vector& inputs, - std::vector& outputs, - MultiClassNMSParam& param) { - - cudaStream_t stream = this->_ctx.get_compute_stream(); - - DataTensor_in* t_loc = inputs[0]; - DataTensor_in* t_conf = inputs[1]; - int class_num = t_conf->valid_shape()[1]; - const int num = t_loc->num(); - - const InDataType* loc_data = t_loc->data(); - const InDataType* conf_data = t_conf->data(); - - - CUDA_CHECK(cudaMemcpyAsync(_bbox_cpu_data, loc_data, \ - t_loc->valid_size() * sizeof(InDataType), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaMemcpyAsync(_conf_cpu_data, conf_data, \ - t_conf->valid_size() * sizeof(InDataType), cudaMemcpyDeviceToHost, stream)); - - std::vector result; - - nms_detect(_bbox_cpu_data, _conf_cpu_data, result, num, class_num, _num_priors, param.background_id, - \ - param.keep_top_k, param.nms_top_k, param.conf_thresh, param.nms_thresh, param.nms_eta, true); - - if (result.size() == 0) { - result.resize(7); - - for (int i = 0; i < 7; ++i) { - result[i] = (InDataType) - 1; - } - - outputs[0]->reshape({1, 7}); - } else { - outputs[0]->reshape({result.size() / 7, 7}); - } - - CUDA_CHECK(cudaMemcpyAsync(outputs[0]->mutable_data(), result.data(), \ - result.size() * sizeof(InDataType), cudaMemcpyHostToDevice, stream)); - - return SaberSuccess; -} -template class SaberMultiClassNMS; -template class SaberMultiClassNMS; - -} //namespace anakin - -} //namespace anakin diff --git a/saber/funcs/impl/cuda/saber_multiclass_nms.h b/saber/funcs/impl/cuda/saber_multiclass_nms.h deleted file mode 100644 index 71c55d13f..000000000 --- a/saber/funcs/impl/cuda/saber_multiclass_nms.h +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_CUDA_SABER_MULTICLASS_NMS_H -#define ANAKIN_SABER_FUNCS_CUDA_SABER_MULTICLASS_NMS_H - -#include "saber/funcs/impl/impl_multiclass_nms.h" -#include "saber/core/tensor.h" - -namespace anakin{ - -namespace saber{ - -template -class SaberMultiClassNMS : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - MultiClassNMSParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberMultiClassNMS() = default; - ~SaberMultiClassNMS() { - if (_bbox_cpu_data) { - fast_free(_bbox_cpu_data); - } - if (_conf_cpu_data) { - fast_free(_conf_cpu_data); - } - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - MultiClassNMSParam& param, Context& ctx) { - // get context - this->_ctx = ctx; - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - MultiClassNMSParam& param, Context &ctx) { - - //! inputs[0]: bbox map, dims = 3 {N, boxes, 4(xmin, ymin, xmax, ymax)} - //! inputs[1]: score map, dims = 3 {N, classes, boxes} - Shape sh_bbox = inputs[0]->valid_shape(); - Shape sh_conf = inputs[1]->valid_shape(); - - //! layout must be 3 dims, the priors(number of boxes) is in the second dim - _num_priors = sh_bbox[1]; - - CHECK_EQ(sh_conf[2], sh_bbox[1]) << \ - "Number of bboxes must match the number of scores per class."; - - if (_conf_cpu_data != nullptr) { - fast_free(_conf_cpu_data); - _conf_cpu_data = nullptr; - } - - if (_bbox_cpu_data != nullptr) { - fast_free(_bbox_cpu_data); - _bbox_cpu_data = nullptr; - } - _conf_cpu_data = (InDataType*)fast_malloc(sizeof(InDataType) * sh_conf.count()); - _bbox_cpu_data = (InDataType*)fast_malloc(sizeof(InDataType) * sh_bbox.count()); - - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - MultiClassNMSParam& param); - -private: - int _num_priors; - InDataType* _bbox_cpu_data{nullptr}; - InDataType* _conf_cpu_data{nullptr}; -}; -} //namespace saber - -} //namespace anakin - -#endif //ANAKIN_SABER_FUNCS_CUDA_SABER_MULTICLASS_NMS_H diff --git a/saber/funcs/impl/cuda/saber_mvn.h b/saber/funcs/impl/cuda/saber_mvn.h index 97dd6f808..9e36f2287 100644 --- a/saber/funcs/impl/cuda/saber_mvn.h +++ b/saber/funcs/impl/cuda/saber_mvn.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,51 +22,34 @@ namespace anakin{ namespace saber{ -template -class SaberMvn:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - MvnParam>> { +template +class SaberMvn: public ImplBase > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberMvn() {} ~SaberMvn() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - MvnParam ¶m, + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + MvnParam ¶m, Context &ctx) { - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - MvnParam ¶m, + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + MvnParam ¶m, Context &ctx) { int num = inputs[0]->num() * inputs[0]->channel(); if (param.across_channels) { num = inputs[0]->num(); } - Shape shape = Shape::zero(inputs[0]->dims()); + Shape shape = inputs[0]->valid_shape(); for (int i = 0; i < shape.size(); i++) { shape[i] = 1; } @@ -78,17 +61,17 @@ class SaberMvn& inputs, - std::vector& outputs, - MvnParam ¶m); + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + MvnParam ¶m); private: - OpTensor _mean; - OpTensor _sd; + Tensor _mean; + Tensor _sd; }; -template class SaberMvn; +template class SaberMvn; } //namespace saber diff --git a/saber/funcs/impl/cuda/saber_normalize.h b/saber/funcs/impl/cuda/saber_normalize.h index 423310927..037d992b5 100644 --- a/saber/funcs/impl/cuda/saber_normalize.h +++ b/saber/funcs/impl/cuda/saber_normalize.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,44 +22,37 @@ namespace anakin{ namespace saber{ -template - class SaberNormalize:\ +template + class SaberNormalize: public ImplBase< - Tensor, - Tensor, - Tensor, - NormalizeParam>> { + NV,OpDtype, + NormalizeParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + //typedef typename DataTensor_in::Dtype InDataType; + //typedef typename DataTensor_out::Dtype OutDataType; + //typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberNormalize() = default; ~SaberNormalize() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - NormalizeParam ¶m, + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + NormalizeParam ¶m, Context &ctx) { // get context - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - NormalizeParam ¶m, + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + NormalizeParam ¶m, Context &ctx) { // compute norm size int channel_index = inputs[0]->channel_index(); @@ -84,7 +77,7 @@ template count_valid(channel_index + 1, _dims); _compute_size = _size / _norm_size; - Shape sh_norm{1, 1, 1, _norm_size}; + Shape sh_norm({1, 1, 1, _norm_size}); _norm_reduce.reshape(sh_norm); _is_continue_buf = outputs[0]->is_continue_mem() && inputs[0]->is_continue_mem(); @@ -93,7 +86,7 @@ template get_stride(); //! re_alloc device memory - Shape sh{1, 1, 1, _dims}; + Shape sh({1, 1, 1, _dims}); _valid_shape.reshape(sh); _input_stride.reshape(sh); _output_stride.reshape(sh); @@ -108,13 +101,14 @@ template & inputs, - std::vector& outputs, - NormalizeParam ¶m); + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + NormalizeParam ¶m); private: - Tensor _norm_reduce; + //Tensor _norm_reduce; + Tensor _norm_reduce; int _size; int _norm_size; int _compute_size; @@ -122,14 +116,16 @@ template _input_stride; - Tensor _output_stride; - Tensor _valid_shape; + //todo: + Tensor _input_stride; + Tensor _output_stride; + Tensor _valid_shape; + bool _is_continue_buf{true}; }; -template class SaberNormalize; +//template class SaberNormalize; } //namespace saber } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_NORMALIZE_H \ No newline at end of file +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_NORMALIZE_H diff --git a/saber/funcs/impl/cuda/saber_pad.h b/saber/funcs/impl/cuda/saber_pad.h index 58b32bab1..09b4348d3 100644 --- a/saber/funcs/impl/cuda/saber_pad.h +++ b/saber/funcs/impl/cuda/saber_pad.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,50 +17,37 @@ #define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PAD_H #include "saber/funcs/impl/impl_pad.h" +#include "saber/core/data_traits.h" namespace anakin{ namespace saber{ -template -class SaberPad:\ +template +class SaberPad:\ public ImplBase< - Tensor, - Tensor, - Tensor, - PadParam>> { + NV, OpDtype, + PadParam> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait :: Dtype dtype; SaberPad() {} ~SaberPad() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - PadParam ¶m, + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PadParam ¶m, Context &ctx) { - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - PadParam ¶m, + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PadParam ¶m, Context &ctx) { CHECK_EQ(2, param.pad_c.size()); CHECK_EQ(2, param.pad_h.size()); @@ -90,8 +77,8 @@ class SaberPad& inputs, \ - std::vector& outputs, PadParam ¶m); + virtual SaberStatus dispatch(const std::vector*>& inputs, \ + std::vector*>& outputs, PadParam ¶m); private: int _img_offset; int _in_n_stride; @@ -104,10 +91,10 @@ class SaberPad; +template class SaberPad; } //namespace saber } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PAD_H \ No newline at end of file +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PAD_H diff --git a/saber/funcs/impl/cuda/saber_permute.cpp b/saber/funcs/impl/cuda/saber_permute.cpp new file mode 100644 index 000000000..29a792011 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_permute.cpp @@ -0,0 +1,65 @@ +#include "saber/funcs/impl/cuda/saber_permute.h" + +#include "saber/funcs/impl/impl_permute.h" + +namespace anakin{ +namespace saber{ +template class SaberPermute; + +template <> +SaberStatus SaberPermute::\ + create(const std::vector*>& inputs, + std::vector*>& outputs, + PermuteParam ¶m, Context &ctx) { + + Shape order_shape({_num_axes, 1, 1, 1}); + _in_steps.reshape(order_shape); + _out_steps.reshape(order_shape); + _out_valid_shape.reshape(order_shape); + + Shape in_stride = inputs[0]->get_stride(); + Shape out_stride = outputs[0]->get_stride(); + + cudaMemcpy(_in_steps.mutable_data(), &in_stride[0], + sizeof(int) * _in_steps.size(), cudaMemcpyHostToDevice); + cudaMemcpy(_out_steps.mutable_data(), &out_stride[0], + sizeof(int) * _out_steps.size(), cudaMemcpyHostToDevice); + cudaMemcpy(_out_valid_shape.mutable_data(), &((outputs[0]->valid_shape())[0]), + sizeof(int) * _out_valid_shape.size(), cudaMemcpyHostToDevice); + return SaberSuccess; +} + +template <> +SaberStatus SaberPermute::\ + init(const std::vector*>& inputs, + std::vector*>& outputs, + PermuteParam ¶m, Context &ctx) { + this->_ctx = &ctx; + _num_axes = inputs[0]->valid_shape().size(); + for (int i = 0; i < _num_axes; i++) { + if (std::find(_order_dims.begin(), _order_dims.end(), + param.order[i]) == _order_dims.end()) { + _order_dims.push_back(param.order[i]); + } + } + + CHECK_EQ(_num_axes, _order_dims.size()); + + // set _need_permute + _need_permute = false; + for (int i = 0; i < _num_axes; ++i) { + if (param.order[i] != i) { + _need_permute = true; + break; + } + } + Shape order_shape({_num_axes, 1, 1, 1}); + _permute_order.reshape(order_shape); + cudaMemcpy(_permute_order.mutable_data(), &(param.order[0]), + sizeof(int) * _permute_order.size(), cudaMemcpyHostToDevice); + return create(inputs, outputs, param, ctx); +} + +} //namespace saber + +} //namespace anakin diff --git a/saber/funcs/impl/cuda/saber_permute.h b/saber/funcs/impl/cuda/saber_permute.h index 6b4dee69b..0ccdf92a9 100644 --- a/saber/funcs/impl/cuda/saber_permute.h +++ b/saber/funcs/impl/cuda/saber_permute.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,101 +22,41 @@ namespace anakin{ namespace saber{ -template -class SaberPermute:\ +template +class SaberPermute:\ public ImplBase< - Tensor, - Tensor, - Tensor, - PermuteParam>> { + NV, + OpDtype, + PermuteParam> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; SaberPermute() {} ~SaberPermute() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - PermuteParam ¶m, - Context &ctx) { - this->_ctx = ctx; - _num_axes = inputs[0]->valid_shape().size(); - for (int i = 0; i < _num_axes; i++) { - if (std::find(_order_dims.begin(), _order_dims.end(), - param.order[i]) == _order_dims.end()) { - _order_dims.push_back(param.order[i]); - } - } - - CHECK_EQ(_num_axes, _order_dims.size()); - - // set _need_permute - _need_permute = false; - for (int i = 0; i < _num_axes; ++i) { - if (param.order[i] != i) { - _need_permute = true; - break; - } - } - Shape order_shape = {_num_axes, 1, 1, 1}; - _permute_order.reshape(order_shape); - cudaMemcpy(_permute_order.mutable_data(), &(param.order[0]), - sizeof(int) * _permute_order.size(), cudaMemcpyHostToDevice); - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - PermuteParam ¶m, - Context &ctx) { - - Shape order_shape = {_num_axes, 1, 1, 1}; - _in_steps.reshape(order_shape); - _out_steps.reshape(order_shape); - _out_valid_shape.reshape(order_shape); - - Shape in_stride = inputs[0]->get_stride(); - Shape out_stride = outputs[0]->get_stride(); - - cudaMemcpy(_in_steps.mutable_data(), &in_stride[0], - sizeof(int) * _in_steps.size(), cudaMemcpyHostToDevice); - cudaMemcpy(_out_steps.mutable_data(), &out_stride[0], - sizeof(int) * _out_steps.size(), cudaMemcpyHostToDevice); - cudaMemcpy(_out_valid_shape.mutable_data(), &((outputs[0]->valid_shape())[0]), - sizeof(int) * _out_valid_shape.size(), cudaMemcpyHostToDevice); - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - PermuteParam ¶m); + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PermuteParam ¶m, + Context &ctx); + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PermuteParam ¶m, + Context &ctx); + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PermuteParam ¶m); private: int _num_axes; bool _need_permute; std::vector _order_dims; - Tensor _permute_order; - Tensor _in_steps; - Tensor _out_steps; - Tensor _out_valid_shape; + Tensor _permute_order; + Tensor _in_steps; + Tensor _out_steps; + Tensor _out_valid_shape; }; -template class SaberPermute; - } //namespace saber } //namespace anakin diff --git a/saber/funcs/impl/cuda/saber_permute_power.h b/saber/funcs/impl/cuda/saber_permute_power.h index 20c54a79f..d0f0b8207 100644 --- a/saber/funcs/impl/cuda/saber_permute_power.h +++ b/saber/funcs/impl/cuda/saber_permute_power.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,50 +22,35 @@ namespace anakin{ namespace saber{ -template -class SaberPermutePower:\ +template +class SaberPermutePower:\ public ImplBase< - Tensor, - Tensor, - Tensor, - PermutePowerParam>> { + NV, + OpDtype, + PermutePowerParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; SaberPermutePower() {} ~SaberPermutePower() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - PermutePowerParam ¶m, + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PermutePowerParam ¶m, Context &ctx) { - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - PermutePowerParam ¶m, + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PermutePowerParam ¶m, Context &ctx) { _num_axes = inputs[0]->shape().size(); - PermuteParam permute_param = param.permute_param; + PermuteParam permute_param = param.permute_param; for (int i = 0; i < _num_axes; i++) { - if (std::find(_order_dims.begin(), _order_dims.end(), permute_param.order[i]) \ - == _order_dims.end()) { + if (std::find(_order_dims.begin(), _order_dims.end(), permute_param.order[i]) == _order_dims.end()) { _order_dims.push_back(permute_param.order[i]); } } @@ -79,12 +64,12 @@ class SaberPermutePowerget_stride(); Shape out_stride = outputs[0]->get_stride(); Shape out_valid_shape = outputs[0]->valid_shape(); @@ -99,24 +84,24 @@ class SaberPermutePower& inputs, - std::vector& outputs, - PermutePowerParam &permute_param); + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PermutePowerParam &permute_param); private: int _num_axes; bool _need_permute; std::vector _order_dims; - Tensor _permute_order; - Tensor _out_valid_shape; - Tensor _old_steps; - Tensor _new_steps; + Tensor _permute_order; + Tensor _out_valid_shape; + Tensor _old_steps; + Tensor _new_steps; }; -template class SaberPermutePower; +template class SaberPermutePower; } //namespace saber } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PERMUTE_POWER_H \ No newline at end of file +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PERMUTE_POWER_H diff --git a/saber/funcs/impl/cuda/saber_pooling.h b/saber/funcs/impl/cuda/saber_pooling.h index ccbc36d6c..b99fbf829 100644 --- a/saber/funcs/impl/cuda/saber_pooling.h +++ b/saber/funcs/impl/cuda/saber_pooling.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,8 +16,9 @@ #ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_POOLING_H #define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_POOLING_H -#include "anakin_config.h" #include + +#include "anakin_config.h" #include "saber/funcs/impl/impl_base.h" #include "saber/core/tensor.h" #include "saber/core/context.h" @@ -26,65 +27,45 @@ namespace anakin{ namespace saber{ -template -class SaberPooling:\ +template +class SaberPooling:\ public ImplBase< - Tensor, - Tensor, - Tensor, - PoolingParam>> { - -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberPooling() - {} - - ~SaberPooling() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - PoolingParam ¶m, - Context &ctx) override { - - //std::cout<<"SaberPooling init!!"<& inputs, - std::vector& outputs, - PoolingParam ¶m, - Context &ctx) override { - - //std::cout<<"SaberPooling create!!"<& inputs, - std::vector& outputs, - PoolingParam ¶m) { - - //std::cout<<"Saber Dispatch!!!!!!"<> { + + public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + + SaberPooling(){} + + ~SaberPooling() {} + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + PoolingParam ¶m, + Context &ctx) override { + + return SaberUnImplError; + + } + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + PoolingParam ¶m, + Context &ctx) override { + + return SaberUnImplError; + + } + + //call cudnnConvolutionForward here + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + PoolingParam ¶m) { + + return SaberUnImplError; + } }; } diff --git a/saber/funcs/impl/cuda/saber_pooling_with_index.h b/saber/funcs/impl/cuda/saber_pooling_with_index.h index c269a9186..08116a816 100644 --- a/saber/funcs/impl/cuda/saber_pooling_with_index.h +++ b/saber/funcs/impl/cuda/saber_pooling_with_index.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,44 +22,30 @@ namespace anakin{ namespace saber{ -template -class SaberPoolingWithIndex:\ +template +class SaberPoolingWithIndex:\ public ImplBase< - Tensor, - Tensor, - Tensor, - PoolingParam>> { + NV, OpDtype, + PoolingParam> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait :: Dtype dtype; SaberPoolingWithIndex() {} ~SaberPoolingWithIndex() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - PoolingParam ¶m, \ + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m, \ Context &ctx) { - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - PoolingParam &power_param, + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam &power_param, Context &ctx) { Shape out_stride = outputs[0]->get_stride(); Shape in_stride = inputs[0]->get_stride(); @@ -82,9 +68,9 @@ class SaberPoolingWithIndex& inputs, - std::vector& outputs, - PoolingParam ¶m); + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m); private: int _in_n_stride; @@ -97,10 +83,10 @@ class SaberPoolingWithIndex; +template class SaberPoolingWithIndex; } //namespace saber } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_POOLING_WITH_INDEX_H \ No newline at end of file +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_POOLING_WITH_INDEX_H diff --git a/saber/funcs/impl/cuda/saber_power.h b/saber/funcs/impl/cuda/saber_power.h index b88e95f68..9b4c4a619 100644 --- a/saber/funcs/impl/cuda/saber_power.h +++ b/saber/funcs/impl/cuda/saber_power.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,48 +22,33 @@ namespace anakin{ namespace saber{ -template -class SaberPower:\ +template +class SaberPower:\ public ImplBase< - Tensor, - Tensor, - Tensor, - PowerParam>> { + NV, OpDtype, + PowerParam> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; SaberPower() {} ~SaberPower() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - PowerParam &power_param, + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PowerParam &power_param, Context &ctx) { - this->_ctx = ctx; + this->_ctx = &ctx; return SaberSuccess; } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - PowerParam &power_param, + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PowerParam &power_param, Context &ctx) { - Shape shape = {inputs[0]->dims(), 1, 1, 1}; - _in_steps.re_alloc(shape); - _out_steps.re_alloc(shape); - _out_valid_shape.re_alloc(shape); + Shape shape({inputs[0]->dims(), 1, 1, 1}); + _in_steps.re_alloc(shape, OpDtype); + _out_steps.re_alloc(shape, OpDtype); + _out_valid_shape.re_alloc(shape, OpDtype); Shape in_stride = inputs[0]->get_stride(); Shape out_stride = outputs[0]->get_stride(); Shape out_valid_shape = outputs[0]->valid_shape(); @@ -73,20 +58,20 @@ class SaberPower& inputs, - std::vector& outputs, - PowerParam &power_param); + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PowerParam &power_param); private: - Tensor _in_steps; - Tensor _out_steps; - Tensor _out_valid_shape; + Tensor _in_steps; + Tensor _out_steps; + Tensor _out_valid_shape; }; -template class SaberPower; +template class SaberPower; } //namespace saber } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_POWER_H \ No newline at end of file +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_POWER_H diff --git a/saber/funcs/impl/cuda/saber_prelu.h b/saber/funcs/impl/cuda/saber_prelu.h deleted file mode 100644 index 39bebbbbd..000000000 --- a/saber/funcs/impl/cuda/saber_prelu.h +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PRELU_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PRELU_H - -#include "saber/funcs/impl/impl_prelu.h" - -namespace anakin{ - -namespace saber{ - -template -class SaberPrelu:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - PreluParam>> { - -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberPrelu() = default; - ~SaberPrelu() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - PreluParam ¶m, - Context &ctx) { - // get context - this->_ctx = ctx; - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - PreluParam ¶m, - Context &ctx) { - // compute inner and outer size - int channel_index = inputs[0]->channel_index(); - _dims = inputs[0]->dims(); - _size = inputs[0]->valid_size(); - _channels = inputs[0]->channel(); - _inner_size = inputs[0]->count_valid(channel_index + 1, _dims); - _outer_size = inputs[0]->count_valid(0, channel_index); - if (!param.channel_shared) { - CHECK_EQ(_channels, param.slope->valid_size()) << \ - "slope data size must = channels"; - } - _is_continue_buf = outputs[0]->is_continue_mem() && inputs[0]->is_continue_mem(); - if (!_is_continue_buf) { - Shape sh_input_real_stride = inputs[0]->get_stride(); - Shape sh_output_real_stride = outputs[0]->get_stride(); - - //! re_alloc device memory - Shape sh{1, 1, 1, _dims}; - _valid_shape.reshape(sh); - _input_stride.reshape(sh); - _output_stride.reshape(sh); - - CUDA_CHECK(cudaMemcpy(_valid_shape.mutable_data(), inputs[0]->valid_shape().data(), \ - sizeof(int) * _dims, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(_input_stride.mutable_data(), sh_input_real_stride.data(), \ - sizeof(int) * _dims, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(_output_stride.mutable_data(), sh_output_real_stride.data(), \ - sizeof(int) * _dims, cudaMemcpyHostToDevice)); - } - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - PreluParam ¶m); - -private: - int _size; - int _inner_size; - int _outer_size; - int _channels; - int _dims; - Tensor _input_stride; - Tensor _output_stride; - Tensor _valid_shape; - bool _is_continue_buf{true}; -}; - - -template class SaberPrelu; -/*t -emplate class SaberPrelu; -template class SaberPrelu; -template class SaberPrelu; -template class SaberPrelu; -template class SaberPrelu; -*/ -} //namespace saber - -} //namespace anakin - -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PRELU_H \ No newline at end of file diff --git a/saber/funcs/impl/cuda/saber_priorbox.h b/saber/funcs/impl/cuda/saber_priorbox.h deleted file mode 100644 index e732ae0e6..000000000 --- a/saber/funcs/impl/cuda/saber_priorbox.h +++ /dev/null @@ -1,195 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_CUDA_SABER_PRIORBOX_H -#define ANAKIN_SABER_FUNCS_CUDA_SABER_PRIORBOX_H - -#include "saber/funcs/impl/impl_priorbox.h" - -namespace anakin{ - -namespace saber{ - -template -class SaberPriorBox:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - PriorBoxParam>> { -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberPriorBox() = default; - ~SaberPriorBox() { - if (_output_host != nullptr) { - fast_free(_output_host); - } - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - PriorBoxParam ¶m, - Context &ctx) { - // get context - this->_ctx = ctx; - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - PriorBoxParam ¶m, - Context &ctx){ - - if (_output_host != nullptr) { - fast_free(_output_host); - _output_host = nullptr; - } - _output_host = (float*)fast_malloc(sizeof(float) * outputs[0]->valid_size()); - - const int width = inputs[0]->width(); - const int height = inputs[0]->height(); - int img_width = param.img_w; - int img_height = param.img_h; - if (img_width == 0 || img_height == 0) { - img_width = inputs[1]->width(); - img_height = inputs[1]->height(); - } - - float step_w = param.step_w; - float step_h = param.step_h; - if (step_w == 0 || step_h == 0) { - step_w = static_cast(img_width) / width; - step_h = static_cast(img_height) / height; - } - float offset = param.offset; - - int channel_size = height * width * param.prior_num * 4; - int idx = 0; - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - float center_x = (w + offset) * step_w; - float center_y = (h + offset) * step_h; - float box_width; - float box_height; - for (int s = 0; s < param.min_size.size(); ++s) { - int min_size = param.min_size[s]; - //! first prior: aspect_ratio = 1, size = min_size - box_width = box_height = min_size; - //! xmin - _output_host[idx++] = (center_x - box_width / 2.f) / img_width; - //! ymin - _output_host[idx++] = (center_y - box_height / 2.f) / img_height; - //! xmax - _output_host[idx++] = (center_x + box_width / 2.f) / img_width; - //! ymax - _output_host[idx++] = (center_y + box_height / 2.f) / img_height; - - if (param.max_size.size() > 0) { - - int max_size = param.max_size[s]; - //! second prior: aspect_ratio = 1, size = sqrt(min_size * max_size) - box_width = box_height = sqrtf(min_size * max_size); - //! xmin - _output_host[idx++] = (center_x - box_width / 2.f) / img_width; - //! ymin - _output_host[idx++] = (center_y - box_height / 2.f) / img_height; - //! xmax - _output_host[idx++] = (center_x + box_width / 2.f) / img_width; - //! ymax - _output_host[idx++] = (center_y + box_height / 2.f) / img_height; - } - - //! rest of priors - for (int r = 0; r < param.aspect_ratio.size(); ++r) { - float ar = param.aspect_ratio[r]; - if (fabs(ar - 1.) < 1e-6) { - continue; - } - box_width = min_size * sqrt(ar); - box_height = min_size / sqrt(ar); - //! xmin - _output_host[idx++] = (center_x - box_width / 2.f) / img_width; - //! ymin - _output_host[idx++] = (center_y - box_height / 2.f) / img_height; - //! xmax - _output_host[idx++] = (center_x + box_width / 2.f) / img_width; - //! ymax - _output_host[idx++] = (center_y + box_height / 2.f) / img_height; - } - } - } - } - //! clip the prior's coordidate such that it is within [0, 1] - if (param.is_clip) { - for (int d = 0; d < channel_size; ++d) { - _output_host[d] = std::min(std::max(_output_host[d], 0.f), 1.f); - } - } - //! set the variance. - - float* ptr = _output_host + channel_size; - int count = 0; - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - for (int i = 0; i < param.prior_num; ++i) { - for (int j = 0; j < 4; ++j) { - ptr[count] = param.variance[j]; - ++count; - } - } - } - } - - SABER_CHECK(_output_nv.reshape(outputs[0]->valid_shape())); - CUDA_CHECK(cudaMemcpy(_output_nv.mutable_data(), _output_host, \ - channel_size * 2 * sizeof(float), cudaMemcpyHostToDevice)); - - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - PriorBoxParam ¶m){ - cudaStream_t stream = this->_ctx.get_compute_stream(); - CUDA_CHECK(cudaMemcpyAsync(outputs[0]->mutable_data(), _output_nv.data(), \ - outputs[0]->valid_size() * sizeof(float), cudaMemcpyDeviceToDevice, stream)); - return SaberSuccess; - } - -private: - float* _output_host{nullptr}; - Tensor _output_nv; -}; - -template class SaberPriorBox; - -} //namespace saber - -} //namespace anakin - -#endif //ANAKIN_SABER_FUNCS_CUDA_SABER_PRIORBOX_H diff --git a/saber/funcs/impl/cuda/saber_resize.h b/saber/funcs/impl/cuda/saber_resize.h old mode 100644 new mode 100755 index fa8da8614..25cc537f3 --- a/saber/funcs/impl/cuda/saber_resize.h +++ b/saber/funcs/impl/cuda/saber_resize.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,52 +22,35 @@ namespace anakin{ namespace saber{ -template -class SaberResize:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - ResizeParam>> { +template +class SaberResize: + public ImplBase> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - + typedef typename DataTrait::Dtype OpDataType; SaberResize() = default; ~SaberResize() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ResizeParam ¶m, + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + ResizeParam ¶m, Context &ctx) { // get context - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ResizeParam ¶m, + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ResizeParam ¶m, Context &ctx) { // do nothing return SaberSuccess; } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ResizeParam ¶m); + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ResizeParam ¶m); }; diff --git a/saber/funcs/impl/cuda/saber_reverse_input.h b/saber/funcs/impl/cuda/saber_reverse_input.h new file mode 100644 index 000000000..7c37ef7af --- /dev/null +++ b/saber/funcs/impl/cuda/saber_reverse_input.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_REVERSE_INPUT_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_REVERSE_INPUT_H + +#include "saber/funcs/impl/impl_reverse_input.h" +#include "saber/saber_funcs_param.h" + +namespace anakin { +namespace saber { + + +template +class SaberReverseInput : public ImplBase < + NV, + OpDtype, + EmptyParam> { +public: + typedef Tensor OpTensor; + typedef typename DataTrait::Dtype OpDataType; + + SaberReverseInput() {} + + ~SaberReverseInput() { + + } + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + EmptyParam& param, + Context& ctx) override; + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + EmptyParam& param, + Context& ctx) override; + + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + EmptyParam& param) override; + + +private: + std::vector _offset_map_cu_vec; + std::vector> _offset_map_vec; + +}; + + +} +} + +#endif diff --git a/saber/funcs/impl/cuda/saber_reverse_sequence.h b/saber/funcs/impl/cuda/saber_reverse_sequence.h new file mode 100644 index 000000000..e39a84d3c --- /dev/null +++ b/saber/funcs/impl/cuda/saber_reverse_sequence.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_REVERSE_SEQUENCE_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_REVERSE_SEQUENCE_H + +#include "saber/funcs/impl/impl_reverse_sequence.h" +#include "saber/saber_funcs_param.h" + +namespace anakin { +namespace saber { + + +template +class SaberReverseSequence : public ImplBase < + NV, + OpDtype, + EmptyParam> { +public: + typedef Tensor OpTensor; + typedef typename DataTrait::Dtype OpDataType; + + SaberReverseSequence() {} + + ~SaberReverseSequence() { + + } + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + EmptyParam& param, + Context& ctx) override; + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + EmptyParam& param, + Context& ctx) override; + + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + EmptyParam& param) override; + + +private: + + OpTensor _offset_map_cu; + Tensor _offset_map; +}; + + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_REVERSE_SEQUENCE_H diff --git a/saber/funcs/impl/cuda/saber_roi_pool.h b/saber/funcs/impl/cuda/saber_roi_pool.h deleted file mode 100644 index ac5cbf507..000000000 --- a/saber/funcs/impl/cuda/saber_roi_pool.h +++ /dev/null @@ -1,108 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ROI_POOL_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ROI_POOL_H - -#include "saber/funcs/impl/impl_roi_pooling.h" - -namespace anakin{ - -namespace saber{ - -template -class SaberRoiPool:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - RoiPoolParam>> { - -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - SaberRoiPool() - {} - - ~SaberRoiPool() { - - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - RoiPoolParam ¶m, - Context &ctx) { - this->_ctx = ctx; - Shape out_stride = outputs[0]->get_stride(); - Shape in_stride = inputs[0]->get_stride(); - int in_n_index = inputs[0]->num_index(); - int in_c_index = inputs[0]->channel_index(); - int in_h_index = inputs[0]->height_index(); - int in_w_index = inputs[0]->width_index(); - int out_n_index = outputs[0]->num_index(); - int out_c_index = outputs[0]->channel_index(); - int out_h_index = outputs[0]->height_index(); - int out_w_index = outputs[0]->width_index(); - _in_n_stride = in_stride[in_n_index]; - _in_c_stride = in_stride[in_c_index]; - _in_h_stride = in_stride[in_h_index]; - _in_w_stride = in_stride[in_w_index]; - _out_n_stride = out_stride[out_n_index]; - _out_c_stride = out_stride[out_c_index]; - _out_h_stride = out_stride[out_h_index]; - _out_w_stride = out_stride[out_w_index]; - - return SaberSuccess; - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - RoiPoolParam ¶m, - Context &ctx) { - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - RoiPoolParam ¶m); - -private: - int _in_n_stride; - int _in_c_stride; - int _in_h_stride; - int _in_w_stride; - int _out_n_stride; - int _out_c_stride; - int _out_h_stride; - int _out_w_stride; -}; -template class SaberRoiPool; -} - -} - -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ROI_POOL_H diff --git a/saber/funcs/impl/cuda/saber_scale.h b/saber/funcs/impl/cuda/saber_scale.h old mode 100644 new mode 100755 index 4b4fe46e3..fd20c76a9 --- a/saber/funcs/impl/cuda/saber_scale.h +++ b/saber/funcs/impl/cuda/saber_scale.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,47 +22,33 @@ namespace anakin{ namespace saber{ -template -class SaberScale : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ScaleParam > > +template +class SaberScale: + public ImplBase> { + public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberScale() {} ~SaberScale() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ScaleParam& param, Context& ctx) { - this->_ctx = ctx; + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ScaleParam& param, Context& ctx) { + this->_ctx = &ctx; _axis = (param.num_axes == 0) ? 0 : param.axis; _num_axes = param.num_axes >= 0 ? param.num_axes : inputs[0]->shape().dims() - _axis; _bias_term = param.bias_term; if (param.scale_w.size() > 0) { - _weight.re_alloc({param.scale_w.size(), 1, 1, 1}); + _weight.re_alloc(Shape({param.scale_w.size(), 1, 1, 1}), OpDtype); cudaMemcpy(_weight.mutable_data(), ¶m.scale_w[0], sizeof(OpDataType) * param.scale_w.size(), cudaMemcpyHostToDevice); } if (param.bias_term) { - _bias.re_alloc({param.scale_b.size(), 1, 1, 1}); + _bias.re_alloc(Shape({param.scale_b.size(), 1, 1, 1}), OpDtype); cudaMemcpy(_bias.mutable_data(), ¶m.scale_b[0], sizeof(OpDataType) * param.scale_w.size(), cudaMemcpyHostToDevice); } @@ -70,10 +56,10 @@ class SaberScale& inputs, - std::vector& outputs, - ScaleParam& param, Context &ctx) { - this->_ctx = ctx; + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ScaleParam& param, Context &ctx) { + this->_ctx = &ctx; _inner_dim = inputs[0]->count(_axis + _num_axes, inputs[0]->shape().dims()); _scale_dim = inputs[0]->count(_axis, _axis + _num_axes); if (inputs.size() == 1) { @@ -82,17 +68,17 @@ class SaberScale& inputs, - std::vector& outputs, - ScaleParam& param); + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ScaleParam& param); private: int _axis; int _num_axes; bool _bias_term; int _inner_dim; int _scale_dim; - OpTensor _weight; - OpTensor _bias; + Tensor _weight; + Tensor _bias; }; //template class SaberScale; diff --git a/saber/funcs/impl/cuda/saber_sequence_pool.h b/saber/funcs/impl/cuda/saber_sequence_pool.h new file mode 100644 index 000000000..9f41f6884 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_sequence_pool.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_POOL_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_POOL_H + +#include "saber/funcs/impl/impl_sequence_pool.h" +#include "saber/saber_funcs_param.h" +#include +#include + +namespace anakin { +namespace saber { + +template +class SaberSequencePool : + public ImplBase < NV, OpDtype, SequencePoolParam > { + +public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + typedef typename DataTrait::Dtype DataType_in; + typedef typename DataTrait::Dtype DataType_out; + typedef typename DataTrait::Dtype DataType_op; + + SaberSequencePool() = default; + + ~SaberSequencePool() {} + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + SequencePoolParam& param, + Context& ctx) override; + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + SequencePoolParam& param, + Context& ctx) { + return SaberSuccess; + + } + + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + SequencePoolParam& param) override; +private: + typedef std::function* ctx)> seq_pool_direct_kernel; + std::map kernel_direct_map; + + +}; +} +} + +#endif diff --git a/saber/funcs/impl/cuda/saber_slice.h b/saber/funcs/impl/cuda/saber_slice.h index 078f3432f..ac8be965b 100644 --- a/saber/funcs/impl/cuda/saber_slice.h +++ b/saber/funcs/impl/cuda/saber_slice.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,44 +22,29 @@ namespace anakin{ namespace saber{ -template -class SaberSlice:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - SliceParam>> { +template +class SaberSlice: + public ImplBase> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberSlice() = default; ~SaberSlice() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - SliceParam ¶m, + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + SliceParam ¶m, Context &ctx) { // get context - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - SliceParam ¶m, + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + SliceParam ¶m, Context &ctx) { _slice_num = inputs[0]->count_valid(0, param.axis); @@ -67,16 +52,16 @@ class SaberSlice& inputs, - std::vector& outputs, - SliceParam ¶m); + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SliceParam ¶m); private: int _slice_num; int _slice_size; }; -template class SaberSlice; +template class SaberSlice; } //namespace saber } //namespace anakin diff --git a/saber/funcs/impl/cuda/saber_softmax.h b/saber/funcs/impl/cuda/saber_softmax.h index 972369ba2..46108cb6e 100644 --- a/saber/funcs/impl/cuda/saber_softmax.h +++ b/saber/funcs/impl/cuda/saber_softmax.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,28 +22,16 @@ namespace anakin{ namespace saber{ -template -class SaberSoftmax : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - SoftmaxParam > > +template +class SaberSoftmax: + public ImplBase> { public: typedef TargetWrapper API; - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + typedef typename DataTrait::Dtype OpDataType; SaberSoftmax() = default; @@ -58,16 +46,16 @@ class SaberSoftmax& inputs, std::vector& outputs, - SoftmaxParam& param, Context& ctx) { + SoftmaxParam& param, Context& ctx) { //! get context - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } virtual SaberStatus create(const std::vector& inputs, std::vector& outputs, - SoftmaxParam& param, Context& ctx) { + SoftmaxParam& param, Context& ctx) { //! compute size Shape shape_in = inputs[0]->valid_shape(); Shape shape_out = outputs[0]->valid_shape(); @@ -79,9 +67,9 @@ class SaberSoftmax _max_dimsize){ //! re_alloc device memory _max_data.reshape(sh_tmp); @@ -96,7 +84,7 @@ class SaberSoftmaxget_stride(); //! re_alloc device memory - Shape sh{1, 1, 1, _dims}; + Shape sh({1, 1, 1, _dims}); _valid_shape.reshape(sh); _input_stride.reshape(sh); _output_stride.reshape(sh); @@ -113,7 +101,7 @@ class SaberSoftmax& inputs, std::vector& outputs, - SoftmaxParam& param); + SoftmaxParam& param); private: @@ -125,14 +113,14 @@ class SaberSoftmax _input_stride; - Tensor _output_stride; - Tensor _valid_shape; + Tensor _input_stride; + Tensor _output_stride; + Tensor _valid_shape; - Tensor _max_data; - Tensor _sum_data; + Tensor _max_data; + Tensor _sum_data; }; -template class SaberSoftmax; +template class SaberSoftmax; } //namespace saber } //namespace anakin diff --git a/saber/funcs/impl/cuda/saber_spp.h b/saber/funcs/impl/cuda/saber_spp.h deleted file mode 100644 index a80eca1fc..000000000 --- a/saber/funcs/impl/cuda/saber_spp.h +++ /dev/null @@ -1,126 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SPP_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SPP_H -#include "saber/funcs/pooling.h" -#include "saber/funcs/impl/impl_base.h" -#include "saber/funcs/impl/impl_spp.h" - -namespace anakin{ - -namespace saber{ -#if 0 -template -class SaberSpp:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - SPPParam>> { - -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - typedef Pooling Pooling_t; - - SaberSpp() - {} - - ~SaberSpp() { - for (auto pool : _pooling) { - delete pool; - pool = nullptr; - } - for (auto out : _pooling_output) { - delete out; - out = nullptr; - } - - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - SPPParam ¶m, - Context &ctx) { - this->_ctx = ctx; - _pooling.clear(); - _pooling_output.clear(); - _pooling_param.clear(); - _pooling_output.resize(param.pyramid_height); - _pooling.resize(param.pyramid_height); - _pooling_param.resize(param.pyramid_height); - int out_w_index = outputs[0]->width_index(); - int out_h_index = outputs[0]->height_index(); - for (int i = 0; i < param.pyramid_height; i++) { - int num_bins = pow(2, i); - int window_h = std::ceil(inputs[0]->height() / static_cast(num_bins)); - int window_w = std::ceil(inputs[0]->width() / static_cast(num_bins)); - int pad_h = (window_h * num_bins - inputs[0]->height() + 1) / 2; - int pad_w = (window_w * num_bins - inputs[0]->width() + 1) / 2; - PoolingParam pool_param(window_h, window_w, pad_h, pad_w - , window_h, window_w, param.pool_type); - - Shape valid_shape = outputs[0]->valid_shape(); - valid_shape[out_w_index] = pow(2, i); - valid_shape[out_h_index] = pow(2, i); - _pooling[i] = new Pooling_t(); - _pooling_output[i] = new DataTensor_out(valid_shape); - std::vector pool_outputs = {_pooling_output[i]}; - _pooling[i]->compute_output_shape(inputs, pool_outputs, pool_param); - _pooling[i]->init(inputs, pool_outputs, pool_param, SPECIFY, VENDER_IMPL, ctx); - _pooling_param[i] = pool_param; - - } - - return SaberSuccess; - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - SPPParam ¶m, - Context &ctx) { - return SaberSuccess; - } - - //call cudnnConvolutionForward here - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - SPPParam ¶m); - -private: - std::vector _pooling; - std::vector> _pooling_param; - std::vector _pooling_output; -}; -template class SaberSpp; -#endif -} - -} - -#endif //ANAKIN_SABER_FUNCS_SABER_CONV2D_H diff --git a/saber/funcs/impl/cuda/saber_transpose.h b/saber/funcs/impl/cuda/saber_transpose.h old mode 100644 new mode 100755 index 7239e1252..a1ba2f452 --- a/saber/funcs/impl/cuda/saber_transpose.h +++ b/saber/funcs/impl/cuda/saber_transpose.h @@ -21,28 +21,18 @@ namespace anakin { namespace saber { -template -class SaberTranspose:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - TransposeParam>> { +template +class SaberTranspose: + public ImplBase> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype InDataType; + typedef typename DataTrait::Dtype OutDataType; + typedef typename DataTrait::Dtype OpDataType; SaberTranspose() = default; @@ -50,19 +40,19 @@ class SaberTranspose& inputs, std::vector& outputs, - TransposeParam ¶m, + TransposeParam ¶m, Context &ctx) { // get context - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param,ctx); } virtual SaberStatus create(const std::vector& inputs, std::vector& outputs, - TransposeParam ¶m, + TransposeParam ¶m, Context &ctx) { - if (!(ctx == this->_ctx)) { - this->_ctx = ctx; + if (!(&ctx == this->_ctx)) { + this->_ctx = &ctx; } // do nothing return SaberSuccess; @@ -70,11 +60,11 @@ class SaberTranspose& inputs, std::vector& outputs, - TransposeParam ¶m); + TransposeParam ¶m); }; -template class SaberTranspose; +template class SaberTranspose; } //namespace saber } //namespace anakin diff --git a/saber/funcs/impl/cuda/saber_unpool.h b/saber/funcs/impl/cuda/saber_unpool.h old mode 100644 new mode 100755 index c69bdb59d..33c0cdbf9 --- a/saber/funcs/impl/cuda/saber_unpool.h +++ b/saber/funcs/impl/cuda/saber_unpool.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,28 +22,18 @@ namespace anakin{ namespace saber{ -template -class SaberUnpool:\ - public ImplBase< - Tensor, - Tensor, - Tensor, - PoolingParam>> { +template +class SaberUnpool:\ + public ImplBase> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype InDataType; + typedef typename DataTrait::Dtype OutDataType; + typedef typename DataTrait::Dtype OpDataType; SaberUnpool() {} @@ -54,15 +44,15 @@ class SaberUnpool& inputs, std::vector& outputs, - PoolingParam ¶m, + PoolingParam ¶m, Context &ctx) { - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } virtual SaberStatus create(const std::vector& inputs, std::vector& outputs, - PoolingParam ¶m, + PoolingParam ¶m, Context &ctx) { Shape out_stride = outputs[0]->get_stride(); Shape in_stride = inputs[0]->get_stride(); @@ -79,7 +69,7 @@ class SaberUnpool& inputs, std::vector& outputs, - PoolingParam ¶m); + PoolingParam ¶m); private: int _in_n_stride; @@ -87,7 +77,7 @@ class SaberUnpool; +template class SaberUnpool; } } diff --git a/saber/funcs/impl/cuda/vender_activation.h b/saber/funcs/impl/cuda/vender_activation.h index ee9cb0bc6..d60ded642 100644 --- a/saber/funcs/impl/cuda/vender_activation.h +++ b/saber/funcs/impl/cuda/vender_activation.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,35 +13,21 @@ limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H -#define ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_VENDER_ACTIVATION_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_VENDER_ACTIVATION_H #include "saber/funcs/impl/impl_activation.h" #include "saber/funcs/impl/cuda/cudnn_helper.h" namespace anakin { namespace saber { -template -class VenderActivation : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ActivationParam > > -{ +template +class VenderActivation : public ImplBase< + NV, OpDtype, ActivationParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; + typedef typename DataTrait::Dtype InDataType; + typedef typename DataTrait::Dtype OutDataType; VenderActivation() : _handle(NULL), _active_descs(NULL), _input_descs(NULL), _output_descs(NULL) {} @@ -61,11 +47,11 @@ class VenderActivation& inputs, - std::vector& outputs, - ActivationParam& param, Context& ctx) { + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ActivationParam& param, Context& ctx) { - this->_ctx = ctx; + this->_ctx = &ctx; cudaStream_t cuda_stream; cuda_stream = ctx.get_compute_stream(); @@ -81,15 +67,17 @@ class VenderActivation& inputs, - std::vector& outputs, - ActivationParam& param, Context& ctx) { - - if (!(ctx == this->_ctx)) { + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ActivationParam& param, Context& ctx) { + if (param.active == Active_prelu || param.active == Active_stanh) { + return SaberUnImplError; + } + if (!(&ctx == this->_ctx)) { if (_handle != NULL) { CUDNN_CHECK(cudnnDestroy(_handle)); } - this->_ctx = ctx; + this->_ctx = &ctx; cudaStream_t cuda_stream; cuda_stream = ctx.get_compute_stream(); @@ -127,10 +115,13 @@ class VenderActivation& inputs, - std::vector& outputs, - ActivationParam& param) { + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ActivationParam& param) { + if (param.active == Active_prelu || param.active == Active_stanh) { + return SaberUnImplError; + } const InDataType *in_data = (const InDataType *) inputs[0]->data(); OutDataType *out_data = (OutDataType *) outputs[0]->mutable_data(); @@ -149,8 +140,8 @@ class VenderActivation; +template class VenderActivation; } } -#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H +#endif //ANAKIN_SABER_FUNCS_VENDER_ACTIVATION_H diff --git a/saber/funcs/impl/cuda/vender_conv.cpp b/saber/funcs/impl/cuda/vender_conv.cpp index 4e7c80ded..fdf5f6b66 100644 --- a/saber/funcs/impl/cuda/vender_conv.cpp +++ b/saber/funcs/impl/cuda/vender_conv.cpp @@ -1,26 +1,24 @@ #include "saber/funcs/impl/cuda/vender_conv.h" -#include "cuda_fp16.h" +#include "saber/funcs/impl/cuda/cudnn_helper.h" +#include "saber/funcs/calibrate.h" namespace anakin { namespace saber { +// FP32 part template <> -SaberStatus VenderConv2D::\ - create(const std::vector& inputs, - std::vector& outputs, - ConvParam& param, Context& ctx) { - - if (!(ctx == this->_ctx)) { +SaberStatus VenderConv2D::\ + create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + if (&ctx != this->_ctx) { if (_handle != NULL) { CUDNN_CHECK(cudnnDestroy(_handle)); } - - this->_ctx = ctx; - + this->_ctx = &ctx; cudaStream_t cuda_stream; cuda_stream = ctx.get_compute_stream(); - CUDNN_CHECK(cudnnCreate(&_handle)); CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); } @@ -32,43 +30,38 @@ SaberStatus VenderConv2D::\ int output_channel = outputs[0]->channel(); int output_height = outputs[0]->height(); int output_width = outputs[0]->width(); - int kernel_h = param.weight()->height(); int kernel_w = param.weight()->width(); - int filter_dim_a[] = {output_channel, - input_channel / param.group, - kernel_h, kernel_w - }; + input_channel / param.group, kernel_h, kernel_w}; cudnn::setNDFilterDesc(&_filter_desc, - param.weight()->dims(), filter_dim_a, CUDNN_TENSOR_NCHW); + param.weight()->dims(), filter_dim_a, CUDNN_TENSOR_NCHW); Shape in_stride = inputs[0]->get_stride(); Shape out_stride = outputs[0]->get_stride(); int dim_a[] = {input_num, input_channel, - input_height, input_width - }; - + input_height, input_width}; int dim_b[] = {input_num, output_channel, - output_height, output_width - }; + output_height, output_width}; - cudnn::setTensorNdDesc(&_input_descs, - inputs[0]->dims(), dim_a, &in_stride[0]); + cudnn::setTensorNdDesc(&_input_descs, + inputs[0]->dims(), dim_a, &in_stride[0]); - cudnn::setTensorNdDesc(&_output_descs, - outputs[0]->dims(), dim_b, &out_stride[0]); + cudnn::setTensorNdDesc(&_output_descs, + outputs[0]->dims(), dim_b, &out_stride[0]); int pad_a[] = {param.pad_h, param.pad_w}; int filter_stride_a[] = {param.stride_h, param.stride_w}; int dilation_a[] = {param.dilation_h, param.dilation_w}; cudnn::setConvolutionNdDesc(&_conv_descs, - inputs[0]->dims() - 2, pad_a, - filter_stride_a, dilation_a); - + inputs[0]->dims() - 2, pad_a, + filter_stride_a, dilation_a); + if(param.activation_param.has_active && param.activation_param.active == Active_relu) { + cudnn::set_activation_des(&_active_descs, param.activation_param.active); + } // true: use tensor core // false: disable tensor core cudnn::set_math_type(&_conv_descs, _use_tensor_core); @@ -79,23 +72,20 @@ SaberStatus VenderConv2D::\ if (param.group == inputs[0]->channel() && inputs[0]->channel() == outputs[0]->channel()) { _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } else { - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(_handle, - _input_descs, _filter_desc, _conv_descs, _output_descs, - _preference, _workspace_limit_bytes, &_fwd_algo)); + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( + _handle, _input_descs, _filter_desc, _conv_descs, _output_descs, + _preference, _workspace_limit_bytes, &_fwd_algo)); } - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle, - _input_descs, _filter_desc, _conv_descs, _output_descs, - _fwd_algo, &_workspace_fwd_sizes)); - + CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( + _handle, _input_descs, _filter_desc, _conv_descs, _output_descs, + _fwd_algo, &_workspace_fwd_sizes)); if (_workspace_fwd_sizes > _workspaceSizeInBytes) { _workspaceSizeInBytes = _workspace_fwd_sizes; - if (_workspaceData != NULL) { cudaFree(_workspaceData); } - cudaMalloc(&_workspaceData, _workspaceSizeInBytes); _workspace = reinterpret_cast(_workspaceData); } @@ -103,427 +93,135 @@ SaberStatus VenderConv2D::\ if (param.bias()->size() > 0) { int dim_bias[] = {1, output_channel, 1, 1}; int stride_bias[] = {output_channel, 1, 1, 1}; - cudnn::setTensorNdDesc(&_bias_desc, - 4, dim_bias, stride_bias); - } - - return SaberSuccess; -} -template <> -SaberStatus VenderConv2D::\ - dispatch(const std::vector& inputs, - std::vector& outputs, - ConvParam& param) { - - const InDataType* in_data = (const InDataType*)inputs[0]->data(); - OutDataType* out_data = (OutDataType*)outputs[0]->mutable_data(); - - const float* weight_data = (const float*) param.weight()->data(); - - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data - )); - - if (param.bias()->size() > 0) { - - // add up bias. - const float* bias_data = (const float*)param.bias()->data(); - CUDNN_CHECK(cudnnAddTensor(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _bias_desc, bias_data, - cudnn::cudnnTypeWrapper::kOne(), - _output_descs, out_data)); - + 4, dim_bias, stride_bias); } - return SaberSuccess; } template <> -SaberStatus VenderConv2D::\ - create(const std::vector& inputs, - std::vector& outputs, - ConvParam& param, Context& ctx) { - - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); +SaberStatus VenderConv2D::\ + init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + + // ---- init cudnn resources ---- + _workspaceSizeInBytes = 0; + _workspaceData = NULL; + _workspace_fwd_sizes = 0; + + this->_ctx = &ctx; + // ---- get cuda resources ---- + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + + _workspace = NULL; + int in_channels = inputs[0]->channel(); + // ---- create cudnn Descs ---- + cudnn::createFilterDesc(&_filter_desc); + cudnn::createTensorDesc(&_input_descs); + cudnn::createTensorDesc(&_output_descs); + cudnn::createConvolutionDesc(&_conv_descs); + + if (param.activation_param.has_active) { + if (param.activation_param.active == Active_relu) { + cudnn::create_activation_des(&_active_descs); + } else { + _with_saber_act = true; } - - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); } - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - int in_size = inputs[0]->valid_size(); - int out_size = outputs[0]->valid_size(); - - // ====== int8 conv, the input channel must be a multiple of 4 - CHECK_EQ(input_channel % 4, 0); - - int kernel_h = param.weight()->height(); - int kernel_w = param.weight()->width(); - - int filter_dim_a[] = {output_channel, - input_channel, - kernel_h, kernel_w - }; - - CUDNN_CHECK(cudnnSetFilterNdDescriptor(_filter_desc, CUDNN_DATA_INT8, - CUDNN_TENSOR_NHWC, - param.weight()->dims(), filter_dim_a)); - - CUDNN_CHECK(cudnnSetTensor4dDescriptor(_input_descs, - CUDNN_TENSOR_NHWC, - CUDNN_DATA_INT8, - input_num, input_channel, input_height, input_width)); - - CUDNN_CHECK(cudnnSetTensor4dDescriptor(_output_descs, - CUDNN_TENSOR_NHWC, - CUDNN_DATA_INT8, - input_num, output_channel, output_height, output_width)); - // ===================================================================== - - // for int8 - // These part is used to describe origin data layout; - Shape in_stride = inputs[0]->get_stride(); - Shape out_stride = outputs[0]->get_stride(); - - int dim_a[] = {input_num, input_channel, - input_height, input_width - }; - - int dim_b[] = {input_num, output_channel, - output_height, output_width - }; - - cudnn::setTensorNdDesc(&_input_nchw_descs, - inputs[0]->dims(), dim_a, &in_stride[0]); - - cudnn::setTensorNdDesc(&_output_nchw_descs, - outputs[0]->dims(), dim_b, &out_stride[0]); - // ======= - int pad_a[] = {param.pad_h, param.pad_w}; - int filter_stride_a[] = {param.stride_h, param.stride_w}; - int dilation_a[] = {param.dilation_h, param.dilation_w}; - - cudnn::setConvolutionNdDesc(&_conv_descs, - inputs[0]->dims() - 2, pad_a, - filter_stride_a, dilation_a); - - // true: use tensor core - // false: disable tensor core - cudnn::set_group_count(&_conv_descs, param.group); - - // Get fastest implement of cudnn - _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle, - _input_descs, _filter_desc, _conv_descs, _output_descs, - _fwd_algo, &_workspace_fwd_sizes)); - - if (_workspace_fwd_sizes > _workspaceSizeInBytes) { - _workspaceSizeInBytes = _workspace_fwd_sizes; - - if (_workspaceData != NULL) { - cudaFree(_workspaceData); - } - - cudaMalloc(&_workspaceData, _workspaceSizeInBytes); - _workspace = reinterpret_cast(_workspaceData); - } - if (param.bias()->size() > 0) { - int dim_bias[] = {1, output_channel, 1, 1}; - int stride_bias[] = {output_channel, 1, 1, 1}; - - cudnn::setTensorNdDesc(&_bias_desc, - 4, dim_bias, stride_bias); + cudnn::createTensorDesc(&_bias_desc); } - - if (x8_data_size < in_size) { - x8_data_size = in_size; - - if (x8_data != NULL) { - CUDA_CHECK(cudaFree(x8_data)); - } - - CUDA_CHECK(cudaMalloc(&x8_data, - sizeof(char) * x8_data_size)); + cudnnCreateTensorDescriptor(&_input_nchw_descs); + cudnnCreateTensorDescriptor(&_output_nchw_descs); + if (_with_saber_act) { + _saber_act = new SaberActivation; + _saber_act->init(outputs, outputs, param.activation_param, ctx); } - - if (y8_data_size < out_size) { - y8_data_size = out_size; - - if (y8_data != NULL) { - CUDA_CHECK(cudaFree(y8_data)); - } - - CUDA_CHECK(cudaMalloc(&y8_data, sizeof(char) * y8_data_size)); - } - - return SaberSuccess; + return create(inputs, outputs, param, ctx); } template <> -SaberStatus VenderConv2D::\ - dispatch(const std::vector& inputs, - std::vector& outputs, - ConvParam& param) { - - const void* in_data = (const void*)inputs[0]->data(); - void* out_data = (void*)outputs[0]->mutable_data(); - - // scale data for int8 - float scale = 1.f; - float scale_1 = 1 / scale; - - // int8 tensor transoform - CUDNN_CHECK(cudnnTransformTensor(_handle, - &scale, - _input_nchw_descs, in_data, - cudnn::cudnnTypeWrapper::kZero(), - _input_descs, x8_data)); - - const void* weight_data = (const void*) param.weight()->data(); - - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, x8_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, y8_data - )); - - if (param.bias()->size() > 0) { - - // add up bias. - const void* bias_data = (const void*)param.bias()->data(); - CUDNN_CHECK(cudnnAddTensor(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _bias_desc, bias_data, - cudnn::cudnnTypeWrapper::kOne(), - _output_descs, y8_data)); - } - - // int8 tensor transoform - CUDNN_CHECK(cudnnTransformTensor(_handle, - &scale_1, - _output_descs, y8_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_nchw_descs, out_data)); - - return SaberSuccess; +SaberStatus VenderConv2D::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param) { -} - -template <> -SaberStatus VenderConv2D::\ - create(const std::vector& inputs, - std::vector& outputs, - ConvParam& param, Context& ctx) { - CHECK_EQ(inputs[0]->dims(), 5); - CHECK_EQ(inputs[0]->shape()[4], 4); - CHECK_EQ(outputs[0]->dims(), 5); - CHECK_EQ(outputs[0]->shape()[4], 4); - - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - int in_size = inputs[0]->valid_size(); - int out_size = outputs[0]->valid_size(); - - // ====== int8 conv, the input channel must be a multiple of 4 - CHECK_EQ(input_channel % 4, 0); - - int kernel_h = param.weight()->height(); - int kernel_w = param.weight()->width(); - - int filter_dim_a[] = {output_channel, - input_channel, - kernel_h, kernel_w - }; - - CUDNN_CHECK(cudnnSetFilterNdDescriptor(_filter_desc, CUDNN_DATA_INT8x4, - CUDNN_TENSOR_NCHW_VECT_C, - 4, filter_dim_a)); - - - // not supported stride in nchw_vect_c - - // Shape in_stride = inputs[0]->get_stride(); - // Shape out_stride = outputs[0]->get_stride(); - - // std::cout<<"in_stride"; - // for (auto i : in_stride) { - // std::cout<<", "<(&_conv_descs, - 2, pad_a, - filter_stride_a, dilation_a); - - // true: use tensor core - // false: disable tensor core - cudnn::set_group_count(&_conv_descs, param.group); - - // Get fastest implement of cudnn - _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle, - _input_descs, _filter_desc, _conv_descs, _output_descs, - _fwd_algo, &_workspace_fwd_sizes)); - - if (_workspace_fwd_sizes > _workspaceSizeInBytes) { - _workspaceSizeInBytes = _workspace_fwd_sizes; + const float* in_data = (const float*)inputs[0]->data(); + float* out_data = (float*)outputs[0]->mutable_data(); + const float* weight_data = (const float*) param.weight()->data(); - if (_workspaceData != NULL) { - cudaFree(_workspaceData); + if (param.activation_param.has_active && param.activation_param.active == Active_relu) { + if (param.bias()->size() > 0) { + const float * bias_data = (const float*)param.bias()->data(); + CUDNN_CHECK(cudnnConvolutionBiasActivationForward(_handle, + cudnn::cudnnTypeWrapper::kOne(), + _input_descs, in_data, + _filter_desc, weight_data, + _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, + &_beta, + _output_descs, out_data, + _bias_desc, bias_data, + _active_descs, _output_descs, out_data)); + } else { + CUDNN_CHECK(cudnnConvolutionForward(_handle, + cudnn::cudnnTypeWrapper::kOne(), + _input_descs, in_data, + _filter_desc, weight_data, + _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, + &_beta, + _output_descs, out_data)); + + CUDNN_CHECK(cudnnActivationForward(_handle, _active_descs, + cudnn::cudnnTypeWrapper::kOne(), + _output_descs, out_data, + &_beta, + _output_descs, out_data)); } - - cudaMalloc(&_workspaceData, _workspaceSizeInBytes); - _workspace = reinterpret_cast(_workspaceData); - } - - if (param.bias()->size() > 0) { - LOG(INFO) << "cudnn not support nchw_vect_c add Tensor, " - "bias is not supported in this layout"; - - return SaberUnImplError; - // int dim_bias[] = {1, output_channel, 1, 1}; - // int stride_bias[] = {output_channel, 1, 1, 1}; - // - // CUDNN_CHECK(cudnnSetTensor4dDescriptor(_bias_desc, - // CUDNN_TENSOR_NCHW_VECT_C, - // CUDNN_DATA_INT8x4, - // 1, output_channel, 1, 1)); - } - - return SaberSuccess; -} - -template <> -SaberStatus VenderConv2D:: \ - dispatch(const std::vector& inputs, - std::vector& outputs, - ConvParam& param) { - - const void* in_data = (const void*)inputs[0]->data(); - void* out_data = (void*)outputs[0]->mutable_data(); - - const void* weight_data = (const void*) param.weight()->data(); - - if (param.bias()->size() > 0) { - LOG(INFO) << "cudnn not support nchw_vect_c add Tensor, " - "bias is not supported in this layout"; - - return SaberUnImplError; - - // CUDNN_CHECK(cudnnConvolutionForward(_handle, - // cudnn::cudnnTypeWrapper::kOne(), - // _input_descs, in_data, - // _filter_desc, weight_data, - // _conv_descs, _fwd_algo, workspace, _workspace_fwd_sizes, - // cudnn::cudnnTypeWrapper::kZero(), - // _output_descs, out_data)); - // - // const void * bias_data = (const void*)param.bias()->data(); - // - // CUDNN_CHECK(cudnnAddTensor(_handle, - // cudnn::cudnnTypeWrapper::kOne(), - // _bias_desc, bias_data, - // cudnn::cudnnTypeWrapper::kOne(), - // _output_descs, out_data)); - } else { CUDNN_CHECK(cudnnConvolutionForward(_handle, cudnn::cudnnTypeWrapper::kOne(), _input_descs, in_data, _filter_desc, weight_data, _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), + &_beta, _output_descs, out_data)); - } + if (param.bias()->size() > 0) { + // add up bias. + const float *bias_data = (const float *) param.bias()->data(); + CUDNN_CHECK(cudnnAddTensor(_handle, + cudnn::cudnnTypeWrapper::kOne(), + _bias_desc, bias_data, + cudnn::cudnnTypeWrapper::kOne(), + _output_descs, out_data)); + } + } + if (_with_saber_act) { + _saber_act->dispatch(outputs, outputs, param.activation_param); + } return SaberSuccess; - } +// INT8 part template <> -SaberStatus VenderConv2D::create(const std::vector& inputs, - std::vector& outputs, - ConvParam& param, Context& ctx) { +SaberStatus VenderConv2D::\ + create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { - CHECK_EQ(inputs[0]->dims(), 5); - CHECK_EQ(inputs[0]->shape()[4], 4); - - if (!(ctx == this->_ctx)) { + if (&ctx != this->_ctx) { if (_handle != NULL) { CUDNN_CHECK(cudnnDestroy(_handle)); } - this->_ctx = ctx; + this->_ctx = &ctx; cudaStream_t cuda_stream; cuda_stream = ctx.get_compute_stream(); - CUDNN_CHECK(cudnnCreate(&_handle)); CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); } @@ -546,8 +244,7 @@ SaberStatus VenderConv2D::c int filter_dim_a[] = {output_channel, input_channel, - kernel_h, kernel_w - }; + kernel_h, kernel_w}; CUDNN_CHECK(cudnnSetFilterNdDescriptor(_filter_desc, CUDNN_DATA_INT8x4, CUDNN_TENSOR_NCHW_VECT_C, @@ -556,20 +253,26 @@ SaberStatus VenderConv2D::c CUDNN_CHECK(cudnnSetTensor4dDescriptor(_input_descs, CUDNN_TENSOR_NCHW_VECT_C, CUDNN_DATA_INT8x4, - input_num, input_channel, input_height, input_width)); + input_num, input_channel, + input_height, input_width)); CUDNN_CHECK(cudnnSetTensor4dDescriptor(_output_descs, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, - input_num, output_channel, output_height, output_width)); + input_num, output_channel, + output_height, output_width)); int pad_a[] = {param.pad_h, param.pad_w}; int filter_stride_a[] = {param.stride_h, param.stride_w}; int dilation_a[] = {param.dilation_h, param.dilation_w}; cudnn::setConvolutionNdDesc(&_conv_descs, - 2, pad_a, - filter_stride_a, dilation_a); + 2, pad_a, + filter_stride_a, dilation_a); + + if(param.activation_param.has_active) { + cudnn::set_activation_des(&_active_descs, param.activation_param.active); + } // true: use tensor core // false: disable tensor core @@ -577,9 +280,9 @@ SaberStatus VenderConv2D::c // Get fastest implement of cudnn _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle, - _input_descs, _filter_desc, _conv_descs, _output_descs, - _fwd_algo, &_workspace_fwd_sizes)); + CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( + _handle, _input_descs, _filter_desc, _conv_descs, _output_descs, + _fwd_algo, &_workspace_fwd_sizes)); if (_workspace_fwd_sizes > _workspaceSizeInBytes) { _workspaceSizeInBytes = _workspace_fwd_sizes; @@ -599,44 +302,176 @@ SaberStatus VenderConv2D::c CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_channel, 1, 1)); - // cudnn::setTensorNdDesc(&_bias_desc, - // 4, dim_bias, stride_bias); } return SaberSuccess; } template <> -SaberStatus VenderConv2D::dispatch(const std::vector& inputs, - std::vector& outputs, - ConvParam& param) { - - const void* in_data = (const void*)inputs[0]->data(); - void* out_data = (void*)outputs[0]->mutable_data(); - - const void* weight_data = (const void*) param.weight()->data(); - - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data)); - - if (param.bias()-> size() > 0) { - // add up bias. - const void* bias_data = (const void*)param.bias()->data(); - CUDNN_CHECK(cudnnAddTensor(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _bias_desc, bias_data, - cudnn::cudnnTypeWrapper::kOne(), - _output_descs, out_data)); +SaberStatus VenderConv2D::\ + init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + + bool use_int8 = true; + use_int8 &= ((inputs[0]->channel() % 4) == 0); + use_int8 &= ((outputs[0]->channel() % 4) == 0); + // INT8 only support Active relu + use_int8 &= ((!param.activation_param.has_active) + || (param.activation_param.active == Active_relu)); + + if (!use_int8) { + return SaberInvalidValue; + } else { + // prepare int8 memory + Tensor weights_fp32_host; + Tensor weights_int8_host; + weights_fp32_host.re_alloc(param.weight()->valid_shape(), AK_FLOAT); + weights_int8_host.re_alloc(param.weight()->valid_shape(), AK_INT8); + int8_weights.re_alloc(param.weight()->valid_shape(), AK_INT8); + weights_int8_host.set_layout(Layout_NCHW_C4); + int8_weights.set_layout(Layout_NCHW_C4); + weights_fp32_host.copy_from(*param.weight()); + convert_weights_to_nchw_c4_host(weights_int8_host, weights_fp32_host, ctx); + int8_weights.copy_from(weights_int8_host); + int8_weights.set_scale(weights_int8_host.get_scale()); + + cudaMalloc(&weights_scale, sizeof(float) * int8_weights.get_scale().size()); + cudaMemcpy(weights_scale, &(int8_weights.get_scale()[0]), sizeof(float) * int8_weights.get_scale().size(), + cudaMemcpyHostToDevice); + } + // ---- init cudnn resources ---- + _workspaceSizeInBytes = 0; + _workspaceData = NULL; + _workspace_fwd_sizes = 0; + + this->_ctx = &ctx; + // ---- get cuda resources ---- + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + + _workspace = NULL; + int in_channels = inputs[0]->channel(); + + // ---- create cudnn Descs ---- + cudnn::createFilterDesc(&_filter_desc); + cudnn::createTensorDesc(&_input_descs); + cudnn::createTensorDesc(&_output_descs); + cudnn::createConvolutionDesc(&_conv_descs); + if (param.activation_param.has_active) { + cudnn::create_activation_des(&_active_descs); + } + if (param.bias()->size() > 0) { + cudnn::createTensorDesc(&_bias_desc); + if (use_int8) { + float in_scale; + if (inputs[0]->get_scale().size() == 1) { + in_scale = inputs[0]->get_scale()[0]; + } else { + LOG(FATAL) << "scale now support static calibrate only!!"; + } + Tensor bias_fp32_host; + Tensor bias_int32_host; + bias_fp32_host.re_alloc(param.bias()->valid_shape(), AK_FLOAT); + bias_int32_host.re_alloc(param.bias()->valid_shape(), AK_FLOAT); + int32_bias.re_alloc(param.bias()->valid_shape(), AK_FLOAT); + bias_fp32_host.copy_from(*param.bias()); + convert_bias_host(bias_int32_host, bias_fp32_host, in_scale, int8_weights.get_scale(), ctx); + int32_bias.copy_from(bias_int32_host); + } } - return SaberSuccess; + cudnnCreateTensorDescriptor(&_input_nchw_descs); + cudnnCreateTensorDescriptor(&_output_nchw_descs); + return create(inputs, outputs, param, ctx); } +template <> +SaberStatus VenderConv2D::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param) { + + const void* in_data; + void* out_data; + float in_scale = 0.f; + + if (inputs[0]->get_dtype() == AK_FLOAT) { + if (inputs[0]->get_scale().size() == 1) { + in_scale = inputs[0]->get_scale()[0]; + } else { + LOG(FATAL) << "scale now support static calibrate only!!"; + } + int8_input.re_alloc(inputs[0]->valid_shape(), AK_INT8); + int8_input.set_layout(Layout_NCHW_C4); + conv_calibrate_fp32_int8_c4(int8_input, *inputs[0], in_scale, *(this->_ctx)); + in_data = (const void *)int8_input.data(); + } else { + in_data = (const void*)inputs[0]->data(); + } + + out_data = (void*)outputs[0]->mutable_data(); + const void* weight_data = (const void*) int8_weights.data(); + + if (param.activation_param.has_active) { + if (param.bias()->valid_size() > 0) { + const void *bias_data = (const void *) int32_bias.data(); + CUDNN_CHECK(cudnnConvolutionBiasActivationForward( + _handle, cudnn::cudnnTypeWrapper::kOne(), + _input_descs, in_data, _filter_desc, weight_data, + _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, + cudnn::cudnnTypeWrapper::kZero(), + _output_descs, out_data, + _bias_desc, bias_data, + _active_descs, _output_descs, out_data)); + } else { + CUDNN_CHECK(cudnnConvolutionForward(_handle, + cudnn::cudnnTypeWrapper::kOne(), + _input_descs, in_data, + _filter_desc, weight_data, + _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, + cudnn::cudnnTypeWrapper::kZero(), + _output_descs, out_data)); + + CUDNN_CHECK(cudnnActivationForward(_handle, _active_descs, + cudnn::cudnnTypeWrapper::kOne(), + _output_descs, out_data, + cudnn::cudnnTypeWrapper::kZero(), + _output_descs, out_data)); + } + } else { + CUDNN_CHECK(cudnnConvolutionForward(_handle, + cudnn::cudnnTypeWrapper::kOne(), + _input_descs, in_data, + _filter_desc, weight_data, + _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, + cudnn::cudnnTypeWrapper::kZero(), + _output_descs, out_data)); + if (param.bias()->size() > 0) { + // add up bias. + const void *bias_data = (const void *) int32_bias.data(); + CUDNN_CHECK(cudnnAddTensor(_handle, + cudnn::cudnnTypeWrapper::kOne(), + _bias_desc, bias_data, + cudnn::cudnnTypeWrapper::kOne(), + _output_descs, out_data)); + } + } + if (outputs[0]->get_dtype() == AK_FLOAT) { + conv_calibrate_int32_fp32( + *outputs[0], *outputs[0], in_scale, weights_scale, *_ctx); + } else if (outputs[0]->get_dtype() == AK_INT8) { + LOG(FATAL) << "not support output int8 now!!!"; + } + return SaberSuccess; +}; + +template class VenderConv2D; +template class VenderConv2D; +DEFINE_OP_TEMPLATE(VenderConv2D, ConvParam, NV, AK_HALF); } } diff --git a/saber/funcs/impl/cuda/vender_conv.h b/saber/funcs/impl/cuda/vender_conv.h index c7ccb88f3..a1dd1c594 100644 --- a/saber/funcs/impl/cuda/vender_conv.h +++ b/saber/funcs/impl/cuda/vender_conv.h @@ -1,50 +1,35 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV2D_H #define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV2D_H #include "saber/funcs/impl/impl_conv.h" -#include "saber/funcs/impl/cuda/cudnn_helper.h" +#include "saber/funcs/impl/cuda/saber_activation.h" #include namespace anakin{ namespace saber{ -template -class VenderConv2D : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvParam > > -{ +template +class VenderConv2D : public ImplBase< + NV, OpDtype, ConvParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; + VenderConv2D() : _handle(NULL) , _workspaceData(NULL) @@ -57,12 +42,10 @@ class VenderConv2D& inputs, - std::vector& outputs, - ConvParam& param, Context& ctx) { - - // ---- init cudnn resources ---- - - _workspaceSizeInBytes = 0; - _workspaceData = NULL; - - _workspace_fwd_sizes = 0; - - this->_ctx = ctx; - // ---- get cuda resources ---- - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - _workspace = NULL; - - int in_channels = inputs[0]->channel(); - - // ---- create cudnn Descs ---- - cudnn::createFilterDesc(&_filter_desc); + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx); - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_output_descs); - cudnn::createConvolutionDesc(&_conv_descs); + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx); - if (param.bias()->size() > 0) { - cudnn::createTensorDesc(&_bias_desc); - } - - cudnnCreateTensorDescriptor(&_input_nchw_descs); - cudnnCreateTensorDescriptor(&_output_nchw_descs); + //call cudnnConvolutionForward here + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param); - return create(inputs, outputs, param, ctx); + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + return SaberUnImplError; } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvParam& param, Context& ctx); - - //call cudnnConvolutionForward here - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvParam& param); - + void set_beta(float beta) { + _beta = beta; + } private: cudnnHandle_t _handle; cudnnConvolutionFwdAlgo_t _fwd_algo; - cudnnTensorDescriptor_t _input_descs; cudnnTensorDescriptor_t _output_descs; cudnnTensorDescriptor_t _bias_desc; - cudnnFilterDescriptor_t _filter_desc; - cudnnConvolutionDescriptor_t _conv_descs; + // activation descriptor + cudnnActivationDescriptor_t _active_descs; + size_t _workspace_fwd_sizes; size_t _workspaceSizeInBytes; // size of underlying storage - void *_workspaceData; // underlying storage void *_workspace; // aliases into _workspaceData - const bool _use_tensor_core = true; - const size_t _workspace_limit_bytes = 64 * 1024 * 1024; + const size_t _workspace_limit_bytes = 4 * 1024 * 1024; const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; // create transform descriptor cudnnTensorDescriptor_t _input_nchw_descs; cudnnTensorDescriptor_t _output_nchw_descs; - - void *x8_data; - void *y8_data; - - int x8_data_size; - int y8_data_size; + float _beta{0.f}; + bool _with_saber_act{false}; + SaberActivation *_saber_act{nullptr}; + float* weights_scale; + Tensor int8_weights; + Tensor int8_input; + Tensor int8_output; + Tensor int32_bias; }; diff --git a/saber/funcs/impl/cuda/vender_conv_act.cpp b/saber/funcs/impl/cuda/vender_conv_act.cpp deleted file mode 100644 index 33438c84d..000000000 --- a/saber/funcs/impl/cuda/vender_conv_act.cpp +++ /dev/null @@ -1,681 +0,0 @@ -#include "saber/funcs/impl/cuda/vender_conv_act.h" -#include "cuda_fp16.h" - -namespace anakin { -namespace saber { - -template <> -SaberStatus VenderConv2DAct::\ - create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx) { - - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - - int kernel_h = param.conv_param.weight()->height(); - int kernel_w = param.conv_param.weight()->width(); - - int filter_dim_a[] = {output_channel, - input_channel / param.conv_param.group, - kernel_h, kernel_w}; - - cudnn::setNDFilterDesc(&_filter_desc, - param.conv_param.weight()->dims(), - filter_dim_a, CUDNN_TENSOR_NCHW); - - Shape in_stride = inputs[0]->get_stride(); - Shape out_stride = outputs[0]->get_stride(); - - int dim_a[] = {input_num, input_channel, - input_height, input_width}; - - int dim_b[] = {input_num, output_channel, - output_height, output_width}; - - cudnn::setTensorNdDesc(&_input_descs, - inputs[0]->dims(), dim_a, &in_stride[0]); - - cudnn::setTensorNdDesc(&_output_descs, - outputs[0]->dims(), dim_b, &out_stride[0]); - - int pad_a[] = {param.conv_param.pad_h, param.conv_param.pad_w}; - int filter_stride_a[] = {param.conv_param.stride_h, param.conv_param.stride_w}; - int dilation_a[] = {param.conv_param.dilation_h, param.conv_param.dilation_w}; - - cudnn::setConvolutionNdDesc(&_conv_descs, - inputs[0]->dims() - 2, pad_a, - filter_stride_a, dilation_a); - // set activation descriptor - if(param.has_active) { - cudnn::set_activation_des(&_active_descs, param.activation_param.active); - } - - // true: use tensor core - // false: disable tensor core - cudnn::set_math_type(&_conv_descs, _use_tensor_core); - cudnn::set_group_count(&_conv_descs, param.conv_param.group); - - // Get fastest implement of cudnn - // set up algo and workspace size - if (param.conv_param.group == inputs[0]->channel() && \ - inputs[0]->channel() == outputs[0]->channel()) { - //CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - } else { - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(_handle, \ - _input_descs, _filter_desc, _conv_descs, _output_descs, \ - _preference, _workspace_limit_bytes, &_fwd_algo)); - } - - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle, - _input_descs, _filter_desc, _conv_descs, _output_descs, - _fwd_algo, &_workspace_fwd_sizes)); - - if (_workspace_fwd_sizes > _workspaceSizeInBytes) { - _workspaceSizeInBytes = _workspace_fwd_sizes; - if (_workspaceData != NULL) { - cudaFree(_workspaceData); - } - cudaMalloc(&_workspaceData, _workspaceSizeInBytes); - _workspace = reinterpret_cast(_workspaceData); - } - - if (param.conv_param.bias()->size()> 0) { - int dim_bias[] = {1, output_channel, 1, 1}; - int stride_bias[] = {output_channel, 1, 1, 1}; - - cudnn::setTensorNdDesc(&_bias_desc, - 4, dim_bias, stride_bias); - } - return SaberSuccess; -} -template <> -SaberStatus VenderConv2DAct::\ - dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param) { - - const InDataType *in_data = (const InDataType*)inputs[0]->data(); - InDataType *out_data = (InDataType*)outputs[0]->mutable_data(); - - const float *weight_data = (const float *) param.conv_param.weight()->data(); - if (param.has_active == false) { - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, - _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data - )); - - if (param.conv_param.bias()->size() > 0) { - - // add up bias. - const float * bias_data = (const float*)param.conv_param.bias()->data(); - CUDNN_CHECK(cudnnAddTensor(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _bias_desc, bias_data, - cudnn::cudnnTypeWrapper::kOne(), - _output_descs, out_data)); - - } - return SaberSuccess; - } - - if (param.conv_param.bias()->size() > 0) { - const float * bias_data = (const float*)param.conv_param.bias()->data(); - - CUDNN_CHECK(cudnnConvolutionBiasActivationForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data, - _bias_desc, bias_data, - _active_descs, _output_descs, out_data)); - - } else { - - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data - )); - - CUDNN_CHECK(cudnnActivationForward(_handle, _active_descs, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, out_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data - )); - } - return SaberSuccess; -} - -#if 0 -template <> -SaberStatus VenderConv2DAct::\ - create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx> &ctx) { - - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - int in_size = inputs[0]->valid_size(); - int out_size = outputs[0]->valid_size(); - - // ====== int8 conv, the input channel must be a multiple of 4 - CHECK_EQ(input_channel % 4, 0); - - int kernel_h = param.conv_param.weight()->height(); - int kernel_w = param.conv_param.weight()->width(); - - int filter_dim_a[] = {output_channel, - input_channel, - kernel_h, kernel_w}; - - CUDNN_CHECK(cudnnSetFilterNdDescriptor(_filter_desc, CUDNN_DATA_INT8x4, - CUDNN_TENSOR_NCHW_VECT_C, - param.conv_param.weight()->dims(), filter_dim_a)); - - CUDNN_CHECK(cudnnSetTensor4dDescriptor(_input_descs, - CUDNN_TENSOR_NCHW_VECT_C, - CUDNN_DATA_INT8x4, - input_num, input_channel, input_height, input_width)); - - CUDNN_CHECK(cudnnSetTensor4dDescriptor(_output_descs, - CUDNN_TENSOR_NCHW_VECT_C, - CUDNN_DATA_INT8x4, - input_num, output_channel, output_height, output_width)); - // ===================================================================== - - // for int8 - // These part is used to describe origin data layout; - Shape in_stride = inputs[0]->get_stride(); - Shape out_stride = outputs[0]->get_stride(); - - int dim_a[] = {input_num, input_channel, - input_height, input_width}; - - int dim_b[] = {input_num, output_channel, - output_height, output_width}; - - cudnn::setTensorNdDesc(&_input_nchw_descs, - inputs[0]->dims(), dim_a, &in_stride[0]); - - cudnn::setTensorNdDesc(&_output_nchw_descs, - outputs[0]->dims(), dim_b, &out_stride[0]); - // ======= - int pad_a[] = {param.conv_param.pad_h, param.conv_param.pad_w}; - int filter_stride_a[] = {param.conv_param.stride_h, param.conv_param.stride_w}; - int dilation_a[] = {param.conv_param.dilation_h, param.conv_param.dilation_w}; - - cudnn::setConvolutionNdDesc(&_conv_descs, - inputs[0]->dims() - 2, pad_a, - filter_stride_a, dilation_a); - // set activation descriptor - cudnn::set_activation_des(&_active_descs, param.activation_param.active); - - // true: use tensor core - // false: disable tensor core - cudnn::set_group_count(&_conv_descs, param.conv_param.group); - - // Get fastest implement of cudnn - _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle, - _input_descs, _filter_desc, _conv_descs, _output_descs, - _fwd_algo, &_workspace_fwd_sizes)); - - if (_workspace_fwd_sizes > _workspaceSizeInBytes) { - _workspaceSizeInBytes = _workspace_fwd_sizes; - if (_workspaceData != NULL) { - cudaFree(_workspaceData); - } - cudaMalloc(&_workspaceData, _workspaceSizeInBytes); - _workspace = reinterpret_cast(_workspaceData); - } - - if (param.conv_param.bias()->size() > 0) { - int dim_bias[] = {1, output_channel, 1, 1}; - int stride_bias[] = {output_channel, 1, 1, 1}; - - cudnn::setTensorNdDesc(&_bias_desc, - 4, dim_bias, stride_bias); - } - - if (x8_data_size < in_size){ - x8_data_size = in_size; - if (x8_data != NULL) { - CUDA_CHECK(cudaFree(x8_data)); - } - CUDA_CHECK(cudaMalloc(&x8_data, - sizeof(char) * x8_data_size)); - } - - if (y8_data_size < out_size){ - y8_data_size = out_size; - if (y8_data != NULL) { - CUDA_CHECK(cudaFree(y8_data)); - } - CUDA_CHECK(cudaMalloc(&y8_data, sizeof(char) * y8_data_size)); - } - return SaberSuccess; -} - -template <> -SaberStatus VenderConv2DAct::dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param) { - - const void *in_data = (const void*)inputs[0]->data(); - void *out_data = (void*)outputs[0]->mutable_data(); - - // scale data for int8 - float scale = 1.f; - float scale_1 = 1 / scale; - - // int8 tensor transoform - CUDNN_CHECK(cudnnTransformTensor(_handle, - &scale, - _input_nchw_descs, in_data, - cudnn::cudnnTypeWrapper::kZero(), - _input_descs, x8_data)); - - const void *weight_data = (const void *) param.conv_param.weight()->data(); - - if (param.conv_param.bias()->size() > 0) { - const float * bias_data = (const float*)param.conv_param.bias()->data(); - - CUDNN_CHECK(cudnnConvolutionBiasActivationForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, x8_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, y8_data, - _bias_desc, bias_data, - _active_descs, _output_descs, y8_data)); - } else { - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, x8_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, y8_data)); - - CUDNN_CHECK(cudnnActivationForward(_handle, _active_descs, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, y8_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, y8_data)); - } - // int8 tensor transoform - CUDNN_CHECK(cudnnTransformTensor(_handle, - &scale_1, - _output_descs, y8_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_nchw_descs, out_data)); - - return SaberSuccess; - -} - -template <> -SaberStatus VenderConv2DAct::\ - create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx> &ctx) { - CHECK_EQ(inputs[0]->dims(), 5); - CHECK_EQ(inputs[0]->shape()[4], 4); - CHECK_EQ(outputs[0]->dims(), 5); - CHECK_EQ(outputs[0]->shape()[4], 4); - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - int in_size = inputs[0]->valid_size(); - int out_size = outputs[0]->valid_size(); - - // ====== int8 conv, the input channel must be a multiple of 4 - CHECK_EQ(input_channel % 4, 0); - int kernel_h = param.conv_param.weight()->height(); - int kernel_w = param.conv_param.weight()->width(); - - int filter_dim_a[] = {output_channel, - input_channel, - kernel_h, kernel_w}; - - CUDNN_CHECK(cudnnSetFilterNdDescriptor(_filter_desc, CUDNN_DATA_INT8x4, - CUDNN_TENSOR_NCHW_VECT_C, - 4, filter_dim_a)); - -// Shape in_stride = inputs[0]->get_stride(); -// Shape out_stride = outputs[0]->get_stride(); - -// std::cout<<"in_stride"; -// for (auto i : in_stride) { -// std::cout<<", "<(&_conv_descs, - 2, pad_a, - filter_stride_a, dilation_a); - // set activation descriptor - cudnn::set_activation_des(&_active_descs, param.activation_param.active); - - // true: use tensor core - // false: disable tensor core - cudnn::set_group_count(&_conv_descs, param.conv_param.group); - - // Get fastest implement of cudnn - _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle, - _input_descs, _filter_desc, _conv_descs, _output_descs, - _fwd_algo, &_workspace_fwd_sizes)); - - if (_workspace_fwd_sizes > _workspaceSizeInBytes) { - _workspaceSizeInBytes = _workspace_fwd_sizes; - if (_workspaceData != NULL) { - cudaFree(_workspaceData); - } - cudaMalloc(&_workspaceData, _workspaceSizeInBytes); - _workspace = reinterpret_cast(_workspaceData); - } - - if (param.conv_param.bias()->size() > 0) { - int dim_bias[] = {1, output_channel, 1, 1}; - int stride_bias[] = {output_channel, 1, 1, 1}; - - cudnn::setTensorNdDesc(&_bias_desc, - 4, dim_bias, stride_bias); - } - return SaberSuccess; -} - -template <> -SaberStatus VenderConv2DAct::dispatch( - const std::vector inputs, - std::vector outputs, - ConvActiveParam ¶m) { - - const void *in_data = (const void*)inputs[0]->data(); - void *out_data = (void*)outputs[0]->mutable_data(); - const void *weight_data = (const void *) param.conv_param.weight()->data(); - - if (param.conv_param.bias()->size() > 0) { - const float * bias_data = (const float*)param.conv_param.bias()->data(); - - CUDNN_CHECK(cudnnConvolutionBiasActivationForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data, - _bias_desc, bias_data, - _active_descs, _output_descs, out_data)); - } else { - - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data)); - - CUDNN_CHECK(cudnnActivationForward(_handle, _active_descs, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, out_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data)); - } - - return SaberSuccess; - -} - -template <> -SaberStatus VenderConv2DAct::\ - create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx> &ctx) { - - CHECK_EQ(inputs[0]->dims(), 5); - CHECK_EQ(inputs[0]->shape()[4], 4); - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - int in_size = inputs[0]->valid_size(); - int out_size = outputs[0]->valid_size(); - - // ====== int8 conv, the input channel must be a multiple of 4 - CHECK_EQ(input_channel % 4, 0); - int kernel_h = param.conv_param.weight()->height(); - int kernel_w = param.conv_param.weight()->width(); - - int filter_dim_a[] = {output_channel, - input_channel, - kernel_h, kernel_w}; - - CUDNN_CHECK(cudnnSetFilterNdDescriptor(_filter_desc, CUDNN_DATA_INT8x4, - CUDNN_TENSOR_NCHW_VECT_C, - 4, filter_dim_a)); - - CUDNN_CHECK(cudnnSetTensor4dDescriptor(_input_descs, - CUDNN_TENSOR_NCHW_VECT_C, - CUDNN_DATA_INT8x4, - input_num, input_channel, input_height, input_width)); - - CUDNN_CHECK(cudnnSetTensor4dDescriptor(_output_descs, - CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, - input_num, output_channel, output_height, output_width)); - - int pad_a[] = {param.conv_param.pad_h, param.conv_param.pad_w}; - int filter_stride_a[] = {param.conv_param.stride_h, param.conv_param.stride_w}; - int dilation_a[] = {param.conv_param.dilation_h, param.conv_param.dilation_w}; - - cudnn::setConvolutionNdDesc(&_conv_descs, - 2, pad_a, - filter_stride_a, dilation_a); - // set activation descriptor - cudnn::set_activation_des(&_active_descs, param.activation_param.active); - - // true: use tensor core - // false: disable tensor core - cudnn::set_group_count(&_conv_descs, param.conv_param.group); - - // Get fastest implement of cudnn - _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle, - _input_descs, _filter_desc, _conv_descs, _output_descs, - _fwd_algo, &_workspace_fwd_sizes)); - - if (_workspace_fwd_sizes > _workspaceSizeInBytes) { - _workspaceSizeInBytes = _workspace_fwd_sizes; - if (_workspaceData != NULL) { - cudaFree(_workspaceData); - } - cudaMalloc(&_workspaceData, _workspaceSizeInBytes); - _workspace = reinterpret_cast(_workspaceData); - } - - if (param.conv_param.bias()->size() > 0) { - int dim_bias[] = {1, output_channel, 1, 1}; - int stride_bias[] = {output_channel, 1, 1, 1}; - - cudnn::setTensorNdDesc(&_bias_desc, - 4, dim_bias, stride_bias); - } - return SaberSuccess; -} - -template <> -SaberStatus VenderConv2DAct::\ - dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param) { - - const void *in_data = (const void*)inputs[0]->data(); - void *out_data = (void*)outputs[0]->mutable_data(); - const void *weight_data = (const void *) param.conv_param.weight()->data(); - - if (param.has_active == false) { - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data - )); - - if (param.conv_param.bias()->size() > 0) { - - // add up bias. - const float * bias_data = (const float*)param.conv_param.bias()->data(); - CUDNN_CHECK(cudnnAddTensor(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _bias_desc, bias_data, - cudnn::cudnnTypeWrapper::kOne(), - _output_descs, out_data)); - - } - return; - } - - if (param.conv_param.bias()->size() > 0) { - const float * bias_data = (const float*)param.conv_param.bias()->data(); - - CUDNN_CHECK(cudnnConvolutionBiasActivationForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data, - _bias_desc, bias_data, - _active_descs, _output_descs, out_data)); - } else { - - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data)); - - CUDNN_CHECK(cudnnActivationForward(_handle, _active_descs, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, out_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data)); - } - - return SaberSuccess; -} -#endif -} -} diff --git a/saber/funcs/impl/cuda/vender_conv_act.h b/saber/funcs/impl/cuda/vender_conv_act.h deleted file mode 100644 index 802445b2d..000000000 --- a/saber/funcs/impl/cuda/vender_conv_act.h +++ /dev/null @@ -1,206 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_H - -#include "saber/funcs/impl/impl_conv_act.h" -#include "saber/funcs/impl/cuda/cudnn_helper.h" -#include - -namespace anakin{ - -namespace saber{ - -template -class VenderConv2DAct : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvActiveParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderConv2DAct() - : _handle(NULL) - , _workspaceData(NULL) - , _workspace(NULL) - , _conv_descs(NULL) - , _input_descs(NULL) - , _output_descs(NULL) - , _filter_desc(NULL) - , _workspace_fwd_sizes(0) - , _workspaceSizeInBytes(0) - , _fwd_algo((cudnnConvolutionFwdAlgo_t)0) - , _input_nchw_descs(NULL) - , _output_nchw_descs(NULL) - , _active_descs(NULL) - , _bias_desc(NULL) - , x8_data(NULL) - , y8_data(NULL) - , x8_data_size(0) - , y8_data_size(0) - {} - - ~VenderConv2DAct() { - - if (_conv_descs) { - CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs)); - } - if (_input_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs)); - } - if (_output_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs)); - } - if (_filter_desc) { - CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc)); - } - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - if (_workspaceData != NULL) { - cudaFree(_workspaceData); - } - if (_input_nchw_descs != NULL) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_nchw_descs)); - } - if (_output_nchw_descs != NULL) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_nchw_descs)); - } - if (x8_data != NULL) { - CUDA_CHECK(cudaFree(x8_data)); - } - if (y8_data != NULL) { - CUDA_CHECK(cudaFree(y8_data)); - } - if (_active_descs) { - CUDNN_CHECK(cudnnDestroyActivationDescriptor(_active_descs)); - } - if (_bias_desc) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_bias_desc)); - } - } - - /** - * [Create description] Init all cudnn resource here - * @AuthorHTL - * @DateTime 2018-02-01T16:13:06+0800 - * @param inputs [description] - * @param outputs [description] - * @param conv_param [conv parameters] - */ - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx) { - // ---- init cudnn resources ---- - - _workspaceSizeInBytes = 0; - _workspaceData = NULL; - - _workspace_fwd_sizes = 0; - - this->_ctx = ctx; - // ---- get cuda resources ---- - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - _workspace = NULL; - - int in_channels = inputs[0]->channel(); - - // ---- create cudnn Descs ---- - cudnn::createFilterDesc(&_filter_desc); - - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_output_descs); - cudnn::createConvolutionDesc(&_conv_descs); - cudnn::create_activation_des(&_active_descs); - - if (param.conv_param.bias()->size() > 0) { - cudnn::createTensorDesc(&_bias_desc); - } - - cudnnCreateTensorDescriptor(&_input_nchw_descs); - cudnnCreateTensorDescriptor(&_output_nchw_descs); - - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx); - //call cudnnConvolutionForward here - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param); -private: - cudnnHandle_t _handle; - cudnnConvolutionFwdAlgo_t _fwd_algo; - - cudnnTensorDescriptor_t _input_descs; - cudnnTensorDescriptor_t _output_descs; - cudnnTensorDescriptor_t _bias_desc; - - cudnnFilterDescriptor_t _filter_desc; - - cudnnConvolutionDescriptor_t _conv_descs; - - size_t _workspace_fwd_sizes; - size_t _workspaceSizeInBytes; // size of underlying storage - - void *_workspaceData; // underlying storage - void *_workspace; // aliases into workspaceData - - const bool _use_tensor_core = true; - const size_t _workspace_limit_bytes = 64 * 1024 * 1024; - const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; - - // activation descriptor - cudnnActivationDescriptor_t _active_descs; - - // create transform descriptor - cudnnTensorDescriptor_t _input_nchw_descs; - cudnnTensorDescriptor_t _output_nchw_descs; - - void *x8_data; - void *y8_data; - - int x8_data_size; - int y8_data_size; -}; - - -} - -} -#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H diff --git a/saber/funcs/impl/cuda/vender_conv_act_pooling.cpp b/saber/funcs/impl/cuda/vender_conv_act_pooling.cpp deleted file mode 100644 index 4daea1528..000000000 --- a/saber/funcs/impl/cuda/vender_conv_act_pooling.cpp +++ /dev/null @@ -1,231 +0,0 @@ -#include "saber/funcs/impl/cuda/vender_conv_act_pooling.h" -#include "cuda_fp16.h" - -namespace anakin { -namespace saber { - -template <> -SaberStatus VenderConv2DActPooling::\ - create(const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam& param, Context &ctx) { - - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - { - _inner_shape = inputs[0]->shape(); - _inner_shape[0] = input_num; - _inner_shape[1] = param.conv_param.weight()->num(); - - int kernel_exten = param.conv_param.dilation_h * - (param.conv_param.weight()->height() - 1) + 1; - int output_dim = (input_height + 2 * param.conv_param.pad_h - kernel_exten) - / param.conv_param.stride_h + 1; - _inner_shape[2] = output_dim; - kernel_exten = param.conv_param.dilation_w * - (param.conv_param.weight()->width() - 1) + 1; - output_dim = (input_width + 2 * param.conv_param.pad_w - kernel_exten) - / param.conv_param.stride_w + 1; - _inner_shape[3] = output_dim; - _inner_tensor.re_alloc(_inner_shape); - } - - int kernel_h = param.conv_param.weight()->height(); - int kernel_w = param.conv_param.weight()->width(); - - int filter_dim_a[] = {output_channel, - input_channel / param.conv_param.group, - kernel_h, kernel_w}; - - cudnn::setNDFilterDesc(&_filter_desc, - param.conv_param.weight()->dims(), - filter_dim_a, CUDNN_TENSOR_NCHW); - - Shape in_stride = inputs[0]->get_stride(); - Shape inner_stride = _inner_tensor.get_stride(); - Shape out_stride = outputs[0]->get_stride(); - - int dim_a[] = {input_num, input_channel, - input_height, input_width}; - - int dim_inner[] = {_inner_shape[0], _inner_shape[1], - _inner_shape[2], _inner_shape[3]}; - - int dim_b[] = {input_num, output_channel, - output_height, output_width}; - cudnn::setTensorNdDesc(&_input_descs, - inputs[0]->dims(), dim_a, &in_stride[0]); - cudnn::setTensorNdDesc(&_inner_descs, - 4, dim_inner, - &inner_stride[0]); - cudnn::setTensorNdDesc(&_output_descs, - outputs[0]->dims(), dim_b, &out_stride[0]); - int pad_a[] = {param.conv_param.pad_h, param.conv_param.pad_w}; - int filter_stride_a[] = {param.conv_param.stride_h, param.conv_param.stride_w}; - int dilation_a[] = {param.conv_param.dilation_h, param.conv_param.dilation_w}; - - cudnn::setConvolutionNdDesc(&_conv_descs, - inputs[0]->dims() - 2, pad_a, - filter_stride_a, dilation_a); - // set activation descriptor - if (param.has_activation) { - cudnn::set_activation_des(&_active_descs, param.activation_param.active); - } - if (param.has_pooling) { - int windowHeight[] = {param.pooling_param.window_h, - param.pooling_param.window_w}; - int padding[] = {param.pooling_param.pad_h, - param.pooling_param.pad_w}; - int stride[] = {param.pooling_param.stride_h, - param.pooling_param.stride_w}; - - cudnn::set_nd_pooling_des(&_pooling_descs, - param.pooling_param.pooling_type, - _inner_tensor.dims() - 2, - windowHeight, - padding,stride); - } - // true: use tensor core - // false: disable tensor core - cudnn::set_math_type(&_conv_descs, _use_tensor_core); - cudnn::set_group_count(&_conv_descs, param.conv_param.group); - - // Get fastest implement of cudnn - // set up algo and workspace size - if (param.conv_param.group == inputs[0]->channel() && \ - inputs[0]->channel() == outputs[0]->channel()) { - _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;//CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - } else { - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(_handle, \ - _input_descs, _filter_desc, _conv_descs, _inner_descs, \ - _preference, _workspace_limit_bytes, &_fwd_algo)); - } - - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle, - _input_descs, _filter_desc, - _conv_descs, _inner_descs, - _fwd_algo, &_workspace_fwd_sizes)); - - if (_workspace_fwd_sizes > _workspaceSizeInBytes) { - _workspaceSizeInBytes = _workspace_fwd_sizes; - if (_workspaceData != NULL) { - cudaFree(_workspaceData); - } - cudaMalloc(&_workspaceData, _workspaceSizeInBytes); - _workspace = reinterpret_cast(_workspaceData); - } - - if (param.conv_param.bias()->size()> 0) { - int dim_bias[] = {1, output_channel, 1, 1}; - int stride_bias[] = {output_channel, 1, 1, 1}; - - cudnn::setTensorNdDesc(&_bias_desc, - 4, dim_bias, stride_bias); - } - return SaberSuccess; -} -template <> -SaberStatus VenderConv2DActPooling::\ - dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam& param) { - - const InDataType *in_data = (const InDataType*)inputs[0]->data(); - InDataType *inner_data = (InDataType*)_inner_tensor.mutable_data(); - InDataType *out_data = (InDataType*)outputs[0]->mutable_data(); - - const float *weight_data = (const float *) param.conv_param.weight()->data(); - if (param.has_activation == false) { - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _inner_descs, inner_data - )); - if (param.conv_param.bias()->size() > 0) { - // add up bias. - const float * bias_data = (const float*)param.conv_param.bias()->data(); - CUDNN_CHECK(cudnnAddTensor(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _bias_desc, bias_data, - cudnn::cudnnTypeWrapper::kOne(), - _inner_descs, inner_data)); - } - CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs, - cudnn::cudnnTypeWrapper::kOne(), - _inner_descs, inner_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data - )); - return SaberSuccess; - } - - if (param.conv_param.bias()->size() > 0) { - const float * bias_data = (const float*)param.conv_param.bias()->data(); - CUDNN_CHECK(cudnnConvolutionBiasActivationForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, - _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _inner_descs, inner_data, - _bias_desc, bias_data, - _active_descs, _inner_descs, inner_data)); - - CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs, - cudnn::cudnnTypeWrapper::kOne(), - _inner_descs, inner_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data - )); - - } else { - - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, - _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _inner_descs, inner_data - )); - - CUDNN_CHECK(cudnnActivationForward(_handle, _active_descs, - cudnn::cudnnTypeWrapper::kOne(), - _inner_descs, inner_data, - cudnn::cudnnTypeWrapper::kZero(), - _inner_descs, inner_data - )); - CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs, - cudnn::cudnnTypeWrapper::kOne(), - _inner_descs, inner_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data - )); - } - return SaberSuccess; -} -} -} diff --git a/saber/funcs/impl/cuda/vender_conv_act_pooling.h b/saber/funcs/impl/cuda/vender_conv_act_pooling.h deleted file mode 100644 index c766ee0fc..000000000 --- a/saber/funcs/impl/cuda/vender_conv_act_pooling.h +++ /dev/null @@ -1,192 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_POOLING_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ACT_POOLING_H - -#include "saber/funcs/impl/impl_conv_act_pooling.h" -#include "saber/funcs/impl/cuda/cudnn_helper.h" -#include - -namespace anakin{ - -namespace saber{ - -template -class VenderConv2DActPooling : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvActivePoolingParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderConv2DActPooling() - : _handle(NULL) - , _workspaceData(NULL) - , _workspace(NULL) - , _conv_descs(NULL) - , _input_descs(NULL) - , _output_descs(NULL) - , _filter_desc(NULL) - , _inner_descs(NULL) - , _bias_desc(NULL) - , _pooling_descs(NULL) - , _active_descs(NULL) - , _workspace_fwd_sizes(0) - , _workspaceSizeInBytes(0) - , _fwd_algo((cudnnConvolutionFwdAlgo_t)0) - {} - ~VenderConv2DActPooling() { - - if (_conv_descs) { - CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs)); - } - if (_input_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs)); - } - if (_output_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs)); - } - if (_filter_desc) { - CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc)); - } - if (_handle) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - if (_workspaceData) { - CUDA_CHECK(cudaFree(_workspaceData)); - } - if (_active_descs) { - CUDNN_CHECK(cudnnDestroyActivationDescriptor(_active_descs)); - } - if (_inner_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_inner_descs)); - } - if (_bias_desc) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_bias_desc)); - } - if (_pooling_descs) { - CUDNN_CHECK(cudnnDestroyPoolingDescriptor(_pooling_descs)); - } - } - - /** - * [Create description] Init all cudnn resource here - * @AuthorHTL - * @DateTime 2018-02-01T16:13:06+0800 - * @param inputs [description] - * @param outputs [description] - * @param conv_param [conv parameters] - */ - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam& param, Context& ctx) { - // ---- init cudnn resources ---- - - _workspaceSizeInBytes = 0; - _workspaceData = NULL; - - _workspace_fwd_sizes = 0; - - this->_ctx = ctx; - // ---- get cuda resources ---- - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - _workspace = NULL; - - int in_channels = inputs[0]->channel(); - - // ---- create cudnn Descs ---- - cudnn::createFilterDesc(&_filter_desc); - - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_inner_descs); - cudnn::createTensorDesc(&_output_descs); - cudnn::createConvolutionDesc(&_conv_descs); - if (param.has_activation) { - cudnn::create_activation_des(&_active_descs); - } - if (param.has_pooling) { - cudnn::create_pooling_des(&_pooling_descs); - } - if (param.conv_param.bias()->size() > 0) { - cudnn::createTensorDesc(&_bias_desc); - } - - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam& param, Context& ctx); - //call cudnnConvolutionForward here - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam& param); -private: - cudnnHandle_t _handle; - cudnnConvolutionFwdAlgo_t _fwd_algo; - - cudnnTensorDescriptor_t _input_descs; - cudnnTensorDescriptor_t _output_descs; - cudnnTensorDescriptor_t _inner_descs; - cudnnTensorDescriptor_t _bias_desc; - - cudnnFilterDescriptor_t _filter_desc; - - cudnnConvolutionDescriptor_t _conv_descs; - cudnnPoolingDescriptor_t _pooling_descs; - - size_t _workspace_fwd_sizes; - size_t _workspaceSizeInBytes; // size of underlying storage - - void *_workspaceData; // underlying storage - void *_workspace; // aliases into workspaceData - - const bool _use_tensor_core = true; - const size_t _workspace_limit_bytes = 64 * 1024 * 1024; - const cudnnConvolutionFwdPreference_t _preference = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; - - // activation descriptor - cudnnActivationDescriptor_t _active_descs; - - Shape _inner_shape; - DataTensor_out _inner_tensor; -}; - - -} - -} -#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H diff --git a/saber/funcs/impl/cuda/vender_conv_eltwise.cpp b/saber/funcs/impl/cuda/vender_conv_eltwise.cpp new file mode 100644 index 000000000..527182478 --- /dev/null +++ b/saber/funcs/impl/cuda/vender_conv_eltwise.cpp @@ -0,0 +1,55 @@ + +#include "saber/funcs/impl/cuda/vender_conv_eltwise.h" +#include "saber/funcs/impl/cuda/cudnn_helper.h" +#include "saber/funcs/calibrate.h" +#include "saber/funcs/impl/cuda/vender_conv.h" +#include "saber/core/tensor_op.h" + +namespace anakin { +namespace saber { + +// FP32 part +template <> +SaberStatus VenderConvEltwise::\ + create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + + _vender_conv.create(inputs, outputs, param.conv_param, ctx); + + return SaberSuccess; +} + +template <> +SaberStatus VenderConvEltwise:: + init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + + if (param.eltwise_param.activation_param.has_active) { + return SaberUnImplError; + } + if (param.conv_param.activation_param.has_active) { + return SaberUnImplError; + } + + _vender_conv.init(inputs, outputs, param.conv_param, ctx); + _vender_conv.set_beta(1.f); + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus VenderConvEltwise::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { + + _vender_conv.dispatch(inputs, outputs, param.conv_param); + return SaberSuccess; +} + +template class VenderConvEltwise; +DEFINE_OP_TEMPLATE(VenderConvEltwise, ConvEltwiseParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(VenderConvEltwise, ConvEltwiseParam, NV, AK_INT8); +} +} diff --git a/saber/funcs/impl/cuda/vender_conv_eltwise.h b/saber/funcs/impl/cuda/vender_conv_eltwise.h new file mode 100644 index 000000000..c477478b4 --- /dev/null +++ b/saber/funcs/impl/cuda/vender_conv_eltwise.h @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ELTWISE_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ELTWISE_H + +#include "saber/funcs/impl/impl_conv_eltwise.h" +#include "saber/funcs/impl/cuda/vender_conv.h" +#include "saber/funcs/funcs_utils.h" +#include + +namespace anakin{ + +namespace saber{ + +template +class VenderConvEltwise : public ImplBase< + NV, OpDtype, ConvEltwiseParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + VenderConvEltwise() {} + + ~VenderConvEltwise() {} + + /** + * [Create description] Init all cudnn resource here + * @AuthorHTL + * @DateTime 2018-02-01T16:13:06+0800 + * @param inputs [description] + * @param outputs [description] + * @param param [conv parameters] + */ + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx); + + //call cudnnConvolutionForward here + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param); + + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + + return SaberUnImplError; + } +private: + VenderConv2D _vender_conv; + +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H diff --git a/saber/funcs/impl/cuda/vender_conv_pooling.cpp b/saber/funcs/impl/cuda/vender_conv_pooling.cpp new file mode 100644 index 000000000..a3fad3a5a --- /dev/null +++ b/saber/funcs/impl/cuda/vender_conv_pooling.cpp @@ -0,0 +1,60 @@ + +#include "saber/funcs/impl/cuda/vender_conv_pooling.h" +#include "saber/funcs/impl/cuda/cudnn_helper.h" +#include "saber/funcs/calibrate.h" +#include "saber/funcs/impl/cuda/vender_conv.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/funcs_utils.h" + +namespace anakin { +namespace saber { + +// FP32 part +template <> +SaberStatus VenderConv2DPooling::\ + create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx) { + _ctx = &ctx; + _inner_shape = conv_compute_shape(inputs[0]->valid_shape(), param.conv_param); + _inner_tensor.reshape(_inner_shape); + _inner_tensor_v.resize(1); + _inner_tensor_v[0] = &_inner_tensor; + + _vender_conv.create(inputs, _inner_tensor_v, param.conv_param, ctx); + _vender_pool.create(_inner_tensor_v, outputs, param.pooling_param, ctx); + return SaberSuccess; +} + +template <> +SaberStatus VenderConv2DPooling:: + init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx) { + + _ctx = &ctx; + _inner_shape = conv_compute_shape(inputs[0]->valid_shape(), param.conv_param); + _inner_tensor.re_alloc(_inner_shape, AK_FLOAT); + + _inner_tensor_v.resize(1); + _inner_tensor_v[0] = &_inner_tensor; + _vender_conv.init(inputs, _inner_tensor_v, param.conv_param, ctx); + _vender_pool.init(_inner_tensor_v, outputs, param.pooling_param, ctx); + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus VenderConv2DPooling::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvPoolingParam& param) { + _vender_conv.dispatch(inputs, _inner_tensor_v, param.conv_param); + _vender_pool.dispatch(_inner_tensor_v, outputs, param.pooling_param); + return SaberSuccess; +} + +template class VenderConv2DPooling; +DEFINE_OP_TEMPLATE(VenderConv2DPooling, ConvPoolingParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(VenderConv2DPooling, ConvPoolingParam, NV, AK_INT8); +} +} diff --git a/saber/funcs/impl/cuda/vender_conv_pooling.h b/saber/funcs/impl/cuda/vender_conv_pooling.h new file mode 100644 index 000000000..795fd8742 --- /dev/null +++ b/saber/funcs/impl/cuda/vender_conv_pooling.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_POOLING_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_POOLING_H + +#include "saber/funcs/impl/impl_conv_pooling.h" +#include "saber/funcs/impl/cuda/vender_conv.h" +#include "saber/funcs/impl/cuda/vender_pooling.h" +#include "saber/funcs/funcs_utils.h" +#include + +namespace anakin{ + +namespace saber{ + +template +class VenderConv2DPooling : public ImplBase< + NV, OpDtype, ConvPoolingParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + VenderConv2DPooling() {} + + ~VenderConv2DPooling() {} + + /** + * [Create description] Init all cudnn resource here + * @AuthorHTL + * @DateTime 2018-02-01T16:13:06+0800 + * @param inputs [description] + * @param outputs [description] + * @param param [conv parameters] + */ + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx); + + //call cudnnConvolutionForward here + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvPoolingParam& param); + + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + return SaberUnImplError; + } +private: + VenderPooling _vender_pool; + VenderConv2D _vender_conv; + Shape _inner_shape; + Tensor _inner_tensor; + std::vector *> _inner_tensor_v; +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_CUDNN_CONV2D_H diff --git a/saber/funcs/impl/cuda/vender_deconv.cpp b/saber/funcs/impl/cuda/vender_deconv.cpp new file mode 100644 index 000000000..496321db0 --- /dev/null +++ b/saber/funcs/impl/cuda/vender_deconv.cpp @@ -0,0 +1,189 @@ + +#include "saber/funcs/impl/cuda/vender_deconv.h" + +namespace anakin { +namespace saber { + +template <> +SaberStatus VenderDeconv2D::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + + if (!(&ctx == this->_ctx)) { + if (_handle != NULL) { + CUDNN_CHECK(cudnnDestroy(_handle)); + } + + this->_ctx = &ctx; + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + } + + int input_num = inputs[0]->num(); + int input_channel = inputs[0]->channel(); + int input_height = inputs[0]->height(); + int input_width = inputs[0]->width(); + + int output_channel = outputs[0]->channel(); + int output_height = outputs[0]->height(); + int output_width = outputs[0]->width(); + + int kernel_h = param.weight()->height(); + int kernel_w = param.weight()->width(); + + int filter_dim_a[] = {input_channel, output_channel / param.group, \ + kernel_h, kernel_w}; + + cudnn::setNDFilterDesc(&_filter_desc, + param.weight()->dims(), + filter_dim_a, CUDNN_TENSOR_NCHW); + + Shape in_stride = inputs[0]->get_stride(); + Shape out_stride = outputs[0]->get_stride(); + + int dim_a[] = {input_num, input_channel, + input_height, input_width}; + + int dim_b[] = {input_num, output_channel, + output_height, output_width}; + + cudnn::setTensorNdDesc(&_input_descs, + inputs[0]->dims(), dim_a, &in_stride[0]); + + cudnn::setTensorNdDesc(&_output_descs, + outputs[0]->dims(), dim_b, &out_stride[0]); + + int pad_a[] = {param.pad_h, param.pad_w}; + int stride_a[] = {param.stride_h, param.stride_w}; + int dilation_a[] = {param.dilation_h, param.dilation_w}; + + //cudnn::setConvolutionNdDesc(&_conv_descs, \ + inputs[0]->dims() - 2, pad_a, \ + stride_a, dilation_a); + CUDNN_CHECK(cudnnSetConvolution2dDescriptor(_conv_descs, \ + pad_a[0], pad_a[1], \ + stride_a[0], stride_a[1], \ + dilation_a[0], dilation_a[1], \ + CUDNN_CROSS_CORRELATION, \ + cudnn::cudnnOpWrapper::type)); + + // true: use tensor core + // false: disable tensor core + cudnn::set_math_type(&_conv_descs, _use_tensor_core); + cudnn::set_group_count(&_conv_descs, param.group); + + // Get fastest implement of cudnn + // set up algo and workspace size + //CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(_handle, \ + _filter_desc, _input_descs, _conv_descs, _output_descs, \ + _preference, _workspace_limit_bytes, &_bwd_algo)); + _bwd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; + + CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(_handle, + _filter_desc, _input_descs, _conv_descs, _output_descs, + _bwd_algo, &_workspace_bwd_sizes)); + + if (_workspace_bwd_sizes > _workspaceSizeInBytes) { + _workspaceSizeInBytes = _workspace_bwd_sizes; + + if (workspaceData != NULL) { + CUDA_CHECK(cudaFree(workspaceData)); + } + + CUDA_CHECK(cudaMalloc(&workspaceData, _workspaceSizeInBytes)); + workspace = reinterpret_cast(workspaceData); + } + + if (param.bias()->size() > 0) { + int dim_bias[] = {1, output_channel, 1, 1}; + int stride_bias[] = {output_channel, 1, 1, 1}; + cudnn::setTensorNdDesc(&_bias_desc, + 4, dim_bias, stride_bias); + } + if (_use_saber_act) { + _saber_act.create(inputs, outputs, param.activation_param, ctx); + } + + return SaberSuccess; +} + +template <> +SaberStatus VenderDeconv2D::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + + // ---- init cudnn resources ---- + + _workspaceSizeInBytes = 0; + workspaceData = NULL; + + _workspace_bwd_sizes = 0; + + this->_ctx = &ctx; + // ---- get cuda resources ---- + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + workspace = NULL; + int in_channels = inputs[0]->channel(); + + // ---- create cudnn Descs ---- + cudnn::createFilterDesc(&_filter_desc); + cudnn::createTensorDesc >(&_input_descs); + cudnn::createTensorDesc >(&_output_descs); + cudnn::createConvolutionDesc(&_conv_descs); + cudnn::createTensorDesc(&_bias_desc); + if (param.activation_param.has_active) { + _saber_act.init(inputs, outputs, param.activation_param, ctx); + _use_saber_act = true; + } + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus VenderDeconv2D::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param) { + const float* input_data = (const float*)inputs[0]->data(); + float* output_data = (float*)outputs[0]->mutable_data(); + const float* weight_data = (const float*) param.weight()->data(); + + CUDNN_CHECK(cudnnConvolutionBackwardData(_handle, + cudnn::cudnnTypeWrapper::kOne(), + _filter_desc, weight_data, + _input_descs, input_data, + _conv_descs, _bwd_algo, workspace, _workspace_bwd_sizes, + cudnn::cudnnTypeWrapper::kZero(), + _output_descs, output_data)); + + if (param.bias()->valid_size() > 0) { + const float* bias_data; + bias_data = (const float*)param.bias()->data(); + CUDNN_CHECK(cudnnAddTensor( + _handle, + cudnn::cudnnTypeWrapper::kOne(), + _bias_desc, + bias_data, + cudnn::cudnnTypeWrapper::kOne(), + _output_descs, + output_data)); + } + if (_use_saber_act) { + _saber_act.dispatch(outputs, outputs, param.activation_param); + } + return SaberSuccess; +} + +template class VenderDeconv2D; +DEFINE_OP_TEMPLATE(VenderDeconv2D, ConvParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(VenderDeconv2D, ConvParam, NV, AK_INT8); + +} +} diff --git a/saber/funcs/impl/cuda/vender_deconv.h b/saber/funcs/impl/cuda/vender_deconv.h index 8b01ea301..3c0c184c8 100644 --- a/saber/funcs/impl/cuda/vender_deconv.h +++ b/saber/funcs/impl/cuda/vender_deconv.h @@ -1,16 +1,16 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_DECONV_H @@ -20,48 +20,34 @@ #include "saber/saber_funcs_param.h" #include "saber/funcs/impl/cuda/cudnn_helper.h" #include "saber/saber_types.h" +#include "saber/funcs/impl/cuda/saber_activation.h" namespace anakin { namespace saber { -template -class VenderDeconv2D : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvParam > > -{ +template +class VenderDeconv2D : \ + public ImplBase > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef Tensor OpTensor; + typedef typename DataTrait::Dtype OpDataType; + VenderDeconv2D() - : _handle(NULL) - , workspaceData(NULL) - , workspace(NULL) - , _conv_descs(NULL) - , _input_descs(NULL) - , _output_descs(NULL) - , _filter_desc(NULL) - , _bias_desc(NULL) - , _workspace_bwd_sizes(0) - , _workspaceSizeInBytes(0) - , _bwd_algo(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0) + : _handle(NULL) + , workspaceData(NULL) + , workspace(NULL) + , _conv_descs(NULL) + , _input_descs(NULL) + , _output_descs(NULL) + , _filter_desc(NULL) + , _bias_desc(NULL) + , _workspace_bwd_sizes(0) + , _workspaceSizeInBytes(0) + , _bwd_algo(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0) {} ~VenderDeconv2D() { - if (_conv_descs) { CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs)); } @@ -93,198 +79,50 @@ class VenderDeconv2D& inputs, - std::vector& outputs, - ConvParam& param, Context& ctx) { - - // ---- init cudnn resources ---- - - _workspaceSizeInBytes = 0; - workspaceData = NULL; - - _workspace_bwd_sizes = 0; - - this->_ctx = ctx; - // ---- get cuda resources ---- - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - workspace = NULL; - - int in_channels = inputs[0]->channel(); - - // ---- create cudnn Descs ---- - cudnn::createFilterDesc(&_filter_desc); - - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_output_descs); - cudnn::createConvolutionDesc(&_conv_descs); - - cudnn::createTensorDesc(&_bias_desc); - - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvParam& param, Context &ctx) { - - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - - //update_weights(param); - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - - int kernel_h = param.weight()->height(); - int kernel_w = param.weight()->width(); + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + ConvParam& param, Context& ctx); - int filter_dim_a[] = {input_channel, output_channel / param.group, \ - kernel_h, kernel_w}; - - cudnn::setNDFilterDesc(&_filter_desc, - param.weight()->dims(), - filter_dim_a, CUDNN_TENSOR_NCHW); - - Shape in_stride = inputs[0]->get_stride(); - Shape out_stride = outputs[0]->get_stride(); - - int dim_a[] = {input_num, input_channel, - input_height, input_width}; - - int dim_b[] = {input_num, output_channel, - output_height, output_width}; - - cudnn::setTensorNdDesc(&_input_descs, - inputs[0]->dims(), dim_a, &in_stride[0]); - - cudnn::setTensorNdDesc(&_output_descs, - outputs[0]->dims(), dim_b, &out_stride[0]); - - int pad_a[] = {param.pad_h, param.pad_w}; - int stride_a[] = {param.stride_h, param.stride_w}; - int dilation_a[] = {param.dilation_h, param.dilation_w}; - - //cudnn::setConvolutionNdDesc(&_conv_descs, \ - inputs[0]->dims() - 2, pad_a, \ - stride_a, dilation_a); - CUDNN_CHECK(cudnnSetConvolution2dDescriptor(_conv_descs, \ - pad_a[0], pad_a[1], \ - stride_a[0], stride_a[1], \ - dilation_a[0], dilation_a[1], \ - CUDNN_CROSS_CORRELATION, \ - cudnn::cudnnOpWrapper::type)); - - // true: use tensor core - // false: disable tensor core - cudnn::set_math_type(&_conv_descs, _use_tensor_core); - cudnn::set_group_count(&_conv_descs, param.group); - - // Get fastest implement of cudnn - // set up algo and workspace size - //CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(_handle, \ - _filter_desc, _input_descs, _conv_descs, _output_descs, \ - _preference, _workspace_limit_bytes, &_bwd_algo)); - _bwd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; - - CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(_handle, - _filter_desc, _input_descs, _conv_descs, _output_descs, - _bwd_algo, &_workspace_bwd_sizes)); - - if (_workspace_bwd_sizes > _workspaceSizeInBytes) { - _workspaceSizeInBytes = _workspace_bwd_sizes; - if (workspaceData != NULL) { - CUDA_CHECK(cudaFree(workspaceData)); - } - CUDA_CHECK(cudaMalloc(&workspaceData, _workspaceSizeInBytes)); - workspace = reinterpret_cast(workspaceData); - } - - if (param.bias()->size() > 0){ - int dim_bias[] = {1, output_channel,1, 1}; - int stride_bias[] = {output_channel, 1, 1, 1}; - cudnn::setTensorNdDesc(&_bias_desc, - 4, dim_bias, stride_bias); - } - return SaberSuccess; - - } - void update_weights(ConvParam ¶m){}; + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + ConvParam& param, Context& ctx); //call cudnnConvolutionForward here - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvParam& param) { - const float* input_data = inputs[0]->data(); - float* output_data = outputs[0]->mutable_data(); - const float* weight_data = (const float*) param.weight()->data(); - - CUDNN_CHECK(cudnnConvolutionBackwardData(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _filter_desc, weight_data, - _input_descs, input_data, - _conv_descs, _bwd_algo, workspace, _workspace_bwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, output_data)); - - if (param.bias()->size() > 0) { - const float* bias_data; - bias_data = (const float*)param.bias()->data(); - CUDNN_CHECK(cudnnAddTensor( - _handle, - cudnn::cudnnTypeWrapper::kOne(), - _bias_desc, - bias_data, - cudnn::cudnnTypeWrapper::kOne(), - _output_descs, - output_data)); - } - - return SaberSuccess; + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + ConvParam& param); + + SaberStatus trans_weights(Tensor &target_weights, + Tensor &target_bias, + int in_channel, int out_channel, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int group) { + return SaberUnImplError; } private: + bool _use_saber_act{false}; + SaberActivation _saber_act; cudnnHandle_t _handle; cudnnConvolutionBwdDataAlgo_t _bwd_algo; cudnnTensorDescriptor_t _input_descs; cudnnTensorDescriptor_t _output_descs; cudnnTensorDescriptor_t _bias_desc; - cudnnFilterDescriptor_t _filter_desc; - cudnnConvolutionDescriptor_t _conv_descs; size_t _workspace_bwd_sizes; size_t _workspaceSizeInBytes; // size of underlying storage - void *workspaceData; // underlying storage - void *workspace; // aliases into workspaceData + void* workspaceData; // underlying storage + void* workspace; // aliases into workspaceData const bool _use_tensor_core = true; - const size_t _workspace_limit_bytes = 64 * 1024 * 1024; + const size_t _workspace_limit_bytes = 4 * 1024 * 1024; const cudnnConvolutionBwdDataPreference_t _preference = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST; }; -template class VenderDeconv2D; + } //namespace saber } //namespace anakin diff --git a/saber/funcs/impl/cuda/vender_deconv_act.h b/saber/funcs/impl/cuda/vender_deconv_act.h deleted file mode 100644 index 7325d4ea8..000000000 --- a/saber/funcs/impl/cuda/vender_deconv_act.h +++ /dev/null @@ -1,311 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_DECONV_ACT_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_DECONV_ACT_H - -#include "saber/funcs/impl/impl_deconv_act.h" -#include "saber/funcs/impl/cuda/cudnn_helper.h" - -namespace anakin{ - -namespace saber{ - -template -class VenderDeconv2DAct : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - ConvActiveParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderDeconv2DAct() - : _handle(NULL) - , _workspaceData(NULL) - , _workspace(NULL) - , _conv_descs(NULL) - , _input_descs(NULL) - , _output_descs(NULL) - , _filter_desc(NULL) - , _bias_desc(NULL) - , _workspace_bwd_sizes(0) - , _workspaceSizeInBytes(0) - , _bwd_algo(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0) - , _active_descs(NULL) - {} - - ~VenderDeconv2DAct() { - - if (_conv_descs) { - CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(_conv_descs)); - } - if (_input_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs)); - } - if (_output_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs)); - } - if (_filter_desc) { - CUDNN_CHECK(cudnnDestroyFilterDescriptor(_filter_desc)); - } - if (_bias_desc) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_bias_desc)); - } - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - if (_workspaceData != NULL) { - CUDA_CHECK(cudaFree(_workspaceData)); - } - if (_active_descs != NULL) { - CUDNN_CHECK(cudnnDestroyActivationDescriptor(_active_descs)); - } - } - - /** - * [Create description] Init all cudnn resource here - * @AuthorHTL - * @DateTime 2018-02-01T16:13:06+0800 - * @param inputs [description] - * @param outputs [description] - * @param conv_param [conv parameters] - */ - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx) { - - // ---- init cudnn resources ---- - - _workspaceSizeInBytes = 0; - _workspaceData = NULL; - - _workspace_bwd_sizes = 0; - - this->_ctx = ctx; - // ---- get cuda resources ---- - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - _workspace = NULL; - - int in_channels = inputs[0]->channel(); - - // ---- create cudnn Descs ---- - cudnn::createFilterDesc(&_filter_desc); - - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_output_descs); - cudnn::createConvolutionDesc(&_conv_descs); - - cudnn::create_activation_des(&_active_descs); - cudnn::createTensorDesc(&_bias_desc); - - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param, Context& ctx) { - - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - - //update_weights(param); - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - - int kernel_h = param.conv_param.weight()->height(); - int kernel_w = param.conv_param.weight()->width(); - - int filter_dim_a[] = {input_channel, output_channel / param.conv_param.group, \ - kernel_h, kernel_w}; - - cudnn::setNDFilterDesc(&_filter_desc, - param.conv_param.weight()->dims(), - filter_dim_a, CUDNN_TENSOR_NCHW); - - Shape in_stride = inputs[0]->get_stride(); - Shape out_stride = outputs[0]->get_stride(); - - int dim_a[] = {input_num, input_channel, - input_height, input_width}; - - int dim_b[] = {input_num, output_channel, - output_height, output_width}; - - cudnn::setTensorNdDesc(&_input_descs, - inputs[0]->dims(), dim_a, &in_stride[0]); - - cudnn::setTensorNdDesc(&_output_descs, - outputs[0]->dims(), dim_b, &out_stride[0]); - - int pad_a[] = {param.conv_param.pad_h, param.conv_param.pad_w}; - int stride_a[] = {param.conv_param.stride_h, param.conv_param.stride_w}; - int dilation_a[] = {param.conv_param.dilation_h, param.conv_param.dilation_w}; - - //cudnn::setConvolutionNdDesc(&_conv_descs, \ - inputs[0]->dims() - 2, pad_a, \ - stride_a, dilation_a); - CUDNN_CHECK(cudnnSetConvolution2dDescriptor(_conv_descs, \ - pad_a[0], pad_a[1], \ - stride_a[0], stride_a[1], \ - dilation_a[0], dilation_a[1], \ - CUDNN_CROSS_CORRELATION, \ - cudnn::cudnnOpWrapper::type)); - - cudnn::set_activation_des(&_active_descs, param.activation_param.active); - - // true: use tensor core - // false: disable tensor core - cudnn::set_math_type(&_conv_descs, _use_tensor_core); - cudnn::set_group_count(&_conv_descs, param.conv_param.group); - - // Get fastest implement of cudnn - // set up algo and workspace size - CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(_handle, \ - _filter_desc, _input_descs, _conv_descs, _output_descs, \ - _preference, _workspace_limit_bytes, &_bwd_algo)); - - CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( - _handle, - _filter_desc, _input_descs, _conv_descs, _output_descs, - _bwd_algo, &_workspace_bwd_sizes)); - - if (_workspace_bwd_sizes > _workspaceSizeInBytes) { - _workspaceSizeInBytes = _workspace_bwd_sizes; - if (_workspaceData != NULL) { - CUDA_CHECK(cudaFree(_workspaceData)); - } - CUDA_CHECK(cudaMalloc(&_workspaceData, _workspaceSizeInBytes)); - _workspace = reinterpret_cast(_workspaceData); - } - - if (param.conv_param.bias()->size() > 0){ - int dim_bias[] = {1, output_channel,1, 1}; - int stride_bias[] = {output_channel, 1, 1, 1}; - cudnn::setTensorNdDesc(&_bias_desc, - 4, dim_bias, stride_bias); - } - return SaberSuccess; - - } - void update_weights(ConvActiveParam ¶m){}; - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam& param) { - const float* input_data = inputs[0]->data(); - float* output_data = outputs[0]->mutable_data(); - const float* weight_data = (const float*) param.conv_param.weight()->data(); - - CUDNN_CHECK(cudnnConvolutionBackwardData( - _handle, - cudnn::cudnnTypeWrapper::kOne(), - _filter_desc, weight_data, - _input_descs, input_data, - _conv_descs, _bwd_algo, _workspace, _workspace_bwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, output_data)); - - if (param.conv_param.bias()->size() > 0) { - const float* bias_data; - bias_data = (const float*)param.conv_param.bias()->data(); - CUDNN_CHECK(cudnnAddTensor( - _handle, - cudnn::cudnnTypeWrapper::kOne(), - _bias_desc, - bias_data, - cudnn::cudnnTypeWrapper::kOne(), - _output_descs, - output_data)); - } - - if (_active_descs) { - CUDNN_CHECK(cudnnActivationForward( - _handle, - _active_descs, - cudnn::cudnnTypeWrapper::kOne(), - _output_descs, - output_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, - output_data)); - } - - return SaberSuccess; - } -private: - cudnnHandle_t _handle; - cudnnConvolutionBwdDataAlgo_t _bwd_algo; - - cudnnTensorDescriptor_t _input_descs; - cudnnTensorDescriptor_t _output_descs; - cudnnTensorDescriptor_t _bias_desc; - - cudnnFilterDescriptor_t _filter_desc; - - cudnnConvolutionDescriptor_t _conv_descs; - - cudnnActivationDescriptor_t _active_descs; - - size_t _workspace_bwd_sizes; - size_t _workspaceSizeInBytes; // size of underlying storage - - void *_workspaceData; // underlying storage - void *_workspace; // aliases into _workspaceData - const bool _use_tensor_core = true; - const size_t _workspace_limit_bytes = 64 * 1024 * 1024; - const cudnnConvolutionBwdDataPreference_t _preference = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST; -}; -template class VenderDeconv2DAct; -} -} - -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_DECONV_ACT_H \ No newline at end of file diff --git a/saber/funcs/impl/cuda/vender_fc.h b/saber/funcs/impl/cuda/vender_fc.h deleted file mode 100644 index 0ea154e46..000000000 --- a/saber/funcs/impl/cuda/vender_fc.h +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H -#define ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H - -#include "saber/funcs/impl/impl_fc.h" - -namespace anakin{ - -namespace saber{ - -template -class VenderFc: \ - public ImplBase< - Tensor, \ - Tensor, \ - Tensor, \ - FcParam>> { - -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderFc() = default; - ~VenderFc() { - if (_handle != nullptr) { - CUBLAS_CHECK(cublasDestroy(_handle)); - } - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - FcParam& param, Context& ctx){ - // get context - this->_ctx = ctx; - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUBLAS_CHECK(cublasCreate(&_handle)); - CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - FcParam& param, Context& ctx){ - - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUBLAS_CHECK(cublasDestroy(_handle)); - } - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - CUBLAS_CHECK(cublasCreate(&_handle)); - CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); - } - - Shape shape_out = inputs[0]->valid_shape(); - _M = inputs[0]->count_valid(0, param.axis); - _K = inputs[0]->count_valid(param.axis, inputs[0]->dims()); - _N = param.num_output; - if (_N <= 0) { - int weight_size = param.weights->valid_size(); - _N = weight_size / _K; - } - //! weights dims must be in h and w - _flag_trans_weights = param.is_transpose_weights; - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - FcParam& param); - - -private: - bool _flag_trans_weights{false}; - int _M; - int _K; - int _N; - cublasHandle_t _handle; - bool _is_continue_buf{true}; -}; - -template class VenderFc; -} //namespace saber - -} //namespace anakin - -#endif //ANAKIN_SABER_FUNCS_CUDA_CUBLAS_FC_H diff --git a/saber/funcs/impl/cuda/vender_gemm.cpp b/saber/funcs/impl/cuda/vender_gemm.cpp new file mode 100644 index 000000000..3d6995fca --- /dev/null +++ b/saber/funcs/impl/cuda/vender_gemm.cpp @@ -0,0 +1,133 @@ + +#include "saber/funcs/impl/cuda/vender_gemm.h" + +namespace anakin { +namespace saber { + +template<> +SaberStatus Gemm::init(const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx) { + + if (!(ctx == this->_ctx)) { + if (_handle != NULL) { + CUBLAS_CHECK(cublasDestroy(_handle)); + } + this->_ctx = ctx; + cudaStream_t cuda_stream = ctx.get_compute_stream(); + CUBLAS_CHECK(cublasCreate(&_handle)); + CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); + } + + _lda = (!trans_a) ? k : m; + _ldb = (!trans_b) ? n : k; + _ldc = n; + _m = m; + _n = n; + _k = k; + cu_trans_a = trans_a ? CUBLAS_OP_T: CUBLAS_OP_N; + cu_trans_b = trans_b ? CUBLAS_OP_T: CUBLAS_OP_N; + + return SaberSuccess; +} + +template<> +SaberStatus Gemm::dispatch( + const float alpha, const float beta, + const float* ptr_a, const float* ptr_b, float* ptr_c) { + + CHECK(ptr_a != nullptr); + CHECK(ptr_b != nullptr); + CHECK(ptr_c != nullptr); + CUBLAS_CHECK(cublasSgemm(_handle, cu_trans_b, cu_trans_a, + _n, _m, _k, &alpha, ptr_b, _ldb, ptr_a, + _lda, &beta, ptr_c, _ldc)); + return SaberSuccess; +} + +template<> +SaberStatus Gemm::init(const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx) { + if (!(ctx == this->_ctx)) { + if (_handle != NULL) { + CUBLAS_CHECK(cublasDestroy(_handle)); + } + this->_ctx = ctx; + cudaStream_t cuda_stream = ctx.get_compute_stream(); + CUBLAS_CHECK(cublasCreate(&_handle)); + CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); + } + + _lda = (!trans_a) ? k : m; + _ldb = (!trans_b) ? n : k; + _ldc = n; + _m = m; + _n = n; + _k = k; + cu_trans_a = trans_a ? CUBLAS_OP_T: CUBLAS_OP_N; + cu_trans_b = trans_b ? CUBLAS_OP_T: CUBLAS_OP_N; + + return SaberSuccess; +} + +template<> +SaberStatus Gemm::dispatch( + const float alpha, const float beta, + const char* ptr_a, const char* ptr_b, float* ptr_c) { + + CHECK(ptr_a != nullptr); + CHECK(ptr_b != nullptr); + CHECK(ptr_c != nullptr); + + CUBLAS_CHECK(cublasSgemmEx(_handle, cu_trans_b, cu_trans_a, + _n, _m, _k, &alpha, ptr_b, CUDA_R_8I, _ldb, ptr_a, + CUDA_R_8I, _lda, &beta, ptr_c, CUDA_R_32F, _ldc)); + return SaberSuccess; +} + +template<> +SaberStatus Gemv::init(const bool trans, const int m, const int n, + const int incx, const int incy, + Context ctx) { + + if (!(ctx == this->_ctx)) { + if (_handle != NULL) { + CUBLAS_CHECK(cublasDestroy(_handle)); + } + this->_ctx = ctx; + cudaStream_t cuda_stream = ctx.get_compute_stream(); + CUBLAS_CHECK(cublasCreate(&_handle)); + CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); + } + + _lda = n; + CHECK_GT(m, 0); + CHECK_GT(n, 0); + CHECK_GT(incx, 0); + CHECK_GT(incy, 0); + _m = m; + _n = n; + _incx = incx; + _incy = incy; + _cu_trans = trans ? CUBLAS_OP_N: CUBLAS_OP_T; + + return SaberSuccess; +} + +template<> +SaberStatus Gemv::dispatch( + const float alpha, const float beta, + const float* a, const float* b, + float* c) { + + CHECK(a != nullptr); + CHECK(b != nullptr); + CHECK(c != nullptr); + CUBLAS_CHECK(cublasSgemv(_handle, _cu_trans, _n, _m, + &alpha, a, _lda, b, _incx, &beta, c, _incy)); + return SaberSuccess; +} + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/vender_gemm.h b/saber/funcs/impl/cuda/vender_gemm.h new file mode 100644 index 000000000..70e8e8078 --- /dev/null +++ b/saber/funcs/impl/cuda/vender_gemm.h @@ -0,0 +1,70 @@ + +#ifndef SABER_FUNCS_IMPL_CUDA_VENDER_GEMM_H +#define SABER_FUNCS_IMPL_CUDA_VENDER_GEMM_H + +#include "saber/core/tensor.h" +#include "saber/funcs/gemm.h" + +namespace anakin { +namespace saber { + +template +class Gemm { + +public: + Gemm() = default; + ~Gemm() {} + + SaberStatus init(const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx); + + SaberStatus dispatch(const outDtype alpha, const outDtype beta, + const inDtype* a, const inDtype* b, + outDtype* c); + +private: + Context _ctx; + cublasHandle_t _handle{nullptr}; + cublasOperation_t cu_trans_a; + cublasOperation_t cu_trans_b; + int _m{-1}; + int _n{-1}; + int _k{-1}; + int _lda{-1}; + int _ldb{-1}; + int _ldc{-1}; +}; + +template +class Gemv { + +public: + Gemv() = default; + ~Gemv() {} + + SaberStatus init(const bool trans, const int m, const int n, + const int incx, const int incy, + Context ctx); + + SaberStatus dispatch(const outDtype alpha, const outDtype beta, + const inDtype* a, const inDtype* b, + outDtype* c); + +private: + Context _ctx; + cublasHandle_t _handle{nullptr}; + cublasOperation_t _cu_trans; + int _incx{-1}; + int _incy{-1}; + int _m{-1}; + int _n{-1}; + int _lda{-1}; +}; + +} +} + +#endif \ No newline at end of file diff --git a/saber/funcs/impl/cuda/vender_gru.cpp b/saber/funcs/impl/cuda/vender_gru.cpp index dff3ad217..1e8261a4a 100644 --- a/saber/funcs/impl/cuda/vender_gru.cpp +++ b/saber/funcs/impl/cuda/vender_gru.cpp @@ -1,178 +1,48 @@ #include "saber/funcs/impl/cuda/vender_gru.h" #include "cuda_fp16.h" +#include "debug.h" namespace anakin { namespace saber { - template <> -void VenderGru::\ - seq2hw(std::vector outputs, std::vector inputs, - GruParam& param, int hidden_size, - DataTensor& sequence, Context& ctx) { - DataTensor* din = inputs[0]; - DataTensor* dout = outputs[0]; - std::vector offset_vec = din->get_seq_offset(); - CHECK_GE(offset_vec.size(), 2) << "offset must >=2" ; - int batch_size = offset_vec.size() - 1; - int max_len = 0; - std::vector length_vec; - - for (int i = 0; i < offset_vec.size() - 1; ++i) { - int len = offset_vec[i + 1] - offset_vec[i]; - max_len = max_len > len ? max_len : len; - length_vec.push_back(len); - } - - const DataDtype* orgin = sequence.data(); - DataDtype* target = dout->mutable_data(); - - int count = 0; - - for (int batch = 0; batch < batch_size; ++batch) { - for (int seq = 0; seq < length_vec[batch]; ++seq) { - const DataDtype* origin_i = orgin + (seq * batch_size + batch) * hidden_size; - DataDtype* target_i = target + (offset_vec[batch] + seq) * hidden_size; - count += hidden_size; - CUDA_CHECK(cudaMemcpyAsync(target_i, origin_i, sizeof(DataDtype)*hidden_size, - cudaMemcpyDeviceToDevice, ctx.get_data_stream())); - } - } - - CHECK_EQ(count, dout->valid_size()) << "output data size should be equal"; - cudaStreamSynchronize(ctx.get_data_stream()); -} - -template <> -void VenderGru::\ -hw2seq(std::vector inputs, GruParam& param, - int word_size, DataTensor& sequence, - DataTensor& out_sequence, Context& ctx) { - DataTensor* din = inputs[0]; - std::vector offset_vec = din->get_seq_offset(); - CHECK_GE(offset_vec.size(), 2) << "offset must >=2" ; - int batch_size = offset_vec.size() - 1; - int max_len = 0; - int hidden_size = param.bias()->valid_size() / 3; - std::vector length_vec; - - for (int i = 0; i < offset_vec.size() - 1; ++i) { - int len = offset_vec[i + 1] - offset_vec[i]; - max_len = max_len > len ? max_len : len; - length_vec.push_back(len); - } - - Shape seq_shape(1, max_len, batch_size, word_size); - sequence.re_alloc(seq_shape); - - Shape seq_out_shape(1, max_len, batch_size, hidden_size); - out_sequence.re_alloc(seq_out_shape); - - if (batch_size == 1) { - sequence.copy_from(*din); - return; - } - - DataDtype* target = sequence.mutable_data(); - const DataDtype* origin = din->data(); - - DataTensor zero_tensor; - Shape zero_shape(1, 1, 1, word_size); - zero_tensor.re_alloc(zero_shape); - - DataDtype* zero_block = zero_tensor.mutable_data(); - //TODO:set all zero - CUDA_CHECK(cudaMemset(zero_block, 0, sizeof(DataDtype) * (word_size))); - - for (int batch = 0; batch < batch_size; ++batch) { - for (int seq = 0; seq < max_len; ++seq) { - DataDtype* target_i = target + (seq * batch_size + batch) * word_size; - - if (seq < length_vec[batch]) { - const DataDtype* origin_i = origin + (offset_vec[batch] + seq) * word_size; - CUDA_CHECK(cudaMemcpyAsync(target_i, origin_i, sizeof(DataDtype)*word_size, - cudaMemcpyDeviceToDevice, ctx.get_data_stream())); - } else { - CUDA_CHECK(cudaMemcpyAsync(target_i, zero_block, sizeof(DataDtype)*word_size, - cudaMemcpyDeviceToDevice, ctx.get_data_stream())); - } - } - } - - _xDesc.reset(new cudnn::TensorDescriptors( - max_len, - {batch_size, word_size, 1}, - {word_size, 1, 1})); - - _yDesc.reset(new cudnn::TensorDescriptors( - max_len, - {batch_size, hidden_size * param.num_direction, 1}, - {hidden_size * param.num_direction, 1, 1})); - - size_t new_workspace_size_in_bytes = 0; - CUDNN_CHECK(cudnnGetRNNWorkspaceSize( - _handle, - _rnnDesc, - max_len, - _xDesc->descs(), - &new_workspace_size_in_bytes)); - - if (new_workspace_size_in_bytes > _workspace_size_in_bytes) { - _workspace_size_in_bytes = new_workspace_size_in_bytes; - _workspace_tensor.re_alloc(Shape(1, 1, 1, _workspace_size_in_bytes)); - } - - int dim[] = {param.num_layers * param.num_direction, batch_size, hidden_size}; - int stride[] = {batch_size * hidden_size, hidden_size, 1}; - - cudnn::setTensorNdDesc(&_hxDesc, - 3, dim, stride); - cudnn::setTensorNdDesc(&_cxDesc, - 3, dim, stride); - cudnn::setTensorNdDesc(&_hyDesc, - 3, dim, stride); - cudnn::setTensorNdDesc(&_cyDesc, - 3, dim, stride); - - cudaStreamSynchronize(ctx.get_data_stream()); -} - -template <> -void VenderGru::\ -set_grnn_params_region(GruParam& param, int wordSize) { - int hidden_size = param.bias()->valid_size() / 3; - const Op_dtype* w_ptr = param.weight()->data(); /*inpute weights*/ - const Op_dtype* i2h = w_ptr; /* new memory gate */ - const Op_dtype* i2h_r = w_ptr + 1 * wordSize * hidden_size; /* reset gate */ - const Op_dtype* i2h_z = w_ptr + 2 * wordSize * hidden_size; /* update gate */ - const Op_dtype* w_ptr_inner = w_ptr + 3 * wordSize * hidden_size; - const Op_dtype* h2h = w_ptr_inner; /* new memory gate */ - const Op_dtype* h2h_r = w_ptr_inner + 1 * hidden_size * hidden_size; /* reset gate */ - const Op_dtype* h2h_z = w_ptr_inner + 2 * hidden_size * hidden_size; /* update gate */ - - const Op_dtype* h = nullptr; - const Op_dtype* h_r = nullptr; - const Op_dtype* h_z = nullptr; +void VenderGru::\ +set_grnn_params_region(GruParam& param) { + const OpDataType* w_i2h_ptr = static_cast + (_inner_weight_i2h.data()); /*inpute weights*/ + const OpDataType* w_h2h_ptr = static_cast(_inner_weight_h2h.data()); + CHECK_NOTNULL(w_i2h_ptr) << "weights can`t be null"; + CHECK_NOTNULL(w_h2h_ptr) << "weights can`t be null"; + const OpDataType* i2h = w_i2h_ptr; /* new memory gate */ + const OpDataType* i2h_r = w_i2h_ptr + 1 * _word_size * _hidden_size; /* reset gate */ + const OpDataType* i2h_z = w_i2h_ptr + 2 * _word_size * _hidden_size; /* update gate */ + const OpDataType* h2h = w_h2h_ptr; /* new memory gate */ + const OpDataType* h2h_r = w_h2h_ptr + 1 * _hidden_size * _hidden_size; /* reset gate */ + const OpDataType* h2h_z = w_h2h_ptr + 2 * _hidden_size * _hidden_size; /* update gate */ + + const OpDataType* h = nullptr; + const OpDataType* h_r = nullptr; + const OpDataType* h_z = nullptr; if (param.bias() != nullptr) { - h = param.bias()->data(); - h_r = h + 1 * hidden_size; - h_z = h + 2 * hidden_size; + h = static_cast(param.bias()->data()); + h_r = h + 1 * _hidden_size; + h_z = h + 2 * _hidden_size; } - const Op_dtype* cudnnW[] = {i2h_r, i2h_z, i2h, h2h_r, h2h_z, h2h}; - const Op_dtype* cudnnB[] = {h_r, h_z, h, nullptr, nullptr, nullptr}; + const OpDataType* cudnnW[] = {i2h_r, i2h_z, i2h, h2h_r, h2h_z, h2h}; + const OpDataType* cudnnB[] = {h_r, h_z, h, nullptr, nullptr, nullptr}; for (int i = 0; i < _cudnn_gru_weights_layernum; i++) { - ParamsRegion& region = _inner_weight_region[i]; + cudnn::ParamsRegion& region = _inner_weight_region[i]; CUDA_CHECK(cudaMemcpy((void*)(region._offset), (void*)cudnnW[i], region._size, cudaMemcpyDeviceToDevice)); } for (int i = 0; i < _cudnn_gru_weights_layernum; i++) { - ParamsRegion& region_b = _inner_bias_region[i]; + cudnn::ParamsRegion& region_b = _inner_bias_region[i]; if (cudnnB[i] != nullptr) { CUDA_CHECK(cudaMemcpy((void*)(region_b._offset), (void*)cudnnB[i], @@ -185,8 +55,8 @@ set_grnn_params_region(GruParam& param, int wordSize) { } template <> -int VenderGru::\ -get_grnn_params_region(GruParam& param) { +int VenderGru::\ +get_grnn_params_region(GruParam& param) { int sum_size_of_weights_and_bias = 0; cudnnFilterDescriptor_t region_desc_handle = nullptr; CUDNN_CHECK(cudnnCreateFilterDescriptor(®ion_desc_handle)); @@ -233,10 +103,10 @@ get_grnn_params_region(GruParam& param) { &tensor_format, &nbDims, dims)); /* filterDimA[] */ - size_t size = dims[0] * dims[1] * dims[2] * sizeof(Op_dtype); + size_t size = dims[0] * dims[1] * dims[2] * sizeof(OpDataType); // LOG(INFO) << "size add "<& param) { } template<> -SaberStatus VenderGru::\ -create(const std::vector& inputs, - std::vector& outputs, - GruParam& gru_param, Context& ctx) { +SaberStatus VenderGru::\ +create(const std::vector& inputs, + std::vector& outputs, + GruParam& param, Context& ctx) { - if (!(ctx == this->_ctx)) { + if (!(&ctx == this->_ctx)) { if (_handle != NULL) { CUDNN_CHECK(cudnnDestroy(_handle)); } - this->_ctx = ctx; - + this->_ctx = &ctx; + LOG(INFO) << "update ctx"; cudaStream_t cuda_stream; cuda_stream = ctx.get_compute_stream(); @@ -270,148 +140,265 @@ create(const std::vector& inputs, CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); } - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - - int seqLength = input_channel;//C; - int batchSize = input_height;//H - size_t stateSize; - - cudnn::setRNNDesc(&_rnnDesc, _handle, _hidden_size, - gru_param.num_layers, _dropoutDesc, gru_param.num_direction, CUDNN_GRU); - - _xDesc.reset(new cudnn::TensorDescriptors( - seqLength, - {batchSize, _word_size, 1}, - {_word_size, 1, 1})); - - _yDesc.reset(new cudnn::TensorDescriptors( - seqLength, - {batchSize, _hidden_size * gru_param.num_direction, 1}, - {_hidden_size * gru_param.num_direction, 1, 1})); - - Shape in_dim = inputs[0]->shape(); - Shape in_stride = inputs[0]->get_stride(); - - Shape out_dim = outputs[0]->shape(); - Shape out_stride = outputs[0]->get_stride(); - - int dim[] = {gru_param.num_layers * gru_param.num_direction, batchSize, _hidden_size}; - int stride[] = {batchSize * _hidden_size, _hidden_size, 1}; - - cudnn::setTensorNdDesc(&_hxDesc, - 3, dim, stride); - cudnn::setTensorNdDesc(&_cxDesc, - 3, dim, stride); - cudnn::setTensorNdDesc(&_hyDesc, - 3, dim, stride); - cudnn::setTensorNdDesc(&_cyDesc, - 3, dim, stride); - - size_t weightsSize = 0; + std::vector> offset_vec = inputs[0]->get_seq_offset(); + std::vector offset = offset_vec[offset_vec.size() - 1]; + int batch_size = offset.size() - 1; + + + std::vector> xdim(batch_size); + std::vector> xstride(batch_size); + + for (int i = 0; i < batch_size; i++) { + int seq_len = offset[i + 1] - offset[i]; + xdim[i] = {seq_len, _word_size, 1}; + xstride[i] = {_word_size, 1, 1}; + } + + _xDesc.reset(new cudnn::TensorDescriptors(batch_size, xdim, xstride)); + + std::vector> ydim(batch_size); + std::vector> ystride(batch_size); + + for (int i = 0; i < batch_size; i++) { + int seq_len = offset[i + 1] - offset[i]; + ydim[i] = {seq_len, _hidden_size * param.num_direction, 1}; + ystride[i] = {_hidden_size * param.num_direction, 1, 1}; + } + + _yDesc.reset(new cudnn::TensorDescriptors(batch_size, ydim, ystride)); + + int dim[] = {param.num_layers * param.num_direction, batch_size, _hidden_size}; + int stride[] = {batch_size * _hidden_size, _hidden_size, 1}; + + cudnn::setTensorNdDesc(&_hxDesc, + 3, dim, stride); + cudnn::setTensorNdDesc(&_cxDesc, + 3, dim, stride); + cudnn::setTensorNdDesc(&_hyDesc, + 3, dim, stride); + cudnn::setTensorNdDesc(&_cyDesc, + 3, dim, stride); + + int max_seq_len = offset[1] - offset[0]; + CUDNN_CHECK(cudnnGetRNNWorkspaceSize( + _handle, + _rnnDesc, + batch_size, + _xDesc->descs(), + &_workspace_size_in_bytes)); + + //FIXME tryEXP + LOG(INFO) << "work space = " << _workspace_size_in_bytes << ",batch size = " << batch_size << + ",max = " << max_seq_len; + utils::try_expand_tensor(_workspace_tensor, _workspace_size_in_bytes); + + return SaberSuccess; +} + +template<> +void VenderGru::trans_akweights_2_cudnnweights(GruParam& param) { + int weights_i2h_size = _hidden_size * _word_size; + Tensor weight_i2h_host; + Tensor weight_i2h_host_target; + Tensor weight_trans_workspace; + Shape weight_i2h_host_shape({1, 1, 1, _hidden_size* _word_size * 3}); + weight_i2h_host.re_alloc(weight_i2h_host_shape, AK_FLOAT); + weight_i2h_host_target.re_alloc(weight_i2h_host_shape, AK_FLOAT); + weight_trans_workspace.re_alloc(weight_i2h_host_shape, AK_FLOAT); + _inner_weight_i2h.re_alloc(weight_i2h_host_shape, AK_FLOAT); + + CUDA_CHECK(cudaMemcpyAsync(weight_i2h_host.mutable_data(), param.weight()->data(), + sizeof(OpDataType)*_hidden_size * _word_size * 3, cudaMemcpyDeviceToHost, + this->_ctx->get_compute_stream())); + CUDA_CHECK(cudaDeviceSynchronize()); + + OpDataType* rz_temp_tensor_ptr = static_cast(weight_i2h_host_target.mutable_data()); + OpDataType* rz_weights_tensor_ptr = static_cast(weight_i2h_host.mutable_data()); + + for (int row = 0; row < _word_size; row++) { + for (int block = 0; block < 3; block++) { + int block_offset = block * _hidden_size; + + for (int cow = 0; cow < _hidden_size; cow++) { + rz_temp_tensor_ptr[block * _word_size * _hidden_size + row * _hidden_size + cow] = + rz_weights_tensor_ptr[row * (3 * _hidden_size) + cow + block_offset]; + } + } + } + + weight_trans_workspace.copy_from(weight_i2h_host_target); + const OpDataType* rz_weight_trans_workspace_ptr = static_cast + (weight_trans_workspace.data()); + + for (int i = 0; i < 3; i++) { + utils::transpose(rz_weight_trans_workspace_ptr + i * _hidden_size * _word_size, _word_size, + _hidden_size, + rz_temp_tensor_ptr + i * _hidden_size * _word_size); + } + + _inner_weight_i2h.copy_from(weight_i2h_host_target); + + Tensor weight_h2h_host; + Tensor weight_h2h_host_target; + Shape weight_h2h_host_shape({1, 1, 1, _hidden_size* _hidden_size * 3}); + _inner_weight_h2h.re_alloc(weight_h2h_host_shape, AK_FLOAT); + weight_h2h_host.re_alloc(weight_h2h_host_shape, AK_FLOAT); + weight_h2h_host_target.re_alloc(weight_h2h_host_shape, AK_FLOAT); + weight_trans_workspace.re_alloc(weight_h2h_host_shape, AK_FLOAT); + + CUDA_CHECK(cudaMemcpyAsync(weight_h2h_host.mutable_data(), + static_cast(param.weight()->data()) + weights_i2h_size, + sizeof(OpDataType)*_hidden_size * _hidden_size * 3, cudaMemcpyDeviceToHost, + this->_ctx->get_compute_stream())); + CUDA_CHECK(cudaDeviceSynchronize()); + + memcpy(weight_h2h_host_target.mutable_data(), weight_h2h_host.data(), + _hidden_size * _hidden_size * sizeof(OpDataType)); + rz_temp_tensor_ptr = static_cast(weight_h2h_host_target.mutable_data()) + _hidden_size + * _hidden_size; + rz_weights_tensor_ptr = static_cast(weight_h2h_host.mutable_data()) + _hidden_size * + _hidden_size; + + for (int row = 0; row < _hidden_size; row++) { + for (int block = 0; block < 2; block++) { + int block_offset = block * _hidden_size; + + for (int cow = 0; cow < _hidden_size; cow++) { + rz_temp_tensor_ptr[block * _hidden_size * _hidden_size + row * _hidden_size + cow] = + rz_weights_tensor_ptr[row * (2 * _hidden_size) + cow + block_offset]; + } + } + } + + weight_trans_workspace.copy_from(weight_h2h_host_target); + + rz_weight_trans_workspace_ptr = static_cast(weight_trans_workspace.data()); + rz_temp_tensor_ptr = static_cast(weight_h2h_host_target.mutable_data()); + + for (int i = 0; i < 3; i++) { + utils::transpose(rz_weight_trans_workspace_ptr + i * _hidden_size * _hidden_size, _hidden_size, + _hidden_size, + rz_temp_tensor_ptr + i * _hidden_size * _hidden_size); + } + + _inner_weight_h2h.copy_from(weight_h2h_host_target); + CUDA_CHECK(cudaDeviceSynchronize()); +} + + +template<> +SaberStatus VenderGru:: +init(const std::vector& inputs, + std::vector& outputs, + GruParam& param, + Context& ctx) { + + _hidden_size = param.bias()->valid_size() / 3; + int weights_bias_size = _hidden_size * 3; + int weights_h2h_size = _hidden_size * _hidden_size * 3; + int weights_i2h_size = param.weight()->valid_size() - weights_h2h_size; + _word_size = weights_i2h_size / _hidden_size / 3; + LOG(INFO) << "w:" << _word_size << "," << _hidden_size; + _workspace_size_in_bytes = 0; + + this->_ctx = &ctx; + // ---- get cuda resources ---- + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + + + // ---- create cudnn Descs ---- + CUDNN_CHECK(cudnnCreateDropoutDescriptor(&_dropoutDesc)); + CUDNN_CHECK(cudnnCreateRNNDescriptor(&_rnnDesc)); + cudnn::setRNNDesc(&_rnnDesc, _handle, _hidden_size, + param.num_layers, _dropoutDesc, param.num_direction, CUDNN_GRU); + + cudnn::createTensorDesc(&_hxDesc); + cudnn::createTensorDesc(&_cxDesc); + cudnn::createTensorDesc(&_hyDesc); + cudnn::createTensorDesc(&_cyDesc); + cudnn::createFilterDesc(&_wDesc); + _workspace_tensor.set_dtype(AK_INT8); + + _xDesc.reset(new cudnn::TensorDescriptors( + 1, + {{1/*batch_size*/, _word_size, 1}}, + {{_word_size, 1, 1}})); + + size_t weightsSize = 999; CUDNN_CHECK(cudnnGetRNNParamsSize( _handle, _rnnDesc, _xDesc->descs()[0], &weightsSize, - cudnn::cudnnTypeWrapper::type)); - + cudnn::cudnnTypeWrapper::type)); const int dims[] = { - static_cast(weightsSize / sizeof(Op_dtype)), + static_cast(weightsSize) / sizeof(OpDataType), 1, 1 }; + LOG(INFO) << "::" << "," << weightsSize << "," << cudnn::cudnnTypeWrapper::type; + Shape weight_tensor_shape({1, 1, 1, weightsSize / sizeof(OpDataType)}); + _inner_weight.re_alloc(weight_tensor_shape, AK_FLOAT); + CUDNN_CHECK(cudnnSetFilterNdDescriptor( - _wDesc, cudnn::cudnnTypeWrapper::type, CUDNN_TENSOR_NCHW, 3, dims)); + _wDesc, cudnn::cudnnTypeWrapper::type, CUDNN_TENSOR_NCHW, 3, dims)); /** * in_weights is tensor of char not the opdata */ - Shape weight_tensor_shape(1, 1, 1, weightsSize / sizeof(Op_dtype)); - _inner_weight.re_alloc(weight_tensor_shape); - int sum_size_of_w = get_grnn_params_region(gru_param); + trans_akweights_2_cudnnweights(param); + + int sum_size_of_w = get_grnn_params_region(param); CHECK_EQ(sum_size_of_w, weightsSize) << "Compute param sum length must equal to that api get." ; - set_grnn_params_region(gru_param, _word_size); - CUDNN_CHECK(cudnnGetRNNWorkspaceSize( - _handle, - _rnnDesc, - seqLength, - _xDesc->descs(), - &_workspace_size_in_bytes)); + set_grnn_params_region(param); + + + return create(inputs, outputs, param, ctx); +}; - _workspace_tensor.re_alloc(Shape(1, 1, 1, _workspace_size_in_bytes)); - return SaberSuccess; -} template<> -SaberStatus VenderGru::\ -dispatch(const std::vector& inputs, - std::vector& outputs, - GruParam& param) { +SaberStatus VenderGru::\ +dispatch(const std::vector& inputs, + std::vector& outputs, + GruParam& param) { CHECK_GE(inputs.size(), 1) << "gru input vec size must >=1"; int input_channel = inputs[0]->channel(); - const DataDtype* in_data = inputs[0]->data(); - DataDtype* out_data = outputs[0]->mutable_data(); - const DataDtype* in_hidden_data = nullptr; + const OpDataType* in_data = static_cast(inputs[0]->data()); + OpDataType* out_data = static_cast(outputs[0]->mutable_data()); + const OpDataType* in_hidden_data = nullptr; + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); if (inputs.size() == 2) { in_hidden_data = inputs[1]->data(); } - bool isHW2Seq=inputs[0]->get_seq_offset().size()>2; - - if (isHW2Seq) { - DataTensor temp_tensor_in; - DataTensor temp_tensor_out; - hw2seq(inputs, param, _word_size, temp_tensor_in, temp_tensor_out, _ctx); - CUDNN_CHECK(cudnnRNNForwardInference(_handle, - _rnnDesc, - _xDesc->sizes(),//sequence - _xDesc->descs(), - temp_tensor_in.data(), - _hxDesc, - in_hidden_data, // hidden state of the network will be initialized to zero - _cxDesc, - nullptr, //the initial cell state of the network will be initialized to zero - _wDesc, - _inner_weight.data(), - _yDesc->descs(), - temp_tensor_out.mutable_data(), // Output GPU-raw-ptr - _hyDesc, - nullptr, // the final hidden state of the network will not be saved - _cyDesc, - nullptr, // the final cell state of the network will be not be saved - _workspace_tensor.mutable_data(), - _workspace_size_in_bytes)); - - seq2hw(outputs, inputs, param, _hidden_size, temp_tensor_out, _ctx); - outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); - - } else { - CUDNN_CHECK(cudnnRNNForwardInference(_handle, - _rnnDesc, - _xDesc->sizes(), - _xDesc->descs(), - in_data, - _hxDesc, - in_hidden_data, // hidden state of the network will be initialized to zero - _cxDesc, - nullptr, //the initial cell state of the network will be initialized to zero - _wDesc, - _inner_weight.data(), - _yDesc->descs(), - out_data, // Output GPU-raw-ptr - _hyDesc, - nullptr, // the final hidden state of the network will not be saved - _cyDesc, - nullptr, // the final cell state of the network will be not be saved - _workspace_tensor.mutable_data(), - _workspace_size_in_bytes)); - } + + LOG(INFO) << "run cudnn gru"; + CUDNN_CHECK(cudnnRNNForwardInference(_handle, + _rnnDesc, + _xDesc->sizes(), + _xDesc->descs(), + in_data, + _hxDesc, + in_hidden_data, // hidden state of the network will be initialized to zero + _cxDesc, + nullptr, //the initial cell state of the network will be initialized to zero + _wDesc, + _inner_weight.data(), + _yDesc->descs(), + out_data, // Output GPU-raw-ptr + _hyDesc, + nullptr, // the final hidden state of the network will not be saved + _cyDesc, + nullptr, // the final cell state of the network will be not be saved + _workspace_tensor.mutable_data(), + _workspace_size_in_bytes)); + return SaberSuccess; } diff --git a/saber/funcs/impl/cuda/vender_gru.h b/saber/funcs/impl/cuda/vender_gru.h index 15320ad40..6faefe5ef 100644 --- a/saber/funcs/impl/cuda/vender_gru.h +++ b/saber/funcs/impl/cuda/vender_gru.h @@ -12,72 +12,37 @@ See the License for the specific language governing permissions and limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_GRU_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_GRU_H - +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_VENDER_GRU_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_VENDER_GRU_H +#include #include "saber/funcs/impl/impl_gru.h" #include "saber/funcs/impl/cuda/cudnn_helper.h" -#include "saber/funcs/funcs_utils.h" -#include "saber/funcs/debug.h" -#include "cuda_fp16.h" - +#include "saber/funcs/impl/cuda/cuda_utils.h" +#include "saber/saber_funcs_param.h" namespace anakin { - namespace saber { - struct ParamsRegion { - - ParamsRegion():_offset(NULL), _size(0){}; - ParamsRegion(void *offset, size_t size):_offset(offset), _size(size){} - ~ParamsRegion(){} - ParamsRegion(const ParamsRegion &right): _offset(right._offset),_size(right._size){}; - - ParamsRegion &operator=(const ParamsRegion &right) { - _offset = right._offset; - _size=right._size; - return *this; - } - bool operator==(const ParamsRegion &right) { - bool comp_eq = true; - comp_eq = comp_eq && (_offset == right._offset); - comp_eq = comp_eq && (_size == right._size); - return comp_eq; - } - - void * _offset; - size_t _size; - }; - -template -class VenderGru: \ - public ImplBase< - Tensor, \ - Tensor, \ - Tensor, \ - GruParam>> { +template +class VenderGru: public ImplBase < + NV, OpDtype, GruParam > { public: - typedef Tensor DataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef typename DataTensor::Dtype DataDtype; - typedef typename OpTensor::Dtype Op_dtype; + typedef typename DataTrait::Dtype OpDataType; + typedef Tensor OpTensor; + + VenderGru() : _handle(NULL), _rnnDesc(NULL), _hxDesc(NULL), _cxDesc(NULL), _hyDesc(NULL), \ + _cyDesc(NULL), _wDesc(NULL), _dropoutDesc(NULL), _workspace_size_in_bytes(0) { - VenderGru() - : _handle(NULL), _rnnDesc(NULL), _hxDesc(NULL), _cxDesc(NULL), _hyDesc(NULL), \ - _cyDesc(NULL), _wDesc(NULL), _dropoutDesc(NULL), _workspace_size_in_bytes(0) {} + } ~VenderGru() { if (_handle != NULL) { CUDNN_CHECK(cudnnDestroy(_handle)); } - if(_dropoutDesc){ + + if (_dropoutDesc) { CUDNN_CHECK(cudnnDestroyDropoutDescriptor(_dropoutDesc)); } + if (_rnnDesc) { CUDNN_CHECK(cudnnDestroyRNNDescriptor(_rnnDesc)); } @@ -85,12 +50,15 @@ class VenderGru& inputs, - std::vector& outputs, - GruParam &gru_param, Context &ctx) { - - _workspace_size_in_bytes = 0; - - this->_ctx = ctx; - // ---- get cuda resources ---- - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - - - // ---- create cudnn Descs ---- - CUDNN_CHECK(cudnnCreateDropoutDescriptor(&_dropoutDesc)); - CUDNN_CHECK(cudnnCreateRNNDescriptor(&_rnnDesc)); - - cudnn::createTensorDesc(&_hxDesc); - cudnn::createTensorDesc(&_cxDesc); - cudnn::createTensorDesc(&_hyDesc); - cudnn::createTensorDesc(&_cyDesc); + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + GruParam& param, + Context& ctx) override; - cudnn::createFilterDesc(&_wDesc); + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + GruParam& param, + Context& ctx) override; - if(_is_init_weights==false){ - _hidden_size = gru_param.bias()->valid_size() / 3; - - int weights_bias_size = _hidden_size * 3; - int weights_h2h_size = _hidden_size * _hidden_size * 3; - int weights_i2h_size = gru_param.weight()->valid_size() - weights_h2h_size; - _word_size = weights_i2h_size / _hidden_size / 3; - - Tensor inner_weights_before_host; - Shape weights_shape(1,1,1,gru_param.weight()->valid_size()); - inner_weights_before_host.re_alloc(weights_shape); - inner_weights_before_host.copy_from(*gru_param.weight()); - - const Op_dtype* weights_i2h=inner_weights_before_host.data(); - const Op_dtype* weights_h2h=weights_i2h+weights_i2h_size; - - - - - int temp_size=_hidden_size>_word_size?_hidden_size*_hidden_size:_word_size*_hidden_size; - Shape temp_tensor_shape(1,1,1,temp_size); - Tensor temp_tensor; - temp_tensor.re_alloc(temp_tensor_shape); - extract_matrix_from_matrix_in_leddim(weights_i2h,temp_tensor.mutable_data(),0,weights_i2h_size,_hidden_size*3,_hidden_size); - write_tensorfile(temp_tensor,"temp_tensor"); - } - - - - return create(inputs, outputs, gru_param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - GruParam &gru_param, Context &ctx); - - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - GruParam ¶m); + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + GruParam& param) override; private: - cudnnHandle_t _handle; -//! choose for lstm or gru or rnn - + //! choose for lstm or gru or rnn cudnnDropoutDescriptor_t _dropoutDesc; cudnnRNNDescriptor_t _rnnDesc; -//! gate desc have to be valid + //! gate desc have to be valid cudnnTensorDescriptor_t _hxDesc; cudnnTensorDescriptor_t _cxDesc; cudnnTensorDescriptor_t _hyDesc; @@ -187,41 +98,37 @@ class VenderGru> _xDesc; - std::unique_ptr> _yDesc; + //! input and output descs + std::unique_ptr> _xDesc; + std::unique_ptr> _yDesc; + int _word_size; int _hidden_size; - bool _is_init_weights=false; OpTensor _inner_weight; - std::vector _inner_weight_region; - std::vector _inner_bias_region; + OpTensor _inner_weight_i2h; + OpTensor _inner_weight_h2h; + std::vector _inner_weight_region; + std::vector _inner_bias_region; -//! workspace for cudnn - const size_t _workspace_limit_bytes = 64 * 1024 * 1024; + //! workspace for cudnn + const size_t _workspace_limit_bytes = 4 * 1024 * 1024; size_t _workspace_size_in_bytes; - Tensor _workspace_tensor; // underlying storage + OpTensor _workspace_tensor; // underlying storage -//! addition flag + //! addition flag const bool _use_tensor_core = true; -//! function to transform weights layout to fit cudnn standard - int get_grnn_params_region(GruParam ¶m) ; - void set_grnn_params_region(GruParam ¶m, int wordSize); - void hw2seq(std::vector inputs, GruParam& param, int word_size, - DataTensor &sequence, DataTensor &out_sequence, Context& ctx); - void seq2hw(std::vector outputs, std::vector inputs, - GruParam& param, int hidden_size, DataTensor &sequence, - Context& ctx); -}; - -template class VenderGru; + //! function to transform weights layout to fit cudnn standard + int get_grnn_params_region(GruParam& param) ; + void set_grnn_params_region(GruParam& param); + void trans_akweights_2_cudnnweights(GruParam& param); -} //namespace saber - -} //namespace anakin + SeqSortedseqTranseUtil _seq_utils; +}; -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_GRU_H +} // namespace saber +} // namespace anakin +#endif // ANAKIN_SABER_FUNCS_IMPL_CUDA_VENDER_GRU_H diff --git a/saber/funcs/impl/cuda/vender_permute_power.cpp b/saber/funcs/impl/cuda/vender_permute_power.cpp new file mode 100644 index 000000000..987c13da8 --- /dev/null +++ b/saber/funcs/impl/cuda/vender_permute_power.cpp @@ -0,0 +1,162 @@ + +#include "saber/funcs/impl/cuda/vender_permute_power.h" + +namespace anakin{ + +namespace saber{ +template class VenderPermutePower; + +template <> +SaberStatus VenderPermutePower::\ + create(const std::vector*>&inputs, + std::vector*>& outputs, + PermutePowerParam ¶m, Context &ctx) { + + if (!(&ctx == this->_ctx)) { + if (_handle != NULL) { + CUDNN_CHECK(cudnnDestroy(_handle)); + } + this->_ctx = &ctx; + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + } + int input_num = inputs[0]->num(); + int input_channel = inputs[0]->channel(); + int input_height = inputs[0]->height(); + int input_width = inputs[0]->width(); + + bool is_nhwc_to_nchw = param.permute_param.order == std::vector({0, 3, 1, 2}); + bool is_nchw_to_nhwc = param.permute_param.order == std::vector({0, 2, 3, 1}); + if (inputs[0]->shape() == inputs[0]->valid_shape()) { + if (is_nhwc_to_nchw) { + cudnn::setTensor4dDesc(&_input_descs, CUDNN_TENSOR_NHWC, + input_num, input_width, input_channel, input_height); + cudnn::setTensor4dDesc(&_output_descs, CUDNN_TENSOR_NCHW, + input_num, input_width, input_channel, input_height); + } else if (is_nchw_to_nhwc){ + cudnn::setTensor4dDesc(&_input_descs, CUDNN_TENSOR_NCHW, + input_num, input_channel, input_height, input_width); + cudnn::setTensor4dDesc(&_output_descs, CUDNN_TENSOR_NHWC, + input_num, input_channel, input_height, input_width); + } else { + //we only support nchw <----> nhwc({0, 3, 1, 2} and {0, 2, 3, 1}) + return SaberUnImplError; + } + } else { + Shape input_stride = inputs[0]->get_stride(); + Shape output_stride = outputs[0]->get_stride(); + int in_num = inputs[0]->num(); + int in_channel = inputs[0]->channel(); + int in_height = inputs[0]->height(); + int in_width = inputs[0]->width(); + int out_num = outputs[0]->num(); + int out_channel = outputs[0]->channel(); + int out_height = outputs[0]->height(); + int out_width = outputs[0]->width(); + int num_index = inputs[0]->num_index(); + int channel_index = inputs[0]->channel_index(); + int height_index = inputs[0]->height_index(); + int width_index = inputs[0]->width_index(); + if (is_nhwc_to_nchw) { + cudnn::setTensor4dDescEx(&_input_descs, + in_num, in_width, in_channel, in_height, + input_stride[num_index], + input_stride[width_index], + input_stride[channel_index], + input_stride[height_index] + ); + cudnn::setTensor4dDescEx(&_output_descs, + out_num, out_channel, out_height, out_width, + output_stride[num_index], + output_stride[channel_index], + output_stride[height_index], + output_stride[width_index] + ); + } else if (is_nchw_to_nhwc) { + cudnn::setTensor4dDescEx(&_input_descs, + in_num, in_channel, in_height, in_width, + input_stride[num_index], + input_stride[channel_index], + input_stride[height_index], + input_stride[width_index] + ); + cudnn::setTensor4dDescEx(&_output_descs, + out_num, out_width, out_channel, out_height, + output_stride[num_index], + output_stride[width_index], + output_stride[channel_index], + output_stride[height_index] + ); + } else { + //we only support nchw <----> nhwc({0, 3, 1, 2} and {0, 2, 3, 1}) + return SaberUnImplError; + } + } + return SaberSuccess; +} + +template <> +SaberStatus VenderPermutePower::\ + init(const std::vector *>& inputs, + std::vector *>& outputs, + PermutePowerParam ¶m, \ + Context &ctx) { + + this->_ctx = &ctx; + // ---- get cuda resources ---- + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + + // ---- create cudnn Descs ---- + cudnn::createTensorDesc(&_input_descs); + cudnn::createTensorDesc(&_output_descs); + + return create(inputs, outputs, param, ctx); +} + +//call cudnnConvolutionForward here +template <> +SaberStatus VenderPermutePower::\ + dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PermutePowerParam ¶m) { + const float* input_data = (const float*)inputs[0]->data(); + float* output_data = (float*)outputs[0]->mutable_data(); + float scale = param.power_param.scale; + float shift = param.power_param.shift; + float power = param.power_param.power; + + bool is_nhwc_to_nchw = param.permute_param.order == std::vector({0, 3, 1, 2}); + bool is_nchw_to_nhwc = param.permute_param.order == std::vector({0, 2, 3, 1}); + + if (shift != 0.f || power != 1.f ) { + LOG(ERROR) << "cudnn permute does not support shift and power"; + return SaberUnImplError; + } else { + //we only support nchw<->nhwc({0, 3, 1, 2} and {0, 2, 3, 1}) + if (!(is_nhwc_to_nchw || is_nchw_to_nhwc)){ + LOG(ERROR) << "cudnn permute does not support this layout"; + return SaberUnImplError; + } + CUDNN_CHECK(cudnnTransformTensor(_handle, + (void*)(&scale), + _input_descs, input_data, + cudnn::cudnnTypeWrapper::kZero(), + _output_descs, output_data)); + } + + return SaberSuccess; +} +DEFINE_OP_TEMPLATE(VenderPermutePower, PermutePowerParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(VenderPermutePower, PermutePowerParam, NV, AK_INT8); +} //namespace saber + +} //namespace anakin diff --git a/saber/funcs/impl/cuda/vender_permute_power.h b/saber/funcs/impl/cuda/vender_permute_power.h index e21cf57ea..8e395cded 100644 --- a/saber/funcs/impl/cuda/vender_permute_power.h +++ b/saber/funcs/impl/cuda/vender_permute_power.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -23,28 +23,13 @@ namespace anakin{ namespace saber{ -template -class VenderPermutePower:\ +template +class VenderPermutePower:\ public ImplBase< - Tensor, - Tensor, - Tensor, - PermutePowerParam>> { + NV, OpDtype, + PermutePowerParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; VenderPermutePower() : _handle(NULL) @@ -65,139 +50,24 @@ class VenderPermutePower& inputs, - std::vector& outputs, - PermutePowerParam ¶m, \ - Context &ctx) { - - this->_ctx = ctx; - // ---- get cuda resources ---- - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - // ---- create cudnn Descs ---- - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_output_descs); - - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector&inputs, - std::vector& outputs, - PermutePowerParam ¶m, Context &ctx) { + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + PermutePowerParam ¶m, \ + Context &ctx); - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - - bool is_nhwc_to_nchw = param.permute_param.order == std::vector({0, 3, 1, 2}); - if (inputs[0]->shape() == inputs[0]->valid_shape()) { - if (is_nhwc_to_nchw) { - cudnn::setTensor4dDesc(&_input_descs, CUDNN_TENSOR_NHWC, - input_num, input_width, input_channel, input_height); - cudnn::setTensor4dDesc(&_output_descs, CUDNN_TENSOR_NCHW, - input_num, input_width, input_channel, input_height); - } else { - cudnn::setTensor4dDesc(&_input_descs, CUDNN_TENSOR_NCHW, - input_num, input_channel, input_height, input_width); - cudnn::setTensor4dDesc(&_output_descs, CUDNN_TENSOR_NHWC, - input_num, input_channel, input_height, input_width); - } - } else { - Shape input_stride = inputs[0]->get_stride(); - Shape output_stride = outputs[0]->get_stride(); - int in_num = inputs[0]->num(); - int in_channel = inputs[0]->channel(); - int in_height = inputs[0]->height(); - int in_width = inputs[0]->width(); - int out_num = outputs[0]->num(); - int out_channel = outputs[0]->channel(); - int out_height = outputs[0]->height(); - int out_width = outputs[0]->width(); - int num_index = inputs[0]->num_index(); - int channel_index = inputs[0]->channel_index(); - int height_index = inputs[0]->height_index(); - int width_index = inputs[0]->width_index(); - if (is_nhwc_to_nchw) { - cudnn::setTensor4dDescEx(&_input_descs, - in_num, in_width, in_channel, in_height, - input_stride[num_index], - input_stride[width_index], - input_stride[channel_index], - input_stride[height_index] - ); - cudnn::setTensor4dDescEx(&_output_descs, - out_num, out_channel, out_height, out_width, - output_stride[num_index], - output_stride[channel_index], - output_stride[height_index], - output_stride[width_index] - ); - } else { - cudnn::setTensor4dDescEx(&_input_descs, - in_num, in_channel, in_height, in_width, - input_stride[num_index], - input_stride[channel_index], - input_stride[height_index], - input_stride[width_index] - ); - cudnn::setTensor4dDescEx(&_output_descs, - out_num, out_width, out_channel, out_height, - output_stride[num_index], - output_stride[width_index], - output_stride[channel_index], - output_stride[height_index] - ); - } - } - return SaberSuccess; - } + virtual SaberStatus create(const std::vector *>&inputs, + std::vector *>& outputs, + PermutePowerParam ¶m, Context &ctx); //call cudnnConvolutionForward here - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - PermutePowerParam ¶m) { - const InDataType* input_data = inputs[0]->data(); - OutDataType* output_data = outputs[0]->mutable_data(); - float scale = param.power_param.scale; - float shift = param.power_param.shift; - float power = param.power_param.power; - - if (shift != 0.f || power != 1.f) { - LOG(FATAL) << "cudnn permute does not support shift and power"; - } else { - CUDNN_CHECK(cudnnTransformTensor(_handle, - (void*)(&scale), - _input_descs, input_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, output_data)); - } - - return SaberSuccess; - } + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + PermutePowerParam ¶m); private: cudnnHandle_t _handle; cudnnTensorDescriptor_t _input_descs; cudnnTensorDescriptor_t _output_descs; const bool _use_tensor_core = true; }; -template class VenderPermutePower; } //namespace saber diff --git a/saber/funcs/impl/cuda/vender_pooling.cpp b/saber/funcs/impl/cuda/vender_pooling.cpp new file mode 100644 index 000000000..5a44229ad --- /dev/null +++ b/saber/funcs/impl/cuda/vender_pooling.cpp @@ -0,0 +1,105 @@ + +#include "saber/funcs/impl/cuda/vender_pooling.h" + +#include "saber/funcs/impl/impl_pooling.h" + +namespace anakin{ +namespace saber { + +template class VenderPooling; + +template <> +SaberStatus VenderPooling::\ + create(const std::vector& inputs, + std::vector& outputs, + PoolingParam &pooling_param, Context &ctx) { + if (!(&ctx == this->_ctx)) { + if (_handle != NULL) { + CUDNN_CHECK(cudnnDestroy(_handle)); + } + this->_ctx = &ctx; + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + } + + int input_num = inputs[0]->num(); + int input_channel = inputs[0]->channel(); + int input_height = inputs[0]->height(); + int input_width = inputs[0]->width(); + int output_channel = outputs[0]->channel(); + int output_height = outputs[0]->height(); + int output_width = outputs[0]->width(); + + Shape stride_in = inputs[0]->get_stride(); + Shape stride_out = outputs[0]->get_stride(); + + int dim_a[] = {input_num, input_channel, + input_height, input_width}; + + int dim_b[] = {input_num, output_channel, + output_height, output_width}; + + cudnn::setTensorNdDesc(&_input_descs, + inputs[0]->dims(), dim_a, &stride_in[0]); + + cudnn::setTensorNdDesc(&_output_descs, + outputs[0]->dims(), dim_b, &stride_out[0]); + + int windowHeight[] = {pooling_param.window_h, pooling_param.window_w}; + int padding[] = {pooling_param.pad_h, pooling_param.pad_w}; + + int stride[] = {pooling_param.stride_h, pooling_param.stride_w}; + + cudnn::set_nd_pooling_des(&_pooling_descs, pooling_param.pooling_type, + inputs[0]->dims() - 2, windowHeight, + padding,stride); + return SaberSuccess; +} + +template<> +SaberStatus VenderPooling :: \ + init(const std::vector& inputs, + std::vector& outputs, + PoolingParam &pooling_param, Context &ctx) { + + this->_ctx = &ctx; + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + + cudnn::createTensorDesc(&_input_descs); + cudnn::createTensorDesc(&_output_descs); + + cudnn::create_pooling_des(&_pooling_descs); + + return create(inputs, outputs, pooling_param, ctx); +} + +template <> +SaberStatus VenderPooling::\ + dispatch(const std::vector& inputs, + std::vector& outputs, + PoolingParam ¶m) { + const float *in_data = (const float*)inputs[0]->data(); + float *out_data = (float*)outputs[0]->mutable_data(); + + CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs, + cudnn::cudnnTypeWrapper::kOne(), + _input_descs, in_data, + cudnn::cudnnTypeWrapper::kZero(), + _output_descs, out_data + )); + + return SaberSuccess; +} +DEFINE_OP_TEMPLATE(VenderPooling, PoolingParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(VenderPooling, PoolingParam, NV, AK_INT8); +} //namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/cuda/vender_pooling.h b/saber/funcs/impl/cuda/vender_pooling.h index c1cb6b85d..201cc6bed 100644 --- a/saber/funcs/impl/cuda/vender_pooling.h +++ b/saber/funcs/impl/cuda/vender_pooling.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -23,140 +23,54 @@ namespace anakin{ namespace saber { -template -class VenderPooling:\ +template +class VenderPooling:\ public ImplBase< - Tensor, - Tensor, - Tensor, - PoolingParam>> { -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - - VenderPooling() : _handle(NULL) {} - - ~VenderPooling() { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - if (_input_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs)); - } - if (_output_descs) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs)); - } - if(_pooling_descs) { - cudnnDestroyPoolingDescriptor(_pooling_descs); - } - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - PoolingParam &pooling_param, Context &ctx) { - - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_output_descs); - - cudnn::create_pooling_des(&_pooling_descs); - - return create(inputs, outputs, pooling_param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - PoolingParam &pooling_param, Context &ctx) { - if (!(ctx == this->_ctx)) { + NV, + OpDtype, + PoolingParam> { + public: + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + + VenderPooling() : _handle(NULL) {} + + ~VenderPooling() { if (_handle != NULL) { CUDNN_CHECK(cudnnDestroy(_handle)); } - this->_ctx = ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + if (_input_descs) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs)); + } + if (_output_descs) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs)); + } + if(_pooling_descs) { + cudnnDestroyPoolingDescriptor(_pooling_descs); + } } - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - - Shape stride_in = inputs[0]->get_stride(); - Shape stride_out = outputs[0]->get_stride(); - - int dim_a[] = {input_num, input_channel, - input_height, input_width}; - - int dim_b[] = {input_num, output_channel, - output_height, output_width}; - - cudnn::setTensorNdDesc(&_input_descs, - inputs[0]->dims(), dim_a, &stride_in[0]); - - cudnn::setTensorNdDesc(&_output_descs, - outputs[0]->dims(), dim_b, &stride_out[0]); - - int windowHeight[] = {pooling_param.window_h, pooling_param.window_w}; - int padding[] = {pooling_param.pad_h, pooling_param.pad_w}; - - int stride[] = {pooling_param.stride_h, pooling_param.stride_w}; - - cudnn::set_nd_pooling_des(&_pooling_descs, pooling_param.pooling_type, - inputs[0]->dims() - 2, windowHeight, - padding,stride); - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - PoolingParam ¶m) { - const InDataType *in_data = inputs[0]->data(); - OutDataType *out_data = outputs[0]->mutable_data(); - - CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data - )); - - return SaberSuccess; - } - -private: - cudnnHandle_t _handle; - cudnnTensorDescriptor_t _input_descs; - cudnnTensorDescriptor_t _output_descs; - cudnnPoolingDescriptor_t _pooling_descs; - + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + PoolingParam &pooling_param, Context &ctx); + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + PoolingParam &pooling_param, Context &ctx); + + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + PoolingParam ¶m); + + private: + cudnnHandle_t _handle; + cudnnTensorDescriptor_t _input_descs; + cudnnTensorDescriptor_t _output_descs; + cudnnPoolingDescriptor_t _pooling_descs; + }; -template class VenderPooling; - } //namespace saber } // namespace anakin diff --git a/saber/funcs/impl/cuda/vender_softmax.cpp b/saber/funcs/impl/cuda/vender_softmax.cpp new file mode 100644 index 000000000..d021e55c1 --- /dev/null +++ b/saber/funcs/impl/cuda/vender_softmax.cpp @@ -0,0 +1,101 @@ + +#include "vender_softmax.h" +namespace anakin { +namespace saber { +template <> +SaberStatus VenderSoftmax::create(const std::vector& inputs, + std::vector& outputs, + SoftmaxParam& param, Context& ctx) { + + if (!inputs[0]->is_continue_mem() || !outputs[0]->is_continue_mem()) { + //! unsupported type for cudnn + return SaberInvalidValue; + } + + + Shape shape_in = inputs[0]->valid_shape(); + + if (!(ctx == this->_ctx)) { + if (_handle != NULL) { + CUDNN_CHECK(cudnnDestroy(_handle)); + } + + this->_ctx = &ctx; + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + } + + int outer_num = inputs[0]->count(0, param.axis); + int inner_num = inputs[0]->count(param.axis + 1, inputs[0]->dims()); + + int N = outer_num; + int K = inputs[0]->valid_shape()[param.axis]; + int H = inner_num; + int W = 1; + + const int stride_w = 1; + const int stride_h = W * stride_w; + const int stride_c = H * stride_h; + const int stride_n = K * stride_c; + CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(_input_desc, \ + cudnn::cudnnOpWrapper::type, \ + N, K, H, W, stride_n, stride_c, stride_h, stride_w)); + CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(_output_desc, \ + cudnn::cudnnOpWrapper::type, \ + N, K, H, W, stride_n, stride_c, stride_h, stride_w)); + + _setup = true; + return SaberSuccess; +} + + /** + * \brief initial all cudnn resources here + * @param inputs + * @param outputs + * @param param + * @param ctx + */ +template <> +SaberStatus VenderSoftmax::init(const std::vector& inputs, + std::vector& outputs, + SoftmaxParam& param, Context& ctx) { + + // ---- init cudnn resources ---- + + this->_ctx = &ctx; + // ---- get cuda resources ---- + + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); + + // ---- create cudnn Descs ---- + cudnn::createTensorDesc(&_input_desc); + cudnn::createTensorDesc(&_output_desc); + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus VenderSoftmax::dispatch(const std::vector& inputs, + std::vector& outputs, + SoftmaxParam& param) { + cudaStream_t stream = this->_ctx->get_compute_stream(); + const OpDataType* input_data = (const OpDataType*)inputs[0]->data(); + OpDataType* output_data = (OpDataType*)outputs[0]->mutable_data(); + CUDNN_CHECK(cudnnSoftmaxForward(_handle, CUDNN_SOFTMAX_ACCURATE, \ + CUDNN_SOFTMAX_MODE_CHANNEL, cudnn::cudnnTypeWrapper::kOne(), _input_desc, input_data, \ + cudnn::cudnnTypeWrapper::kZero(), _output_desc, output_data)); + //outputs[0]->record_event(stream); + return SaberSuccess; +} + + +DEFINE_OP_TEMPLATE(VenderSoftmax, SoftmaxParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(VenderSoftmax, SoftmaxParam, NV, AK_INT8); + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/vender_softmax.h b/saber/funcs/impl/cuda/vender_softmax.h index 349420974..863fa62c9 100644 --- a/saber/funcs/impl/cuda/vender_softmax.h +++ b/saber/funcs/impl/cuda/vender_softmax.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,27 +25,15 @@ namespace anakin{ namespace saber{ -template -class VenderSoftmax : \ - public ImplBase< - Tensor, - Tensor, - Tensor, - SoftmaxParam > > +template +class VenderSoftmax: + public ImplBase> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; + typedef typename DataTrait::Dtype OpDataType; VenderSoftmax(){ _handle = nullptr; @@ -72,87 +60,15 @@ class VenderSoftmax& inputs, std::vector& outputs, - SoftmaxParam& param, Context& ctx) { - - // ---- init cudnn resources ---- - - this->_ctx = ctx; - // ---- get cuda resources ---- - - cudaStream_t cuda_stream = this->_ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - // ---- create cudnn Descs ---- - cudnn::createTensorDesc(&_input_desc); - cudnn::createTensorDesc(&_output_desc); - - return create(inputs, outputs, param, ctx); - } + SoftmaxParam& param, Context& ctx) ; virtual SaberStatus create(const std::vector& inputs, std::vector& outputs, - SoftmaxParam& param, Context &ctx) { - - if (!inputs[0]->is_continue_mem() || !outputs[0]->is_continue_mem()) { - //! unsupported type for cudnn - return SaberInvalidValue; - } - - //CHECK_EQ(inputs[0]->shape() == inputs[0]->valid_shape(), true) << \ - "cudnn softmax does not support tensor with roi"; - //CHECK_EQ(outputs[0]->shape() == outputs[0]->valid_shape(), true) << \ - "cudnn softmax does not support tensor with roi"; - - Shape shape_in = inputs[0]->valid_shape(); - if (!(ctx == this->_ctx)) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - this->_ctx = ctx; - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - - int outer_num = inputs[0]->count(0, param.axis); - int inner_num = inputs[0]->count(param.axis + 1, inputs[0]->dims()); - - int N = outer_num; - int K = inputs[0]->valid_shape()[param.axis]; - int H = inner_num; - int W = 1; - - const int stride_w = 1; - const int stride_h = W * stride_w; - const int stride_c = H * stride_h; - const int stride_n = K * stride_c; - CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(_input_desc, \ - cudnn::cudnnOpWrapper::type, \ - N, K, H, W, stride_n, stride_c, stride_h, stride_w)); - CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(_output_desc, \ - cudnn::cudnnOpWrapper::type, \ - N, K, H, W, stride_n, stride_c, stride_h, stride_w)); - - _setup = true; - return SaberSuccess; - } - + SoftmaxParam& param, Context &ctx); //call cudnnConvolutionForward here virtual SaberStatus dispatch(const std::vector& inputs, std::vector& outputs, - SoftmaxParam ¶m){ - cudaStream_t stream = this->_ctx.get_compute_stream(); - const InDataType* input_data = inputs[0]->data(); - InDataType * output_data = outputs[0]->mutable_data(); - CUDNN_CHECK(cudnnSoftmaxForward(_handle, CUDNN_SOFTMAX_ACCURATE, \ - CUDNN_SOFTMAX_MODE_CHANNEL, cudnn::cudnnTypeWrapper::kOne(), _input_desc, input_data, \ - cudnn::cudnnTypeWrapper::kZero(), _output_desc, output_data)); - //outputs[0]->record_event(stream); - return SaberSuccess; - } + SoftmaxParam ¶m); private: bool _setup{false}; @@ -161,6 +77,8 @@ class VenderSoftmax \ +//class SaberActivation : public ImplBase > {}; \ +//\ +//template \ +//class VenderActivation : public ImplBase< \ +// TargetType, OpDtype,\ +// ActivationParam > {}; + + } } diff --git a/saber/funcs/impl/impl_argmax.h b/saber/funcs/impl/impl_argmax.h index 747868366..f7477faae 100644 --- a/saber/funcs/impl/impl_argmax.h +++ b/saber/funcs/impl/impl_argmax.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_axpy.h b/saber/funcs/impl/impl_axpy.h index c25fb0c47..e4a10c52a 100644 --- a/saber/funcs/impl/impl_axpy.h +++ b/saber/funcs/impl/impl_axpy.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_base.h b/saber/funcs/impl/impl_base.h index 5b593f04f..91571e532 100644 --- a/saber/funcs/impl/impl_base.h +++ b/saber/funcs/impl/impl_base.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,47 +17,41 @@ #define ANAKIN_SABER_FUNCS_IMPL_BASE_IMPL_H #include "saber/core/context.h" +#include "saber/core/tensor.h" namespace anakin { namespace saber { -template +template class ImplBase { public: - typedef typename inTensor::targetType_t targetType_t; - //typedef typename inTensor::target_type in_target; - //typedef typename outTensor::target_type out_target; - ImplBase() { - } - - virtual ~ImplBase(){ - } + ImplBase() {} + virtual ~ImplBase() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - Param ¶m, Context &ctx) { + virtual SaberStatus init(const std::vector* >& inputs, + std::vector *>& outputs, + Param ¶m, Context &ctx) { return SaberUnImplError; } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - Param ¶m, Context &ctx) { + virtual SaberStatus create(const std::vector* >& inputs, + std::vector *>& outputs, + Param ¶m, Context &ctx) { return SaberUnImplError; } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, + virtual SaberStatus dispatch(const std::vector* >& inputs, + std::vector *>& outputs, Param ¶m) { return SaberUnImplError; } protected: Param* _param; - Context _ctx; + Context* _ctx; }; } diff --git a/saber/funcs/impl/impl_batch_norm.h b/saber/funcs/impl/impl_batch_norm.h deleted file mode 100644 index 5a09220c7..000000000 --- a/saber/funcs/impl/impl_batch_norm.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef ANAKIN_SABER_FUNCS_IMPL_BATCH_NORM_H -#define ANAKIN_SABER_FUNCS_IMPL_BATCH_NORM_H - -#include "saber/funcs/impl/impl_macro.h" -namespace anakin{ - -namespace saber{ - -DEFINE_OP_CLASS(BatchNorm, BatchnormParam); - -} -} - -#endif //ANAKIN_SABER_FUNCS_IMPL_BATCH_NORM_H diff --git a/saber/funcs/impl/impl_box_coder.h b/saber/funcs/impl/impl_box_coder.h index 84386a1b5..be58498fb 100644 --- a/saber/funcs/impl/impl_box_coder.h +++ b/saber/funcs/impl/impl_box_coder.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_cast.h b/saber/funcs/impl/impl_cast.h index 3e267a7f0..84282cfcb 100644 --- a/saber/funcs/impl/impl_cast.h +++ b/saber/funcs/impl/impl_cast.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_concat.h b/saber/funcs/impl/impl_concat.h index 4940989bc..c8b9270b2 100644 --- a/saber/funcs/impl/impl_concat.h +++ b/saber/funcs/impl/impl_concat.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_conv.h b/saber/funcs/impl/impl_conv.h index 1c32dd2ca..5d20df1fe 100644 --- a/saber/funcs/impl/impl_conv.h +++ b/saber/funcs/impl/impl_conv.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,8 +21,28 @@ namespace anakin{ namespace saber{ - -DEFINE_OP_CLASS(Conv2D, ConvParam); +template +class SaberConv2D : public ImplBase< + TargetType, OpDtype, + ConvParam > { +public: + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + return SaberUnImplError; + } +}; + +template +class VenderConv2D : public ImplBase< + TargetType, OpDtype, + ConvParam > { +public: + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + return SaberUnImplError; + } +}; } } diff --git a/saber/funcs/impl/impl_conv_eltwise.h b/saber/funcs/impl/impl_conv_eltwise.h index 471862de2..b61316672 100644 --- a/saber/funcs/impl/impl_conv_eltwise.h +++ b/saber/funcs/impl/impl_conv_eltwise.h @@ -1,29 +1,47 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_IMPL_CONV2DELTWISE_H -#define ANAKIN_SABER_FUNCS_IMPL_CONV2DELTWISE_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_CONV_ELTWISE_H +#define ANAKIN_SABER_FUNCS_IMPL_CONV_ELTWISE_H #include "saber/funcs/impl/impl_macro.h" namespace anakin{ namespace saber{ -DEFINE_OP_CLASS(Conv2DEltWise, ConvActiveParam); - +template +class SaberConvEltwise : public ImplBase< + TargetType, OpDtype, + ConvEltwiseParam > { + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + return SaberUnImplError; + } +}; + +template +class VenderConvEltwise : public ImplBase< + TargetType, OpDtype, + ConvEltwiseParam > { + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + return SaberUnImplError; + } +}; } } -#endif //ANAKIN_SABER_FUNCS_IMPL_CONV2DELTWISE_H +#endif //ANAKIN_SABER_FUNCS_IMPL_CONV_ELTWISE_H diff --git a/saber/funcs/impl/impl_conv_pooling.h b/saber/funcs/impl/impl_conv_pooling.h new file mode 100644 index 000000000..79e07429c --- /dev/null +++ b/saber/funcs/impl/impl_conv_pooling.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CONV_POOLING_H +#define ANAKIN_SABER_FUNCS_IMPL_CONV_POOLING_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +template +class SaberConv2DPooling : public ImplBase< + TargetType, OpDtype, + ConvPoolingParam > { + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + return SaberUnImplError; + } +}; + +template +class VenderConv2DPooling : public ImplBase< + TargetType, OpDtype, + ConvPoolingParam > { + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + return SaberUnImplError; + } +}; + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_CONV_POOLING_H diff --git a/saber/funcs/impl/impl_conv_unpadding_padding.h b/saber/funcs/impl/impl_conv_unpadding_padding.h new file mode 100644 index 000000000..af727502c --- /dev/null +++ b/saber/funcs/impl/impl_conv_unpadding_padding.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_IMPL_CONV_UNPADDING_PADDING_H +#define ANAKIN_SABER_FUNCS_IMPL_IMPL_CONV_UNPADDING_PADDING_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin { + +namespace saber { + +DEFINE_OP_CLASS(ConvUnpaddingPadding, ConvUnpaddingPaddingParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_IMPL_CONV_UNPADDING_PADDING_H diff --git a/saber/funcs/impl/impl_crf_decoding.h b/saber/funcs/impl/impl_crf_decoding.h index b4e770f3e..b05a16834 100644 --- a/saber/funcs/impl/impl_crf_decoding.h +++ b/saber/funcs/impl/impl_crf_decoding.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_crop.h b/saber/funcs/impl/impl_crop.h index d19c9d129..343eae793 100644 --- a/saber/funcs/impl/impl_crop.h +++ b/saber/funcs/impl/impl_crop.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_ctc_align.h b/saber/funcs/impl/impl_ctc_align.h index d31a757f0..a349b6210 100644 --- a/saber/funcs/impl/impl_ctc_align.h +++ b/saber/funcs/impl/impl_ctc_align.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_deconv.h b/saber/funcs/impl/impl_deconv.h index 3e4d7f931..efe1c9e8f 100644 --- a/saber/funcs/impl/impl_deconv.h +++ b/saber/funcs/impl/impl_deconv.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,7 +21,38 @@ namespace anakin{ namespace saber{ -DEFINE_OP_CLASS(Deconv2D, ConvParam); +template +class SaberDeconv2D : public ImplBase< + TargetType, OpDtype, + ConvParam > { +public: + SaberStatus trans_weights(Tensor &target_weights, + Tensor &target_bias, + int in_channel, int out_channel, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int group) { + return SaberUnImplError; + } +}; + +template +class VenderDeconv2D : public ImplBase< + TargetType, OpDtype, + ConvParam > { +public: + SaberStatus trans_weights(Tensor &target_weights, + Tensor &target_bias, + int in_channel, int out_channel, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int group) { + return SaberUnImplError; + } +}; } } diff --git a/saber/funcs/impl/impl_deformable_conv.h b/saber/funcs/impl/impl_deformable_conv.h index f71d3f91e..409e0ac26 100644 --- a/saber/funcs/impl/impl_deformable_conv.h +++ b/saber/funcs/impl/impl_deformable_conv.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_detection_output.h b/saber/funcs/impl/impl_detection_output.h index fd1272233..1313f51ef 100644 --- a/saber/funcs/impl/impl_detection_output.h +++ b/saber/funcs/impl/impl_detection_output.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_eltwise.h b/saber/funcs/impl/impl_eltwise.h index 24c344d6c..dfad1f6be 100644 --- a/saber/funcs/impl/impl_eltwise.h +++ b/saber/funcs/impl/impl_eltwise.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_eltwise_act.h b/saber/funcs/impl/impl_eltwise_act.h index 141ec699e..9637baee0 100644 --- a/saber/funcs/impl/impl_eltwise_act.h +++ b/saber/funcs/impl/impl_eltwise_act.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_embedding.h b/saber/funcs/impl/impl_embedding.h index 1ce4f3047..e3e8f1a15 100644 --- a/saber/funcs/impl/impl_embedding.h +++ b/saber/funcs/impl/impl_embedding.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_fc.h b/saber/funcs/impl/impl_fc.h index b28ede61c..bd3357a73 100644 --- a/saber/funcs/impl/impl_fc.h +++ b/saber/funcs/impl/impl_fc.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_flatten.h b/saber/funcs/impl/impl_flatten.h index bb2a0eaa6..26fab0ed0 100644 --- a/saber/funcs/impl/impl_flatten.h +++ b/saber/funcs/impl/impl_flatten.h @@ -1,10 +1,10 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, diff --git a/saber/funcs/impl/impl_gru.h b/saber/funcs/impl/impl_gru.h index ccf488d15..7f769bc22 100644 --- a/saber/funcs/impl/impl_gru.h +++ b/saber/funcs/impl/impl_gru.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_im2sequence.h b/saber/funcs/impl/impl_im2sequence.h index ce7a6ab23..4baee9902 100644 --- a/saber/funcs/impl/impl_im2sequence.h +++ b/saber/funcs/impl/impl_im2sequence.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_layer_norm.h b/saber/funcs/impl/impl_layer_norm.h index da6ccf1d1..f98b1cf6e 100644 --- a/saber/funcs/impl/impl_layer_norm.h +++ b/saber/funcs/impl/impl_layer_norm.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_lrn.h b/saber/funcs/impl/impl_lrn.h index 658ac4ed9..4376cce06 100644 --- a/saber/funcs/impl/impl_lrn.h +++ b/saber/funcs/impl/impl_lrn.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_prelu.h b/saber/funcs/impl/impl_lstm.h similarity index 64% rename from saber/funcs/impl/impl_prelu.h rename to saber/funcs/impl/impl_lstm.h index f65f26d6a..12c767a31 100644 --- a/saber/funcs/impl/impl_prelu.h +++ b/saber/funcs/impl/impl_lstm.h @@ -1,29 +1,28 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_PRELU_H -#define ANAKIN_SABER_FUNCS_IMPL_PRELU_H - -#include "saber/funcs/impl/impl_macro.h" -namespace anakin{ - -namespace saber{ - -DEFINE_OP_CLASS(Prelu, PreluParam); - -} -} - -#endif //ANAKIN_SABER_FUNCS_IMPL_PRELU_H +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_LSTM_H +#define ANAKIN_SABER_FUNCS_IMPL_LSTM_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin { +namespace saber { + +DEFINE_OP_CLASS(Lstm, LstmParam); + +} +} + +#endif // ANAKIN_SABER_FUNCS_IMPL_LSTM_H diff --git a/saber/funcs/impl/impl_macro.h b/saber/funcs/impl/impl_macro.h index c6e6fbfe0..bf75def33 100644 --- a/saber/funcs/impl/impl_macro.h +++ b/saber/funcs/impl/impl_macro.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -24,30 +24,29 @@ namespace saber{ #define DEFINE_OP_CLASS(class_name, param_name) \ template \ + DataType OpDtype = AK_FLOAT> \ class Saber##class_name : public ImplBase< \ - Tensor, \ - Tensor, \ - Tensor, \ - param_name > > {}; \ + TargetType, OpDtype,\ + param_name > {}; \ \ template \ + DataType OpDtype = AK_FLOAT> \ class Vender##class_name : public ImplBase< \ - Tensor, \ - Tensor, \ - Tensor, \ - param_name > > {}; + TargetType, OpDtype,\ + param_name > {}; +#define DEFINE_OP_TEMPLATE(op_name, op_param, op_target, op_dtype) \ +template<> SaberStatus op_name::create( \ + const std::vector *>& inputs, \ + std::vector *>& outputs, op_param ¶m, \ + Context &ctx) {return SaberUnImplError;} \ +template<> SaberStatus op_name::init( \ + const std::vector *>& inputs, \ + std::vector *>& outputs, op_param ¶m, \ + Context &ctx) {return SaberUnImplError;} \ +template<> SaberStatus op_name::dispatch( \ + const std::vector *>& inputs, \ + std::vector *>& outputs, op_param ¶m \ + ) {return SaberUnImplError;} +} } -} \ No newline at end of file diff --git a/saber/funcs/impl/impl_mat_mul.h b/saber/funcs/impl/impl_mat_mul.h index be8073b68..2809eb5a3 100644 --- a/saber/funcs/impl/impl_mat_mul.h +++ b/saber/funcs/impl/impl_mat_mul.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_multiclass_nms.h b/saber/funcs/impl/impl_multiclass_nms.h index 82894a819..b360454e8 100644 --- a/saber/funcs/impl/impl_multiclass_nms.h +++ b/saber/funcs/impl/impl_multiclass_nms.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_mvn.h b/saber/funcs/impl/impl_mvn.h index 54f7e3001..d18c5b326 100644 --- a/saber/funcs/impl/impl_mvn.h +++ b/saber/funcs/impl/impl_mvn.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_normalize.h b/saber/funcs/impl/impl_normalize.h index 4f0b92660..828073efe 100644 --- a/saber/funcs/impl/impl_normalize.h +++ b/saber/funcs/impl/impl_normalize.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_pad.h b/saber/funcs/impl/impl_pad.h index f1a4c92d4..8ac0db533 100644 --- a/saber/funcs/impl/impl_pad.h +++ b/saber/funcs/impl/impl_pad.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_permute.h b/saber/funcs/impl/impl_permute.h index 675eecc20..09a6eac66 100644 --- a/saber/funcs/impl/impl_permute.h +++ b/saber/funcs/impl/impl_permute.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_permute_power.h b/saber/funcs/impl/impl_permute_power.h index 81f6b57c1..ec486414e 100644 --- a/saber/funcs/impl/impl_permute_power.h +++ b/saber/funcs/impl/impl_permute_power.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_pooling.h b/saber/funcs/impl/impl_pooling.h index b2b719515..aeb5ab914 100644 --- a/saber/funcs/impl/impl_pooling.h +++ b/saber/funcs/impl/impl_pooling.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_pooling_with_index.h b/saber/funcs/impl/impl_pooling_with_index.h index 911fc7e4f..d4c6bd84e 100644 --- a/saber/funcs/impl/impl_pooling_with_index.h +++ b/saber/funcs/impl/impl_pooling_with_index.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_power.h b/saber/funcs/impl/impl_power.h index 6e6e9a3eb..4129b04f1 100644 --- a/saber/funcs/impl/impl_power.h +++ b/saber/funcs/impl/impl_power.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_priorbox.h b/saber/funcs/impl/impl_priorbox.h index 94fc72d36..568610200 100644 --- a/saber/funcs/impl/impl_priorbox.h +++ b/saber/funcs/impl/impl_priorbox.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_reshape.h b/saber/funcs/impl/impl_reshape.h index 025d3cd64..65a433704 100644 --- a/saber/funcs/impl/impl_reshape.h +++ b/saber/funcs/impl/impl_reshape.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_resize.h b/saber/funcs/impl/impl_resize.h index d7aaae43a..5a978b63c 100644 --- a/saber/funcs/impl/impl_resize.h +++ b/saber/funcs/impl/impl_resize.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_deconv_act.h b/saber/funcs/impl/impl_reverse_input.h similarity index 67% rename from saber/funcs/impl/impl_deconv_act.h rename to saber/funcs/impl/impl_reverse_input.h index 590de04cb..fb45ce19e 100644 --- a/saber/funcs/impl/impl_deconv_act.h +++ b/saber/funcs/impl/impl_reverse_input.h @@ -1,29 +1,29 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_IMPL_DECONV2DACT_H -#define ANAKIN_SABER_FUNCS_IMPL_DECONV2DACT_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_REVERSE_INPUT_H +#define ANAKIN_SABER_FUNCS_IMPL_REVERSE_INPUT_H #include "saber/funcs/impl/impl_macro.h" namespace anakin{ namespace saber{ -DEFINE_OP_CLASS(Deconv2DAct, ConvActiveParam); +DEFINE_OP_CLASS(ReverseInput, EmptyParam); } } -#endif //ANAKIN_SABER_FUNCS_IMPL_DECONV2DACT_H +#endif //ANAKIN_SABER_FUNCS_IMPL_REVERSE_INPUT_H diff --git a/saber/funcs/impl/impl_conv_act_pooling.h b/saber/funcs/impl/impl_reverse_sequence.h similarity index 65% rename from saber/funcs/impl/impl_conv_act_pooling.h rename to saber/funcs/impl/impl_reverse_sequence.h index 11585988c..84f2fbd3c 100644 --- a/saber/funcs/impl/impl_conv_act_pooling.h +++ b/saber/funcs/impl/impl_reverse_sequence.h @@ -1,29 +1,30 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_IMPL_CONVACTPOOLING_H -#define ANAKIN_SABER_FUNCS_IMPL_CONVACTPOOLING_H + +#ifndef ANAKIN_SABER_FUNCS_IMPL_IMPL_REVERSE_SEQUENCE_H +#define ANAKIN_SABER_FUNCS_IMPL_IMPL_REVERSE_SEQUENCE_H #include "saber/funcs/impl/impl_macro.h" namespace anakin{ namespace saber{ -DEFINE_OP_CLASS(Conv2DActPooling, ConvActivePoolingParam); +DEFINE_OP_CLASS(ReverseSequence, EmptyParam); } } -#endif //ANAKIN_SABER_FUNCS_IMPL_CONVACTPOOLING_H +#endif //ANAKIN_SABER_FUNCS_IMPL_IMPL_REVERSE_SEQUENCE_H diff --git a/saber/funcs/impl/impl_roi_pooling.h b/saber/funcs/impl/impl_roi_pooling.h index 38882c5e1..c55956106 100644 --- a/saber/funcs/impl/impl_roi_pooling.h +++ b/saber/funcs/impl/impl_roi_pooling.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_scale.h b/saber/funcs/impl/impl_scale.h index f00897b7b..38f467f3f 100644 --- a/saber/funcs/impl/impl_scale.h +++ b/saber/funcs/impl/impl_scale.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_conv_act.h b/saber/funcs/impl/impl_sequence_conv.h similarity index 63% rename from saber/funcs/impl/impl_conv_act.h rename to saber/funcs/impl/impl_sequence_conv.h index f020e136e..636dfcced 100644 --- a/saber/funcs/impl/impl_conv_act.h +++ b/saber/funcs/impl/impl_sequence_conv.h @@ -1,29 +1,29 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_IMPL_CON2DACT_H -#define ANAKIN_SABER_FUNCS_IMPL_CON2DACT_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_SEQUENCECONV_H +#define ANAKIN_SABER_FUNCS_IMPL_SEQUENCECONV_H #include "saber/funcs/impl/impl_macro.h" -namespace anakin{ +namespace anakin { -namespace saber{ +namespace saber { -DEFINE_OP_CLASS(Conv2DAct, ConvActiveParam); +DEFINE_OP_CLASS(SequenceConv, SequenceConvParam); } } -#endif //ANAKIN_SABER_FUNCS_IMPL_CON2DACT_H +#endif //ANAKIN_SABER_FUNCS_IMPL_SEQUENCEPOOL_H diff --git a/saber/funcs/impl/impl_sequence_pool.h b/saber/funcs/impl/impl_sequence_pool.h index c5ceb5ad6..22730a5bd 100644 --- a/saber/funcs/impl/impl_sequence_pool.h +++ b/saber/funcs/impl/impl_sequence_pool.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_slice.h b/saber/funcs/impl/impl_slice.h index be54facd7..d69561f5a 100644 --- a/saber/funcs/impl/impl_slice.h +++ b/saber/funcs/impl/impl_slice.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_softmax.h b/saber/funcs/impl/impl_softmax.h index a91d0afbd..76aa68509 100644 --- a/saber/funcs/impl/impl_softmax.h +++ b/saber/funcs/impl/impl_softmax.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_spp.h b/saber/funcs/impl/impl_spp.h index 6466226ae..f708f05ef 100644 --- a/saber/funcs/impl/impl_spp.h +++ b/saber/funcs/impl/impl_spp.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_transpose.h b/saber/funcs/impl/impl_transpose.h index f5313f99e..22eb54c3f 100644 --- a/saber/funcs/impl/impl_transpose.h +++ b/saber/funcs/impl/impl_transpose.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/impl_unpool.h b/saber/funcs/impl/impl_unpool.h index a2f57abab..3a0f66432 100644 --- a/saber/funcs/impl/impl_unpool.h +++ b/saber/funcs/impl/impl_unpool.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/saber/funcs/impl/x86/avx_mathfun.h b/saber/funcs/impl/x86/avx_mathfun.h deleted file mode 100644 index 8e698e746..000000000 --- a/saber/funcs/impl/x86/avx_mathfun.h +++ /dev/null @@ -1,735 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -/* - AVX implementation of sin, cos, sincos, exp and log - - Based on "sse_mathfun.h", by Julien Pommier - http://gruntthepeon.free.fr/ssemath/ - - Copyright (C) 2012 Giovanni Garberoglio - Interdisciplinary Laboratory for Computational Science (LISC) - Fondazione Bruno Kessler and University of Trento - via Sommarive, 18 - I-38123 Trento (Italy) - - This software is provided 'as-is', without any express or implied - warranty. In no event will the authors be held liable for any damages - arising from the use of this software. - - Permission is granted to anyone to use this software for any purpose, - including commercial applications, and to alter it and redistribute it - freely, subject to the following restrictions: - - 1. The origin of this software must not be misrepresented; you must not - claim that you wrote the original software. If you use this software - in a product, an acknowledgment in the product documentation would be - appreciated but is not required. - 2. Altered source versions must be plainly marked as such, and must not be - misrepresented as being the original software. - 3. This notice may not be removed or altered from any source distribution. - - (this is the zlib license) -*/ - -#include - -/* yes I know, the top of this file is quite ugly */ -#define ALIGN32_BEG -#define ALIGN32_END __attribute__((aligned(32))) - -/* __m128 is ugly to write */ -typedef __m256 v8sf; // vector of 8 float (avx) -typedef __m256i v8si; // vector of 8 int (avx) -typedef __m128i v4si; // vector of 8 int (avx) - -#define _PI32AVX_CONST(Name, Val) \ - static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \ - Val, Val, Val, Val} - -_PI32AVX_CONST(1, 1); -_PI32AVX_CONST(inv1, ~1); -_PI32AVX_CONST(2, 2); -_PI32AVX_CONST(4, 4); - -/* declare some AVX constants -- why can't I figure a better way to do that? */ -#define _PS256_CONST(Name, Val) \ - static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \ - Val, Val, Val, Val, Val, Val, Val, Val} -#define _PI32_CONST256(Name, Val) \ - static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \ - Val, Val, Val, Val, Val, Val, Val, Val} -#define _PS256_CONST_TYPE(Name, Type, Val) \ - static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \ - Val, Val, Val, Val, Val, Val, Val, Val} - -_PS256_CONST(1, 1.0f); -_PS256_CONST(0p5, 0.5f); -/* the smallest non denormalized float number */ -_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); -_PS256_CONST_TYPE(mant_mask, int, 0x7f800000); -_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); - -_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); -_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); - -_PI32_CONST256(0, 0); -_PI32_CONST256(1, 1); -_PI32_CONST256(inv1, ~1); -_PI32_CONST256(2, 2); -_PI32_CONST256(4, 4); -_PI32_CONST256(0x7f, 0x7f); - -_PS256_CONST(cephes_SQRTHF, 0.707106781186547524); -_PS256_CONST(cephes_log_p0, 7.0376836292E-2); -_PS256_CONST(cephes_log_p1, -1.1514610310E-1); -_PS256_CONST(cephes_log_p2, 1.1676998740E-1); -_PS256_CONST(cephes_log_p3, -1.2420140846E-1); -_PS256_CONST(cephes_log_p4, +1.4249322787E-1); -_PS256_CONST(cephes_log_p5, -1.6668057665E-1); -_PS256_CONST(cephes_log_p6, +2.0000714765E-1); -_PS256_CONST(cephes_log_p7, -2.4999993993E-1); -_PS256_CONST(cephes_log_p8, +3.3333331174E-1); -_PS256_CONST(cephes_log_q1, -2.12194440e-4); -_PS256_CONST(cephes_log_q2, 0.693359375); - -#ifndef __AVX2__ - -typedef union imm_xmm_union { - v8si imm; - v4si xmm[2]; -} imm_xmm_union; - -#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ - { \ - imm_xmm_union u __attribute__((aligned(32))); \ - u.imm = imm_; \ - xmm0_ = u.xmm[0]; \ - xmm1_ = u.xmm[1]; \ - } - -#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ - { \ - imm_xmm_union u __attribute__((aligned(32))); \ - u.xmm[0] = xmm0_; \ - u.xmm[1] = xmm1_; \ - imm_ = u.imm; \ - } - -#define AVX2_BITOP_USING_SSE2(fn) \ - static inline v8si avx2_mm256_##fn(v8si x, int a) { \ - /* use SSE2 instruction to perform the bitop AVX2 */ \ - v4si x1, x2; \ - v8si ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - x1 = _mm_##fn(x1, a); \ - x2 = _mm_##fn(x2, a); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return (ret); \ - } - -//#warning "Using SSE2 to perform AVX2 bitshift ops" -AVX2_BITOP_USING_SSE2(slli_epi32) -AVX2_BITOP_USING_SSE2(srli_epi32) - -#define AVX2_INTOP_USING_SSE2(fn) \ - static inline v8si avx2_mm256_##fn(v8si x, v8si y) { \ - /* use SSE2 instructions to perform the AVX2 integer operation */ \ - v4si x1, x2; \ - v4si y1, y2; \ - v8si ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - COPY_IMM_TO_XMM(y, y1, y2); \ - x1 = _mm_##fn(x1, y1); \ - x2 = _mm_##fn(x2, y2); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return (ret); \ - } - -//#warning "Using SSE2 to perform AVX2 integer ops" -AVX2_INTOP_USING_SSE2(and_si128) -AVX2_INTOP_USING_SSE2(andnot_si128) -AVX2_INTOP_USING_SSE2(cmpeq_epi32) -AVX2_INTOP_USING_SSE2(sub_epi32) -AVX2_INTOP_USING_SSE2(add_epi32) -#define avx2_mm256_and_si256 avx2_mm256_and_si128 -#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128 -#else -#define avx2_mm256_slli_epi32 _mm256_slli_epi32 -#define avx2_mm256_srli_epi32 _mm256_srli_epi32 -#define avx2_mm256_and_si256 _mm256_and_si256 -#define avx2_mm256_andnot_si256 _mm256_andnot_si256 -#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32 -#define avx2_mm256_sub_epi32 _mm256_sub_epi32 -#define avx2_mm256_add_epi32 _mm256_add_epi32 -#endif /* __AVX2__ */ - -/* natural logarithm computed for 8 simultaneous float - return NaN for x <= 0 -*/ -v8sf log256_ps(v8sf x) { - v8si imm0; - v8sf one = *(v8sf *)_ps256_1; - - // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); - v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); - - x = _mm256_max_ps( - x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */ - - // can be done with AVX2 - imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23); - - /* keep only the fractional part */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); - x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); - - // this is again another AVX2 instruction - imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); - v8sf e = _mm256_cvtepi32_ps(imm0); - - e = _mm256_add_ps(e, one); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); - v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); - v8sf tmp = _mm256_and_ps(x, mask); - x = _mm256_sub_ps(x, one); - e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); - x = _mm256_add_ps(x, tmp); - - v8sf z = _mm256_mul_ps(x, x); - - v8sf y = *(v8sf *)_ps256_cephes_log_p0; - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); - y = _mm256_mul_ps(y, x); - - y = _mm256_mul_ps(y, z); - - tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); - y = _mm256_add_ps(y, tmp); - - tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); - y = _mm256_sub_ps(y, tmp); - - tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); - x = _mm256_add_ps(x, y); - x = _mm256_add_ps(x, tmp); - x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN - return x; -} - -_PS256_CONST(exp_hi, 88.3762626647949f); -_PS256_CONST(exp_lo, -88.3762626647949f); - -_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); -_PS256_CONST(cephes_exp_C1, 0.693359375); -_PS256_CONST(cephes_exp_C2, -2.12194440e-4); - -_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); -_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); -_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); -_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); -_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); -_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); - -v8sf exp256_ps(v8sf x) { - v8sf tmp = _mm256_setzero_ps(), fx; - v8si imm0; - v8sf one = *(v8sf *)_ps256_1; - - x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); - x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); - fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); - - /* how to perform a floorf with SSE: just below */ - // imm0 = _mm256_cvttps_epi32(fx); - // tmp = _mm256_cvtepi32_ps(imm0); - - tmp = _mm256_floor_ps(fx); - - /* if greater, substract 1 */ - // v8sf mask = _mm256_cmpgt_ps(tmp, fx); - v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); - mask = _mm256_and_ps(mask, one); - fx = _mm256_sub_ps(tmp, mask); - - tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); - v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); - x = _mm256_sub_ps(x, tmp); - x = _mm256_sub_ps(x, z); - - z = _mm256_mul_ps(x, x); - - v8sf y = *(v8sf *)_ps256_cephes_exp_p0; - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); - y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, x); - y = _mm256_add_ps(y, one); - - /* build 2^n */ - imm0 = _mm256_cvttps_epi32(fx); - // another two AVX2 instructions - imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); - imm0 = avx2_mm256_slli_epi32(imm0, 23); - v8sf pow2n = _mm256_castsi256_ps(imm0); - y = _mm256_mul_ps(y, pow2n); - return y; -} - -_PS256_CONST(minus_cephes_DP1, -0.78515625); -_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); -_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); -_PS256_CONST(sincof_p0, -1.9515295891E-4); -_PS256_CONST(sincof_p1, 8.3321608736E-3); -_PS256_CONST(sincof_p2, -1.6666654611E-1); -_PS256_CONST(coscof_p0, 2.443315711809948E-005); -_PS256_CONST(coscof_p1, -1.388731625493765E-003); -_PS256_CONST(coscof_p2, 4.166664568298827E-002); -_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI - -/* evaluation of 8 sines at onces using AVX intrisics - - The code is the exact rewriting of the cephes sinf function. - Precision is excellent as long as x < 8192 (I did not bother to - take into account the special handling they have for greater values - -- it does not return garbage for arguments over 8192, though, but - the extra precision is missing). - - Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - surprising but correct result. - -*/ -v8sf sin256_ps(v8sf x) { // any x - v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; - v8si imm0, imm2; - -#ifndef __AVX2__ - v4si imm0_1, imm0_2; - v4si imm2_1, imm2_2; -#endif - - sign_bit = x; - /* take the absolute value */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); - /* extract the sign bit (upper one) */ - sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); - - /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); - -/* - Here we start a series of integer operations, which are in the - realm of AVX2. - If we don't have AVX, let's perform them using SSE2 directives -*/ - -#ifdef __AVX2__ - /* store the integer part of y in mm0 */ - imm2 = _mm256_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - // another two AVX2 instruction - imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); - y = _mm256_cvtepi32_ps(imm2); - - /* get the swap sign flag */ - imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); - imm0 = avx2_mm256_slli_epi32(imm0, 29); - /* get the polynom selection mask - there is one polynom for 0 <= x <= Pi/4 - and another one for Pi/4 +SaberStatus JitAvx2Conv::check_conf( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m) { + + ConvParam *conv_param = &(param); + const Tensor *weights = conv_param->weight(); + const Tensor *bias = conv_param->bias(); + const jit_conv_conf_t jcp = kernel->jcp; + Tensor *input = inputs[0]; + Tensor *output = outputs[0]; + + // check format +// if (!(std::is_same::value && +// std::is_same::value && +// std::is_same::value)) { +// LOG(ERROR) << "wrong format"; +// return SaberUnImplError; +// } + if ((inputs[0]->get_layout() != Layout_NCHW) + || (outputs[0]->get_layout() != Layout_NCHW_C8) + || (weights->get_layout() != Layout_NCHW)) { + + LOG(ERROR) << "wrong format"; + return SaberUnImplError; + } + + // check param + bool param_ok = true + && jcp.t_pad == conv_param->pad_h + && jcp.l_pad == conv_param->pad_w + && jcp.stride_h == conv_param->stride_h + && jcp.stride_w == conv_param->stride_w + && jcp.dilate_h == conv_param->dilation_h + && jcp.dilate_w == conv_param->dilation_w; + + // check shape + bool shape_ok = true + && jcp.kh == weights->height() + && jcp.kw == weights->width() + && jcp.ngroups == 1 + && jcp.mb == input->num() + && jcp.ic == input->channel() + && jcp.ih == input->height() + && jcp.iw == input->width() + && jcp.oc == output->channel() + && jcp.oh == output->height() + && jcp.ow == output->width(); + + if (param_ok && shape_ok) { + return SaberSuccess; + } else { + LOG(INFO) << "param or shape changed, re-init kernel"; + return SaberNotInitialized; + } +} + +template <> +SaberStatus JitAvx2Conv::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m, Context &ctx) { + + SaberStatus status = SaberSuccess; + ConvParam *conv_param = &(param); + ActivationParam *act_param = nullptr; + const Tensor *weights = conv_param->weight(); + Tensor *input = inputs[0]; + Tensor *output = outputs[0]; + + // check conf + if (kernel) { + status = check_conf(inputs, outputs, param); + if(status != SaberNotInitialized) { + return status; + } + } + + // init conf + conf.ngroups = 1; + conf.mb = input->num(); + conf.ic = input->channel(); + conf.ih = input->height(); + conf.iw = input->width(); + + conf.oc = output->channel(); + conf.oh = output->height(); + conf.ow = output->width(); + + conf.kh = weights->height(); + conf.kw = weights->width(); + conf.stride_h = conv_param->stride_h; + conf.stride_w = conv_param->stride_w; + conf.t_pad = conv_param->pad_h; + conf.l_pad = conv_param->pad_w; + conf.dilate_h = conv_param->dilation_h; + conf.dilate_w = conv_param->dilation_w; + conf.with_bias = (conv_param->bias()->valid_size() > 0); + + conf.with_relu = param.activation_param.has_active; + if (conf.with_relu) { + act_param = &(param.activation_param); + conf.relu_negative_slope = act_param->negative_slope; + } + status = jit_avx2_conv_act_kernel::init_conf(conf); + if (status == SaberSuccess) { + if (kernel != nullptr) { + delete kernel; + kernel = nullptr; + } + kernel = new jit_avx2_conv_act_kernel(this->conf); + } else { + return SaberUnImplError; + } + + // reorder weights + Tensor *weights_reorder = conv_param->mutable_weight(); + weights_internal.reset(new Tensor(weights_reorder->valid_shape())); + weight_reorder_OIhwi8o(*weights_reorder, *weights_internal); + + return SaberSuccess; +} + +template <> +SaberStatus JitAvx2Conv::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m, Context &ctx) { + +// if (!(std::is_same::value && +// std::is_same::value && +// std::is_same::value)) { +// return SaberUnImplError; +// } + if ((inputs[0]->get_layout() != Layout_NCHW) + || (outputs[0]->get_layout() != Layout_NCHW_C8) + || (param.weight()->get_layout() != Layout_NCHW)) { + + LOG(ERROR) << "wrong format"; + return SaberUnImplError; + } + + this->_ctx = &ctx; + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus JitAvx2Conv::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m) { + + ConvParam *conv_param = &(param); + const Tensor *bias = conv_param->bias(); + + const float *ptr_src = reinterpret_cast(inputs[0]->data()); + const float *ptr_weights = reinterpret_cast(weights_internal->data()); + const float *ptr_bias = reinterpret_cast(bias->data()); + auto ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); + + const auto &jcp = kernel->jcp; + + int ocb_work = utils::div_up(jcp.nb_oc, jcp.nb_oc_blocking); + // int gb_work = jcp.nb_g; + const size_t work_amount = jcp.mb * jcp.ngroups * ocb_work * jcp.oh; + // const size_t work_amount_dw = jcp.mb * gb_work * jcp.oh; + + auto ker = [&](const int ithr, const int nthr) { + size_t start{0}, end{0}; + utils::balance211(work_amount, nthr, ithr, start, end); + + int icbb = 0; + while (icbb < jcp.nb_ic) { + int icb_step = jcp.nb_ic_blocking; + int icb_step_rem = jcp.nb_ic - icbb; + if (icb_step_rem < jcp.nb_ic_blocking_max) { + icb_step = icb_step_rem; + } + + size_t n{0}, g{0}, ocbb{0}, oh{0}; + utils::nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); + for (size_t iwork = start; iwork < end; ++iwork) { + int ocb = ocbb * jcp.nb_oc_blocking; + int ocb_num = jcp.nb_oc_blocking; + + for (int icb = icbb; icb < icbb + icb_step; ++icb) { + jit_conv_call_t par_conv; + + const int ij = oh * jcp.stride_h; + const int i_t_overflow = saber::utils::max(0, jcp.t_pad - ij); + const int i_b_overflow = saber::utils::max(jcp.ih, ij + (jcp.kh - 1) * (jcp.dilate_h + 1) - jcp.t_pad + 1) - jcp.ih; + + const size_t _oc = g * jcp.nb_oc + ocb; + const size_t _ic = g * jcp.nb_ic + icb; + + const int src_ic = jcp.ic == 3 ? 0 : _ic; + const int wgt_ic = jcp.ic == 3 ? 0 : icb; + const int ih = saber::utils::max(ij - jcp.t_pad + utils::div_up(i_t_overflow, + (jcp.dilate_h + 1)) * (jcp.dilate_h + 1), 0); + + par_conv.src = ptr_src + n * jcp.ic * jcp.iw * jcp.ih + src_ic * jcp.iw * jcp.ih + ih * jcp.iw; + par_conv.dst = ptr_dst + n * jcp.oc * jcp.ow * jcp.oh + _oc * jcp.ow * jcp.oh * 8 + oh * jcp.ow * 8; + + const int wh = utils::div_up(i_t_overflow, (jcp.dilate_h + 1)); + + par_conv.filt = ptr_weights + ocb * jcp.kh * jcp.kw * jcp.ic * 8 + wgt_ic * 8 + wh * jcp.kw * jcp.ic * 8; + + if (icb == 0) { + if (bias) { + par_conv.bias = ptr_bias + _oc * 8; + } + par_conv.flags |= 1 << 4; + } + + if (jcp.with_relu && icb + 1 == jcp.nb_ic) { + par_conv.flags |= 1 << 5; + } + + par_conv.oc_blocks = saber::utils::min(ocb + ocb_num, jcp.nb_oc) - ocb; + + par_conv.kw_padding = 0; + const int kh_padding = jcp.kh - + utils::div_up(i_t_overflow, (jcp.dilate_h + 1)) - + utils::div_up(i_b_overflow, (jcp.dilate_h + 1)); + par_conv.kh_padding = saber::utils::max(0, kh_padding); + kernel->jit_ker(&par_conv); + } + utils::nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, + oh, jcp.oh); + } + icbb += icb_step; + } + }; + +#pragma omp parallel + { + ker(omp_get_thread_num(), omp_get_num_threads()); + } + + return SaberSuccess; +} + +template class JitAvx2Conv; + + +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/x86/jit_avx2_conv.h b/saber/funcs/impl/x86/jit_avx2_conv.h new file mode 100644 index 000000000..9243dc0b1 --- /dev/null +++ b/saber/funcs/impl/x86/jit_avx2_conv.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX2_CONV_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX2_CONV_H + +#include + +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/core/tensor.h" +#include "saber/funcs/impl/x86/jit_call_conf.h" +#include "saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.h" + +namespace anakin { +namespace saber { + +template +class JitAvx2Conv : public ImplBase< + X86, OpDtype, ConvParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + JitAvx2Conv() {kernel = nullptr;} + ~JitAvx2Conv() { + if (kernel) { + delete kernel; + } + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m, Context&ctx) override; + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m, Context&ctx) override; + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m) override; +private: + jit::jit_conv_conf_t conf; + jit::jit_avx2_conv_act_kernel *kernel = nullptr; + std::shared_ptr > weights_internal; + SaberStatus check_conf(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m); +}; + +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX2_CONV_H diff --git a/saber/funcs/impl/x86/jit_avx2_conv_act.cpp b/saber/funcs/impl/x86/jit_avx2_conv_act.cpp deleted file mode 100644 index 04773d61a..000000000 --- a/saber/funcs/impl/x86/jit_avx2_conv_act.cpp +++ /dev/null @@ -1,223 +0,0 @@ -#include "saber/funcs/impl/x86/jit_call_conf.h" -#include "saber/funcs/impl/x86/kernel/jit_avx2_conv_act_kernel.h" -#include "saber/funcs/impl/x86/jit_avx2_conv_act.h" - -#include "x86_utils.h" - -namespace anakin { -namespace saber { - -using namespace jit; - -using jit_conv_ker_t = void (*)(jit_conv_call_t *); - -inline void jit_conv_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_t &p, - const void *src, const void *dst, const void *filt, const void *bias, - int channel, int kh_padding) { -#define PIPELINE(field) \ - do { \ - p.field = p.field ## _prf; \ - p.field ## _prf = field; \ - } while (0) - - PIPELINE(src); - PIPELINE(dst); - PIPELINE(filt); - PIPELINE(bias); - PIPELINE(channel); - PIPELINE(kh_padding); - - if (p.src) { - ker(&p); - } -} - -template -SaberStatus JitAvx2ConvAct::init( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, Context &ctx) { - this->_ctx = ctx; - ConvParam *conv_param = &(param.conv_param); - ActivationParam *act_param = &(param.activation_param); - - const opTensor *weights = conv_param->weight(); - Shape src_shape(inputs[0]->shape()); - Shape dst_shape(outputs[0]->shape()); - Shape weights_shape(weights->shape()); - - // const bool with_groups = false; - conf.ngroups = 1; - conf.mb = src_shape[0]; - conf.ic = src_shape[1]; - conf.ih = src_shape[2]; - conf.iw = src_shape[3]; - - conf.oc = dst_shape[1] * 8; - conf.oh = dst_shape[2]; - conf.ow = dst_shape[3]; - - conf.kh = weights_shape[2]; - conf.kw = weights_shape[3]; - conf.stride_h = conv_param -> stride_h; - conf.stride_w = conv_param -> stride_w; - conf.t_pad = conv_param -> pad_h; - conf.l_pad = conv_param -> pad_w; - conf.dilate_h = conv_param -> dilation_h; - conf.dilate_w = conv_param -> dilation_w; - conf.with_relu = param.has_active; - conf.with_bias = !(conv_param -> bias() == NULL); - - if (conf.with_relu) { - conf.relu_negative_slope = static_cast(act_param->negative_slope); - } - - if (!(std::is_same::value && - std::is_same::value && - std::is_same::value)) { - return SaberUnImplError; - } - - SaberStatus status = jit_avx2_conv_act_kernel::init_conf(conf); - if (status == SaberSuccess) { - return create(inputs, outputs, param, ctx); - } else { - return SaberUnImplError; - } -} -template -SaberStatus JitAvx2ConvAct::create( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, Context &ctx) { - kernel_ = new jit_avx2_conv_act_kernel(this->conf); - - ConvParam *conv_param = &(param.conv_param); - opTensor *weights = conv_param->mutable_weight(); - weights_internal.reset(new opTensor(weights->shape())); - weight_reorder_OIhwi8o(*weights, *weights_internal); - - return SaberSuccess; -} - -template -SaberStatus JitAvx2ConvAct::dispatch( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m) { - - ConvParam *conv_param = &(param.conv_param); - const opTensor *bias = conv_param->bias(); - - const dtype *ptr_src = reinterpret_cast(inputs[0]->get_buf()->get_data()); - const dtype *ptr_weights = reinterpret_cast(weights_internal->get_buf()->get_data()); - const dtype *ptr_bias = reinterpret_cast(bias->get_buf()-> get_data()); - auto ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); - - const auto &jcp = kernel_->jcp; - - int ocb_work = utils::div_up(jcp.nb_oc, jcp.nb_oc_blocking); - // int gb_work = jcp.nb_g; - const size_t work_amount = jcp.mb * jcp.ngroups * ocb_work * jcp.oh; - // const size_t work_amount_dw = jcp.mb * gb_work * jcp.oh; - - auto ker = [&](const int ithr, const int nthr) { - size_t start{0}, end{0}; - utils::balance211(work_amount, nthr, ithr, start, end); - - int icbb = 0; - while (icbb < jcp.nb_ic) { - int icb_step = jcp.nb_ic_blocking; - int icb_step_rem = jcp.nb_ic - icbb; - if (icb_step_rem < jcp.nb_ic_blocking_max) { - icb_step = icb_step_rem; - } - - size_t n{0}, g{0}, ocbb{0}, oh{0}; - utils::nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); - for (size_t iwork = start; iwork < end; ++iwork) { - int ocb = ocbb * jcp.nb_oc_blocking; - int ocb_num = jcp.nb_oc_blocking; - - for (int icb = icbb; icb < icbb + icb_step; ++icb) { - jit_conv_call_t par_conv = {}; - - const int ij = oh * jcp.stride_h; - const int i_t_overflow = saber::utils::max(0, jcp.t_pad - ij); - const int i_b_overflow = saber::utils::max(jcp.ih, ij + (jcp.kh - 1) * (jcp.dilate_h + 1) - jcp.t_pad + 1) - jcp.ih; - - const size_t _oc = g * jcp.nb_oc + ocb; - const size_t _ic = g * jcp.nb_ic + icb; - - const int src_ic = jcp.ic == 3 ? 0 : _ic; - const int wgt_ic = jcp.ic == 3 ? 0 : icb; - const int ih = saber::utils::max(ij - jcp.t_pad + utils::div_up(i_t_overflow, - (jcp.dilate_h + 1)) * (jcp.dilate_h + 1), 0); - - par_conv.src = ptr_src + n * jcp.ic * jcp.iw * jcp.ih + src_ic * jcp.iw * jcp.ih + ih * jcp.iw; - par_conv.dst = ptr_dst + n * jcp.oc * jcp.ow * jcp.oh + _oc * jcp.ow * jcp.oh * 8 + oh * jcp.ow * 8; - - const int wh = utils::div_up(i_t_overflow, (jcp.dilate_h + 1)); - - par_conv.filt = ptr_weights + ocb * jcp.kh * jcp.kw * jcp.ic * 8 + wgt_ic * 8 + wh * jcp.kw * jcp.ic * 8; - - if (icb == 0) { - if (bias) { - par_conv.bias = ptr_bias + _oc * 8; - } - par_conv.flags |= 1 << 4; - } - - if (jcp.with_relu && icb + 1 == jcp.nb_ic) { - par_conv.flags |= 1 << 5; - } - - par_conv.oc_blocks = saber::utils::min(ocb + ocb_num, jcp.nb_oc) - ocb; - - par_conv.kw_padding = 0; - const int kh_padding = jcp.kh - - utils::div_up(i_t_overflow, (jcp.dilate_h + 1)) - - utils::div_up(i_b_overflow, (jcp.dilate_h + 1)); - par_conv.kh_padding = saber::utils::max(0, kh_padding); - kernel_->jit_ker(&par_conv); - } - utils::nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, - oh, jcp.oh); - } - icbb += icb_step; - } - }; - - #pragma omp parallel - { - ker(omp_get_thread_num(), omp_get_num_threads()); - } - - return SaberSuccess; -} - -template class JitAvx2ConvAct; -template class JitAvx2ConvAct; -template class JitAvx2ConvAct; - - -} // namespace saber -} // namespace anakin diff --git a/saber/funcs/impl/x86/jit_avx2_conv_act.h b/saber/funcs/impl/x86/jit_avx2_conv_act.h deleted file mode 100644 index a719b9245..000000000 --- a/saber/funcs/impl/x86/jit_avx2_conv_act.h +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX2_CONV_ACT_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX2_CONV_ACT_H - -#include - -#include "saber/saber_funcs_param.h" -#include "saber/funcs/impl/impl_base.h" -#include "saber/core/tensor.h" -#include "saber/funcs/impl/x86/jit_call_conf.h" -#include "saber/funcs/impl/x86/kernel/jit_avx2_conv_act_kernel.h" - -namespace anakin { -namespace saber { - -template -class JitAvx2ConvAct : public ImplBase, - Tensor, - Tensor, - ConvActiveParam>> { -public: - typedef Tensor inTensor; - typedef Tensor outTensor; - typedef Tensor opTensor; - typedef typename inTensor::Dtype dtype; - - JitAvx2ConvAct() {} - ~JitAvx2ConvAct() { delete kernel_; } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) override; - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m) override; -private: - jit::jit_conv_conf_t conf; - jit::jit_avx2_conv_act_kernel *kernel_; - std::shared_ptr weights_internal; -}; - -} // namespace saber -} // namespace anakin - -#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX2_CONV_ACT_H diff --git a/saber/funcs/impl/x86/jit_avx512_conv.cpp b/saber/funcs/impl/x86/jit_avx512_conv.cpp new file mode 100644 index 000000000..e89c51b36 --- /dev/null +++ b/saber/funcs/impl/x86/jit_avx512_conv.cpp @@ -0,0 +1,347 @@ +#include + +#include "saber/funcs/impl/x86/jit_avx512_conv.h" +#include "saber/funcs/impl/x86/jit_call_conf.h" +#include "saber/funcs/impl/x86/x86_utils.h" + +namespace anakin { +namespace saber { + +using namespace jit; + +using jit_conv_ker_t = void (*)(jit_conv_call_t *); + +inline void jit_conv_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_t &p, + const void *src, const void *dst, const void *filt, const void *bias, + int channel, int kh_padding) { +#define PIPELINE(field) \ + do { \ + p.field = p.field ## _prf; \ + p.field ## _prf = field; \ + } while (0) + + PIPELINE(src); + PIPELINE(dst); + PIPELINE(filt); + PIPELINE(bias); + PIPELINE(channel); + PIPELINE(kh_padding); + + if (p.src) { + ker(&p); + } +} + + +template <> +SaberStatus JitAvx512Conv::check_conf( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m) { + ConvParam *conv_param = &(param); + const Tensor *weights = conv_param->weight(); + const Tensor *bias = conv_param->bias(); + const jit_conv_conf_t jcp = kernel->jcp; + Tensor *input = inputs[0]; + Tensor *output = outputs[0]; + conf.is_1stconv = utils::one_of(input->channel(), 1, 3); + + // check format + if (conf.is_1stconv) { +// if (!(std::is_same::value && +// std::is_same::value && +// std::is_same::value )) { +// LOG(ERROR) << "1stconv wrong format "; +// return SaberUnImplError; +// } + if ((inputs[0]->get_layout() != Layout_NCHW) + || (outputs[0]->get_layout() != Layout_NCHW_C16) + || (param.weight()->get_layout() != Layout_NCHW)) { + + LOG(ERROR) << "wrong format"; + return SaberUnImplError; + } + } else { +// if (!(std::is_same::value && +// std::is_same::value && +// std::is_same::value)) { +// LOG(ERROR) << "wrong format "; +// return SaberUnImplError; +// } + if ((inputs[0]->get_layout() != Layout_NCHW_C16) + || (outputs[0]->get_layout() != Layout_NCHW_C16) + || (param.weight()->get_layout() != Layout_NCHW)) { + + LOG(ERROR) << "wrong format"; + return SaberUnImplError; + } + } + + // check param + bool param_ok = true + && jcp.t_pad == conv_param->pad_h + && jcp.l_pad == conv_param->pad_w + && jcp.stride_h == conv_param->stride_h + && jcp.stride_w == conv_param->stride_w + && jcp.dilate_h == conv_param->dilation_h + && jcp.dilate_w == conv_param->dilation_w; + + // check shape + bool shape_ok = true + && jcp.kh == weights->height() + && jcp.kw == weights->width() + && jcp.ngroups == 1 + && jcp.mb == input->num() + && jcp.ic == input->channel() + && jcp.ih == input->height() + && jcp.iw == input->width() + && jcp.oc == output->channel() + && jcp.oh == output->height() + && jcp.ow == output->width(); + + if (param_ok && shape_ok) { + return SaberSuccess; + } else { + LOG(INFO) << "param or shape changed, re-init kernel"; + return SaberNotInitialized; + } + +} + +template <> +SaberStatus JitAvx512Conv::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m, + Context &ctx) { + SaberStatus status; + ConvParam *conv_param = &(param); + ActivationParam *act_param = nullptr; + const Tensor *weights = conv_param->weight(); + Tensor *output = outputs[0]; + Tensor *input = inputs[0]; + + // check conf + if (kernel) { + status = check_conf(inputs, outputs, param); + if(status != SaberNotInitialized) { + return status; + } + } + + // init conf + const bool with_groups = false; + conf.ngroups = with_groups ? weights->num() : 1; + + conf.mb = input->num(); + conf.ic = input->channel() / conf.ngroups; + conf.ih = input->height(); + conf.iw = input->width(); + + conf.oc = output->channel() / conf.ngroups; + conf.oh = output->height(); + conf.ow = output->width(); + + conf.kh = weights->height(); + conf.kw = weights->width(); + conf.stride_h = conv_param->stride_h; + conf.stride_w = conv_param->stride_w; + conf.t_pad = conv_param->pad_h; + conf.l_pad = conv_param->pad_w; + conf.dilate_h = conv_param->dilation_h; + conf.dilate_w = conv_param->dilation_w; + + conf.with_relu = param.activation_param.has_active; + if (conf.with_relu) { + act_param = &(param.activation_param); + conf.relu_negative_slope = static_cast(act_param->negative_slope); + } + conf.with_bias = (conv_param->bias() != NULL); + + status = jit_conv_kernel::init_conf(conf); + if (status == SaberSuccess) { + if (kernel != nullptr) { + delete kernel; + kernel = nullptr; + } + kernel = new jit_conv_kernel(conf); + } else { + return SaberUnImplError; + } + + // reorder weights + Tensor *weights_reorder = conv_param->mutable_weight(); + weights_internal.reset(new Tensor(weights_reorder->valid_shape())); + +// if (std::is_same::value) { + if (inputs[0]->get_layout() == Layout_NCHW) { + weight_reorder_OIhwi16o(*weights_reorder, *weights_internal); +// } else if (std::is_same::value) { + } else if (inputs[0]->get_layout() == Layout_NCHW_C16) { + weight_reorder_OIhw16i16o(*weights_reorder, *weights_internal); + } + + return SaberSuccess; +} + +template <> +SaberStatus JitAvx512Conv::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m, + Context &ctx) { + Tensor *input = inputs[0]; + conf.is_1stconv = utils::one_of(input->channel(), 1, 3); + + if (conf.is_1stconv) { +// if (!(std::is_same::value && +// std::is_same::value && +// std::is_same::value )) { +// LOG(ERROR) << "data layout is not supported"; +// return SaberUnImplError; +// } + if ((inputs[0]->get_layout() != Layout_NCHW) + || (outputs[0]->get_layout() != Layout_NCHW_C16) + || (param.weight()->get_layout() != Layout_NCHW)) { + LOG(ERROR) << "data layout is not supported"; + return SaberUnImplError; + } + } else { +// if (!(std::is_same::value && +// std::is_same::value && +// std::is_same::value)) { +// LOG(ERROR) << "data layout is not supported"; +// return SaberUnImplError; +// } + if ((inputs[0]->get_layout() != Layout_NCHW_C16) + || (outputs[0]->get_layout() != Layout_NCHW_C16) + || (param.weight()->get_layout() != Layout_NCHW)) { + LOG(ERROR) << "data layout is not supported"; + return SaberUnImplError; + } + } + + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus JitAvx512Conv::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m) { + + ConvParam *conv_param = &(param); + const Tensor *bias = conv_param->bias(); + + const float *ptr_src = reinterpret_cast(inputs[0]->data()); + const float *ptr_weights = reinterpret_cast(weights_internal->data()); + const float *ptr_bias = reinterpret_cast(bias->data()); + auto ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); + + const auto &jcp = kernel->jcp; + +#pragma omp parallel + { + int ithr = omp_get_thread_num(), nthr = omp_get_num_threads(); + int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking; + int start, end, start_copy; + int work_amount = jcp.mb * jcp.ngroups * oc_chunks * jcp.oh; + utils::balance211(work_amount, nthr, ithr, start, end); + start_copy = start; + + jit_conv_call_t par_conv; + size_t src_h_stride = jcp.iw * jcp.ic_block; + size_t src_c_stride = jcp.ih * jcp.iw * jcp.ic_block; + size_t dst_h_stride = jcp.ow * jcp.oc_block; + size_t wht_h_stride = jcp.kw * jcp.ic_block * jcp.oc_block; + size_t wht_ic_stride = jcp.kh * jcp.kw * jcp.ic_block * jcp.oc_block; + + if (jcp.is_1stconv) { + src_h_stride = jcp.iw; + src_c_stride = jcp.ih * jcp.iw; + wht_ic_stride = jcp.oc_block; + } + + for (int icb_l2 = 0; icb_l2 < jcp.nb_ic; icb_l2 += jcp.nb_ic_L2) { + start = start_copy; + int n{0}, g{0}, occ{0}, oh_s{0}; + if (jcp.loop_order == conv_loop_order_t::loop_cgn) { + utils::nd_iterator_init(start, occ, oc_chunks, g, jcp.ngroups, n, jcp.mb, oh_s, jcp.oh); + } + else if (jcp.loop_order == conv_loop_order_t::loop_gnc) { + utils::nd_iterator_init(start, g, jcp.ngroups, n, jcp.mb, occ, oc_chunks, oh_s, jcp.oh); + } + + while (start < end) { + int ocb = occ * jcp.nb_oc_blocking; + int g_ocb = g * jcp.nb_oc + ocb; + int g_oc = g_ocb * jcp.oc_block; + int g_icb = g * jcp.nb_ic; + + int work_rem = end - start; + int ih_s = -jcp.t_pad + oh_s * jcp.stride_h; + int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem; + + size_t bias_blk_off = g_oc; + size_t dst_blk_off = n * jcp.oc * jcp.oh * jcp.ow + + (g_ocb * jcp.oh * jcp.ow + oh_s * jcp.ow) * jcp.oc_block; + size_t src_blk_off = n * jcp.ic * jcp.ih * jcp.iw + + (g_icb + icb_l2) * jcp.ih * jcp.iw * jcp.ic_block + ih_s * jcp.iw * jcp.ic_block; + size_t weight_blk_off= ocb * jcp.ic * jcp.kh * jcp.kw * jcp.oc_block + + icb_l2 * jcp.kh * jcp.kw * jcp.oc_block * jcp.ic_block; + + if (jcp.is_1stconv) { + src_blk_off = n * jcp.ic * jcp.ih * jcp.iw + ih_s * jcp.iw; + weight_blk_off = ocb * jcp.ic * jcp.kh * jcp.kw * jcp.oc_block; + } + + auto bias_w = ptr_bias ? ptr_bias + bias_blk_off : 0; + auto dst_w = ptr_dst + dst_blk_off; + auto src_w = ptr_src + src_blk_off; + auto wht_w = ptr_weights + weight_blk_off; + + for (int icb = icb_l2; + icb < utils::min(jcp.nb_ic, icb_l2 + jcp.nb_ic_L2); ++icb) { + auto src_c = src_w; + auto dst_c = dst_w; + for (int oj = oh_s, ij = ih_s; + oj < oh_e; ++oj, ij += jcp.stride_h) { + + int i_t_overflow = -utils::min(0, ij); + int i_b_overflow = utils::max(jcp.ih, ij + jcp.kh) - jcp.ih; + int kh_padding = utils::max(0, jcp.kh - i_t_overflow - i_b_overflow); + + jit_conv_ker_pipeline(kernel->jit_ker, par_conv, + src_c + i_t_overflow * src_h_stride, + dst_c, wht_w + i_t_overflow * wht_h_stride, + bias_w, icb, kh_padding); + + src_c += src_h_stride * jcp.stride_h; + dst_c += dst_h_stride; + } + src_w += src_c_stride; + wht_w += wht_ic_stride; + } + + if (jcp.loop_order == conv_loop_order_t::loop_cgn) { + utils::nd_iterator_jump(start, end, + occ, oc_chunks, g, jcp.ngroups, n, jcp.mb, oh_s, jcp.oh); + } else if (jcp.loop_order == conv_loop_order_t::loop_gnc) { + utils::nd_iterator_jump(start, end, g, jcp.ngroups, n, jcp.mb, occ, oc_chunks, oh_s, jcp.oh); + } + } + } + + jit_conv_ker_pipeline(kernel->jit_ker, par_conv, + ptr_src, ptr_dst, ptr_weights, ptr_bias, 0, 0); + + } + + return SaberSuccess; +} + +template class JitAvx512Conv; + +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/x86/jit_avx512_conv.h b/saber/funcs/impl/x86/jit_avx512_conv.h new file mode 100644 index 000000000..0a8effffc --- /dev/null +++ b/saber/funcs/impl/x86/jit_avx512_conv.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV_H + +#include + +#include "saber/funcs/impl/impl_base.h" +#include "saber/core/tensor.h" +#include "saber/funcs/impl/x86/jit_call_conf.h" +#include "saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.h" +#include "saber/saber_funcs_param.h" + +namespace anakin{ +namespace saber{ + +using namespace jit; + +template +class JitAvx512Conv : public ImplBase< + X86, OpDtype, ConvParam > { +public: +typedef typename DataTrait::Dtype OpDataType; + + JitAvx512Conv() + : kernel(NULL) + {} + + ~JitAvx512Conv() { + if (kernel != NULL) { + delete kernel; + } + } + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m) override; + +private: + jit::jit_conv_conf_t conf; + jit::jit_conv_kernel *kernel = nullptr; + std::shared_ptr > weights_internal; + SaberStatus check_conf(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m); +}; + + +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV_H diff --git a/saber/funcs/impl/x86/jit_avx512_conv1x1.cpp b/saber/funcs/impl/x86/jit_avx512_conv1x1.cpp new file mode 100644 index 000000000..ea77ce27e --- /dev/null +++ b/saber/funcs/impl/x86/jit_avx512_conv1x1.cpp @@ -0,0 +1,537 @@ +#include "saber/funcs/impl/x86/jit_call_conf.h" +#include "saber/funcs/impl/x86/x86_utils.h" +#include "saber/funcs/impl/x86/jit_avx512_conv1x1.h" + +namespace anakin { +namespace saber { + +using namespace jit; + +inline void set_default_strides(jit_strides_t strides, const jit_dims_t dims, + int ndims, const int *perm = nullptr) { + int id_perm[JIT_TENSOR_MAX_DIMS] = { 0 }; + for (int i = 0; i < ndims; ++i) { + id_perm[i] = i; + } + + if (perm == nullptr) { + perm = id_perm; + } + + strides[perm[ndims - 1]] = 1; + for (int d = 1; d < ndims; ++d) { + const int prev_idx = perm[ndims - d]; + const int curr_idx = perm[ndims - 1 - d]; + + strides[curr_idx] = dims[curr_idx] == 0 + ? 1 + : strides[prev_idx] * utils::max(1, dims[prev_idx]); + } +} + +SaberStatus fill_contiguous_blocked(jit_dims_t md_dims, + const int ndims, const jit_dims_t block_dims, + const int perm[], jit_strides_t strides) { + int unrolled_dims[2 * JIT_TENSOR_MAX_DIMS]; + int unrolled_strides[2 * JIT_TENSOR_MAX_DIMS]; + for (int d = 0; d < ndims; ++d) { + unrolled_dims[d] = md_dims[d] / block_dims[d]; + unrolled_dims[ndims + d] = block_dims[d]; + } + set_default_strides(unrolled_strides, unrolled_dims, 2 * ndims, perm); + utils::array_copy(strides, &unrolled_strides[0], ndims); + return SaberSuccess; +} + +SaberStatus fill_nChw16c(jit_dims_t md_dims, int ndims, jit_strides_t strides) { + const jit_dims_t block_dims = { 1, 16, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, + 4, 5, 6, 7 }; + return fill_contiguous_blocked(md_dims, ndims, block_dims, perm, strides); +} + +SaberStatus fill_gOIhw16i16o(jit_dims_t md_dims, int ndims, jit_strides_t strides) { + const jit_dims_t block_dims = { 1, 16, 16, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, 4, + 5, 7, 6, 8, 9 }; + return fill_contiguous_blocked(md_dims, ndims, block_dims, perm, strides); +} + +SaberStatus fill_OIhw16i16o(jit_dims_t md_dims, int ndims, jit_strides_t strides) { + const jit_dims_t block_dims = { 16, 16, 1, 1 }; + const int perm[] = { + 0, 1, 2, 3, + 5, 4, 6, 7 }; + return fill_contiguous_blocked(md_dims, ndims, block_dims, perm, strides); +} + +int shape_to_jit_dim(jit_dims_t& md_dims, const Shape &shape) { + for (int i = 0; i < shape.dims(); i++) + md_dims[i] = shape[i]; + return shape.dims(); +} + +struct memory_block_t { + jit_dims_t md_dims; + jit_strides_t strides; + + template inline size_t blk_off(Args... args) { + return _blk_off(args...); + } + + template + inline size_t _blk_off() { + return 0; + } + + template + inline size_t _blk_off(T xc, Args ...args) { + constexpr int dc = ORIG_LEN - sizeof...(args)-1; + return size_t(xc) * strides[dc] + + _blk_off(args...); + } + + memory_block_t(LayoutType layout_type, Shape &shape) { + int ndims = 0; + if (layout_type == Layout_NCHW_C16) { + ndims = 4; + } + else if (layout_type == Layout_GOIHW16I16O) { + ndims = 5; + } + else if (layout_type == Layout_OIHW16I16O) { + ndims = 4; + } + + shape_to_jit_dim(md_dims, shape); + if (layout_type == Layout_NCHW_C16) { + fill_nChw16c(md_dims, ndims, strides); + } + else if (layout_type == Layout_GOIHW16I16O) { + fill_gOIhw16i16o(md_dims, ndims, strides); + } + else if (layout_type == Layout_OIHW16I16O) { + fill_OIhw16i16o(md_dims, ndims, strides); + } + } +}; + + +template <> +void JitAvx512Conv1x1::prepare_rtus() { + bool rtus_applicable = true && + (conf.stride_h != 1 || conf.stride_w != 1); + + rtus_applicable = rtus_applicable && + conf.t_pad == 0 && conf.l_pad == 0 && + conf.oh * conf.stride_h == conf.ih && + conf.ow * conf.stride_w == conf.iw; + + // LOG(ERROR) << "rtus applicable:" << rtus_applicable; + + if (rtus_applicable) { + this->reduce_src = true; + this->conf.stride_h = this->conf.stride_w = 1; + this->conf.ih = this->conf.oh; + this->conf.iw = this->conf.ow; + } + + return; +} + +template +void balance2D(U nthr, U ithr, T ny, T &ny_start, T &ny_end, + T nx, T &nx_start, T &nx_end, T nx_divider) { + const T grp_size = utils::div_up(nthr, nx_divider); + const T grp_count = utils::div_up(nthr, grp_size); + + T grp = ithr / grp_size; + T grp_ithr = ithr % grp_size; + T grp_nthr = grp_size; + T first_grps = nthr % grp_count; + if (first_grps > 0 && grp >= first_grps) { + ithr -= first_grps * grp_size; + grp_nthr--; + grp = ithr / grp_nthr + first_grps; + grp_ithr = ithr % grp_nthr; + } + utils::balance211(nx, grp_count, grp, nx_start, nx_end); + utils::balance211(ny, grp_nthr, grp_ithr, ny_start, ny_end); +} + + +template <> +SaberStatus JitAvx512Conv1x1::check_conf( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m) { + + ConvParam *conv_param = &(param); + const Tensor *weights = conv_param->weight(); + const Tensor *bias = conv_param->bias(); + const jit_1x1_conv_conf_t jcp = kernel->jcp; + Tensor *input = inputs[0]; + Tensor *output = outputs[0]; + + // check format +// if (!(typeid(LayOutType_in) == typeid(NCHW_C16) && +// typeid(LayOutType_out) == typeid(NCHW_C16) && +// typeid(LayOutType_op) == typeid(NCHW))) { +// LOG(ERROR) << "wrong format"; +// return SaberUnImplError; +// } + if ((inputs[0]->get_layout() != Layout_NCHW_C16) + || (outputs[0]->get_layout() != Layout_NCHW_C16) + || (param.weight()->get_layout() != Layout_NCHW)) { + + LOG(ERROR) << "wrong format"; + return SaberUnImplError; + } + + // check param + bool param_ok = true + && jcp.t_pad == conv_param->pad_h + && jcp.l_pad == conv_param->pad_w + && jcp.stride_h == conv_param->stride_h + && jcp.stride_w == conv_param->stride_w; + + // check shape + bool shape_ok = true + && jcp.kh == weights->height() + && jcp.kw == weights->width() + && jcp.ngroups == 1 + && jcp.mb == input->num() + && jcp.ic == input->channel() + && jcp.ih == input->height() + && jcp.iw == input->width() + && jcp.oc == output->channel() + && jcp.oh == output->height() + && jcp.ow == output->width(); + + if (param_ok && shape_ok) { + return SaberSuccess; + } else { + LOG(INFO) << "param or shape changed, re-init kernel"; + return SaberNotInitialized; + } + +} + +template <> +SaberStatus JitAvx512Conv1x1::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m, Context &ctx) { + ConvParam *conv_param = &(param); + ActivationParam *act_param = nullptr; + SaberStatus status; + const Tensor *weights = conv_param->weight(); + Tensor *input = inputs[0]; + Tensor *output = outputs[0]; + + // check conf + if (kernel) { + status = check_conf(inputs, outputs, param); + if(status != SaberNotInitialized) { + return status; + } + } + + // init conf + const bool with_groups = false; + conf.ngroups = with_groups ? weights->num() : 1; + + conf.mb = input->num(); + conf.ic = input->channel() / conf.ngroups; + conf.ih = input->height(); + conf.iw = input->width(); + + conf.oc = output->channel() / conf.ngroups; + conf.oh = output->height(); + conf.ow = output->width(); + + conf.kh = weights->height(); + conf.kw = weights->width(); + conf.stride_h = conv_param->stride_h; + conf.stride_w = conv_param->stride_w; + conf.t_pad = conv_param->pad_h; + conf.l_pad = conv_param->pad_w; + + conf.with_relu = param.activation_param.has_active; + if (conf.with_relu) { + act_param = &(param.activation_param); + conf.relu_negative_slope = static_cast(act_param->negative_slope); + } + conf.with_bias = !(conv_param->bias() == nullptr); + + conv_d.n = input->num(); + conv_d.ic = input->channel() / conf.ngroups; + conv_d.ih = input->height(); + conv_d.iw = input->width(); + conv_d.oc = output->channel() / conf.ngroups; + conv_d.oh = output->height(); + conv_d.ow = output->width(); + conv_d.t_pad = conv_param->pad_h; + conv_d.l_pad = conv_param->pad_w; + conv_d.stride_h = conv_param->stride_h; + conv_d.stride_w = conv_param->stride_w; + + prepare_rtus(); + + status = jit_avx512_common_1x1_conv_kernel::init_conf(conf, conv_d, omp_get_max_threads(), reduce_src); + if (status == SaberSuccess) { + if (kernel != nullptr) { + delete kernel; + kernel = nullptr; + } + kernel = new jit_avx512_common_1x1_conv_kernel(this->conf); + } else { + return SaberUnImplError; + } + + if (reduce_src) { + init_rtus_driver(&rtus_driver, conf, conv_d, ws_per_thread, &scratch); + } + + // reorder weights + Tensor *weights_reorder = conv_param->mutable_weight(); + weights_internal.reset(new Tensor(weights_reorder->valid_shape())); + weight_reorder_OIhw16i16o(*weights_reorder, *weights_internal); + + return SaberSuccess; +} + +template <> +SaberStatus JitAvx512Conv1x1::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m, Context &ctx) { + +// if (!(typeid(LayOutType_in) == typeid(NCHW_C16) && +// typeid(LayOutType_out) == typeid(NCHW_C16) && +// typeid(LayOutType_op) == typeid(NCHW)) +// ) { +// return SaberUnImplError; +// } + if ((inputs[0]->get_layout() != Layout_NCHW_C16) + || (outputs[0]->get_layout() != Layout_NCHW_C16) + || (param.weight()->get_layout() != Layout_NCHW)) { + + LOG(ERROR) << "wrong format"; + return SaberUnImplError; + } + + this->_ctx = &ctx; + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus JitAvx512Conv1x1::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m) { + + ConvParam *conv_param = &(param); +// ActivationParam *act_param = &(param.activation_param); + const Tensor *weights = conv_param->weight(); + const Tensor *bias = conv_param->bias(); + + const float *ptr_src = reinterpret_cast(inputs[0]->data()); + const float *ptr_bias = reinterpret_cast(bias->data()); + float *ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); + const float *ptr_weights = reinterpret_cast(weights_internal->data()); + + const auto &jcp = kernel->jcp; + + const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast; + + const int stride_h = conv_param->stride_h; + const int stride_w = conv_param->stride_w; + const int pad_t = conv_param->pad_h; + const int pad_l = conv_param->pad_w; + Shape weights_shape = weights->valid_shape(); + memory_block_t weights_d(Layout_OIHW16I16O, weights_shape); + + Shape src_d_adjust(inputs[0]->valid_shape()); + src_d_adjust[1] *= 16; + Shape dst_d_adjust(outputs[0]->valid_shape()); + dst_d_adjust[1] *= 16; + memory_block_t dst_d(outputs[0]->get_layout(), dst_d_adjust); + memory_block_t src_d(inputs[0]->get_layout(), src_d_adjust); + + auto step = [](int default_step, int remaining, int tail_step) { + assert(default_step <= tail_step); + return remaining < tail_step ? remaining : default_step; + }; + +#pragma omp parallel + { + int ithr = omp_get_thread_num(), nthr = omp_get_num_threads(); + + jit_1x1_conv_call_t p; + + rtus_driver_t::call_params_t rp; + + const int nb_oc = jcp.nb_load; + const int nb_ic = jcp.nb_reduce; + const int nb_ic_blocking = jcp.nb_reduce_blocking; + const int os_block = jcp.bcast_block; + + int bcast_start{ 0 }, bcast_end{ 0 }, ocb_start{ 0 }, ocb_end{ 0 }; + balance2D(nthr, ithr, work_amount, bcast_start, bcast_end, + jcp.nb_load, ocb_start, ocb_end, jcp.load_grp_count); + + auto init_bcast = [&](int iwork, int &n, int &g, int &bcast_step, + int &oh, int &ow, int &ih, int &iw) { + int osb{ 0 }; + nd_iterator_init(iwork, n, jcp.mb, g, jcp.ngroups, osb, + jcp.nb_bcast); + bcast_step = step(jcp.nb_bcast_blocking, jcp.nb_bcast - osb, + jcp.nb_bcast_blocking_max); + bcast_step = utils::min(bcast_step, bcast_end - iwork); + + const int os = osb * os_block; + oh = os / jcp.ow; + ow = os % jcp.ow; + + ih = utils::max(oh * stride_h - pad_t, 0); + iw = utils::max(ow * stride_w - pad_l, 0); + rp.iw_start = iw; + + p.bcast_dim = this_block_size(os, jcp.os, + bcast_step * os_block); + rp.os = p.bcast_dim; + }; + + auto init_load = [&](int ocb, int &load_step) { + load_step = step(jcp.nb_load_blocking, ocb_end - ocb, + jcp.nb_load_blocking_max); + p.load_dim = this_block_size(ocb * jcp.oc_block, + ocb_end * jcp.oc_block, load_step * jcp.oc_block); + }; + + auto init_reduce = [&](int icb) { + const int nb_ic_blocking_step = + utils::min(icb + nb_ic_blocking, nb_ic) - icb; + p.reduce_pos_flag = 0 + | (icb == 0 ? FLAG_REDUCE_FIRST : 0) + | (icb + nb_ic_blocking_step >= nb_ic + ? FLAG_REDUCE_LAST : 0); + + p.reduce_dim = this_block_size(icb * jcp.ic_block, + jcp.ic, nb_ic_blocking_step * jcp.ic_block); + rp.icb = p.reduce_dim / jcp.reduce_block; + }; + + auto inner_ker = [&](int ocb, int icb, int n, int g, int oh, int ow, + int ih, int iw) { + const int _ocb = g * nb_oc + ocb; + const size_t dst_off = dst_d.blk_off(n, _ocb, oh, ow); + + p.output_data = &ptr_dst[dst_off]; + p.bias_data = &ptr_bias[_ocb * jcp.oc_block]; + p.load_data = &ptr_weights[conv_param->group > 1 ? + weights_d.blk_off(g, ocb, icb) : + weights_d.blk_off(ocb, icb)]; + + const int _icb = g * nb_ic + icb; + + if (reduce_src) { + rp.ws = scratch + ithr * ws_per_thread + + _icb * jcp.is * jcp.ic_block; + if (ocb == ocb_start) { + rp.src = ptr_src + src_d.blk_off(n, _icb, ih, iw); + rtus_driver->ker_(&rp); + } + p.bcast_data = rp.ws; + } else { + p.bcast_data = ptr_src + src_d.blk_off(n, _icb, ih, iw); + } + + kernel->jit_ker(&p); + }; + + if (jcp.loop_order == loop_rlb) { + for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) { + init_reduce(icb); + int ocb = ocb_start; + while (ocb < ocb_end) { + int load_step; + init_load(ocb, load_step); + int iwork = bcast_start; + while (iwork < bcast_end) { + int n, g, bcast_step, oh, ow, ih, iw; + init_bcast(iwork, n, g, bcast_step, oh, ow, ih, iw); + inner_ker(ocb, icb, n, g, oh, ow, ih, iw); + iwork += bcast_step; + } + ocb += load_step; + } + } + } else if (jcp.loop_order == loop_lbr) { + int ocb = ocb_start; + while (ocb < ocb_end) { + int load_step; + init_load(ocb, load_step); + int iwork = bcast_start; + while (iwork < bcast_end) { + int n, g, bcast_step, oh, ow, ih, iw; + init_bcast(iwork, n, g, bcast_step, oh, ow, ih, iw); + for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) { + init_reduce(icb); + inner_ker(ocb, icb, n, g, oh, ow, ih, iw); + } + iwork += bcast_step; + } + ocb += load_step; + } + } else if (jcp.loop_order == loop_rbl) { + for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) { + init_reduce(icb); + int iwork = bcast_start; + while (iwork < bcast_end) { + int n, g, bcast_step, oh, ow, ih, iw; + init_bcast(iwork, n, g, bcast_step, oh, ow, ih, iw); + int ocb = ocb_start; + while (ocb < ocb_end) { + int load_step; + init_load(ocb, load_step); + inner_ker(ocb, icb, n, g, oh, ow, ih, iw); + ocb += load_step; + } + iwork += bcast_step; + } + } + } else if (jcp.loop_order == loop_blr) { + int iwork = bcast_start; + while (iwork < bcast_end) { + int n, g, bcast_step, oh, ow, ih, iw; + init_bcast(iwork, n, g, bcast_step, oh, ow, ih, iw); + int ocb = ocb_start; + while (ocb < ocb_end) { + int load_step; + init_load(ocb, load_step); + for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) { + init_reduce(icb); + inner_ker(ocb, icb, n, g, oh, ow, ih, iw); + } + ocb += load_step; + } + iwork += bcast_step; + } + } + else { + assert(!"unsupported loop order"); + } + } + + return SaberSuccess; +} + +template class JitAvx512Conv1x1; + +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/x86/jit_avx512_conv1x1.h b/saber/funcs/impl/x86/jit_avx512_conv1x1.h new file mode 100644 index 000000000..0579e0b61 --- /dev/null +++ b/saber/funcs/impl/x86/jit_avx512_conv1x1.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV1X1_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV1X1_H + +#include +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/core/tensor.h" +#include "saber/funcs/impl/x86/jit_call_conf.h" +#include "saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h" +#include "saber/funcs/impl/x86/kernel/jit_avx512_rtus_driver.h" +#include "saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_kernel.h" + +#include "x86_utils.h" + +namespace anakin { +namespace saber { + +using namespace jit; + +template +class JitAvx512Conv1x1 : public ImplBase< + X86, OpDtype, ConvParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + JitAvx512Conv1x1() + : conf(), + kernel(nullptr), rtus_driver(nullptr), + scratch(nullptr) + {} + + ~JitAvx512Conv1x1() { + if (kernel) { + delete kernel; + } + if (rtus_driver) { + delete rtus_driver; + } + if (scratch) { + zfree(scratch); + } + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m) override; +private: + conv_1x1_desc conv_d; + jit_1x1_conv_conf_t conf; + bool reduce_src; + jit_avx512_common_1x1_conv_kernel *kernel{nullptr}; + rtus_driver_t *rtus_driver; + size_t ws_per_thread; + OpDataType *scratch; + std::shared_ptr > weights_internal; + + void prepare_rtus(); + SaberStatus check_conf(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m); +}; + +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV1X1_H diff --git a/saber/funcs/impl/x86/jit_avx512_conv1x1_act.cpp b/saber/funcs/impl/x86/jit_avx512_conv1x1_act.cpp deleted file mode 100644 index fa05ccb0a..000000000 --- a/saber/funcs/impl/x86/jit_avx512_conv1x1_act.cpp +++ /dev/null @@ -1,445 +0,0 @@ -#include "saber/funcs/impl/x86/jit_call_conf.h" -#include "saber/funcs/impl/x86/x86_utils.h" -#include "saber/funcs/impl/x86/jit_avx512_conv1x1_act.h" - -namespace anakin { -namespace saber { - -using namespace jit; - -inline void set_default_strides(jit_strides_t strides, const jit_dims_t dims, - int ndims, const int *perm = NULL) { - int id_perm[JIT_TENSOR_MAX_DIMS] = { 0 }; - for (int i = 0; i < ndims; ++i) - id_perm[i] = i; - if (perm == NULL) - perm = id_perm; - - strides[perm[ndims - 1]] = 1; - for (int d = 1; d < ndims; ++d) { - const int prev_idx = perm[ndims - d]; - const int curr_idx = perm[ndims - 1 - d]; - - strides[curr_idx] = dims[curr_idx] == 0 - ? 1 - : strides[prev_idx] * utils::max(1, dims[prev_idx]); - } -} - -SaberStatus fill_contiguous_blocked(jit_dims_t md_dims, - const int ndims, const jit_dims_t block_dims, - const int perm[], jit_strides_t strides) { - int unrolled_dims[2 * JIT_TENSOR_MAX_DIMS]; - int unrolled_strides[2 * JIT_TENSOR_MAX_DIMS]; - for (int d = 0; d < ndims; ++d) { - unrolled_dims[d] = md_dims[d] / block_dims[d]; - unrolled_dims[ndims + d] = block_dims[d]; - } - set_default_strides(unrolled_strides, unrolled_dims, 2 * ndims, perm); - utils::array_copy(strides, &unrolled_strides[0], ndims); - return SaberSuccess; -} - -SaberStatus fill_nChw16c(jit_dims_t md_dims, int ndims, jit_strides_t strides) { - const jit_dims_t block_dims = { 1, 16, 1, 1 }; - const int perm[] = { - 0, 1, 2, 3, - 4, 5, 6, 7 }; - return fill_contiguous_blocked(md_dims, ndims, block_dims, perm, strides); -} - -SaberStatus fill_gOIhw16i16o(jit_dims_t md_dims, int ndims, jit_strides_t strides) { - const jit_dims_t block_dims = { 1, 16, 16, 1, 1 }; - const int perm[] = { - 0, 1, 2, 3, 4, - 5, 7, 6, 8, 9 }; - return fill_contiguous_blocked(md_dims, ndims, block_dims, perm, strides); -} - -SaberStatus fill_OIhw16i16o(jit_dims_t md_dims, int ndims, jit_strides_t strides) { - const jit_dims_t block_dims = { 16, 16, 1, 1 }; - const int perm[] = { - 0, 1, 2, 3, - 5, 4, 6, 7 }; - return fill_contiguous_blocked(md_dims, ndims, block_dims, perm, strides); -} - -int shape_to_jit_dim(jit_dims_t& md_dims, const Shape &shape) -{ - for (int i = 0; i < shape.dims(); i++) - md_dims[i] = shape[i]; - return shape.dims(); -} - -template -struct memory_block_t -{ - jit_dims_t md_dims; - jit_strides_t strides; - template inline size_t blk_off(Args... args) { - return _blk_off(args...); - } - template - inline size_t _blk_off() { - return 0; - } - template - inline size_t _blk_off(T xc, Args ...args) { - constexpr int dc = ORIG_LEN - sizeof...(args)-1; - return size_t(xc) * strides[dc] - + _blk_off(args...); - } - memory_block_t(const Shape &shape) { - int ndims = 0; - if (typeid(LayoutType) == typeid(NCHW_C16)) - { - ndims = 4; - } - else if (typeid(LayoutType) == typeid(GOIHW16I16O)) - { - ndims = 5; - } - else if (typeid(LayoutType) == typeid(OIHW16I16O)) - { - ndims = 4; - } - //assert(shape.dims == ndims); - shape_to_jit_dim(md_dims, shape); - if (typeid(LayoutType) == typeid(NCHW_C16)) - { - fill_nChw16c(md_dims, ndims, strides); - } - else if (typeid(LayoutType) == typeid(GOIHW16I16O)) - { - fill_gOIhw16i16o(md_dims, ndims, strides); - } - else if (typeid(LayoutType) == typeid(OIHW16I16O)) - { - fill_OIhw16i16o(md_dims, ndims, strides); - } - } -}; - - -void rtus_prepare(reduce_to_unit_stride_t&rtus_, - conv_1x1_desc *conv_d) { - // Src Format = memory_format::nChw16c - bool rtus_applicable = true - && (conv_d->strides[0] != 1 || conv_d->strides[1] != 1); - for (int d = 2; d < 4; ++d) { - /* TODO: relax these conditions (by improving reducer) */ - rtus_applicable = rtus_applicable - && conv_d->padding[0][d - 2] == 0 - && conv_d->dst_d[d] * conv_d->strides[d - 2] == conv_d->src_d[d]; - } - if (rtus_applicable) { - rtus_.reduce_src_ = true; - rtus_.conv_d_ = conv_d; - rtus_.conv_d_->strides[0] = rtus_.conv_d_->strides[1] = 1; - utils::array_set(rtus_.conv_d_->padding[0], 0, 2); - utils::array_set(rtus_.conv_d_->padding[1], 0, 2); - int ic = rtus_.conv_d_->src_d[1]; - for (int i = 0; i < rtus_.conv_d_->dst_d_dims; i++) - { - rtus_.conv_d_->src_d[i] = rtus_.conv_d_->dst_d[i]; - } - rtus_.conv_d_->src_d[1] = ic; - fill_nChw16c(rtus_.conv_d_->src_d, 4, rtus_.src_dstrides); - } -} - - -template -void balance2D(U nthr, U ithr, T ny, T &ny_start, T &ny_end, - T nx, T &nx_start, T &nx_end, T nx_divider) -{ - const T grp_size = utils::div_up(nthr, nx_divider); - const T grp_count = utils::div_up(nthr, grp_size); - - T grp = ithr / grp_size; - T grp_ithr = ithr % grp_size; - T grp_nthr = grp_size; - T first_grps = nthr % grp_count; - if (first_grps > 0 && grp >= first_grps) { - ithr -= first_grps * grp_size; - grp_nthr--; - grp = ithr / grp_nthr + first_grps; - grp_ithr = ithr % grp_nthr; - } - utils::balance211(nx, grp_count, grp, nx_start, nx_end); - utils::balance211(ny, grp_nthr, grp_ithr, ny_start, ny_end); -} - -/* convolution forward */ -template -SaberStatus JitAvx512Conv1x1Act::init( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, Context &ctx) { - this->_ctx = ctx; - // TODO: type checking - // src = dst = nChw16c - // weight = group ? gOIhw16i16o : OIhw16i16o - - if (typeid(LayOutType_in) != typeid(NCHW_C16)) { - return SaberUnImplError; - } - conv_d_.src_d_dims = shape_to_jit_dim(conv_d_.src_d, inputs[0]->shape()); - conv_d_.dst_d_dims = shape_to_jit_dim(conv_d_.dst_d, outputs[0]->shape()); - conv_d_.padding[0][0] = param.conv_param.pad_h; - conv_d_.padding[0][1] = param.conv_param.pad_w; - conv_d_.strides[0] = param.conv_param.stride_h; - conv_d_.strides[1] = param.conv_param.stride_w; - rtus_prepare(rtus_, &conv_d_); - SaberStatus status; - status = kernel_->init_conf(this->jcp_, conv_d_, - param.conv_param.weight()->shape(), - param.conv_param.group, - param.conv_param.dilation_h, param.conv_param.dilation_w, - param.has_active, - param.activation_param.has_negative_slope() ? param.activation_param.negative_slope : 0.0, - omp_get_max_threads(), - param.conv_param.bias() != NULL, - rtus_.reduce_src_); - - if (status != SaberSuccess) { - return status; - } - - if (!kernel_) { - kernel_ = new jit::jit_avx512_common_1x1_conv_kernel(this->jcp_); - } - - init_rtus_driver(&rtus_driver_, rtus_, jcp_, ws_per_thread_, &scratch_, - inputs[0]->shape(), param.conv_param.stride_h, param.conv_param.stride_w); - return create(inputs, outputs, param, ctx); -} - -template -SaberStatus JitAvx512Conv1x1Act::create( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, Context &ctx) { - - return SaberSuccess; -} - -template -SaberStatus JitAvx512Conv1x1Act::dispatch( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m) { - - ConvParam *conv_param = &(param.conv_param); - ActivationParam *act_param = &(param.activation_param); - const opTensor *weights = conv_param->weight(); - const opTensor *bias = conv_param->bias(); - - const dtype *ptr_src = reinterpret_cast(inputs[0]->get_buf()->get_data()); - const dtype *ptr_weights = reinterpret_cast(weights->get_buf()->get_data()); - const dtype *ptr_bias = reinterpret_cast(bias->get_buf()->get_data()); - auto ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); - const auto &jcp = kernel_->jcp; - - const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast; - - const int stride_h = conv_param->stride_h; - const int stride_w = conv_param->stride_w; - const int pad_t = conv_param->pad_h; - const int pad_l = conv_param->pad_w; - memory_block_t weights_d(weights->shape()); //TODO: Hard code - memory_block_t dst_d(outputs[0]->shape()); - memory_block_t src_d(inputs[0]->shape()); - - auto step = [](int default_step, int remaining, int tail_step) { - assert(default_step <= tail_step); - return remaining < tail_step ? remaining : default_step; - }; - - #pragma omp parallel - { - int ithr = omp_get_thread_num(), nthr = omp_get_num_threads(); - - jit::jit_1x1_conv_call_t p = {}; - - jit::rtus_driver_t::call_params_t rp = {}; - - const int nb_oc = jcp.nb_load; - const int nb_ic = jcp.nb_reduce; - const int nb_ic_blocking = jcp.nb_reduce_blocking; - const int os_block = jcp.bcast_block; - - int bcast_start{ 0 }, bcast_end{ 0 }, ocb_start{ 0 }, ocb_end{ 0 }; - balance2D(nthr, ithr, work_amount, bcast_start, bcast_end, - jcp.nb_load, ocb_start, ocb_end, jcp.load_grp_count); - - auto init_bcast = [&](int iwork, int &n, int &g, int &bcast_step, - int &oh, int &ow, int &ih, int &iw) { - int osb{ 0 }; - jit::nd_iterator_init(iwork, n, jcp.mb, g, jcp.ngroups, osb, - jcp.nb_bcast); - bcast_step = step(jcp.nb_bcast_blocking, jcp.nb_bcast - osb, - jcp.nb_bcast_blocking_max); - bcast_step = utils::min(bcast_step, bcast_end - iwork); - - const int os = osb * os_block; - oh = os / jcp.ow; - ow = os % jcp.ow; - - ih = utils::max(oh * stride_h - pad_t, 0); - iw = utils::max(ow * stride_w - pad_l, 0); - rp.iw_start = iw; - - p.bcast_dim = jit::this_block_size(os, jcp.os, - bcast_step * os_block); - rp.os = p.bcast_dim; - }; - - auto init_load = [&](int ocb, int &load_step) { - load_step = step(jcp.nb_load_blocking, ocb_end - ocb, - jcp.nb_load_blocking_max); - p.load_dim = jit::this_block_size(ocb * jcp.oc_block, - ocb_end * jcp.oc_block, load_step * jcp.oc_block); - }; - - auto init_reduce = [&](int icb) { - const int nb_ic_blocking_step = - utils::min(icb + nb_ic_blocking, nb_ic) - icb; - p.reduce_pos_flag = 0 - | (icb == 0 ? FLAG_REDUCE_FIRST : 0) - | (icb + nb_ic_blocking_step >= nb_ic - ? FLAG_REDUCE_LAST : 0); - - p.reduce_dim = jit::this_block_size(icb * jcp.ic_block, - jcp.ic, nb_ic_blocking_step * jcp.ic_block); - rp.icb = p.reduce_dim / jcp.reduce_block; - }; - - auto inner_ker = [&](int ocb, int icb, int n, int g, int oh, int ow, - int ih, int iw) { - const int _ocb = g * nb_oc + ocb; - const size_t dst_off = dst_d.blk_off(n, _ocb, oh, ow); - - p.output_data = &ptr_dst[dst_off]; - p.bias_data = &ptr_bias[_ocb * jcp.oc_block]; - p.load_data = &ptr_weights[conv_param->group > -1 - ? weights_d.blk_off(g, ocb, icb) - : weights_d.blk_off(ocb, icb)]; - - const int _icb = g * nb_ic + icb; - - if (rtus_.reduce_src_) { - rp.ws = scratch_ + ithr * ws_per_thread_ - + _icb * jcp.is * jcp.ic_block; - if (ocb == ocb_start) { - rp.src = ptr_src + src_d.blk_off(n, _icb, ih, iw); - rtus_driver_->ker_(&rp); - } - p.bcast_data = rp.ws; - } else - p.bcast_data = ptr_src + src_d.blk_off(n, _icb, ih, iw); - - kernel_->jit_ker(&p); - }; - - if (jcp.loop_order == loop_rlb) { - for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) { - init_reduce(icb); - int ocb = ocb_start; - while (ocb < ocb_end) { - int load_step; - init_load(ocb, load_step); - int iwork = bcast_start; - while (iwork < bcast_end) { - int n, g, bcast_step, oh, ow, ih, iw; - init_bcast(iwork, n, g, bcast_step, oh, ow, ih, iw); - inner_ker(ocb, icb, n, g, oh, ow, ih, iw); - iwork += bcast_step; - } - ocb += load_step; - } - } - } else if (jcp.loop_order == loop_lbr) { - int ocb = ocb_start; - while (ocb < ocb_end) { - int load_step; - init_load(ocb, load_step); - int iwork = bcast_start; - while (iwork < bcast_end) { - int n, g, bcast_step, oh, ow, ih, iw; - init_bcast(iwork, n, g, bcast_step, oh, ow, ih, iw); - for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) { - init_reduce(icb); - inner_ker(ocb, icb, n, g, oh, ow, ih, iw); - } - iwork += bcast_step; - } - ocb += load_step; - } - } else if (jcp.loop_order == loop_rbl) { - for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) { - init_reduce(icb); - int iwork = bcast_start; - while (iwork < bcast_end) { - int n, g, bcast_step, oh, ow, ih, iw; - init_bcast(iwork, n, g, bcast_step, oh, ow, ih, iw); - int ocb = ocb_start; - while (ocb < ocb_end) { - int load_step; - init_load(ocb, load_step); - inner_ker(ocb, icb, n, g, oh, ow, ih, iw); - ocb += load_step; - } - iwork += bcast_step; - } - } - } else if (jcp.loop_order == loop_blr) { - int iwork = bcast_start; - while (iwork < bcast_end) { - int n, g, bcast_step, oh, ow, ih, iw; - init_bcast(iwork, n, g, bcast_step, oh, ow, ih, iw); - int ocb = ocb_start; - while (ocb < ocb_end) { - int load_step; - init_load(ocb, load_step); - for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) { - init_reduce(icb); - inner_ker(ocb, icb, n, g, oh, ow, ih, iw); - } - ocb += load_step; - } - iwork += bcast_step; - } - } - else { - assert(!"unsupported loop order"); - } - } - - return SaberSuccess; -} - -template class JitAvx512Conv1x1Act; -template class JitAvx512Conv1x1Act; -template class JitAvx512Conv1x1Act; -template class JitAvx512Conv1x1Act; -} // namespace saber -} // namespace anakin diff --git a/saber/funcs/impl/x86/jit_avx512_conv1x1_act.h b/saber/funcs/impl/x86/jit_avx512_conv1x1_act.h deleted file mode 100644 index b5f31fc87..000000000 --- a/saber/funcs/impl/x86/jit_avx512_conv1x1_act.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV1X1_ACT_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV1X1_ACT_H - -#include -#include "saber/saber_funcs_param.h" -#include "saber/funcs/impl/impl_base.h" -#include "saber/core/tensor.h" -#include "saber/funcs/impl/x86/jit_call_conf.h" -#include "saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h" -#include "saber/funcs/impl/x86/kernel/jit_avx512_rtus_driver.h" -#include "saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_act_kernel.h" - -#include "x86_utils.h" - -namespace anakin { -namespace saber { - -namespace jit { -struct jit_avx512_common_1x1_conv_kernel; -} - -template -class JitAvx512Conv1x1Act : public ImplBase, - Tensor, - Tensor, - ConvActiveParam>> { -public: - typedef Tensor inTensor; - typedef Tensor outTensor; - typedef Tensor opTensor; - typedef typename inTensor::Dtype dtype; - - JitAvx512Conv1x1Act() - : rtus_({}), jcp_({}), - kernel_(nullptr), rtus_driver_(nullptr), - scratch_(nullptr) - {} - - ~JitAvx512Conv1x1Act() { - if (kernel_) { - delete kernel_; - } - if (rtus_driver_) { - delete rtus_driver_; - } - if (scratch_) { - zfree(scratch_); - } - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) override; - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m) override; -private: - reduce_to_unit_stride_t rtus_; - jit::jit_1x1_conv_conf_t jcp_; - conv_1x1_desc conv_d_; - jit::jit_avx512_common_1x1_conv_kernel *kernel_; - jit::rtus_driver_t *rtus_driver_; - size_t ws_per_thread_; - float *scratch_; // TODO float -}; - -} // namespace saber -} // namespace anakin - -#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV1X1_ACT_H diff --git a/saber/funcs/impl/x86/jit_avx512_conv_act.cpp b/saber/funcs/impl/x86/jit_avx512_conv_act.cpp deleted file mode 100644 index 3c6eb5971..000000000 --- a/saber/funcs/impl/x86/jit_avx512_conv_act.cpp +++ /dev/null @@ -1,285 +0,0 @@ -#include - -#include "saber/funcs/impl/x86/jit_avx512_conv_act.h" -#include "saber/funcs/impl/x86/jit_call_conf.h" -#include "saber/funcs/impl/x86/x86_utils.h" - -namespace anakin { -namespace saber { - -using namespace jit; - -using jit_conv_ker_t = void (*)(jit_conv_call_t *); - -inline bool is_1stconv(const jit_conv_conf_t &jcp) { - return utils::one_of(jcp.ic, 1, 3); -} - -inline void jit_conv_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_t &p, - const void *src, const void *dst, const void *filt, const void *bias, - int channel, int kh_padding) -{ -#define PIPELINE(field) \ - do { \ - p.field = p.field ## _prf; \ - p.field ## _prf = field; \ - } while (0) - - PIPELINE(src); - PIPELINE(dst); - PIPELINE(filt); - PIPELINE(bias); - PIPELINE(channel); - PIPELINE(kh_padding); - - if (p.src) { - ker(&p); - } -} - -template -SaberStatus JitAvx512ConvAct::init( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) { - // get context of avx512_conv_act - this->_ctx = ctx; - ConvParam *conv_param = &(param.conv_param); - ActivationParam *act_param = &(param.activation_param); - - const opTensor *weights = conv_param->weight(); - - Shape src_shape(inputs[0]->shape()); - Shape dst_shape(outputs[0]->shape()); - Shape weights_shape(weights->shape()); - - const bool with_groups = false; - conf.ngroups = with_groups ? weights_shape[0] : 1; - - conf.mb = src_shape[0]; - if (src_shape.dims() == 5) { - conf.ic = src_shape[1] * src_shape[4] / conf.ngroups; - } - else { - conf.ic = src_shape[1] / conf.ngroups; - } - conf.ih = src_shape[2]; - conf.iw = src_shape[3]; - - if (dst_shape.dims() == 5) { - conf.oc = dst_shape[1] * dst_shape[4] / conf.ngroups; - } else { - conf.oc = dst_shape[1] / conf.ngroups; - } - conf.oh = dst_shape[2]; - conf.ow = dst_shape[3]; - - conf.kh = weights_shape[2]; - conf.kw = weights_shape[3]; - conf.stride_h = conv_param->stride_h; - conf.stride_w = conv_param->stride_w; - conf.t_pad = conv_param->pad_h; - conf.l_pad = conv_param->pad_w; - conf.dilate_h = conv_param->dilation_h; - conf.dilate_w = conv_param->dilation_w; - - conf.with_relu = param.has_active; - if (conf.with_relu) { - conf.relu_negative_slope = static_cast(act_param->negative_slope); - } - conf.with_bias = !(conv_param->bias() == NULL); - conf.is_1stconv = is_1stconv(conf); - - // check memory layout - if (conf.is_1stconv) { - if (!(std::is_same::value && - std::is_same::value && - std::is_same::value )) { - return SaberUnImplError; - } - } else { - if (!(std::is_same::value && - std::is_same::value && - std::is_same::value)) { - return SaberUnImplError; - } - } - - SaberStatus status = jit_conv_act_kernel::init_conf(conf); - if (status == SaberSuccess) { - return create(inputs, outputs, param, ctx); - } else { - return SaberUnImplError; - } -} - -template -SaberStatus JitAvx512ConvAct::create( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) { - kernel_ = new jit_conv_act_kernel(conf); - - ConvParam *conv_param = &(param.conv_param); - opTensor *weights = conv_param->mutable_weight(); - weights_internal.reset(new opTensor(weights->shape())); - if (std::is_same::value) { - weight_reorder_OIhwi16o(*weights, *weights_internal); - } else if (std::is_same::value) { - weight_reorder_OIhw16i16o(*weights, *weights_internal); - } - - return SaberSuccess; -} - -template -SaberStatus JitAvx512ConvAct::dispatch( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m) { - - ConvParam *conv_param = &(param.conv_param); - const opTensor *bias = conv_param->bias(); - - const dtype *ptr_src = reinterpret_cast( - inputs[0]->get_buf()->get_data()); - const dtype *ptr_weights = reinterpret_cast( - weights_internal->get_buf()->get_data()); - const dtype *ptr_bias = reinterpret_cast( - bias->get_buf()-> get_data()); - auto ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); - - const auto &jcp = kernel_->jcp; - - #pragma omp parallel - { - int ithr = omp_get_thread_num(), nthr = omp_get_num_threads(); - int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking; - int start, end, start_copy; - int work_amount = jcp.mb * jcp.ngroups * oc_chunks * jcp.oh; - utils::balance211(work_amount, nthr, ithr, start, end); - start_copy = start; - - jit_conv_call_t par_conv = { 0 }; - size_t src_h_stride = jcp.iw * jcp.ic_block; - size_t src_c_stride = jcp.ih * jcp.iw * jcp.ic_block; - size_t dst_h_stride = jcp.ow * jcp.oc_block; - size_t wht_h_stride = jcp.kw * jcp.ic_block * jcp.oc_block; - size_t wht_ic_stride = jcp.kh * jcp.kw * jcp.ic_block * jcp.oc_block; - - if (jcp.is_1stconv) { - src_h_stride = jcp.iw; - src_c_stride = jcp.ih * jcp.iw; - wht_ic_stride = jcp.oc_block; - } - - for (int icb_l2 = 0; icb_l2 < jcp.nb_ic; icb_l2 += jcp.nb_ic_L2) { - start = start_copy; - int n{0}, g{0}, occ{0}, oh_s{0}; - if (jcp.loop_order == conv_loop_order_t::loop_cgn) { - utils::nd_iterator_init(start, occ, oc_chunks, - g, jcp.ngroups, n, jcp.mb, oh_s, jcp.oh); - } - else if (jcp.loop_order == conv_loop_order_t::loop_gnc) { - utils::nd_iterator_init(start, - g, jcp.ngroups, - n, jcp.mb, - occ, oc_chunks, - oh_s, jcp.oh); - } - - while (start < end) { - int ocb = occ * jcp.nb_oc_blocking; - int g_ocb = g * jcp.nb_oc + ocb; - int g_oc = g_ocb * jcp.oc_block; - int g_icb = g * jcp.nb_ic; - - int work_rem = end - start; - int ih_s = -jcp.t_pad + oh_s * jcp.stride_h; - int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem; - - size_t bias_blk_off = g_oc; - size_t dst_blk_off = n * jcp.oc * jcp.oh * jcp.ow + - (g_ocb * jcp.oh * jcp.ow + oh_s * jcp.ow) * jcp.oc_block; - size_t src_blk_off = n * jcp.ic * jcp.ih * jcp.iw + - (g_icb + icb_l2) * jcp.ih * jcp.iw * jcp.ic_block - + ih_s * jcp.iw * jcp.ic_block; - size_t weight_blk_off= ocb * jcp.ic * jcp.kh * jcp.kw * jcp.oc_block + - icb_l2 * jcp.kh * jcp.kw * jcp.oc_block * jcp.ic_block; - - if (jcp.is_1stconv) { - src_blk_off = n * jcp.ic * jcp.ih * jcp.iw + ih_s * jcp.iw; - weight_blk_off = ocb * jcp.ic * jcp.kh * jcp.kw * jcp.oc_block; - } - - auto bias_w = ptr_bias ? ptr_bias + bias_blk_off : 0; - auto dst_w = ptr_dst + dst_blk_off; - auto src_w = ptr_src + src_blk_off; - auto wht_w = ptr_weights + weight_blk_off; - - for (int icb = icb_l2; - icb < utils::min(jcp.nb_ic, icb_l2 + jcp.nb_ic_L2); ++icb) { - auto src_c = src_w; - auto dst_c = dst_w; - for (int oj = oh_s, ij = ih_s; - oj < oh_e; ++oj, ij += jcp.stride_h) { - - int i_t_overflow = -utils::min(0, ij); - int i_b_overflow = utils::max(jcp.ih, ij + jcp.kh) - jcp.ih; - int kh_padding = utils::max(0, jcp.kh - i_t_overflow - i_b_overflow); - - jit_conv_ker_pipeline(kernel_->jit_ker, par_conv, - src_c + i_t_overflow * src_h_stride, - dst_c, wht_w + i_t_overflow * wht_h_stride, - bias_w, icb, kh_padding); - - src_c += src_h_stride * jcp.stride_h; - dst_c += dst_h_stride; - } - src_w += src_c_stride; - wht_w += wht_ic_stride; - } - - if (jcp.loop_order == conv_loop_order_t::loop_cgn) { - utils::nd_iterator_jump(start, end, - occ, oc_chunks, g, jcp.ngroups, n, jcp.mb, oh_s, jcp.oh); - } else if (jcp.loop_order == conv_loop_order_t::loop_gnc) { - utils::nd_iterator_jump(start, end, g, jcp.ngroups, n, jcp.mb, occ, oc_chunks, oh_s, jcp.oh); - } - } - } - - jit_conv_ker_pipeline(kernel_->jit_ker, par_conv, - ptr_src, ptr_dst, ptr_weights, ptr_bias, 0, 0); - - } - - return SaberSuccess; -} - -template class JitAvx512ConvAct; -template class JitAvx512ConvAct; -template class JitAvx512ConvAct; - -} // namespace saber -} // namespace anakin diff --git a/saber/funcs/impl/x86/jit_avx512_conv_act.h b/saber/funcs/impl/x86/jit_avx512_conv_act.h deleted file mode 100644 index 2a9bb68f3..000000000 --- a/saber/funcs/impl/x86/jit_avx512_conv_act.h +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV_ACT_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV_ACT_H - -#include - -#include "saber/funcs/impl/impl_base.h" -#include "saber/core/tensor.h" -#include "saber/funcs/impl/x86/jit_call_conf.h" -#include "saber/funcs/impl/x86/kernel/jit_avx512_conv_act_kernel.h" -#include "saber/saber_funcs_param.h" - -namespace anakin{ -namespace saber{ - -using namespace jit; - -template -class JitAvx512ConvAct : public ImplBase, - Tensor, - Tensor, - ConvActiveParam>> { -public: - typedef Tensor inTensor; - typedef Tensor outTensor; - typedef Tensor opTensor; - typedef typename inTensor::Dtype dtype; - - JitAvx512ConvAct() - : kernel_(NULL) - {} - - ~JitAvx512ConvAct() { - if (kernel_ != NULL) { - delete kernel_; - } - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) override; - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m) override; - -private: - jit::jit_conv_conf_t conf; - jit::jit_conv_act_kernel *kernel_; - std::shared_ptr weights_internal; -}; - - -} // namespace saber -} // namespace anakin - -#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV_ACT_H diff --git a/saber/funcs/impl/x86/jit_call_conf.h b/saber/funcs/impl/x86/jit_call_conf.h index dd935131e..dfe807344 100644 --- a/saber/funcs/impl/x86/jit_call_conf.h +++ b/saber/funcs/impl/x86/jit_call_conf.h @@ -1,16 +1,17 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ #ifndef ANAKIN_SABER_FUNCS_IMPL_X86_JIT_CALL_CONF_H #define ANAKIN_SABER_FUNCS_IMPL_X86_JIT_CALL_CONF_H @@ -252,4 +253,4 @@ struct jit_concat_conf_t { } // namespace saber } // namespace anakin -#endif \ No newline at end of file +#endif diff --git a/saber/funcs/impl/x86/jit_uni_dw_convolution.cpp b/saber/funcs/impl/x86/jit_uni_dw_convolution.cpp deleted file mode 100644 index 0a093aca4..000000000 --- a/saber/funcs/impl/x86/jit_uni_dw_convolution.cpp +++ /dev/null @@ -1,254 +0,0 @@ -#include "saber/funcs/impl/x86/jit_uni_dw_convolution.h" -#include "saber/funcs/impl/x86/jit_call_conf.h" -#include "saber/funcs/impl/x86/x86_utils.h" -#include - -namespace anakin { -namespace saber { - -using namespace jit; - -template -SaberStatus JitUniDWConvolution::init( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) { - - if (!(std::is_same::value && - std::is_same::value && - std::is_same::value && - OpDtype == AK_FLOAT )) { - return SaberUnImplError; - } - - // get context of uni_dw_convolution - this->_ctx = ctx; - - ConvParam *conv_param = &(param.conv_param); - ActivationParam *act_param = &(param.activation_param); - - const opTensor *weights = conv_param->weight(); - const opTensor *bias = conv_param->bias(); - - Shape src_shape(inputs[0]->shape()); - Shape dst_shape(outputs[0]->shape()); - Shape weights_shape(weights->shape()); - - conf.ngroups = weights_shape[0]; - conf.mb = src_shape[0]; - if (src_shape.dims() == 5) { - conf.ic = src_shape[1] * src_shape[4]; - } - else { - conf.ic = src_shape[1]; - } - conf.ih = src_shape[2]; - conf.iw = src_shape[3]; - - if (src_shape.dims() == 5) { - conf.oc = dst_shape[1] * dst_shape[4]; - } else { - conf.oc = dst_shape[1]; - } - conf.oh = dst_shape[2]; - conf.ow = dst_shape[3]; - - conf.kh = weights_shape[2]; - conf.kw = weights_shape[3]; - - conf.stride_h = conv_param->stride_h; - conf.stride_w = conv_param->stride_w; - conf.t_pad = conv_param->pad_h; - conf.l_pad = conv_param->pad_w; - conf.b_pad = conv_param->pad_h; - conf.r_pad = conv_param->pad_w; - conf.dilate_h = conv_param->dilation_h; - conf.dilate_w = conv_param->dilation_w; - - conf.with_bias = (bias != NULL); - conf.with_relu = param.has_active; - if (conf.with_relu) { - conf.relu_negative_slope = static_cast(act_param->negative_slope); - } - - conf.is_dw = conf.oc / conf.ngroups == weights_shape[1]; - bool ok = true - && conf.oc == conf.ngroups - && conf.ic == conf.ngroups - && conf.is_dw; - if (!ok) { - LOG(ERROR) << "dw conv init fail, return UnImplError"; - return SaberUnImplError; - } - - SaberStatus status = jit_uni_dw_conv_kernel_f32::init_conf(conf); - if (status == SaberSuccess) { - return create(inputs, outputs, param, ctx); - } else { - return SaberUnImplError; - } -} - -template -SaberStatus JitUniDWConvolution::create( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) { - - kernel_ = new jit_uni_dw_conv_kernel_f32(conf); - ConvParam *conv_param = &(param.conv_param); - opTensor *weights = conv_param->mutable_weight(); - weights_internal.reset(new opTensor(weights->shape())); - weight_reorder_OIhwi16o(*weights, *weights_internal); - return SaberSuccess; -} - -template -SaberStatus JitUniDWConvolution::dispatch( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m) { - - ConvParam *conv_param = &(param.conv_param); - const opTensor *bias = conv_param->bias(); - - const dtype *ptr_src = reinterpret_cast(inputs[0]->get_buf()->get_data()); - const dtype *ptr_weights = reinterpret_cast(weights_internal->get_buf()->get_data()); - const dtype *ptr_bias = reinterpret_cast(bias->get_buf()-> get_data()); - auto ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); - - const auto &jcp = kernel_->jcp; - - int dil_h = jcp.dilate_h + 1; - int dil_w = jcp.dilate_w + 1; - int str_h = jcp.stride_h; - int str_w = jcp.stride_w; - - int MB = jcp.mb; - int chb_work = utils::div_up(jcp.nb_ch, jcp.nb_ch_blocking); - const size_t work_amount = MB * chb_work * jcp.oh; - - auto kernel_params = [&](int ur_w_step, int ow, int oh, int ih, int kh, - int kh_padding, int ch, int ch_num, int n) { - jit_conv_call_t par_conv = {}; - - const int i_l_overflow = utils::max(0, (jcp.l_pad - ow * str_w)); - const int i_r_overflow = utils::max(jcp.iw, (ow * str_w - + (jcp.kw - 1)*dil_w - jcp.l_pad + 1)) - jcp.iw; - - const int iw = utils::max((ow*str_w - jcp.l_pad - + utils::div_up(i_l_overflow, dil_w)*dil_w), 0); - const int kw = utils::div_up(i_l_overflow, dil_w); - - const int kw_padding = jcp.kw - utils::div_up(i_l_overflow, dil_w) - - utils::div_up(i_r_overflow, dil_w); - - par_conv.src = ptr_src + n * jcp.ic * jcp.iw * jcp.ih + ch * jcp.iw * jcp.ih * 16 + ih * jcp.iw * 16 + iw * 16; - par_conv.src = ptr_dst + n * jcp.oc * jcp.ow * jcp.oh + ch * jcp.iw * jcp.ih * 16 + oh * jcp.ow * 16 + ow * 16; - - //par_conv.filt = &weights[weights_d.blk_off(ch, 0, 0, kh, kw)]; - par_conv.filt = ptr_weights + ch * jcp.ngroups * jcp.kh * jcp.kw + kh * jcp.kw * 16 + kw *16; - if (bias) { - par_conv.bias = ptr_bias + ch * jcp.ch_block; - } - - par_conv.kh_padding = (size_t)utils::max(0, kh_padding); - par_conv.kw_padding = (size_t)utils::max(0, kw_padding); - - par_conv.ur_w = (size_t)ur_w_step; - - par_conv.ch_blocks = utils::min(ch + ch_num, jcp.nb_ch) - ch; - - return par_conv; - }; - - auto ker = [&](const int ithr, const int nthr) { - size_t start{0}, end{0}; - utils::balance211(work_amount, nthr, ithr, start, end); - - size_t n{0}, chb{0}, oh{0}; - utils::nd_iterator_init(start, n, MB, chb, chb_work, oh, jcp.oh); - for (size_t iwork = start; iwork < end; ++iwork) { - int ch = chb * jcp.nb_ch_blocking; - int ch_num = jcp.nb_ch_blocking; - - const int i_t_overflow = utils::max(0, (int)(jcp.t_pad - oh*str_h)); - const int i_b_overflow = utils::max(jcp.ih, - (int)(oh*str_h + (jcp.kh - 1)*dil_h - jcp.t_pad + 1)) - jcp.ih; - - const int ih = utils::max((int)(oh*str_h - jcp.t_pad - + utils::div_up(i_t_overflow, dil_h)*dil_h), 0); - const int kh = utils::div_up(i_t_overflow, dil_h); - const int kh_padding = jcp.kh - utils::div_up(i_t_overflow, dil_h) - - utils::div_up(i_b_overflow, dil_h); - - // left border - int ow = 0; - int l_border = utils::min(utils::div_up(jcp.l_pad, str_w), jcp.ow); - int ur_w_step = 1; - for (; ow < l_border; ow++) { - jit_conv_call_t par_conv = kernel_params(ur_w_step, ow, oh, ih, - kh, kh_padding, ch, ch_num, n); - - kernel_->jit_ker(&par_conv); - } - - // main loop - ur_w_step = (jcp.iw - (jcp.kw - 1)*dil_w + jcp.l_pad - 1) - / jcp.stride_w - ow + 1; - if (ur_w_step > 0) { - jit_conv_call_t par_conv = kernel_params(ur_w_step, ow, oh, ih, - kh, kh_padding, ch, ch_num, n); - - kernel_->jit_ker(&par_conv); - - ow += ur_w_step; - } - - // right border - ur_w_step = 1; - for (; ow < jcp.ow; ow++) { - jit_conv_call_t par_conv = kernel_params(ur_w_step, ow, oh, ih, - kh, kh_padding, ch, ch_num, n); - - kernel_->jit_ker(&par_conv); - } - - utils::nd_iterator_step(n, MB, chb, chb_work, oh, jcp.oh); - } - }; - - #pragma omp parallel - { - ker(omp_get_thread_num(), omp_get_num_threads()); - } - - return SaberSuccess; -} - -template class JitUniDWConvolution; -template class JitUniDWConvolution; -template class JitUniDWConvolution; - -} // namespace saber -} // namespace anakin diff --git a/saber/funcs/impl/x86/jit_uni_dw_convolution.h b/saber/funcs/impl/x86/jit_uni_dw_convolution.h deleted file mode 100644 index 9933ce21e..000000000 --- a/saber/funcs/impl/x86/jit_uni_dw_convolution.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_JIT_UNI_DW_CONVOLUTION_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_JIT_UNI_DW_CONVOLUTION_H - -#include "saber/funcs/impl/impl_base.h" -#include "saber/core/tensor.h" -#include "saber/funcs/impl/x86/jit_call_conf.h" -#include "saber/funcs/impl/x86/kernel/jit_uni_dw_conv_kernel_f32.h" -#include "saber/saber_funcs_param.h" - -namespace anakin{ -namespace saber{ - -using namespace jit; -template -class JitUniDWConvolution : public ImplBase, - Tensor, - Tensor, - ConvActiveParam>> { -public: - typedef Tensor inTensor; - typedef Tensor outTensor; - typedef Tensor opTensor; - typedef typename inTensor::Dtype dtype; - - JitUniDWConvolution() - : kernel_(NULL) - {} - ~JitUniDWConvolution() { - if (kernel_ != NULL) { - delete kernel_; - } - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) override; - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m) override; - -private: - jit_conv_conf_t conf; - jit_uni_dw_conv_kernel_f32 *kernel_; - std::shared_ptr weights_internal; -}; - -} // namespace saber -} // namespace anakin - -#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_UNI_DW_CONVOLUTION_H diff --git a/saber/funcs/impl/x86/jit_uni_dwconv.cpp b/saber/funcs/impl/x86/jit_uni_dwconv.cpp new file mode 100644 index 000000000..0e82e21aa --- /dev/null +++ b/saber/funcs/impl/x86/jit_uni_dwconv.cpp @@ -0,0 +1,300 @@ +#include "saber/funcs/impl/x86/jit_uni_dwconv.h" +#include "saber/funcs/impl/x86/jit_call_conf.h" +#include "saber/funcs/impl/x86/x86_utils.h" +#include + +namespace anakin { +namespace saber { + +using namespace jit; + +template <> +SaberStatus JitUniDWConv::check_conf( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m) { + ConvParam *conv_param = &(param); + const Tensor *weights = conv_param->weight(); + const Tensor *bias = conv_param->bias(); + const jit_conv_conf_t jcp = kernel->jcp; + Tensor *input = inputs[0]; + Tensor *output = outputs[0]; + + // check format +// if (!(std::is_same::value && +// std::is_same::value && +// std::is_same::value && +// inDtype == AK_FLOAT)) { +// LOG(ERROR) << "wrong format"; +// return SaberUnImplError; +// } + if ((inputs[0]->get_layout() != Layout_NCHW_C16) + || (outputs[0]->get_layout() != Layout_NCHW_C16) + || (param.weight()->get_layout() != Layout_NCHW)) { + + LOG(ERROR) << "wrong format"; + return SaberUnImplError; + } + + // check param + bool param_ok = true + && jcp.t_pad == conv_param->pad_h + && jcp.l_pad == conv_param->pad_w + && jcp.b_pad == conv_param->pad_h + && jcp.r_pad == conv_param->pad_w + && jcp.stride_h == conv_param->stride_h + && jcp.stride_w == conv_param->stride_w + && jcp.dilate_h == conv_param->dilation_h + && jcp.dilate_w == conv_param->dilation_w; + + // check shape + bool shape_ok = true + && jcp.kh == weights->height() + && jcp.kw == weights->width() + && jcp.ngroups == weights->num() + && jcp.mb == input->num() + && jcp.ic == input->channel() + && jcp.ih == input->height() + && jcp.iw == input->width() + && jcp.oc == output->channel() + && jcp.oh == output->height() + && jcp.ow == output->width(); + + if (param_ok && shape_ok) { + return SaberSuccess; + } else { + LOG(INFO) << "param or shape changed, re-init kernel"; + return SaberNotInitialized; + } +} + +template <> +SaberStatus JitUniDWConv::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m, + Context &ctx) { + SaberStatus status; + ConvParam *conv_param = &(param); + ActivationParam *act_param = nullptr; + const Tensor *weights = conv_param->weight(); + const Tensor *bias = conv_param->bias(); + Tensor *input = inputs[0]; + Tensor *output = outputs[0]; + + // check conf + if (kernel) { + status = check_conf(inputs, outputs, param); + if (status != SaberNotInitialized) { + return status; + } + } + + // init conf + conf.ngroups = weights->num(); + conf.mb = input->num(); + conf.ic = input->channel(); + conf.ih = input->height(); + conf.iw = input->width(); + conf.oc = output->channel(); + conf.oh = output->height(); + conf.ow = output->width(); + + conf.kh = weights->height(); + conf.kw = weights->width(); + + conf.stride_h = conv_param->stride_h; + conf.stride_w = conv_param->stride_w; + conf.t_pad = conv_param->pad_h; + conf.l_pad = conv_param->pad_w; + conf.b_pad = conv_param->pad_h; + conf.r_pad = conv_param->pad_w; + conf.dilate_h = conv_param->dilation_h; + conf.dilate_w = conv_param->dilation_w; + + conf.with_bias = (bias != NULL); + conf.with_relu = param.activation_param.has_active; + if (conf.with_relu) { + act_param = &(param.activation_param); + conf.relu_negative_slope = static_cast(act_param->negative_slope); + } + + conf.is_dw = (conf.oc / conf.ngroups == weights->channel()); + bool ok = true + && conf.oc == conf.ngroups + && conf.ic == conf.ngroups + && conf.is_dw; + if (!ok) { + LOG(ERROR) << "dw conv init fail, return UnImplError"; + return SaberUnImplError; + } + + status = jit_uni_dwconv_kernel_f32::init_conf(conf); + if (status == SaberSuccess) { + if (kernel != nullptr) { + delete kernel; + kernel = nullptr; + } + kernel = new jit_uni_dwconv_kernel_f32(conf); + } else { + return SaberUnImplError; + } + + // reorder weights + Tensor *weights_reorder = conv_param->mutable_weight(); + weights_internal.reset(new Tensor(weights_reorder->valid_shape())); + weight_reorder_Goihw16g(*weights_reorder, *weights_internal); + + return status; +} + +template <> +SaberStatus JitUniDWConv::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m, + Context &ctx) { + +// if (!(std::is_same::value && +// std::is_same::value && +// std::is_same::value && +// OpDtype == AK_FLOAT)) { +// return SaberUnImplError; +// } + if ((inputs[0]->get_layout() != Layout_NCHW_C16) + || (outputs[0]->get_layout() != Layout_NCHW_C16) + || (param.weight()->get_layout() != Layout_NCHW)) { + + LOG(ERROR) << "wrong format"; + return SaberUnImplError; + } + this->_ctx = &ctx; + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus JitUniDWConv::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m) { + + ConvParam *conv_param = &(param); + const Tensor *bias = conv_param->bias(); + + const float *ptr_src = reinterpret_cast(inputs[0]->data()); + const float *ptr_weights = reinterpret_cast(weights_internal->data()); + const float *ptr_bias = reinterpret_cast(bias->data()); + auto ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); + + const auto &jcp = kernel->jcp; + + int dil_h = jcp.dilate_h + 1; + int dil_w = jcp.dilate_w + 1; + int str_h = jcp.stride_h; + int str_w = jcp.stride_w; + + int MB = jcp.mb; + int chb_work = utils::div_up(jcp.nb_ch, jcp.nb_ch_blocking); + const size_t work_amount = MB * chb_work * jcp.oh; + + auto kernel_params = [&](int ur_w_step, int ow, int oh, int ih, int kh, + int kh_padding, int ch, int ch_num, int n) { + jit_conv_call_t par_conv; + + const int i_l_overflow = utils::max(0, (jcp.l_pad - ow * str_w)); + const int i_r_overflow = utils::max(jcp.iw, (ow * str_w + + (jcp.kw - 1)*dil_w - jcp.l_pad + 1)) - jcp.iw; + + const int iw = utils::max((ow*str_w - jcp.l_pad + + utils::div_up(i_l_overflow, dil_w)*dil_w), 0); + const int kw = utils::div_up(i_l_overflow, dil_w); + + const int kw_padding = jcp.kw - utils::div_up(i_l_overflow, dil_w) + - utils::div_up(i_r_overflow, dil_w); + + par_conv.src = ptr_src + n * jcp.ic * jcp.iw * jcp.ih + ch * jcp.iw * jcp.ih * 16 + ih * jcp.iw * 16 + iw * 16; + par_conv.dst = ptr_dst + n * jcp.oc * jcp.ow * jcp.oh + ch * jcp.ow * jcp.oh * 16 + oh * jcp.ow * 16 + ow * 16; + + //par_conv.filt = &weights[weights_d.blk_off(ch, 0, 0, kh, kw)]; + par_conv.filt = ptr_weights + (ch * jcp.kh * jcp.kw + kh * jcp.kw + kw) *16; + if (bias) { + par_conv.bias = ptr_bias + ch * jcp.ch_block; + } + + par_conv.kh_padding = (size_t)utils::max(0, kh_padding); + par_conv.kw_padding = (size_t)utils::max(0, kw_padding); + + par_conv.ur_w = (size_t)ur_w_step; + + par_conv.ch_blocks = utils::min(ch + ch_num, jcp.nb_ch) - ch; + + return par_conv; + }; + + auto ker = [&](const int ithr, const int nthr) { + size_t start{0}, end{0}; + utils::balance211(work_amount, nthr, ithr, start, end); + + size_t n{0}, chb{0}, oh{0}; + utils::nd_iterator_init(start, n, MB, chb, chb_work, oh, jcp.oh); + for (size_t iwork = start; iwork < end; ++iwork) { + int ch = chb * jcp.nb_ch_blocking; + int ch_num = jcp.nb_ch_blocking; + + const int i_t_overflow = utils::max(0, (int)(jcp.t_pad - oh*str_h)); + const int i_b_overflow = utils::max(jcp.ih, + (int)(oh*str_h + (jcp.kh - 1)*dil_h - jcp.t_pad + 1)) - jcp.ih; + + const int ih = utils::max((int)(oh*str_h - jcp.t_pad + + utils::div_up(i_t_overflow, dil_h)*dil_h), 0); + const int kh = utils::div_up(i_t_overflow, dil_h); + const int kh_padding = jcp.kh - utils::div_up(i_t_overflow, dil_h) + - utils::div_up(i_b_overflow, dil_h); + + // left border + int ow = 0; + int l_border = utils::min(utils::div_up(jcp.l_pad, str_w), jcp.ow); + int ur_w_step = 1; + for (; ow < l_border; ow++) { + jit_conv_call_t par_conv = kernel_params(ur_w_step, ow, oh, ih, + kh, kh_padding, ch, ch_num, n); + + kernel->jit_ker(&par_conv); + } + + // main loop + ur_w_step = (jcp.iw - (jcp.kw - 1)*dil_w + jcp.l_pad - 1) / jcp.stride_w - ow + 1; + if (ur_w_step > 0) { + jit_conv_call_t par_conv = kernel_params(ur_w_step, ow, oh, ih, + kh, kh_padding, ch, ch_num, n); + + kernel->jit_ker(&par_conv); + + ow += ur_w_step; + } + + // right border + ur_w_step = 1; + for (; ow < jcp.ow; ow++) { + jit_conv_call_t par_conv = kernel_params(ur_w_step, ow, oh, ih, + kh, kh_padding, ch, ch_num, n); + + kernel->jit_ker(&par_conv); + } + + utils::nd_iterator_step(n, MB, chb, chb_work, oh, jcp.oh); + } + }; + +#pragma omp parallel + { + ker(omp_get_thread_num(), omp_get_num_threads()); + } + + return SaberSuccess; +} + +template class JitUniDWConv; +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/x86/jit_uni_dwconv.h b/saber/funcs/impl/x86/jit_uni_dwconv.h new file mode 100644 index 000000000..a5826f732 --- /dev/null +++ b/saber/funcs/impl/x86/jit_uni_dwconv.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_JIT_UNI_DW_CONVOLUTION_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_JIT_UNI_DW_CONVOLUTION_H + +#include "saber/funcs/impl/impl_base.h" +#include "saber/core/tensor.h" +#include "saber/funcs/impl/x86/jit_call_conf.h" +#include "saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.h" +#include "saber/saber_funcs_param.h" + +namespace anakin { +namespace saber { + +using namespace jit; + +template +class JitUniDWConv : public ImplBase< + X86, OpDtype, ConvParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + JitUniDWConv() + : kernel(nullptr) + {} + ~JitUniDWConv() { + if (kernel != nullptr) { + delete kernel; + } + } + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m) override; + +private: + jit_conv_conf_t conf; + jit_uni_dwconv_kernel_f32 *kernel = nullptr; + std::shared_ptr > weights_internal; + SaberStatus check_conf(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam ¶m); +}; + + +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_UNI_DW_CONVOLUTION_H diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_conv_act_kernel.cpp b/saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.cpp similarity index 83% rename from saber/funcs/impl/x86/kernel/jit_avx2_conv_act_kernel.cpp rename to saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.cpp index 1efa9cf59..fb851acac 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx2_conv_act_kernel.cpp +++ b/saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.cpp @@ -1,6 +1,6 @@ #include -#include "jit_avx2_conv_act_kernel.h" +#include "jit_avx2_conv_kernel.h" #define GET_OFF(field) offsetof(jit_conv_call_t, field) @@ -11,7 +11,7 @@ namespace jit { using namespace Xbyak; inline void jit_avx2_conv_act_kernel::oh_step_unroll_kw(int ur_w, - int pad_l, int pad_r, int oc_blocks) { + int pad_l, int pad_r, int oc_blocks) { int iw = jcp.iw; int ih = jcp.ih; int kw = jcp.kw; @@ -42,15 +42,15 @@ inline void jit_avx2_conv_act_kernel::oh_step_unroll_kw(int ur_w, vmovups(ymm15, ptr[aux_reg_kernel + sizeof(float) * ker_off]); for (int jj = jj_start; jj < jj_end; jj++) vfmadd231ps(Ymm(ur_w * ii + jj), - Ymm(oc_blocks * ur_w + jj), ymm15); + Ymm(oc_blocks * ur_w + jj), ymm15); } } } } inline void jit_avx2_conv_act_kernel::oh_step_nopad(int ur_w, - int pad_l, int pad_r, char pad_tag, - int oc_blocks, char oc_blocks_tag) { + int pad_l, int pad_r, char pad_tag, + int oc_blocks, char oc_blocks_tag) { jit_tagged_label kw_label("kw", pad_tag, oc_blocks_tag); int iw = jcp.iw; @@ -74,14 +74,14 @@ inline void jit_avx2_conv_act_kernel::oh_step_nopad(int ur_w, for (int jj = jj_start; jj < jj_end; jj++) { int inp_off; // if (jcp.src_fmt == nchw) - inp_off = ifm2 * ih * iw + (jj * stride_w - pad_l); + inp_off = ifm2 * ih * iw + (jj * stride_w - pad_l); // else // inp_off = (jj * stride_w - pad_l) * (dw ? g_blk : ic_blk) + ifm2; // if (dw) // vmovups(Ymm(oc_blocks * ur_w + jj), ptr[aux_reg_input + sizeof(float) * inp_off]); // else - vbroadcastss(Ymm(oc_blocks * ur_w + jj), - ptr[aux_reg_input + sizeof(float) * inp_off]); + vbroadcastss(Ymm(oc_blocks * ur_w + jj), + ptr[aux_reg_input + sizeof(float) * inp_off]); } for (int ii = 0; ii < oc_blocks; ii++) { int aux_kernel_offset = ii * nb_ic * kh * kw * ic_blk * oc_blk + ifm2 * oc_blk; @@ -101,8 +101,8 @@ inline void jit_avx2_conv_act_kernel::oh_step_nopad(int ur_w, } inline void jit_avx2_conv_act_kernel::width_blk_step(int ur_w, - int pad_l, int pad_r, char pad_tag, - int oc_blocks, char oc_blocks_tag) { + int pad_l, int pad_r, char pad_tag, + int oc_blocks, char oc_blocks_tag) { int iw = jcp.iw; int kw = jcp.kw; int ow = jcp.ow; @@ -142,7 +142,7 @@ inline void jit_avx2_conv_act_kernel::width_blk_step(int ur_w, for (int ii = 0; ii < oc_blocks; ii++) { for (int jj = 0; jj < ur_w; jj++) { vaddps(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), - yword[reg_bias + sizeof(float) * ii * oc_blk]); + yword[reg_bias + sizeof(float) * ii * oc_blk]); } } } @@ -171,12 +171,14 @@ inline void jit_avx2_conv_act_kernel::width_blk_step(int ur_w, cmp(kj, 0); je(skip_kh_loop, T_NEAR); } + jit_tagged_label kh_label("kh", pad_tag, oc_blocks_tag); + L(kh_label); { if (jcp.kw >= 5 && pad_l == 0 && pad_r == 0) { oh_step_nopad(ur_w, pad_l, pad_r, pad_tag, oc_blocks, - oc_blocks_tag); + oc_blocks_tag); sub(aux_reg_input, sizeof(float) * kw * inp_off); add(aux_reg_input, sizeof(float) * iw * inp_mult); } else { @@ -203,11 +205,11 @@ inline void jit_avx2_conv_act_kernel::width_blk_step(int ur_w, } vxorps(yzero, yzero, yzero); if (jcp.relu_negative_slope == 0) { - ymm_relu_ns = yzero; + ymm_relu_ns = yzero; } else { - mov(imm_addr64, float2int(jcp.relu_negative_slope)); - movq(xmm_relu_ns, imm_addr64); - uni_vbroadcastss(ymm_relu_ns, xmm_relu_ns); + mov(imm_addr64, float2int(jcp.relu_negative_slope)); + movq(xmm_relu_ns, imm_addr64); + uni_vbroadcastss(ymm_relu_ns, xmm_relu_ns); } for (int ii = 0; ii < oc_blocks; ii++) { @@ -253,19 +255,19 @@ inline void jit_avx2_conv_act_kernel::solve_common( int l_pad = jcp.l_pad; int r_pad = saber::utils::max(0, (int(jcp.ow) - 1) * str_w + (kw - 1) * dilate_w - - (iw + l_pad - 1)); + - (iw + l_pad - 1)); int r_pad1 = (ur_w * n_oi - 1) * str_w + (kw - 1) * dilate_w - - (iw + l_pad - 1); + - (iw + l_pad - 1); if (r_pad1 > 0) n_oi--; if (l_pad > 0) { n_oi--; if (n_oi < 0 && r_pad1 > 0) width_blk_step(ur_w, l_pad, r_pad1, - 'l', oc_blocks, oc_blocks_tag); // "lrpad" + 'l', oc_blocks, oc_blocks_tag); // "lrpad" else width_blk_step(ur_w, l_pad, 0, - 'l', oc_blocks, oc_blocks_tag); // "lpad" + 'l', oc_blocks, oc_blocks_tag); // "lpad" add(reg_input, sizeof(float) * (ur_w * str_w - l_pad) * inp_mult); add(reg_output, sizeof(float) * ur_w * oc_blk); } @@ -277,7 +279,7 @@ inline void jit_avx2_conv_act_kernel::solve_common( L(ow_loop_label); width_blk_step(ur_w, 0, 0, - 'm', oc_blocks, oc_blocks_tag); // "middle" + 'm', oc_blocks, oc_blocks_tag); // "middle" add(reg_input, sizeof(float) * ur_w * str_w * inp_mult); add(reg_output, sizeof(float) * ur_w * oc_blk); @@ -288,19 +290,19 @@ inline void jit_avx2_conv_act_kernel::solve_common( if (r_pad1 > 0 && n_oi >=0) { width_blk_step(ur_w, 0, r_pad1, - 'r', oc_blocks, oc_blocks_tag); // "rpad" + 'r', oc_blocks, oc_blocks_tag); // "rpad" add(reg_input, sizeof(float) * ur_w * str_w * inp_mult); add(reg_output, sizeof(float) * ur_w * oc_blk); } if (ur_w_tail != 0) width_blk_step(ur_w_tail, 0, r_pad, - 't', oc_blocks, oc_blocks_tag); // "tail" + 't', oc_blocks, oc_blocks_tag); // "tail" } void jit_avx2_conv_act_kernel::generate() { this->preamble(); - + mov(reg_input, ptr[this->param1 + GET_OFF(src)]); mov(reg_output, ptr[this->param1 + GET_OFF(dst)]); mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]); @@ -339,15 +341,16 @@ void jit_avx2_conv_act_kernel::generate() { SaberStatus jit_avx2_conv_act_kernel::init_conf(jit_conv_conf_t &jcp) { if (!mayiuse(avx2)) { + LOG(ERROR) << "init a AVX2 kernel in a non-avx2 machine is not permitted"; return SaberUnImplError; } bool with_groups = false; const bool flat = jcp.ic == 3; const bool depthwise = true - && with_groups - && jcp.oc == 1 - && jcp.ic == 1; + && with_groups + && jcp.oc == 1 + && jcp.ic == 1; const bool mimo = !flat && !depthwise; jcp.is_dw = depthwise ? true : false; const int simd_w = 8; @@ -361,13 +364,14 @@ SaberStatus jit_avx2_conv_act_kernel::init_conf(jit_conv_conf_t &jcp) { jcp.nb_oc_blocking = 4; /* the optimal value for the kernel */ bool args_ok = true - && (jcp.oc % simd_w == 0) - && jcp.l_pad <= jcp.ur_w - && utils::implication(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0) - || (jcp.stride_w == 1 && jcp.stride_h == 1)) - && utils::implication(mimo, jcp.ic % simd_w == 0); - // && implication(depthwise, jcp.ngroups % simd_w == 0); + && (jcp.oc % simd_w == 0) + && jcp.l_pad <= jcp.ur_w + && utils::implication(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0) + || (jcp.stride_w == 1 && jcp.stride_h == 1)) + && utils::implication(mimo, jcp.ic % simd_w == 0); + // && implication(depthwise, jcp.ngroups % simd_w == 0); if (!args_ok) { + LOG(ERROR) << "arguments check failed"; return SaberUnImplError; } @@ -381,13 +385,15 @@ SaberStatus jit_avx2_conv_act_kernel::init_conf(jit_conv_conf_t &jcp) { jcp.ur_w_tail = jcp.ow % jcp.ur_w; /* check again ... */ r_pad_no_tail = saber::utils::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w - + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1)); + + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1)); if ((r_pad_no_tail > jcp.ur_w) || (jcp.ow < jcp.ur_w)) { + LOG(ERROR) << "tail should not be greater than ur_w"; return SaberUnImplError; } } if (jcp.l_pad > jcp.ur_w) { + LOG(ERROR) << "pad should not be greater than ur_w"; return SaberUnImplError; } @@ -409,8 +415,3 @@ SaberStatus jit_avx2_conv_act_kernel::init_conf(jit_conv_conf_t &jcp) { } // namespace jit } // namespace saber } // namespace anakin - - - - - diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_conv_act_kernel.h b/saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.h similarity index 78% rename from saber/funcs/impl/x86/kernel/jit_avx2_conv_act_kernel.h rename to saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.h index 4370701d4..7cf62639e 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx2_conv_act_kernel.h +++ b/saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.h @@ -1,13 +1,13 @@ -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_AVX2_CONV_ACT_KERNEL_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_AVX2_CONV_ACT_KERNEL_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_AVX2_CONV_KERNEL_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_AVX2_CONV_KERNEL_H #include #include +#include "jit_generator.h" #include "saber/funcs/impl/x86/jit_call_conf.h" #include "saber/saber_types.h" #include "saber/funcs/impl/x86/x86_utils.h" -#include "saber/funcs/impl/x86/kernel/jit_generator.h" namespace anakin { namespace saber { @@ -15,8 +15,7 @@ namespace jit { struct jit_avx2_conv_act_kernel: public jit_generator { - jit_avx2_conv_act_kernel(jit_conv_conf_t ajcp) : jcp(ajcp) - { + jit_avx2_conv_act_kernel(jit_conv_conf_t ajcp) : jcp(ajcp) { this->generate(); jit_ker = (void (*)(jit_conv_call_t *))this->getCode(); } @@ -52,11 +51,11 @@ struct jit_avx2_conv_act_kernel: public jit_generator { Xbyak::Ymm ymask = Xbyak::Ymm(14); inline void oh_step_unroll_kw(int ur_w, int pad_l, int pad_r, - int oc_blocks); + int oc_blocks); inline void oh_step_nopad(int ur_w, int pad_l, int pad_r, - char pad_label, int oc_blocks, char oc_blocks_label); + char pad_label, int oc_blocks, char oc_blocks_label); inline void width_blk_step(int ur_w, int pad_l, int pad_r, - char pad_label, int oc_blocks, char oc_blocks_label); + char pad_label, int oc_blocks, char oc_blocks_label); inline void solve_common(int oc_blocks, char oc_blocks_label); void generate(); diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_act_kernel.cpp b/saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_kernel.cpp similarity index 75% rename from saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_act_kernel.cpp rename to saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_kernel.cpp index 3015843fd..8095f2db7 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_act_kernel.cpp +++ b/saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_kernel.cpp @@ -1,8 +1,8 @@ - -#include "jit_avx512_conv1x1_act_kernel.h" +#include "jit_avx512_conv1x1_kernel.h" #include "saber/funcs/impl/x86/jit_call_conf.h" + #include "saber/funcs/impl/x86/x86_utils.h" -#include "saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h" +#include "jit_uni_1x1_conv_utils.h" using namespace anakin::saber::utils; @@ -37,9 +37,9 @@ void jit_avx512_common_1x1_conv_kernel::bcast_loop(int load_loop_blk) { add(aux_reg_output_data, jcp.bcast_loop_output_substep); } else { add(aux1_reg_bcast_data, jcp.bcast_loop_bcast_step - - (num_substeps - 1) * jcp.bcast_loop_bcast_substep); + - (num_substeps - 1) * jcp.bcast_loop_bcast_substep); add(aux_reg_output_data, jcp.bcast_loop_output_step - - (num_substeps - 1) * jcp.bcast_loop_output_substep); + - (num_substeps - 1) * jcp.bcast_loop_output_substep); } } sub(bcast_loop_iter, jcp.bcast_block); @@ -59,10 +59,10 @@ void jit_avx512_common_1x1_conv_kernel::bcast_loop(int load_loop_blk) { void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, - int ur, int substep, bool wraparound) { + int ur, int substep, bool wraparound) { auto vreg_load = [=](int i_load, int i_fma) { return Zmm(rnd_up(ur * load_loop_blk, jcp.fma_step) - + jcp.fma_step * i_load + i_fma); + + jcp.fma_step * i_load + i_fma); }; auto vreg_accum = [=](int i_load, int i_ur) { @@ -71,7 +71,7 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, auto bias_ptr = [=](int i_load) { return EVEX_compress_addr(reg_bias_data, - jcp.typesize_out * jcp.oc_block * i_load); + jcp.typesize_out * jcp.oc_block * i_load); }; auto bcast_ptr = [=](int i_reduce, int i_ur, bool bcast) { @@ -81,11 +81,11 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, assert(jcp.reduce_loop_unroll == jcp.reduce_block); offt = (i_reduce == jcp.reduce_loop_unroll) - ? (jcp.bcast_dim + i_ur) * jcp.reduce_loop_unroll - : i_ur * jcp.reduce_loop_unroll + i_reduce; + ? (jcp.bcast_dim + i_ur) * jcp.reduce_loop_unroll + : i_ur * jcp.reduce_loop_unroll + i_reduce; return EVEX_compress_addr(aux_reg_bcast_data, jcp.typesize_in * offt, - bcast); + bcast); }; auto load_ptr = [=](int i_reduce, int i_load) { @@ -95,14 +95,14 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, offt = (i_load * jcp.reduce_dim + u0) * jcp.load_block; return EVEX_compress_addr(aux_reg_load_data, - u1 * jcp.reduce_loop_load_step - + jcp.typesize_in * offt); + u1 * jcp.reduce_loop_load_step + + jcp.typesize_in * offt); }; auto output_ptr = [=](int i_load, int i_ur) { return EVEX_compress_addr(aux_reg_output_data, - (i_load * jcp.bcast_dim + i_ur) * jcp.load_block - * jcp.typesize_out); + (i_load * jcp.bcast_dim + i_ur) * jcp.load_block + * jcp.typesize_out); }; auto init = [=]() { @@ -138,7 +138,7 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, }; auto vcmp = [=](Xbyak::Opmask kmask, - Xbyak::Zmm zmm_src1, Xbyak::Zmm zmm_src2, const unsigned char cmp) { + Xbyak::Zmm zmm_src1, Xbyak::Zmm zmm_src2, const unsigned char cmp) { if (jcp.ver == ver_4vnni) vpcmpd(kmask, zmm_src1, zmm_src2, cmp); else @@ -146,7 +146,7 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, }; auto vmul = [=](Xbyak::Zmm zmm_dst, Xbyak::Opmask kmask, - Xbyak::Zmm zmm_src1, Xbyak::Zmm zmm_src2) { + Xbyak::Zmm zmm_src1, Xbyak::Zmm zmm_src2) { if (jcp.ver == ver_4vnni) vpmulld(zmm_dst | kmask, zmm_src1, zmm_src2); else @@ -194,9 +194,9 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, for (int i_ur = 0; i_ur < ur; ++i_ur) for (int i_load = 0; i_load < load_loop_blk; ++i_load) { vcmp(vmask, vreg_accum(i_load, i_ur), zmm_zero, - _cmp_lt_os); + _cmp_lt_os); vmul(vreg_accum(i_load, i_ur), vmask, - vreg_accum(i_load, i_ur), zmm_relu_ns); + vreg_accum(i_load, i_ur), zmm_relu_ns); } L(store_norelu); } @@ -206,10 +206,10 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, for (int i_load = 0; i_load < load_loop_blk; ++i_load) if (output_is_aligned && jcp.use_vmovntps) vmovntps(output_ptr(i_load, i_ur), - vreg_accum(i_load, i_ur)); + vreg_accum(i_load, i_ur)); else vmovups(output_ptr(i_load, i_ur), - vreg_accum(i_load, i_ur)); + vreg_accum(i_load, i_ur)); }; Label unaligned_store, end_store; @@ -218,18 +218,18 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, store_output(true); jmp(end_store, T_NEAR); L(unaligned_store); { - store_output(false); - } + store_output(false); + } L(end_store); }; auto prefetch_callback = [=](int ur, int i_reduce, int i_ur, int i_load, - bool last_block, bool wraparound, int reduce_step) { + bool last_block, bool wraparound, int reduce_step) { bool pf_ker_l1 = true; bool pf_ker_l2 = wraparound; int n_ops = (jcp.reduce_loop_unroll / reduce_step) * ur * load_loop_blk; int i_op = (i_reduce / reduce_step) * ur * load_loop_blk + - i_ur * load_loop_blk + i_load; + i_ur * load_loop_blk + i_load; int n_pf_ker_l1 = pf_ker_l1 ? jcp.reduce_block : 0; int n_pf_ker_l2 = pf_ker_l2 && wraparound ? jcp.reduce_block : 0; @@ -241,10 +241,10 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, pf_inp_trigger = max(1, pf_inp_ops / ur); int n_other_pf = - load_loop_blk * (n_pf_ker_l1 + n_pf_ker_l2 + n_pf_out_l1); + load_loop_blk * (n_pf_ker_l1 + n_pf_ker_l2 + n_pf_out_l1); int n_other_pf_ops = n_ops - pf_inp_ops; int other_pf_trigger - = n_other_pf ? max(1, n_other_pf_ops / n_other_pf) : 0; + = n_other_pf ? max(1, n_other_pf_ops / n_other_pf) : 0; if (i_op < pf_inp_ops && i_op % pf_inp_trigger == 0) { // input prefetches have the highest priority b/c the @@ -252,14 +252,14 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, // cache lines int i_pf = i_op / pf_inp_trigger; auto pf_reg = wraparound && last_block - ? reg_bcast_data - : (last_block ? aux1_reg_bcast_data - : aux_reg_bcast_data); + ? reg_bcast_data + : (last_block ? aux1_reg_bcast_data + : aux_reg_bcast_data); int offt = i_pf; { offt += wraparound && last_block - ? 0 - : (last_block ? jcp.ur : jcp.bcast_dim); + ? 0 + : (last_block ? jcp.ur : jcp.bcast_dim); offt *= jcp.reduce_block; } mic_prefetcht0(ptr[pf_reg + offt * jcp.typesize_in]); @@ -273,24 +273,24 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, int i_pf = i_op / (load_loop_blk * other_pf_trigger); if (i_pf < n_pf_ker_l2) { int offt = (i_pf + (i_load + 1) * jcp.reduce_dim) - * jcp.load_block; + * jcp.load_block; mic_prefetcht1(ptr[aux_reg_load_data - + offt * jcp.typesize_in]); + + offt * jcp.typesize_in]); } else if (i_pf < n_pf_ker_l2 + n_pf_ker_l1) { i_pf -= n_pf_ker_l2; auto pf_reg = last_block ? reg_load_data - : aux_reg_load_data; + : aux_reg_load_data; int offt = (i_pf + i_load * jcp.reduce_dim - + (last_block - ? (wraparound ? jcp.reduce_dim : 0) - : jcp.reduce_block)) - * jcp.load_block; + + (last_block + ? (wraparound ? jcp.reduce_dim : 0) + : jcp.reduce_block)) + * jcp.load_block; mic_prefetcht0(ptr[pf_reg + offt * jcp.typesize_in]); } else if (i_pf < n_pf_ker_l1 + n_pf_ker_l2 + n_pf_out_l1) { i_pf -= n_pf_ker_l1 + n_pf_ker_l2; int offt = i_pf * jcp.load_block; mic_prefetcht0(ptr[aux_reg_output_data - + offt * jcp.typesize_out]); + + offt * jcp.typesize_out]); } } } @@ -304,7 +304,7 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, reduce_step *= 2; for (int i_reduce = 0; i_reduce < jcp.reduce_loop_unroll; - i_reduce += reduce_step) { + i_reduce += reduce_step) { int load_scale = (jcp.ver == ver_4vnni) ? 2 : 1; for (int i_load = 0; i_load < load_loop_blk; ++i_load) { // if transposed input data used and if spatial size is @@ -322,27 +322,27 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, for (int i_fma = 0; i_fma < jcp.fma_step; i_fma++) { if (i_fma < n_loads) vmovups(vreg_load(i_load, i_fma), - load_ptr(i_reduce + load_scale * i_fma, - i_load)); + load_ptr(i_reduce + load_scale * i_fma, + i_load)); else vpxord(vreg_load(i_load, i_fma), - vreg_load(i_load, i_fma), - vreg_load(i_load, i_fma)); + vreg_load(i_load, i_fma), + vreg_load(i_load, i_fma)); } jmp(load_finish); L(load_all); for (int i_fma = 0; i_fma < jcp.fma_step; i_fma++) { vmovups(vreg_load(i_load, i_fma), - load_ptr(i_reduce + load_scale * i_fma, i_load)); + load_ptr(i_reduce + load_scale * i_fma, i_load)); } L(load_finish); } else { for (int i_fma = 0; i_fma < jcp.fma_step; i_fma++) { vmovups(vreg_load(i_load, i_fma), - load_ptr(i_reduce - + load_scale * i_fma, - i_load)); + load_ptr(i_reduce + + load_scale * i_fma, + i_load)); } } } @@ -354,22 +354,22 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(int load_loop_blk, for (int i_load = 0; i_load < load_loop_blk; ++i_load) { if (jcp.ver == ver_4fma) v4fmaddps(vreg_accum(i_load, i_ur), - vreg_load(i_load, 0), - bcast_ptr(i_reduce, i_ur, false)); + vreg_load(i_load, 0), + bcast_ptr(i_reduce, i_ur, false)); else if (jcp.ver == ver_4vnni) vp4dpwssd(vreg_accum(i_load, i_ur), - vreg_load(i_load, 0), - bcast_ptr(i_reduce, i_ur, false)); + vreg_load(i_load, 0), + bcast_ptr(i_reduce, i_ur, false)); else if (jcp.ver == ver_avx512_core && jcp.expl_bcast - && load_loop_blk > 1) + && load_loop_blk > 1) vfmadd231ps(vreg_accum(i_load, i_ur), - vreg_load(i_load, 0), vreg_bcast); + vreg_load(i_load, 0), vreg_bcast); else vfmadd231ps(vreg_accum(i_load, i_ur), - vreg_load(i_load, 0), - bcast_ptr(i_reduce, i_ur, true)); + vreg_load(i_load, 0), + bcast_ptr(i_reduce, i_ur, true)); prefetch_callback(ur, i_reduce, i_ur, i_load, - last_block, wraparound, reduce_step); + last_block, wraparound, reduce_step); } } } @@ -409,8 +409,9 @@ void jit_avx512_common_1x1_conv_kernel::generate() { sub(rsp, stack_space_needed); - if (jcp.with_bias) + if (jcp.with_bias) { mov(reg_bias_data, ptr[param1 + GET_OFF(bias_data)]); + } mov(reg_load_loop_work, ptr[param1 + GET_OFF(load_dim)]); mov(reg_bcast_loop_work, ptr[param1 + GET_OFF(bcast_dim)]); @@ -429,7 +430,7 @@ void jit_avx512_common_1x1_conv_kernel::generate() { add(reg_output_data, load_loop_blk * jcp.bcast_dim * jcp.load_block * jcp.typesize_out); - + sub(reg_load_loop_work, load_loop_blk * jcp.load_loop_iter_step); }; @@ -442,20 +443,20 @@ void jit_avx512_common_1x1_conv_kernel::generate() { static const int ur_cases_4fma[] = { 2, 4, 6, 12, 32 }; const int size_ur_cases_fma - = (jcp.ver == ver_avx512_core && jcp.expl_bcast) ? - sizeof(ur_cases_fma_expl_bcast) : - sizeof(ur_cases_fma_embd_bcast); + = (jcp.ver == ver_avx512_core && jcp.expl_bcast) ? + sizeof(ur_cases_fma_expl_bcast) : + sizeof(ur_cases_fma_embd_bcast); const int size_ur_cases_4fma = sizeof(ur_cases_4fma); const int *ur_cases_fma = (jcp.ver == ver_avx512_core && jcp.expl_bcast) ? - ur_cases_fma_expl_bcast : - ur_cases_fma_embd_bcast; + ur_cases_fma_expl_bcast : + ur_cases_fma_embd_bcast; const int *ur_cases = (jcp.ver == ver_4fma || jcp.ver == ver_4vnni) - ? ur_cases_4fma : ur_cases_fma; + ? ur_cases_4fma : ur_cases_fma; const int num_ur_cases = (jcp.ver == ver_4fma || jcp.ver == ver_4vnni ? - size_ur_cases_4fma : - size_ur_cases_fma) - / sizeof(*ur_cases); + size_ur_cases_4fma : + size_ur_cases_fma) + / sizeof(*ur_cases); for (int ur_idx = num_ur_cases - 1; ur_idx > 0; ur_idx--) { int label_idx = num_ur_cases - ur_idx - 1; @@ -501,92 +502,39 @@ void jit_avx512_common_1x1_conv_kernel::generate() { SaberStatus jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jcp, - conv_1x1_desc &conv_d, - const Shape &weights_d, - int group, - int dilation_h, - int dilation_w, - bool with_relu, float relu_negative_slope, - int nthreads, bool with_bias, bool reduce_src) { - jcp.ngroups = group; // with_groups ? weights_d.dims()[0] : 1; - const bool with_groups = jcp.ngroups > 1; // weights_d.ndims() == src_d.ndims() + 1; - jcp.mb = conv_d.src_d[0]; - - jcp.oc = conv_d.dst_d[1] / jcp.ngroups; - jcp.ic = conv_d.src_d[1] / jcp.ngroups; - - jcp.ih = conv_d.src_d[2]; - jcp.iw = conv_d.src_d[3]; - jcp.oh = conv_d.dst_d[2]; - jcp.ow = conv_d.dst_d[3]; - - jcp.kh = weights_d[with_groups + 2]; - jcp.kw = weights_d[with_groups + 3]; - - jcp.t_pad = conv_d.padding[0][0]; - jcp.l_pad = conv_d.padding[0][1]; - - jcp.stride_h = conv_d.strides[0]; - jcp.stride_w = conv_d.strides[1]; - - //jcp.src_fmt = src_d.format(); - jcp.with_bias = with_bias; - jcp.with_relu = with_relu; - jcp.relu_negative_slope = relu_negative_slope; + conv_1x1_desc &conv_d, + int nthreads, + bool reduce_src) { + if (!mayiuse(avx512_common)) { + LOG(ERROR) <<"init a AVX512 kernel in non-avx512 machine is not permitted"; + return SaberUnImplError; + } + + // const bool with_groups = jcp.ngroups > 1; + bool args_ok = true; + + const int simd_w = 16; + args_ok = true && + jcp.oc % simd_w == 0 && jcp.ic % simd_w == 0 && + jcp.t_pad == 0 && jcp.l_pad == 0 && + jcp.stride_w == 1 && jcp.stride_h == 1 && + jcp.kh == 1 && jcp.kw == 1; + if (!args_ok) { + LOG(ERROR) << "ic:" << jcp.ic << ", oc:" << jcp.oc << ", stride_h:" << jcp.stride_h << ", stride_w:" << jcp.stride_w << ", kh:" << jcp.kh << ", kw:" << jcp.kw << ", pad:" << jcp.t_pad; + return SaberUnImplError; + } jcp.os = jcp.oh * jcp.ow; jcp.is = jcp.ih * jcp.iw; jcp.tr_is = rnd_up(jcp.is, 4); - //const auto &p = attr.post_ops_; - //jcp.with_sum = p.find(primitive_kind::sum) != -1; - - //if (!jcp.with_relu) { - // jcp.with_relu = p.find(primitive_kind::eltwise) != -1; - // jcp.relu_negative_slope = 0; - //} - - //bool args_ok = true - // && jcp.ngroups == 1 - // && src_d.format() == nChw16c - // && one_of(cd.bias_desc.format, memory_format::undef, any, x) - // && dst_d.format() == nChw16c; - //if (!args_ok) return status::unimplemented; - - const int simd_w = 16; - bool args_ok = true; - args_ok = true - && jcp.oc % simd_w == 0 && jcp.ic % simd_w == 0 - && jcp.t_pad == 0 && jcp.l_pad == 0 - && jcp.stride_w == 1 && jcp.stride_h == 1 // TODO: support some strides - && jcp.kh == 1 && jcp.kw == 1; - if (!args_ok) return SaberUnImplError; - jcp.ic_block = jcp.oc_block = simd_w; jcp.transpose_src = false; - - //if (everyone_is(data_type::f32, src_d.data_type(), - // weights_d.data_type(), dst_d.data_type())) - { - //constexpr memory_format_t weights_formats[2] = { OIhw16i16o , gOIhw16i16o }; - //memory_format_t weights_format = weights_formats[with_groups]; - - //if (weights_d.format() != weights_format) - // return status::unimplemented; - if ( mayiuse(avx512_mic_4ops) && - (jcp.ic_block) % 4 == 0) { - jcp.ver = ver_4fma; - jcp.fma_step = 4; - } - else { - jcp.ver = (mayiuse(avx512_core)) ? ver_avx512_core : ver_fma; - jcp.fma_step = 1; - } - jcp.typesize_in = sizeof(float); - jcp.typesize_out = sizeof(float); - } - + jcp.ver = (mayiuse(avx512_core)) ? ver_avx512_core : ver_fma; + jcp.fma_step = 1; + jcp.typesize_in = sizeof(float); + jcp.typesize_out = sizeof(float); const int SMALL_SPATIAL = 10; const int BIG_SPATIAL = 28; @@ -607,26 +555,19 @@ SaberStatus jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jc const int L2_capacity = (L2_size * 3) / 4; { - { - jcp.reduce_dim = jcp.ic; - jcp.reduce_block = jcp.ic_block; + jcp.reduce_dim = jcp.ic; + jcp.reduce_block = jcp.ic_block; - jcp.load_dim = jcp.oc; - jcp.load_block = jcp.oc_block; + jcp.load_dim = jcp.oc; + jcp.load_block = jcp.oc_block; - jcp.bcast_dim = jcp.is; - } + jcp.bcast_dim = jcp.is; jcp.reduce_loop_unroll = jcp.reduce_block; - jcp.reduce_loop_bcast_step - = jcp.reduce_loop_unroll * jcp.bcast_dim * jcp.typesize_in; + jcp.reduce_loop_bcast_step = jcp.reduce_loop_unroll * jcp.bcast_dim * jcp.typesize_in; - { - jcp.reduce_loop_load_step - = jcp.reduce_loop_unroll * jcp.load_block * jcp.typesize_in; - jcp.load_loop_load_step - = jcp.reduce_dim * jcp.load_block * jcp.typesize_in; - } + jcp.reduce_loop_load_step = jcp.reduce_loop_unroll * jcp.load_block * jcp.typesize_in; + jcp.load_loop_load_step = jcp.reduce_dim * jcp.load_block * jcp.typesize_in; // adjusting registry blocking int max_regs, min_regs, size_treshold, ur_step; @@ -638,8 +579,8 @@ SaberStatus jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jc ur_step = 1; jcp.expl_bcast = true; - if (jcp.load_dim > 128 && jcp.load_dim < BIG_LOAD_DIM - && spatial > SMALL_SPATIAL && spatial < BIG_SPATIAL) { + if (jcp.load_dim > 128 && jcp.load_dim < BIG_LOAD_DIM && + spatial > SMALL_SPATIAL && spatial < BIG_SPATIAL) { max_regs = 6; min_regs = 5; } @@ -652,6 +593,7 @@ SaberStatus jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jc jcp.expl_bcast = false; jcp.use_vmovntps = true; } + jcp.ur = 1; for (int ur_w = max_regs; ur_w >= min_regs; ur_w -= ur_step) { if ((spatial >= size_treshold && spatial % ur_w == 0) @@ -678,7 +620,7 @@ SaberStatus jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jc jcp.reduce_loop_unroll = jcp.reduce_block; jcp.reduce_loop_bcast_step - = jcp.reduce_loop_unroll * jcp.bcast_dim * jcp.typesize_in; + = jcp.reduce_loop_unroll * jcp.bcast_dim * jcp.typesize_in; jcp.bcast_block = jcp.ur; @@ -714,7 +656,7 @@ SaberStatus jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jc if (spatial <= SMALL_SPATIAL && jcp.reduce_dim >= BIG_REDUCE_DIM) reduce_blocking = 16; else if (spatial > SMALL_SPATIAL - && jcp.reduce_dim >= BIG_REDUCE_DIM) + && jcp.reduce_dim >= BIG_REDUCE_DIM) reduce_blocking = 8; reduce_blocking = best_divider(nb_reduce, 1, reduce_blocking, true); reduce_blocking *= jcp.reduce_block; @@ -728,43 +670,41 @@ SaberStatus jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jc jcp.load_grp_count = div_up(nthreads, jcp.mb * jcp.ngroups * nb_bcast); jcp.load_grp_count = best_divider( - nthreads, jcp.load_grp_count, 2 * jcp.load_grp_count, false); + nthreads, jcp.load_grp_count, 2 * jcp.load_grp_count, false); if (jcp.ver == ver_avx512_core && jcp.expl_bcast && jcp.bcast_dim <= 64 && jcp.load_dim * jcp.reduce_dim >= L2_size) { jcp.load_grp_count = max(jcp.load_grp_count, 4); } else if (jcp.bcast_dim <= 49 && jcp.mb <= nthreads - && jcp.load_dim > 512 && jcp.load_dim / jcp.reduce_dim >= 4) { + && jcp.load_dim > 512 && jcp.load_dim / jcp.reduce_dim >= 4) { jcp.load_grp_count = max(jcp.load_grp_count, 2); load_blocking = jcp.load_block; } bcast_blocking = div_up(jcp.mb * jcp.ngroups * nb_bcast, - div_up(nthreads, jcp.load_grp_count)) - * jcp.bcast_block; + div_up(nthreads, jcp.load_grp_count)) + * jcp.bcast_block; bcast_blocking = min(jcp.bcast_dim, bcast_blocking); bcast_blocking = rnd_up(bcast_blocking, jcp.bcast_block); int space_for_bcast - = (L2_capacity - /* kernel_size - */ - 2 * jcp.load_block * reduce_blocking - - jcp.ur * reduce_blocking - 3 * 1024); + = (L2_capacity - /* kernel_size - */ + 2 * jcp.load_block * reduce_blocking + - jcp.ur * reduce_blocking - 3 * 1024); if (jcp.reduce_dim * jcp.bcast_dim > L2_capacity) space_for_bcast /= 2; int bcast_in_cache - = max(jcp.bcast_block, space_for_bcast / reduce_blocking); + = max(jcp.bcast_block, space_for_bcast / reduce_blocking); bcast_blocking = min( - bcast_blocking, rnd_dn(bcast_in_cache, jcp.bcast_block)); + bcast_blocking, rnd_dn(bcast_in_cache, jcp.bcast_block)); load_blocking_max = load_blocking; bcast_blocking_max = bcast_blocking * 3 / 2; reduce_blocking_max = reduce_blocking; - } - assert(load_blocking); assert(load_blocking_max); assert(bcast_blocking); @@ -775,13 +715,6 @@ SaberStatus jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jc assert(reduce_blocking % jcp.reduce_block == 0); assert(load_blocking_max % jcp.load_block == 0); assert(reduce_blocking_max % jcp.reduce_block == 0); - if (jcp.ver == ver_4fma || jcp.ver == ver_4vnni) { - if (jcp.ver == ver_4fma) - assert(jcp.reduce_loop_unroll % jcp.fma_step == 0); - if (jcp.ver == ver_4vnni) - assert(jcp.reduce_loop_unroll % (2 * jcp.fma_step) == 0); - assert(jcp.reduce_dim % jcp.reduce_loop_unroll == 0); - } assert(jcp.bcast_block % jcp.ur == 0); assert(jcp.reduce_dim % jcp.reduce_block == 0); @@ -803,11 +736,11 @@ SaberStatus jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jc } void jit_avx512_common_1x1_conv_kernel::balance(jit_1x1_conv_conf_t &jcp, - int nthreads) { + int nthreads) { if (nthreads < jcp.ngroups) { /* simplification... fortunately it doesn't hurt much */ jcp.nthr = jcp.nthr_mb = jcp.nthr_g = - jcp.nthr_oc_b = jcp.nthr_ic_b = 1; + jcp.nthr_oc_b = jcp.nthr_ic_b = 1; return; } const int nb_bcast = utils::div_up(jcp.bcast_dim, jcp.bcast_block); @@ -835,17 +768,17 @@ void jit_avx512_common_1x1_conv_kernel::balance(jit_1x1_conv_conf_t &jcp, output_koeff = 8; } return 0 - + bcast_koeff * div_up(jcp.mb * nb_reduce, nthr_mb) - * div_up(jcp.ngroups, jcp.nthr_g) - * div_up(nb_bcast, nthr_ic_b) * jcp.ic_block * jcp.reduce_block - / jcp.stride_h / jcp.stride_w /* (n1) */ - + load_koeff * div_up(jcp.mb * nb_reduce, nthr_mb) - * div_up(jcp.ngroups, jcp.nthr_g) - * div_up(nb_load, nthr_oc_b) * jcp.oc_block * jcp.reduce_block - + output_koeff /* (n2) */ - * div_up(jcp.ngroups, jcp.nthr_g) * div_up(nb_load, nthr_oc_b) - * div_up(nb_bcast, nthr_ic_b) * jcp.ic_block - * jcp.oc_block; + + bcast_koeff * div_up(jcp.mb * nb_reduce, nthr_mb) + * div_up(jcp.ngroups, jcp.nthr_g) + * div_up(nb_bcast, nthr_ic_b) * jcp.ic_block * jcp.reduce_block + / jcp.stride_h / jcp.stride_w /* (n1) */ + + load_koeff * div_up(jcp.mb * nb_reduce, nthr_mb) + * div_up(jcp.ngroups, jcp.nthr_g) + * div_up(nb_load, nthr_oc_b) * jcp.oc_block * jcp.reduce_block + + output_koeff /* (n2) */ + * div_up(jcp.ngroups, jcp.nthr_g) * div_up(nb_load, nthr_oc_b) + * div_up(nb_bcast, nthr_ic_b) * jcp.ic_block + * jcp.oc_block; }; int nthr_mb = 1, nthr_oc_b = 1, nthr_ic_b = 1; diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_act_kernel.h b/saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_kernel.h similarity index 82% rename from saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_act_kernel.h rename to saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_kernel.h index 380342313..71e15ea36 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_act_kernel.h +++ b/saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_kernel.h @@ -12,23 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_JIT_AVX512_CONV1X1_ACT_KERNEL_H -#define ANAKIN_SABER_FUNCS_JIT_AVX512_CONV1X1_ACT_KERNEL_H +#ifndef ANAKIN_SABER_FUNCS_JIT_AVX512_CONV1X1_KERNEL_H +#define ANAKIN_SABER_FUNCS_JIT_AVX512_CONV1X1_KERNEL_H #include "saber/funcs/impl/impl_base.h" #include "saber/core/tensor.h" #include "saber/saber_types.h" #include "saber/funcs/impl/x86/jit_call_conf.h" -#include "saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h" -#include "saber/funcs/impl/x86/kernel/jit_generator.h" +#include "jit_uni_1x1_conv_utils.h" +#include "jit_generator.h" namespace anakin { namespace saber { namespace jit { struct jit_avx512_common_1x1_conv_kernel : public jit_generator { - jit_avx512_common_1x1_conv_kernel(jit_1x1_conv_conf_t ajcp) : jcp(ajcp) - { + jit_avx512_common_1x1_conv_kernel(jit_1x1_conv_conf_t ajcp) : jcp(ajcp) { this->generate(); jit_ker = (void (*)(jit_1x1_conv_call_t *)) this->getCode(); } @@ -36,17 +35,12 @@ struct jit_avx512_common_1x1_conv_kernel : public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_common_1x1_conv_kernel) static SaberStatus init_conf(jit_1x1_conv_conf_t &jcp, conv_1x1_desc &conv_d, - const Shape &weights_d, - int group, - int dilation_h, - int dilation_w, - bool with_relu, float relu_negative_slope, - int nthreads, bool with_bias = false, bool reduce_src = false); + int nthreads, bool reduce_src = false); jit_1x1_conv_conf_t jcp; void (*jit_ker)(jit_1x1_conv_call_t *); - private: +private: using reg64_t = const Xbyak::Reg64; using zmm_t = const Xbyak::Zmm; using mask_t = const Xbyak::Opmask; @@ -88,4 +82,5 @@ struct jit_avx512_common_1x1_conv_kernel : public jit_generator { } // namespace jit } // namespace saber } // namespace anakin -#endif // ANAKIN_SABER_FUNCS_JIT_AVX512_CONV1X1_ACT_KERNEL_H + +#endif // ANAKIN_SABER_FUNCS_JIT_AVX512_CONV1X1_KERNEL_H diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_conv_act_kernel.cpp b/saber/funcs/impl/x86/kernel/jit_avx512_conv_act_kernel.cpp deleted file mode 100644 index b6610b6c7..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_conv_act_kernel.cpp +++ /dev/null @@ -1,583 +0,0 @@ -#include - -#include "jit_avx512_conv_act_kernel.h" - -#define GET_OFF(field) offsetof(jit_conv_call_t, field) -#define KNx_L2_EFFECTIVE_CAPACITY ((512 - 64) * 1024) - -namespace anakin { -namespace saber { -namespace jit { - -using namespace Xbyak; - -static unsigned int L1_cache_size = get_cache_size(1, true); - -static inline void pick_loop_order(jit_conv_conf_t &jcp) { - // auto w = jcp.ow; - // auto h = jcp.oh; - switch (jcp.ver) { - case ver_fma: - jcp.loop_order = loop_cgn; - break; - default: - assert(!"unsupported convolution version"); - } -} - - -void jit_conv_act_kernel::prepare_output(int ur_w) { - for (int k = 0; k < jcp.nb_oc_blocking; k++) - for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); - vpxord(zmm, zmm, zmm); - int aux_output_offset = get_output_offset(j, k); - mic_prefetcht1(EVEX_compress_addr(reg_out_prf, aux_output_offset)); - } -} - - -void jit_conv_act_kernel::store_output(int ur_w) { - - Label no_update_label, store_label, relu_label; - - mov(reg_channel, ptr[param1 + GET_OFF(channel)]); - if (jcp.with_bias) { - mov(reg_bias, ptr[param1 + GET_OFF(bias)]); - } - - if (!jcp.with_sum) { - cmp(reg_channel, 0); - je(no_update_label, T_NEAR); - } - - for (int k = 0; k < jcp.nb_oc_blocking; k++) { - for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); - int aux_output_offset = get_output_offset(j, k); - vadd(zmm, reg_out, aux_output_offset); - } - } - - if (!jcp.with_sum) { - jmp(relu_label, T_NEAR); - } else { - cmp(reg_channel, 0); - jne(relu_label, T_NEAR); - } - - - L(no_update_label); - if (jcp.with_bias) { - for (int k = 0; k < jcp.nb_oc_blocking; k++) { - int bias_offset = jcp.typesize_out * k * jcp.oc_block; - for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); - vadd(zmm, reg_bias, bias_offset); - } - mic_prefetcht1(EVEX_compress_addr(reg_bias, bias_offset + 64)); - } - } - - L(relu_label); - if (jcp.with_relu) { - vpxord(zmm_zero, zmm_zero, zmm_zero); - if (jcp.relu_negative_slope == 0 || jcp.ver == ver_4vnni) { - zmm_relu_ns = zmm_zero; - } else { - mov(imm_addr64, float2int(jcp.relu_negative_slope)); - vmovq(xmm_relu_ns, imm_addr64); - vbroadcastss(zmm_relu_ns, xmm_relu_ns); - } - cmp(reg_channel, jcp.nb_ic - 1); - jl(store_label, T_NEAR); - for (int k = 0; k < jcp.nb_oc_blocking; k++) - for (int j = 0; j < ur_w; j++){ - Opmask kmask = Opmask(7); - Zmm zmm = zmm_out(j, k); - vcmp(kmask, zmm, zmm_zero, _cmp_lt_os); - vmul(zmm, kmask, zmm, zmm_relu_ns); - } - } - - L(store_label); - for (int k = 0; k < jcp.nb_oc_blocking; k++) { - for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); - int aux_output_offset - = typesize * (k * jcp.oh * jcp.ow + j) * jcp.oc_block; - vmovups(EVEX_compress_addr(reg_out, aux_output_offset), zmm); - mic_prefetcht0(EVEX_compress_addr(reg_out_prf, aux_output_offset)); - } - } -} - - -void jit_conv_act_kernel::compute_loop_fma_core(int ur_w, - int pad_l, int pad_r) { - int kw = jcp.kw; - int stride_w = jcp.stride_w; - int ic_block = jcp.ic_block; - int oc_block = jcp.oc_block; - int nb_oc_block = jcp.nb_oc_blocking; - Label kh_label, skip_kh_loop; - int shift_kernel_ptr = jcp.typesize_in * jcp.kw * jcp.oc_block - * jcp.ic_block; - int shift_input_ptr = jcp.typesize_in * jcp.iw - * (!jcp.is_1stconv ? ic_block : 1); - auto input_offset = [=](int oi, int ic, int ki) { - return jcp.typesize_in * ((ki + oi * stride_w - pad_l) * ic_block + ic); - }; - mov(aux_reg_inp, reg_inp); - mov(aux_reg_ker, reg_ker); - - prepare_output(ur_w); - - mov(reg_kj, reg_kh); - if (jcp.kh <= jcp.t_pad) { - cmp(reg_kj, 0); - je(skip_kh_loop, T_NEAR); - } - - L(kh_label); - { - for (int ki = 0; ki < kw; ki++) { - int jj_start = get_ow_start(ki, pad_l); - int jj_end = get_ow_end(ur_w, ki, pad_r); - for (int ic = 0; ic < ic_block; ic++) { - if (jcp.kernel_kind == expl_bcast) { - for (int jj = jj_start; jj < jj_end; jj++) { - int aux_input_offset = input_offset(jj, ic, ki); - vbroadcastss(zmm_inp(jj, nb_oc_block), - ptr[aux_reg_inp + aux_input_offset]); - } - } - - for (int ii = 0; ii < nb_oc_block; ii++) { - int aux_kernel_offset = jcp.typesize_in - * (ii * jcp.nb_ic * jcp.kh * jcp.kw * ic_block - * oc_block + ki * ic_block * oc_block + ic * oc_block); - if (jj_end - jj_start > 0) - vmovups(zmm_wei, EVEX_compress_addr(aux_reg_ker, - aux_kernel_offset)); - for (int jj = jj_start; jj < jj_end; jj++) - if (jcp.kernel_kind == expl_bcast) - vfmadd231ps(zmm_out(jj, ii), - zmm_inp(jj, nb_oc_block), zmm_wei); - else - vfmadd231ps(zmm_out(jj, ii), zmm_wei, - EVEX_compress_addr(aux_reg_inp, - input_offset(jj, ic, ki), true)); - } - } - } - add(aux_reg_ker, shift_kernel_ptr); - add(aux_reg_inp, shift_input_ptr); - dec(reg_kj); - cmp(reg_kj, 0); - jg(kh_label, T_NEAR); - } - - L(skip_kh_loop); - store_output(ur_w); -} - - -void jit_conv_act_kernel::compute_loop_fma(int ur_w, int pad_l, int pad_r) { - bool prf_ker = true; - bool prf_inp = true; - int iw = jcp.iw; - int ih = jcp.ih; - int kw = jcp.kw; - int stride_w = jcp.stride_w; - int ic_block = jcp.ic_block; - int oc_block = jcp.oc_block; - int nb_oc_block = jcp.nb_oc_blocking; - Label kh_label; - - int ker_pipeline_depth = 4; - assert(ker_reg_base_idx + ker_pipeline_depth <= 32); - assert(oc_block >= ker_pipeline_depth); - - int num_ker_loads = ic_block * nb_oc_block * kw; - const int simd_w = 16; - int num_ker_prfs = prf_ker ? num_ker_loads : 0; - int num_inp_prfs = prf_inp ? - ur_w * utils::min(kw, stride_w) + utils::max(0, kw - stride_w) : 0; - if (jcp.is_1stconv && prf_inp) { - num_inp_prfs = utils::div_up(num_inp_prfs, simd_w) * ic_block; - } - int num_prfs = num_ker_prfs + num_inp_prfs; - int num_fmas = num_ker_loads * ur_w; - int prf_inst_spacing - = (prf_ker || prf_inp) ? utils::max(1, num_fmas / num_prfs) : 1; - int prf_inst_trigger = (num_fmas % prf_inst_spacing) / 2; - - mov(aux_reg_inp, reg_inp); - mov(aux_reg_ker, reg_ker); - - prepare_output(ur_w); - - mov(aux_reg_inp_prf, reg_inp_prf); - mov(aux_reg_ker_prf, reg_ker_prf); - mov(reg_kj, reg_kh); - Label skip_kh_loop; - if (jcp.kh <= jcp.t_pad) { - cmp(reg_kj, 0); - je(skip_kh_loop, T_NEAR); - } - align(16); - L(kh_label); - { - int step = 0; - int ker_prfs = 0; - for (int ki = 0; ki < kw; ki++) { - for (int ic = 0; ic < ic_block; ic++) { - int aux_kernel_offset = 0; - if (step == 0) { - for (int i = 0; i < ker_pipeline_depth; i++) { - aux_kernel_offset = get_kernel_offset(ki, ic, 0, i); - vmovups(zmm_ker(i), EVEX_compress_addr( - aux_reg_ker, aux_kernel_offset)); - } - } else if (step < num_ker_loads - ker_pipeline_depth + 1) { - int load_offset = ker_pipeline_depth - 1; - int ker_load_reg_idx - = (step + load_offset) % ker_pipeline_depth; - aux_kernel_offset = get_kernel_offset(ki,ic,0,load_offset); - vmovups(zmm_ker(ker_load_reg_idx), - EVEX_compress_addr(aux_reg_ker, aux_kernel_offset)); - } - - bool ker_prf_inserted = false; - Zmm zmm_kernel = zmm_ker(step % ker_pipeline_depth); - int j_start = get_ow_start(ki, pad_l); - int j_end = get_ow_end(ur_w, ki, pad_r); - for (int j = j_start; j < j_end; j++) { - int aux_input_offset = get_input_offset(ki, ic, j, pad_l); - vfmadd231ps(zmm_out(j, 0), zmm_kernel, - EVEX_compress_addr(aux_reg_inp, aux_input_offset, true)); - - int fma_idx = step * ur_w + j; - int prf_slot_idx = fma_idx / prf_inst_spacing; - if (fma_idx % prf_inst_spacing == prf_inst_trigger) { - if (prf_ker && !ker_prf_inserted - && ker_prfs < num_ker_prfs) { - int ker_prf_offset - = jcp.typesize_in * ker_prfs * jcp.oc_block; - mic_prefetcht2(EVEX_compress_addr( - aux_reg_ker_prf, ker_prf_offset)); - ker_prf_inserted = true; - ker_prfs++; - } else if (prf_inp) { - int inp_prf_idx = prf_slot_idx - ker_prfs; - if (inp_prf_idx < num_inp_prfs) { - int inp_prf_stride = utils::max(kw, stride_w); - int inp_prf_offset; - if (!jcp.is_1stconv) { - inp_prf_offset - = ic_block * jcp.typesize_in - * ((inp_prf_idx / kw) - * inp_prf_stride - + (inp_prf_idx % kw)); - } else { - int ic_prf_stride = jcp.typesize_in*iw*ih; - int iw_prf_stride = jcp.typesize_in*simd_w; - inp_prf_offset = ((inp_prf_idx / ic_block) - * iw_prf_stride - + (inp_prf_idx % ic_block) - * ic_prf_stride); - } - - mic_prefetcht0(EVEX_compress_addr( - aux_reg_inp_prf, inp_prf_offset)); - } - } - } - } - - step++; - } - } - add(aux_reg_ker, jcp.typesize_in * kw * oc_block * ic_block); - if (prf_ker) { - add(aux_reg_ker_prf, jcp.typesize_in * kw * oc_block * ic_block); - } - int inp_mul = !jcp.is_1stconv ? ic_block : 1; - add(aux_reg_inp, jcp.typesize_in * iw * inp_mul); - if (prf_inp) { - add(aux_reg_inp_prf, jcp.typesize_in * iw * inp_mul); - } - - dec(reg_kj); - cmp(reg_kj, 0); - jg(kh_label, T_NEAR); - } - - L(skip_kh_loop); - store_output(ur_w); -} - - -void jit_conv_act_kernel::compute_loop(int ur_w, int pad_l, int pad_r) { - - if (jcp.ver == ver_fma){ - if (jcp.is_1stconv || mayiuse(avx512_mic)) { - compute_loop_fma(ur_w, pad_l, pad_r); - } - else if (jcp.kernel_kind == embd_bcast && jcp.nb_oc_blocking == 1) { - compute_loop_fma(ur_w, pad_l, pad_r); - } - else { - compute_loop_fma_core(ur_w, pad_l, pad_r); - } - } else { - assert(!"unknown convolution version"); - } - } - - -void jit_conv_act_kernel::generate() { - int iw = jcp.iw; - int ow = jcp.ow; - int kw = jcp.kw; - int l_pad = jcp.l_pad; - int ur_w = jcp.ur_w; - int ur_w_tail = jcp.ur_w_tail; - int stride_w = jcp.stride_w; - int ic_block = jcp.ic_block; - int oc_block = jcp.oc_block; - - int inp_mult = !jcp.is_1stconv ? ic_block : 1; - int inp_shift_pad = jcp.typesize_in * (ur_w * stride_w - l_pad) * inp_mult; - int inp_shift = jcp.typesize_in * (ur_w * stride_w * inp_mult); - int out_shift = jcp.typesize_out * (ur_w * oc_block); - preamble(); - - mov(reg_inp, ptr[param1 + GET_OFF(src)]); - mov(reg_out, ptr[param1 + GET_OFF(dst)]); - mov(reg_ker, ptr[param1 + GET_OFF(filt)]); - mov(reg_ker_prf, ptr[param1 + GET_OFF(filt_prf)]); - mov(reg_kh, ptr[param1 + GET_OFF(kh_padding)]); - - int r_pad = utils::max(0, (ow - 1) * stride_w + (kw - 1) - (iw + l_pad - 1)); - - int n_oi = ow / ur_w; - int r_pad1 = (ur_w * n_oi - 1) * stride_w + kw - 1 - (iw + l_pad - 1); - if (r_pad1 > 0) n_oi--; - - - if (ow == ur_w) { - mov(reg_inp_prf, ptr[param1 + GET_OFF(src_prf)]); - mov(reg_out_prf, ptr[param1 + GET_OFF(dst_prf)]); - compute_loop(ur_w, l_pad, r_pad); - } else { - //TODO: potentially suboptimal - mov(reg_inp_prf, reg_inp); - mov(reg_out_prf, reg_out); - - if (n_oi == 0) { - add(reg_inp_prf, inp_shift_pad); - add(reg_out_prf, out_shift); - compute_loop(ur_w, l_pad, r_pad1); - add(reg_inp, inp_shift_pad); - add(reg_out, out_shift); - if (ur_w_tail != 0) { - add(reg_inp_prf, inp_shift); - add(reg_out_prf, out_shift); - compute_loop(ur_w_tail, 0, r_pad); - } - } else { - xor_(reg_oi, reg_oi); - if (l_pad > 0) { - add(reg_inp_prf, inp_shift_pad); - add(reg_out_prf, out_shift); - compute_loop(ur_w, l_pad, 0); - add(reg_inp, inp_shift_pad); - add(reg_out, out_shift); - inc(reg_oi); - } - if ((l_pad <= 0 && n_oi > 0) || (l_pad > 0 && n_oi > 1)) { - if (l_pad <= 0 && r_pad1 > 0) - n_oi--; - Label ow_loop_label; - L(ow_loop_label); - { - add(reg_inp_prf, inp_shift); - add(reg_out_prf, out_shift); - compute_loop(ur_w, 0, 0); - add(reg_inp, inp_shift); - add(reg_out, out_shift); - inc(reg_oi); - cmp(reg_oi, n_oi); - jl(ow_loop_label, T_NEAR); - } - } - if (r_pad1 > 0) { - add(reg_inp_prf, inp_shift); - add(reg_out_prf, out_shift); - compute_loop(ur_w, 0, r_pad1); - add(reg_inp, inp_shift); - add(reg_out, out_shift); - } - if (ur_w_tail != 0) { - add(reg_inp_prf, inp_shift); - add(reg_out_prf, out_shift); - compute_loop(ur_w_tail, 0, r_pad); - } - } - } - postamble(); -} - - -SaberStatus jit_conv_act_kernel::init_conf(jit_conv_conf_t &jcp) { - if (!mayiuse(avx512_common)) - return SaberUnImplError; - - const int simd_w = cpu_isa_traits::vlen / sizeof(float); - const int regs = 28; - - //jcp = zero(); - jcp.ur_h = 1; - jcp.oc_block = simd_w; - jcp.ic_block = (jcp.ic % simd_w != 0) ? jcp.ic : simd_w; - - // check dilation - if (jcp.dilate_h != 0 || jcp.dilate_w != 0) - return SaberUnImplError; - - // check ic%16==0 - //jcp.is_1stconv = is_1stconv(jcp); - if (jcp.ic % simd_w != 0 && !jcp.is_1stconv) - return SaberUnImplError; - - if (mayiuse(avx512_common)) { - jcp.ver = ver_fma; - jcp.typesize_in = sizeof(float); - jcp.typesize_out = sizeof(float); - - if (jcp.is_1stconv) { - // TODO: fix & remove constraints below - if (jcp.l_pad != 0 || jcp.r_pad != 0 - || jcp.b_pad != 0 || jcp.t_pad != 0 - || (jcp.kw < 7 && jcp.kh < 7)) - jcp.ver = ver_fma; - } - } - - // set jcp.ur_w - if (jcp.is_1stconv) { - jcp.ur_w = utils::min(jcp.ow, regs); - } else { - for (int ur_w = regs; ur_w > 0; --ur_w) { - if (jcp.ow % ur_w == 0) { - jcp.ur_w = ur_w; - break; - } - } - if (jcp.ur_w == 1) { - jcp.ur_w = utils::min(jcp.ow, regs); - } - } - - // TODO (Tanya): currenly applied to Segnet convolutions only. - // Need to try for other topologies - if (jcp.ow > 150 && jcp.ur_w < regs/2) - jcp.ur_w = regs; - - int n_oi = (jcp.ow / jcp.ur_w); - int r_pad = (jcp.ur_w * n_oi - 1) * jcp.stride_w + jcp.kw - jcp.iw - jcp.l_pad; - if (jcp.l_pad > 0 && r_pad > 0) - n_oi--; - - bool large_code_size = jcp.ur_w != jcp.ow && jcp.l_pad > 0 && r_pad > 0 - && ((jcp.l_pad <= 0 && n_oi > 0) || (jcp.l_pad > 0 && n_oi > 1)); - if (large_code_size) { - const int max_code_size = 24 * 1024; - const int num_ops_per_reg = 6 + jcp.ic_block * jcp.kw; - int mult = 1; - if (jcp.l_pad > 0) mult += 1; - if (r_pad > 0) mult += 1; - for (int ur_w = jcp.ur_w; ur_w > regs/2; --ur_w) { - if (ur_w * mult * num_ops_per_reg * 9.0 < max_code_size) { - jcp.ur_w = ur_w; - break; - } - } - } - - jcp.nb_ic = jcp.ic / jcp.ic_block; - jcp.nb_oc = jcp.oc / jcp.oc_block; - jcp.nb_ic_blocking = jcp.nb_oc_blocking = 1; - if (jcp.ver == ver_fma && mayiuse(avx512_core)) { - int try_nb_oc_blocking = 2; - unsigned int ker_inp_size = typesize * (jcp.iw / jcp.stride_w) - * jcp.ic_block * jcp.kh; - unsigned int ker_out_size = typesize * jcp.ow * jcp.oc_block - * try_nb_oc_blocking; - unsigned int ker_wei_size = typesize * jcp.kh * jcp.kw * jcp.ic_block - * jcp.oc_block * try_nb_oc_blocking; - unsigned int ker_total_size = ker_inp_size + ker_out_size - + ker_wei_size; - - if (jcp.mb == 1) { - jcp.kernel_kind = embd_bcast; - } else if (jcp.is_1stconv || jcp.kw > 3 - || ((jcp.kw == 3 && jcp.ow <= 28 && ker_total_size < L1_cache_size) - && !(jcp.kw == 3 && jcp.ow == 13 && jcp.ic >= 192) - && !(jcp.kw == 3 && jcp.ow == 28 && jcp.ic >= 512)) - ) { - jcp.kernel_kind = embd_bcast; - jcp.ur_w = utils::min(jcp.ow, regs); - jcp.nb_ic_blocking = jcp.nb_oc_blocking = 1; - if (ker_total_size < L1_cache_size && jcp.ow <= 8 && jcp.kh <= 3 - && jcp.kw <= 3) { - if (jcp.nb_oc % try_nb_oc_blocking == 0 && !jcp.is_1stconv) { - jcp.nb_oc_blocking = try_nb_oc_blocking; - jcp.ur_w = 31 / (jcp.nb_oc_blocking + 1); - if (jcp.ow < jcp.ur_w) jcp.ur_w = jcp.ow; - } - } - } else { - jcp.kernel_kind = expl_bcast; - jcp.nb_ic_blocking = 1; - jcp.nb_oc_blocking = 4; - if (jcp.nb_oc < jcp.nb_oc_blocking) jcp.nb_oc_blocking = jcp.nb_oc; - if (jcp.nb_oc % jcp.nb_oc_blocking != 0) - for (int i = jcp.nb_oc_blocking; i > 0; i--) - if (jcp.nb_oc % i == 0) { - jcp.nb_oc_blocking = i; - break; - } - jcp.ur_w = 31 / (jcp.nb_oc_blocking + 1); - if (jcp.ow < jcp.ur_w) jcp.ur_w = jcp.ow; - } - } - - jcp.ur_w_tail = jcp.ow % jcp.ur_w; - - bool args_ok = true - && jcp.oc % simd_w == 0 - && jcp.l_pad <= jcp.ur_w - && utils::implication(!jcp.is_1stconv, jcp.ic % simd_w == 0); - if (!args_ok) - return SaberUnImplError; - - int r_pad_no_tail = utils::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w - + jcp.kw - jcp.iw - jcp.l_pad); - if (r_pad_no_tail > jcp.ur_w) - return SaberUnImplError; - - pick_loop_order(jcp); - jcp.nb_ic_L2 = jcp.nb_ic; - - return SaberSuccess; - -} - - -} // namespace jit -} // namespace saber -} // namespace anakin - diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.cpp b/saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.cpp new file mode 100644 index 000000000..7bca384d5 --- /dev/null +++ b/saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.cpp @@ -0,0 +1,594 @@ +#include + +#include "jit_avx512_conv_kernel.h" + +#define GET_OFF(field) offsetof(jit_conv_call_t, field) +#define KNx_L2_EFFECTIVE_CAPACITY ((512 - 64) * 1024) + +namespace anakin { +namespace saber { +namespace jit { + +using namespace Xbyak; + +static unsigned int L1_cache_size = get_cache_size(1, true); + +static inline void pick_loop_order(jit_conv_conf_t &jcp) { + // auto w = jcp.ow; + // auto h = jcp.oh; + switch (jcp.ver) { + case ver_fma: + jcp.loop_order = loop_cgn; + break; + default: + assert(!"unsupported convolution version"); + } +} + + +void jit_conv_kernel::prepare_output(int ur_w) { + for (int k = 0; k < jcp.nb_oc_blocking; k++) + for (int j = 0; j < ur_w; j++) { + Zmm zmm = zmm_out(j, k); + vpxord(zmm, zmm, zmm); + int aux_output_offset = get_output_offset(j, k); + mic_prefetcht1(EVEX_compress_addr(reg_out_prf, aux_output_offset)); + } +} + + +void jit_conv_kernel::store_output(int ur_w) { + + Label no_update_label, store_label, relu_label; + + mov(reg_channel, ptr[param1 + GET_OFF(channel)]); + if (jcp.with_bias) { + mov(reg_bias, ptr[param1 + GET_OFF(bias)]); + } + + if (!jcp.with_sum) { + cmp(reg_channel, 0); + je(no_update_label, T_NEAR); + } + + for (int k = 0; k < jcp.nb_oc_blocking; k++) { + for (int j = 0; j < ur_w; j++) { + Zmm zmm = zmm_out(j, k); + int aux_output_offset = get_output_offset(j, k); + vadd(zmm, reg_out, aux_output_offset); + } + } + + if (!jcp.with_sum) { + jmp(relu_label, T_NEAR); + } else { + cmp(reg_channel, 0); + jne(relu_label, T_NEAR); + } + + + L(no_update_label); + if (jcp.with_bias) { + for (int k = 0; k < jcp.nb_oc_blocking; k++) { + int bias_offset = jcp.typesize_out * k * jcp.oc_block; + for (int j = 0; j < ur_w; j++) { + Zmm zmm = zmm_out(j, k); + vadd(zmm, reg_bias, bias_offset); + } + mic_prefetcht1(EVEX_compress_addr(reg_bias, bias_offset + 64)); + } + } + + L(relu_label); + if (jcp.with_relu) { + vpxord(zmm_zero, zmm_zero, zmm_zero); + if (jcp.relu_negative_slope == 0 || jcp.ver == ver_4vnni) { + zmm_relu_ns = zmm_zero; + } else { + mov(imm_addr64, float2int(jcp.relu_negative_slope)); + vmovq(xmm_relu_ns, imm_addr64); + vbroadcastss(zmm_relu_ns, xmm_relu_ns); + } + cmp(reg_channel, jcp.nb_ic - 1); + jl(store_label, T_NEAR); + for (int k = 0; k < jcp.nb_oc_blocking; k++) + for (int j = 0; j < ur_w; j++){ + Opmask kmask = Opmask(7); + Zmm zmm = zmm_out(j, k); + vcmp(kmask, zmm, zmm_zero, _cmp_lt_os); + vmul(zmm, kmask, zmm, zmm_relu_ns); + } + } + + L(store_label); + for (int k = 0; k < jcp.nb_oc_blocking; k++) { + for (int j = 0; j < ur_w; j++) { + Zmm zmm = zmm_out(j, k); + int aux_output_offset + = typesize * (k * jcp.oh * jcp.ow + j) * jcp.oc_block; + vmovups(EVEX_compress_addr(reg_out, aux_output_offset), zmm); + mic_prefetcht0(EVEX_compress_addr(reg_out_prf, aux_output_offset)); + } + } +} + + +void jit_conv_kernel::compute_loop_fma_core(int ur_w, + int pad_l, int pad_r) { + int kw = jcp.kw; + int stride_w = jcp.stride_w; + int ic_block = jcp.ic_block; + int oc_block = jcp.oc_block; + int nb_oc_block = jcp.nb_oc_blocking; + Label kh_label, skip_kh_loop; + int shift_kernel_ptr = jcp.typesize_in * jcp.kw * jcp.oc_block + * jcp.ic_block; + int shift_input_ptr = jcp.typesize_in * jcp.iw + * (!jcp.is_1stconv ? ic_block : 1); + auto input_offset = [=](int oi, int ic, int ki) { + return jcp.typesize_in * ((ki + oi * stride_w - pad_l) * ic_block + ic); + }; + mov(aux_reg_inp, reg_inp); + mov(aux_reg_ker, reg_ker); + + prepare_output(ur_w); + + mov(reg_kj, reg_kh); + if (jcp.kh <= jcp.t_pad) { + cmp(reg_kj, 0); + je(skip_kh_loop, T_NEAR); + } + + L(kh_label); + { + for (int ki = 0; ki < kw; ki++) { + int jj_start = get_ow_start(ki, pad_l); + int jj_end = get_ow_end(ur_w, ki, pad_r); + for (int ic = 0; ic < ic_block; ic++) { + if (jcp.kernel_kind == expl_bcast) { + for (int jj = jj_start; jj < jj_end; jj++) { + int aux_input_offset = input_offset(jj, ic, ki); + vbroadcastss(zmm_inp(jj, nb_oc_block), + ptr[aux_reg_inp + aux_input_offset]); + } + } + + for (int ii = 0; ii < nb_oc_block; ii++) { + int aux_kernel_offset = jcp.typesize_in + * (ii * jcp.nb_ic * jcp.kh * jcp.kw * ic_block + * oc_block + ki * ic_block * oc_block + ic * oc_block); + if (jj_end - jj_start > 0) { + vmovups(zmm_wei, EVEX_compress_addr(aux_reg_ker, + aux_kernel_offset)); + } + for (int jj = jj_start; jj < jj_end; jj++) { + if (jcp.kernel_kind == expl_bcast) { + vfmadd231ps(zmm_out(jj, ii), + zmm_inp(jj, nb_oc_block), zmm_wei); + } + else { + vfmadd231ps(zmm_out(jj, ii), zmm_wei, + EVEX_compress_addr(aux_reg_inp, + input_offset(jj, ic, ki), true)); + } + } + } + } + } + add(aux_reg_ker, shift_kernel_ptr); + add(aux_reg_inp, shift_input_ptr); + dec(reg_kj); + cmp(reg_kj, 0); + jg(kh_label, T_NEAR); + } + + L(skip_kh_loop); + store_output(ur_w); +} + + +void jit_conv_kernel::compute_loop_fma(int ur_w, int pad_l, int pad_r) { + bool prf_ker = true; + bool prf_inp = true; + int iw = jcp.iw; + int ih = jcp.ih; + int kw = jcp.kw; + int stride_w = jcp.stride_w; + int ic_block = jcp.ic_block; + int oc_block = jcp.oc_block; + int nb_oc_block = jcp.nb_oc_blocking; + Label kh_label; + + int ker_pipeline_depth = 4; + assert(ker_reg_base_idx + ker_pipeline_depth <= 32); + assert(oc_block >= ker_pipeline_depth); + + int num_ker_loads = ic_block * nb_oc_block * kw; + const int simd_w = 16; + int num_ker_prfs = prf_ker ? num_ker_loads : 0; + int num_inp_prfs = prf_inp ? + ur_w * utils::min(kw, stride_w) + utils::max(0, kw - stride_w) : 0; + if (jcp.is_1stconv && prf_inp) { + num_inp_prfs = utils::div_up(num_inp_prfs, simd_w) * ic_block; + } + int num_prfs = num_ker_prfs + num_inp_prfs; + int num_fmas = num_ker_loads * ur_w; + int prf_inst_spacing + = (prf_ker || prf_inp) ? utils::max(1, num_fmas / num_prfs) : 1; + int prf_inst_trigger = (num_fmas % prf_inst_spacing) / 2; + + mov(aux_reg_inp, reg_inp); + mov(aux_reg_ker, reg_ker); + + prepare_output(ur_w); + + mov(aux_reg_inp_prf, reg_inp_prf); + mov(aux_reg_ker_prf, reg_ker_prf); + mov(reg_kj, reg_kh); + Label skip_kh_loop; + if (jcp.kh <= jcp.t_pad) { + cmp(reg_kj, 0); + je(skip_kh_loop, T_NEAR); + } + align(16); + L(kh_label); + { + int step = 0; + int ker_prfs = 0; + for (int ki = 0; ki < kw; ki++) { + for (int ic = 0; ic < ic_block; ic++) { + int aux_kernel_offset = 0; + if (step == 0) { + for (int i = 0; i < ker_pipeline_depth; i++) { + aux_kernel_offset = get_kernel_offset(ki, ic, 0, i); + vmovups(zmm_ker(i), EVEX_compress_addr( + aux_reg_ker, aux_kernel_offset)); + } + } else if (step < num_ker_loads - ker_pipeline_depth + 1) { + int load_offset = ker_pipeline_depth - 1; + int ker_load_reg_idx + = (step + load_offset) % ker_pipeline_depth; + aux_kernel_offset = get_kernel_offset(ki,ic,0,load_offset); + vmovups(zmm_ker(ker_load_reg_idx), + EVEX_compress_addr(aux_reg_ker, aux_kernel_offset)); + } + + bool ker_prf_inserted = false; + Zmm zmm_kernel = zmm_ker(step % ker_pipeline_depth); + int j_start = get_ow_start(ki, pad_l); + int j_end = get_ow_end(ur_w, ki, pad_r); + for (int j = j_start; j < j_end; j++) { + int aux_input_offset = get_input_offset(ki, ic, j, pad_l); + vfmadd231ps(zmm_out(j, 0), zmm_kernel, + EVEX_compress_addr(aux_reg_inp, aux_input_offset, true)); + + int fma_idx = step * ur_w + j; + int prf_slot_idx = fma_idx / prf_inst_spacing; + if (fma_idx % prf_inst_spacing == prf_inst_trigger) { + if (prf_ker && !ker_prf_inserted + && ker_prfs < num_ker_prfs) { + int ker_prf_offset + = jcp.typesize_in * ker_prfs * jcp.oc_block; + mic_prefetcht2(EVEX_compress_addr( + aux_reg_ker_prf, ker_prf_offset)); + ker_prf_inserted = true; + ker_prfs++; + } else if (prf_inp) { + int inp_prf_idx = prf_slot_idx - ker_prfs; + if (inp_prf_idx < num_inp_prfs) { + int inp_prf_stride = utils::max(kw, stride_w); + int inp_prf_offset; + if (!jcp.is_1stconv) { + inp_prf_offset + = ic_block * jcp.typesize_in + * ((inp_prf_idx / kw) + * inp_prf_stride + + (inp_prf_idx % kw)); + } else { + int ic_prf_stride = jcp.typesize_in*iw*ih; + int iw_prf_stride = jcp.typesize_in*simd_w; + inp_prf_offset = ((inp_prf_idx / ic_block) + * iw_prf_stride + + (inp_prf_idx % ic_block) + * ic_prf_stride); + } + + mic_prefetcht0(EVEX_compress_addr( + aux_reg_inp_prf, inp_prf_offset)); + } + } + } + } + + step++; + } + } + add(aux_reg_ker, jcp.typesize_in * kw * oc_block * ic_block); + if (prf_ker) { + add(aux_reg_ker_prf, jcp.typesize_in * kw * oc_block * ic_block); + } + int inp_mul = !jcp.is_1stconv ? ic_block : 1; + add(aux_reg_inp, jcp.typesize_in * iw * inp_mul); + if (prf_inp) { + add(aux_reg_inp_prf, jcp.typesize_in * iw * inp_mul); + } + + dec(reg_kj); + cmp(reg_kj, 0); + jg(kh_label, T_NEAR); + } + + L(skip_kh_loop); + store_output(ur_w); +} + + +void jit_conv_kernel::compute_loop(int ur_w, int pad_l, int pad_r) { + + if (jcp.ver == ver_fma){ + if (jcp.is_1stconv || mayiuse(avx512_mic)) { + compute_loop_fma(ur_w, pad_l, pad_r); + } + else if (jcp.kernel_kind == embd_bcast && jcp.nb_oc_blocking == 1) { + compute_loop_fma(ur_w, pad_l, pad_r); + } + else { + compute_loop_fma_core(ur_w, pad_l, pad_r); + } + } else { + assert(!"unknown convolution version"); + } +} + + +void jit_conv_kernel::generate() { + int iw = jcp.iw; + int ow = jcp.ow; + int kw = jcp.kw; + int l_pad = jcp.l_pad; + int ur_w = jcp.ur_w; + int ur_w_tail = jcp.ur_w_tail; + int stride_w = jcp.stride_w; + int ic_block = jcp.ic_block; + int oc_block = jcp.oc_block; + + int inp_mult = !jcp.is_1stconv ? ic_block : 1; + int inp_shift_pad = jcp.typesize_in * (ur_w * stride_w - l_pad) * inp_mult; + int inp_shift = jcp.typesize_in * (ur_w * stride_w * inp_mult); + int out_shift = jcp.typesize_out * (ur_w * oc_block); + preamble(); + + mov(reg_inp, ptr[param1 + GET_OFF(src)]); + mov(reg_out, ptr[param1 + GET_OFF(dst)]); + mov(reg_ker, ptr[param1 + GET_OFF(filt)]); + mov(reg_ker_prf, ptr[param1 + GET_OFF(filt_prf)]); + mov(reg_kh, ptr[param1 + GET_OFF(kh_padding)]); + + int r_pad = utils::max(0, (ow - 1) * stride_w + (kw - 1) - (iw + l_pad - 1)); + + int n_oi = ow / ur_w; + int r_pad1 = (ur_w * n_oi - 1) * stride_w + kw - 1 - (iw + l_pad - 1); + if (r_pad1 > 0) n_oi--; + + + if (ow == ur_w) { + mov(reg_inp_prf, ptr[param1 + GET_OFF(src_prf)]); + mov(reg_out_prf, ptr[param1 + GET_OFF(dst_prf)]); + compute_loop(ur_w, l_pad, r_pad); + } else { + //TODO: potentially suboptimal + mov(reg_inp_prf, reg_inp); + mov(reg_out_prf, reg_out); + + if (n_oi == 0) { + add(reg_inp_prf, inp_shift_pad); + add(reg_out_prf, out_shift); + compute_loop(ur_w, l_pad, r_pad1); + add(reg_inp, inp_shift_pad); + add(reg_out, out_shift); + if (ur_w_tail != 0) { + add(reg_inp_prf, inp_shift); + add(reg_out_prf, out_shift); + compute_loop(ur_w_tail, 0, r_pad); + } + } else { + xor_(reg_oi, reg_oi); + if (l_pad > 0) { + add(reg_inp_prf, inp_shift_pad); + add(reg_out_prf, out_shift); + compute_loop(ur_w, l_pad, 0); + add(reg_inp, inp_shift_pad); + add(reg_out, out_shift); + inc(reg_oi); + } + if ((l_pad <= 0 && n_oi > 0) || (l_pad > 0 && n_oi > 1)) { + if (l_pad <= 0 && r_pad1 > 0) + n_oi--; + Label ow_loop_label; + L(ow_loop_label); + { + add(reg_inp_prf, inp_shift); + add(reg_out_prf, out_shift); + compute_loop(ur_w, 0, 0); + add(reg_inp, inp_shift); + add(reg_out, out_shift); + inc(reg_oi); + cmp(reg_oi, n_oi); + jl(ow_loop_label, T_NEAR); + } + } + if (r_pad1 > 0) { + add(reg_inp_prf, inp_shift); + add(reg_out_prf, out_shift); + compute_loop(ur_w, 0, r_pad1); + add(reg_inp, inp_shift); + add(reg_out, out_shift); + } + if (ur_w_tail != 0) { + add(reg_inp_prf, inp_shift); + add(reg_out_prf, out_shift); + compute_loop(ur_w_tail, 0, r_pad); + } + } + } + postamble(); +} + + +SaberStatus jit_conv_kernel::init_conf(jit_conv_conf_t &jcp) { + if (!mayiuse(avx512_common)) { + LOG(ERROR) << "init a AVX512 kernel in non-avx512 machine is not permitted"; + return SaberUnImplError; + } + + const int simd_w = cpu_isa_traits::vlen / sizeof(float); + const int regs = 28; + + jcp.ur_h = 1; + jcp.oc_block = simd_w; + jcp.ic_block = (jcp.ic % simd_w != 0) ? jcp.ic : simd_w; + + if (mayiuse(avx512_common)) { + jcp.ver = ver_fma; + jcp.typesize_in = sizeof(float); + jcp.typesize_out = sizeof(float); + + if (jcp.is_1stconv) { + // TODO: fix & remove constraints below + if (jcp.l_pad != 0 || jcp.r_pad != 0 + || jcp.b_pad != 0 || jcp.t_pad != 0 + || (jcp.kw < 7 && jcp.kh < 7)) + jcp.ver = ver_fma; + } + } + + // set jcp.ur_w + if (jcp.is_1stconv) { + jcp.ur_w = utils::min(jcp.ow, regs); + } else { + for (int ur_w = regs; ur_w > 0; --ur_w) { + if (jcp.ow % ur_w == 0) { + jcp.ur_w = ur_w; + break; + } + } + if (jcp.ur_w == 1) { + jcp.ur_w = utils::min(jcp.ow, regs); + } + } + + // TODO (Tanya): currenly applied to Segnet convolutions only. + // Need to try for other topologies + if (jcp.ow > 150 && jcp.ur_w < regs / 2) { + jcp.ur_w = regs; + } + + int n_oi = (jcp.ow / jcp.ur_w); + int r_pad = (jcp.ur_w * n_oi - 1) * jcp.stride_w + jcp.kw - jcp.iw - jcp.l_pad; + if (jcp.l_pad > 0 && r_pad > 0) { + n_oi--; + } + + bool large_code_size = jcp.ur_w != jcp.ow && jcp.l_pad > 0 && r_pad > 0 && + ((jcp.l_pad <= 0 && n_oi > 0) || (jcp.l_pad > 0 && n_oi > 1)); + if (large_code_size) { + const int max_code_size = 24 * 1024; + const int num_ops_per_reg = 6 + jcp.ic_block * jcp.kw; + int mult = 1; + if (jcp.l_pad > 0) { + mult += 1; + } + if (r_pad > 0) { + mult += 1; + } + for (int ur_w = jcp.ur_w; ur_w > regs / 2; --ur_w) { + if (ur_w * mult * num_ops_per_reg * 9.0 < max_code_size) { + jcp.ur_w = ur_w; + break; + } + } + } + + jcp.nb_ic = jcp.ic / jcp.ic_block; + jcp.nb_oc = jcp.oc / jcp.oc_block; + jcp.nb_ic_blocking = jcp.nb_oc_blocking = 1; + if (jcp.ver == ver_fma && mayiuse(avx512_core)) { + int try_nb_oc_blocking = 2; + unsigned int ker_inp_size = typesize * (jcp.iw / jcp.stride_w) + * jcp.ic_block * jcp.kh; + unsigned int ker_out_size = typesize * jcp.ow * jcp.oc_block + * try_nb_oc_blocking; + unsigned int ker_wei_size = typesize * jcp.kh * jcp.kw * jcp.ic_block + * jcp.oc_block * try_nb_oc_blocking; + unsigned int ker_total_size = ker_inp_size + ker_out_size + + ker_wei_size; + + if (jcp.mb == 1) { + jcp.kernel_kind = embd_bcast; + } else if (jcp.is_1stconv || jcp.kw > 3 + || ((jcp.kw == 3 && jcp.ow <= 28 && ker_total_size < L1_cache_size) + && !(jcp.kw == 3 && jcp.ow == 13 && jcp.ic >= 192) + && !(jcp.kw == 3 && jcp.ow == 28 && jcp.ic >= 512)) + ) { + jcp.kernel_kind = embd_bcast; + jcp.ur_w = utils::min(jcp.ow, regs); + jcp.nb_ic_blocking = jcp.nb_oc_blocking = 1; + if (ker_total_size < L1_cache_size && jcp.ow <= 8 && jcp.kh <= 3 + && jcp.kw <= 3) { + if (jcp.nb_oc % try_nb_oc_blocking == 0 && !jcp.is_1stconv) { + jcp.nb_oc_blocking = try_nb_oc_blocking; + jcp.ur_w = 31 / (jcp.nb_oc_blocking + 1); + if (jcp.ow < jcp.ur_w) jcp.ur_w = jcp.ow; + } + } + } else { + jcp.kernel_kind = expl_bcast; + jcp.nb_ic_blocking = 1; + jcp.nb_oc_blocking = 4; + if (jcp.nb_oc < jcp.nb_oc_blocking) { + jcp.nb_oc_blocking = jcp.nb_oc; + } + if (jcp.nb_oc % jcp.nb_oc_blocking != 0) { + for (int i = jcp.nb_oc_blocking; i > 0; i--) { + if (jcp.nb_oc % i == 0) { + jcp.nb_oc_blocking = i; + break; + } + } + } + jcp.ur_w = 31 / (jcp.nb_oc_blocking + 1); + if (jcp.ow < jcp.ur_w) { + jcp.ur_w = jcp.ow; + } + } + } + + jcp.ur_w_tail = jcp.ow % jcp.ur_w; + + bool args_ok = true && + jcp.oc % simd_w == 0 && + jcp.l_pad <= jcp.ur_w && + utils::implication(!jcp.is_1stconv, jcp.ic % simd_w == 0) && + jcp.dilate_h == 0 && jcp.dilate_w == 0; + if (!args_ok) { + LOG(ERROR) << "arguments check failed"; + return SaberUnImplError; + } + + int r_pad_no_tail = utils::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w + + jcp.kw - jcp.iw - jcp.l_pad); + if (r_pad_no_tail > jcp.ur_w) { + LOG(ERROR) << "tail should not be greater than ur_w"; + return SaberUnImplError; + } + + pick_loop_order(jcp); + jcp.nb_ic_L2 = jcp.nb_ic; + + return SaberSuccess; +} + + +} // namespace jit +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_conv_act_kernel.h b/saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.h similarity index 78% rename from saber/funcs/impl/x86/kernel/jit_avx512_conv_act_kernel.h rename to saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.h index 834bcdd8a..b4fd94aa4 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx512_conv_act_kernel.h +++ b/saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.h @@ -1,10 +1,10 @@ -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CONV_ACT_KERNEL_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_AVX512_CONV_ACT_KERNEL_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CONV_KERNEL_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_AVX512_CONV_KERNEL_H #include #include -#include "saber/funcs/impl/x86/kernel/jit_generator.h" +#include "jit_generator.h" #include "saber/funcs/impl/x86/jit_call_conf.h" #include "saber/saber_types.h" #include "saber/funcs/impl/x86/x86_utils.h" @@ -13,11 +13,10 @@ namespace anakin { namespace saber { namespace jit { -struct jit_conv_act_kernel : public jit_generator { +struct jit_conv_kernel : public jit_generator { public: - jit_conv_act_kernel(jit_conv_conf_t ajcp) : jcp(ajcp) - { + jit_conv_kernel(jit_conv_conf_t ajcp) : jcp(ajcp) { generate(); jit_ker = (void (*)(jit_conv_call_t *))getCode(); } @@ -25,7 +24,7 @@ struct jit_conv_act_kernel : public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_conv_act_kernel); static SaberStatus init_conf(jit_conv_conf_t &jcp); - + jit_conv_conf_t jcp; void (*jit_ker)(jit_conv_call_t *); @@ -107,27 +106,27 @@ struct jit_conv_act_kernel : public jit_generator { void generate(); inline void vpXdpwssd(Xbyak::Zmm zmm1, Xbyak::Zmm zmm2, reg64_t reg, - int offset) { - vpdpwssd(zmm1, zmm2, EVEX_compress_addr(reg, offset, true)); + int offset) { + vpdpwssd(zmm1, zmm2, EVEX_compress_addr(reg, offset, true)); } inline void vadd(Xbyak::Zmm zmm, reg64_t reg, int offset) { - vaddps(zmm, zmm, EVEX_compress_addr(reg, offset)); + vaddps(zmm, zmm, EVEX_compress_addr(reg, offset)); } inline void vcmp(Xbyak::Opmask kmask, - Xbyak::Zmm zmm_src1, Xbyak::Zmm zmm_src2, const unsigned char cmp) { - vcmpps(kmask, zmm_src1, zmm_src2, cmp); + Xbyak::Zmm zmm_src1, Xbyak::Zmm zmm_src2, const unsigned char cmp) { + vcmpps(kmask, zmm_src1, zmm_src2, cmp); } inline void vmul(Xbyak::Zmm zmm_dst, Xbyak::Opmask kmask, Xbyak::Zmm zmm_src1, Xbyak::Zmm zmm_src2) { - vmulps(zmm_dst | kmask, zmm_src1, zmm_src2); + vmulps(zmm_dst | kmask, zmm_src1, zmm_src2); } inline int get_output_offset(int oi, int n_oc_block) { return jcp.typesize_out - * (n_oc_block * jcp.oh * jcp.ow + oi) * jcp.oc_block; + * (n_oc_block * jcp.oh * jcp.ow + oi) * jcp.oc_block; } inline int get_input_offset(int ki, int ic, int oi, int pad_l) { @@ -135,14 +134,14 @@ struct jit_conv_act_kernel : public jit_generator { int iw_str = !jcp.is_1stconv ? jcp.ic_block : 1; int ic_str = !jcp.is_1stconv ? 1 : jcp.iw * jcp.ih; return jcp.typesize_in - * ((ki + oi * jcp.stride_w - pad_l) * iw_str + scale * ic * ic_str); + * ((ki + oi * jcp.stride_w - pad_l) * iw_str + scale * ic * ic_str); } inline int get_kernel_offset(int ki,int ic,int n_oc_block,int ker_number) { int scale = 1; return jcp.typesize_in * jcp.oc_block - * (n_oc_block * jcp.nb_ic * jcp.ic_block * jcp.kh * jcp.kw - + (ic + ker_number) * scale + ki * jcp.ic_block); + * (n_oc_block * jcp.nb_ic * jcp.ic_block * jcp.kh * jcp.kw + + (ic + ker_number) * scale + ki * jcp.ic_block); } inline int get_ow_start(int ki, int pad_l) { @@ -151,9 +150,9 @@ struct jit_conv_act_kernel : public jit_generator { inline int get_ow_end(int ur_w, int ki, int pad_r) { return ur_w - utils::max(0, - (ki + pad_r - (jcp.kw - 1) + jcp.stride_w - 1) / jcp.stride_w); + (ki + pad_r - (jcp.kw - 1) + jcp.stride_w - 1) / jcp.stride_w); } - + }; diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_rtus_driver.h b/saber/funcs/impl/x86/kernel/jit_avx512_rtus_driver.h index e48c02765..12c86e105 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx512_rtus_driver.h +++ b/saber/funcs/impl/x86/kernel/jit_avx512_rtus_driver.h @@ -19,13 +19,16 @@ limitations under the License. */ #include "saber/saber_types.h" #include "saber/funcs/impl/impl_base.h" #include "saber/funcs/impl/x86/jit_call_conf.h" -#include "saber/funcs/impl/x86/kernel/jit_generator.h" -#include "saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h" +#include "jit_generator.h" +#include "jit_uni_1x1_conv_utils.h" + namespace anakin { namespace saber { namespace jit { +using namespace Xbyak; + struct rtus_driver_t : public jit_generator { struct call_params_t { @@ -60,11 +63,11 @@ struct rtus_driver_t : public jit_generator { Vmm reg_v; rtus_driver_t(int iw, int stride_w, int src_step_h, - int src_step_icb, int ws_step_icb, bool src_to_ws, size_t typesize) - : iw_(iw), stride_w_(stride_w), src_step_h_(src_step_h) - , src_step_icb_(src_step_icb), ws_step_icb_(ws_step_icb) - , src_to_ws_(src_to_ws), typesize_(typesize) { - using namespace Xbyak; + int src_step_icb, int ws_step_icb, + bool src_to_ws, size_t typesize) + : iw_(iw), stride_w_(stride_w), src_step_h_(src_step_h) + , src_step_icb_(src_step_icb), ws_step_icb_(ws_step_icb) + , src_to_ws_(src_to_ws), typesize_(typesize) { vlen_ = cpu_isa_traits::vlen; vlen_shift_ = cpu_isa_traits::vlen_shift; if (typesize_ == 2) { @@ -79,8 +82,6 @@ struct rtus_driver_t : public jit_generator { } void loop_is() { - using namespace Xbyak; - mov(reg_cur_src, reg_src); mov(reg_cur_iw, reg_iw_start); mov(reg_cur_os, reg_os); @@ -134,8 +135,6 @@ struct rtus_driver_t : public jit_generator { } void generate() { - using namespace Xbyak; - #if defined(_WIN32) assert(reg_src == abi_not_param1 && abi_not_param1 == rdi); push(rdi); @@ -154,8 +153,9 @@ struct rtus_driver_t : public jit_generator { shl(reg_os, vlen_shift_); - if (!src_to_ws_) + if (!src_to_ws_) { uni_vpxor(reg_zero, reg_zero, reg_zero); + } Label icb_loop; L(icb_loop); @@ -173,41 +173,37 @@ struct rtus_driver_t : public jit_generator { #endif ret(); - this->ker_ = reinterpret_cast(const_cast( - this->getCode())); + this->ker_ = reinterpret_cast(const_cast(this->getCode())); } }; -inline void init_rtus_driver(rtus_driver_t **p_rtus_driver_, - reduce_to_unit_stride_t &rtus_, - jit_1x1_conv_conf_t &jcp_, size_t &ws_per_thread_, - float **p_scratch_, const Shape &src_d, - int stride_h, int stride_w) { - - if (!rtus_.reduce_src_) { - return; - } - +inline void init_rtus_driver(rtus_driver_t **p_rtus_driver, + jit_1x1_conv_conf_t &jcp, + conv_1x1_desc &conv_d, + size_t &ws_per_thread, + float **p_scratch) { const int max_threads = omp_get_max_threads(); size_t factor = 0; - factor = jcp_.nb_reduce; + factor = jcp.nb_reduce; - size_t typesize = sizeof(decltype(**p_scratch_)); + size_t typesize = sizeof(decltype(**p_scratch)); - ws_per_thread_ = factor * jcp_.is * jcp_.ic_block; - *p_scratch_ = (float*)zmalloc(max_threads * ws_per_thread_ * typesize, 64); + ws_per_thread = factor * jcp.is * jcp.ic_block; + *p_scratch = (float*)zmalloc(max_threads * ws_per_thread * typesize, 64); - const int ih = src_d[2]; - const int iw = src_d[3]; + const int ih = conv_d.ih; + const int iw = conv_d.iw; - const int src_step_h = stride_h * iw; + const int src_step_h = conv_d.stride_h * iw; const int src_step_icb = ih * iw; - const int ws_step_icb = jcp_.is; + const int ws_step_icb = jcp.is; + const bool src_to_ws = true; - *p_rtus_driver_ = new rtus_driver_t(iw, stride_w, src_step_h, - src_step_icb, ws_step_icb, src_to_ws, typesize); + + *p_rtus_driver = new rtus_driver_t(iw, conv_d.stride_w, src_step_h, + src_step_icb, ws_step_icb, src_to_ws, typesize); } diff --git a/saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h b/saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h index 228efe420..7d7a9860f 100644 --- a/saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h +++ b/saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h @@ -15,9 +15,9 @@ limitations under the License. */ #define ANAKIN_SABER_FUNCS_IMPL_X86_JIT_UNI_1X1_CONV_UTIL_H #include - #include -#include "x86_utils.h" + +#include "saber/funcs/impl/x86/x86_utils.h" namespace anakin { namespace saber { @@ -83,18 +83,17 @@ typedef int jit_dims_t[JIT_TENSOR_MAX_DIMS]; typedef int jit_strides_t[JIT_TENSOR_MAX_DIMS]; struct conv_1x1_desc { - jit_dims_t src_d; - int src_d_dims; - jit_dims_t dst_d; - int dst_d_dims; - jit_dims_t padding[2]; - int strides[2]; -}; - -struct reduce_to_unit_stride_t { - jit_strides_t src_dstrides; - conv_1x1_desc *conv_d_; - bool reduce_src_; + int n; + int ic; + int ih; + int iw; + int oc; + int oh; + int ow; + int stride_h; + int stride_w; + int t_pad; + int l_pad; }; diff --git a/saber/funcs/impl/x86/kernel/jit_uni_dw_conv_kernel_f32.cpp b/saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.cpp similarity index 72% rename from saber/funcs/impl/x86/kernel/jit_uni_dw_conv_kernel_f32.cpp rename to saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.cpp index a49b5261e..6efcd57bb 100644 --- a/saber/funcs/impl/x86/kernel/jit_uni_dw_conv_kernel_f32.cpp +++ b/saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.cpp @@ -1,8 +1,10 @@ -#include "saber/funcs/impl/x86/kernel/jit_uni_dw_conv_kernel_f32.h" #include +#include "jit_uni_dwconv_kernel_f32.h" +#include "utils/logger/logger.h" + #define GET_OFF(field) offsetof(jit_conv_call_t, field) - + namespace anakin { namespace saber { namespace jit { @@ -10,7 +12,7 @@ namespace jit { using namespace Xbyak; template -void jit_uni_dw_conv_kernel_f32::load_src(int ur_ch_blocks, int ur_w) { +void jit_uni_dwconv_kernel_f32::load_src(int ur_ch_blocks, int ur_w) { int repeats = isa == sse42 ? 2 : 1; for (int i = 0; i < repeats; i++) { for (int ch = 0; ch < ur_ch_blocks; ch++) { @@ -19,15 +21,15 @@ void jit_uni_dw_conv_kernel_f32::load_src(int ur_ch_blocks, int ur_w) { int b_off = ch*jcp.ch_block + i*4; if (this->jcp.with_bias) { uni_vmovups(vmm_acc, - vmmword[reg_bias + b_off*sizeof(float)]); + vmmword[reg_bias + b_off*sizeof(float)]); } else { uni_vpxor(vmm_acc, vmm_acc, vmm_acc); } int o_off = ch*jcp.oh*jcp.ow*jcp.ch_block - + ow*jcp.ch_block + i*4; + + ow*jcp.ch_block + i*4; if (this->jcp.with_sum) { uni_vaddps(vmm_acc, vmm_acc, - vmmword[reg_output + o_off*sizeof(float)]); + vmmword[reg_output + o_off*sizeof(float)]); } } } @@ -35,7 +37,7 @@ void jit_uni_dw_conv_kernel_f32::load_src(int ur_ch_blocks, int ur_w) { } template -void jit_uni_dw_conv_kernel_f32::apply_filter( +void jit_uni_dwconv_kernel_f32::apply_filter( int ur_ch_blocks, int ur_w) { int ch_blk = jcp.ch_block; int dilate_h = jcp.dilate_h + 1; @@ -57,39 +59,36 @@ void jit_uni_dw_conv_kernel_f32::apply_filter( int repeats = isa == sse42 ? 2 : 1; for (int i = 0; i < repeats; i++) { for (int ch = 0; ch < ur_ch_blocks; ch++) { - int ker_off = ch*jcp.kh*jcp.kw*ch_blk + i*4; + int ker_off = ch * jcp.kh * jcp.kw * ch_blk + i * 4; Vmm vmm_ker = get_ker_reg(0); - uni_vmovups(vmm_ker, ptr[aux1_reg_kernel - + ker_off*sizeof(float)]); + uni_vmovups(vmm_ker, ptr[aux1_reg_kernel + ker_off * sizeof(float)]); for (int ow = 0; ow < ur_w; ow++) { - int inp_off = ch*jcp.ih*jcp.iw*ch_blk - + ow*stride_w*ch_blk + i*4; + int inp_off = ch * jcp.ih * jcp.iw * ch_blk + + ow * stride_w * ch_blk + i * 4; Vmm vmm_src = get_src_reg(0); - uni_vmovups(vmm_src, ptr[aux1_reg_input - + inp_off*sizeof(float)]); - Vmm vmm_acc = get_acc_reg(i*ur_ch_blocks*ur_w - + ch*ur_w + ow); + uni_vmovups(vmm_src, ptr[aux1_reg_input + inp_off * sizeof(float)]); + Vmm vmm_acc = get_acc_reg(i * ur_ch_blocks * ur_w + ch * ur_w + ow); uni_vfmadd231ps(vmm_acc, vmm_src, vmm_ker); } } } - add(aux1_reg_kernel, ch_blk*sizeof(float)); - add(aux1_reg_input, ch_blk*dilate_w*sizeof(float)); + add(aux1_reg_kernel, ch_blk * sizeof(float)); + add(aux1_reg_input, ch_blk * dilate_w * sizeof(float)); dec(iter_kw); cmp(iter_kw, 0); jg(kw_label, T_NEAR); } - add(aux_reg_kernel, jcp.kw*ch_blk*sizeof(float)); - add(aux_reg_input, jcp.iw*ch_blk*dilate_h*sizeof(float)); + add(aux_reg_kernel, jcp.kw * ch_blk * sizeof(float)); + add(aux_reg_input, jcp.iw * ch_blk * dilate_h * sizeof(float)); dec(iter_kh); cmp(iter_kh, 0); jg(kh_label, T_NEAR); } L(iter_exit_label); } + template -void jit_uni_dw_conv_kernel_f32::apply_filter_unrolled( - int ur_ch_blocks, int ur_w) { +void jit_uni_dwconv_kernel_f32::apply_filter_unrolled(int ur_ch_blocks, int ur_w) { int ch_blk = jcp.ch_block; int dilate_h = jcp.dilate_h + 1; int dilate_w = jcp.dilate_w + 1; @@ -104,34 +103,33 @@ void jit_uni_dw_conv_kernel_f32::apply_filter_unrolled( for (int i = 0; i < repeats; i++) { for (int ch = 0; ch < ur_ch_blocks; ch++) { for (int kw = 0; kw < jcp.kw; kw++) { - int ker_off = ch*jcp.kh*jcp.kw*ch_blk + kw*ch_blk + i*4; + int ker_off = ch * jcp.kh * jcp.kw * ch_blk + kw * ch_blk + i * 4; Vmm vmm_ker = get_ker_reg(0); - uni_vmovups(vmm_ker, ptr[aux_reg_kernel - + ker_off*sizeof(float)]); + uni_vmovups(vmm_ker, ptr[aux_reg_kernel + ker_off * sizeof(float)]); for (int ow = 0; ow < ur_w; ow++) { - int inp_off = ch*jcp.ih*jcp.iw*ch_blk - + ow*stride_w*ch_blk + kw*ch_blk*dilate_w + i*4; + int inp_off = ch * jcp.ih * jcp.iw * ch_blk + + ow * stride_w * ch_blk + kw * ch_blk * dilate_w + i * 4; Vmm vmm_src = get_src_reg(0); - uni_vmovups(vmm_src, ptr[aux_reg_input - + inp_off*sizeof(float)]); - Vmm vmm_acc = get_acc_reg(i*ur_ch_blocks*ur_w - + ch*ur_w + ow); + uni_vmovups(vmm_src, ptr[aux_reg_input + + inp_off * sizeof(float)]); + Vmm vmm_acc = get_acc_reg(i * ur_ch_blocks * ur_w + + ch * ur_w + ow); uni_vfmadd231ps(vmm_acc, vmm_src, vmm_ker); } } } } - add(aux_reg_kernel, jcp.kw*ch_blk*sizeof(float)); - add(aux_reg_input, jcp.iw*ch_blk*dilate_h*sizeof(float)); + add(aux_reg_kernel, jcp.kw * ch_blk * sizeof(float)); + add(aux_reg_input, jcp.iw * ch_blk * dilate_h * sizeof(float)); dec(iter_kh); cmp(iter_kh, 0); jg(kh_label, T_NEAR); } L(iter_exit_label); } + template -void jit_uni_dw_conv_kernel_f32::apply_activation( - int ur_ch_blocks, int ur_w) { +void jit_uni_dwconv_kernel_f32::apply_activation(int ur_ch_blocks, int ur_w) { if (this->jcp.with_relu) { uni_vpxor(vmm_zero, vmm_zero, vmm_zero); if (jcp.relu_negative_slope == 0) { @@ -146,7 +144,7 @@ void jit_uni_dw_conv_kernel_f32::apply_activation( for (int ch = 0; ch < ur_ch_blocks; ch++) { for (int ow = 0; ow < ur_w; ow++) { Vmm vmm_dst = get_acc_reg(i*ur_ch_blocks*ur_w - + ch*ur_w + ow); + + ch*ur_w + ow); if (isa == sse42) { pxor(vmm_mask, vmm_mask); cmpps(vmm_mask, vmm_dst, _cmp_gt_os); @@ -167,23 +165,25 @@ void jit_uni_dw_conv_kernel_f32::apply_activation( } } } + template -void jit_uni_dw_conv_kernel_f32::store_dst( +void jit_uni_dwconv_kernel_f32::store_dst( int ur_ch_blocks, int ur_w) { int ch_blk = jcp.ch_block; int repeats = isa == sse42 ? 2 : 1; for (int i = 0; i < repeats; i++) { for (int ch = 0; ch < ur_ch_blocks; ch++) { for (int ow = 0; ow < ur_w; ow++) { - int o_off = ch*jcp.oh*jcp.ow*ch_blk + ow*ch_blk + i*4; - Vmm vmm_dst = get_acc_reg(i*ur_ch_blocks*ur_w + ch*ur_w + ow); - uni_vmovups(vmmword[reg_output + o_off*sizeof(float)], vmm_dst); + int o_off = ch * jcp.oh * jcp.ow * ch_blk + ow * ch_blk + i * 4; + Vmm vmm_dst = get_acc_reg(i * ur_ch_blocks * ur_w + ch * ur_w + ow); + uni_vmovups(vmmword[reg_output + o_off * sizeof(float)], vmm_dst); } } } } + template -void jit_uni_dw_conv_kernel_f32::loop_body(int ur_ch_blocks) { +void jit_uni_dwconv_kernel_f32::loop_body(int ur_ch_blocks) { Label unrolled_w_label; Label tail_w_label; Label exit_label; @@ -221,7 +221,7 @@ void jit_uni_dw_conv_kernel_f32::loop_body(int ur_ch_blocks) { } template -void jit_uni_dw_conv_kernel_f32::generate() { +void jit_uni_dwconv_kernel_f32::generate() { this->preamble(); mov(reg_input, ptr[this->param1 + GET_OFF(src)]); @@ -251,10 +251,10 @@ void jit_uni_dw_conv_kernel_f32::generate() { template -SaberStatus jit_uni_dw_conv_kernel_f32::init_conf(jit_conv_conf_t &jcp){ - - if (!mayiuse(isa)) { - return SaberUnImplError; +SaberStatus jit_uni_dwconv_kernel_f32::init_conf(jit_conv_conf_t &jcp) { + if (!mayiuse(isa) && isa == avx512_common) { + LOG(ERROR) << "Init an AVX512 kernel in a non-avx512 machine is not permitted"; + return SaberUnImplError; } const int simd_w = isa == avx512_common ? 16 : 8; @@ -266,12 +266,12 @@ SaberStatus jit_uni_dw_conv_kernel_f32::init_conf(jit_conv_conf_t &jcp){ jcp.nb_ch_blocking = isa == avx512_common ? 4 : isa == avx2 ? 3 : 2; if (jcp.nb_ch < jcp.nb_ch_blocking) { jcp.nb_ch_blocking = jcp.nb_ch; - } - + } + return SaberSuccess; } -template struct jit_uni_dw_conv_kernel_f32; +template struct jit_uni_dwconv_kernel_f32; } } diff --git a/saber/funcs/impl/x86/kernel/jit_uni_dw_conv_kernel_f32.h b/saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.h similarity index 82% rename from saber/funcs/impl/x86/kernel/jit_uni_dw_conv_kernel_f32.h rename to saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.h index 089a5ba48..fb3f2d965 100644 --- a/saber/funcs/impl/x86/kernel/jit_uni_dw_conv_kernel_f32.h +++ b/saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.h @@ -1,7 +1,7 @@ -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_UNI_DW_CONV_KERNEL_F32_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_UNI_DW_CONV_KERNEL_F32_H #define ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_UNI_DW_CONV_KERNEL_F32_H -#include "saber/funcs/impl/x86/kernel/jit_generator.h" +#include "jit_generator.h" #include "saber/funcs/impl/x86/jit_call_conf.h" #include "saber/saber_types.h" #include "saber/funcs/impl/x86/x86_utils.h" @@ -13,30 +13,29 @@ namespace saber { namespace jit { template -struct jit_uni_dw_conv_kernel_f32 : public jit_generator { +struct jit_uni_dwconv_kernel_f32 : public jit_generator { public: - jit_uni_dw_conv_kernel_f32(jit_conv_conf_t ajcp) : jcp(ajcp) - { + jit_uni_dwconv_kernel_f32(jit_conv_conf_t ajcp) : jcp(ajcp) { generate(); jit_ker = (void (*)(jit_conv_call_t *))getCode(); } - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_dw_conv_kernel_f32); + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_dwconv_kernel_f32); static SaberStatus init_conf(jit_conv_conf_t &jcp); - + jit_conv_conf_t jcp; void (*jit_ker)(jit_conv_call_t *); private: using Vmm = typename utils::conditional3::type; + isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type; using reg64_t = const Xbyak::Reg64; const Xbyak::AddressFrame &vmmword = (isa == sse42) - ? xword : (isa == avx2) ? yword : zword; + ? xword : (isa == avx2) ? yword : zword; const int vlen = cpu_isa_traits::vlen; - // dw convolution + // dw convolution reg64_t reg_input = r8; reg64_t aux_reg_input = r9; reg64_t aux1_reg_input = r10; @@ -73,9 +72,8 @@ struct jit_uni_dw_conv_kernel_f32 : public jit_generator { }; - -} -} -} +} // namespace jit +} // namespace saber +} // namespace anakin #endif diff --git a/saber/funcs/impl/x86/kernel/jit_uni_pool_kernel_f32.h b/saber/funcs/impl/x86/kernel/jit_uni_pool_kernel_f32.h index 4baa1bb60..77f8383a3 100644 --- a/saber/funcs/impl/x86/kernel/jit_uni_pool_kernel_f32.h +++ b/saber/funcs/impl/x86/kernel/jit_uni_pool_kernel_f32.h @@ -1,3 +1,19 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + + #ifndef CPU_JIT_UNI_POOL_KERNEL_F32_H #define CPU_JIT_UNI_POOL_KERNEL_F32_H diff --git a/saber/funcs/impl/x86/mkl_packed_weight.h b/saber/funcs/impl/x86/mkl_packed_weight.h new file mode 100644 index 000000000..f89aa9d2f --- /dev/null +++ b/saber/funcs/impl/x86/mkl_packed_weight.h @@ -0,0 +1,136 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_MKL_PACKED_WEIGHT_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_MKL_PACKED_WEIGHT_H + +#include +#include +#include +#include "saber/core/tensor.h" + +namespace anakin { +namespace saber { + +template +class MatrixInfo { +public: + // default construct + MatrixInfo() : + _buf_(nullptr), _height_(0), _width_(0) { + }; + + // construct the class with allocated buf and + // the matrix info including height(row number) and width(column number) + MatrixInfo(T *buf, size_t height, size_t width) : + _buf_(buf), _height_(height), _width_(width){ + }; + + // get the raw data buf point + T *buf() { + return _buf_; + }; + + // return the height of the buffer; + // equal to row number + size_t height() { + return _height_; + }; + + // return the width of the buffer; + // equal to column number + size_t width() { + return _width_; + } + + // get the sub buf between start and end; + // return sub buffer : [start,end) + MatrixInfo subMatrixInfo(int start, int end) { + MatrixInfo ret(_buf_ + start * _width_, end -start, _width_); + return ret; + } + + // print the value to log + void log_dump() { + for (int i = 0; i < _height_ * _width_; i++) { + LOG(INFO) <<"i:" < +class mkl_packed_weight { + +public: + typedef Tensor ioTensor; + typedef Dtype dtype; + explicit mkl_packed_weight(MatrixInfo *weight, bool transW = false) { + packed_weight_ = nullptr; + weight_ = weight->buf(); + height_ = weight->height(); + width_ = weight->width(); + trans_w_ = transW; + } + + ~mkl_packed_weight() { + if (packed_weight_) { + cblas_sgemm_free(packed_weight_); + packed_weight_ = nullptr; + } + } + + void pack() { + if (!packed_weight_) { + packed_weight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_); + } + cblas_sgemm_pack(CblasRowMajor, + CblasBMatrix, + CblasNoTrans, + 1, + width_, + height_, + 1.0, + weight_, + width_, + packed_weight_); + } + + void gemm_compute(MatrixInfo& src, MatrixInfo* dst, float beta = 1.0) { + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + src.height(), + width_, + height_, + src.buf(), + src.width(), + packed_weight_, + width_, + beta, + dst->buf(), + dst->width() + ); + } +protected: + /// The pointer of weight + dtype * weight_; + /// The pointer of cblas packed gemm to weight + dtype *packed_weight_; + size_t height_; + size_t width_; + bool trans_w_; +}; + +template class mkl_packed_weight; + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/impl/x86/saber_activation.cpp b/saber/funcs/impl/x86/saber_activation.cpp index 4c80446c7..ff4c60783 100644 --- a/saber/funcs/impl/x86/saber_activation.cpp +++ b/saber/funcs/impl/x86/saber_activation.cpp @@ -1,90 +1,153 @@ #include "saber/funcs/impl/x86/saber_activation.h" +#include + namespace anakin{ namespace saber { -template -SaberStatus SaberActivation::init( - const std::vector& inputs, - std::vector& outputs, - ActivationParam ¶m, - Context &ctx) -{ - - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - this->_ctx = ctx; - +template +SaberStatus SaberActivation::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam ¶m, + Context &ctx) { + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } -template -SaberStatus SaberActivation::create( - const std::vector& inputs, - std::vector& outputs, - ActivationParam ¶m, - Context &ctx) -{ - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - this->_ctx = ctx; - +template +SaberStatus SaberActivation::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam ¶m, + Context &ctx) { + this->_ctx = &ctx; return SaberSuccess; } -template -SaberStatus SaberActivation::dispatch( - const std::vector& inputs, - std::vector& outputs, - ActivationParam ¶m) -{ - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - - // TODO !! need add other types of activation +template +SaberStatus SaberActivation::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam ¶m) { + typedef typename DataTrait::Dtype OpDataType; + // x > 0 ? x :0 if (param.active == Active_relu) { for (size_t vc = 0; vc < inputs.size(); vc++) { - size_t len = inputs[vc]->size(); - float *input_data = inputs[vc]->mutable_data(); - float *output_data = outputs[vc]->mutable_data(); + size_t len = inputs[vc]->valid_size(); + OpDataType *input_data = (OpDataType*)inputs[vc]->mutable_data(); + OpDataType *output_data = (OpDataType*)outputs[vc]->mutable_data(); for (size_t i = 0; i < len; i++) { - if (*input_data > 0) { - *output_data = *input_data; - } else { - *output_data = 0; - } - + *output_data = *input_data > (OpDataType)0 ? *input_data : (OpDataType)0; input_data++; output_data++; } } } + + // stanh : b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}} + if (param.active == Active_stanh) { + for (size_t i = 0; i < inputs.size(); i++) { + size_t len = inputs[i]->valid_size(); + const OpDataType *input_data = (OpDataType*)inputs[i]->data(); + OpDataType *output_data = (OpDataType*)outputs[i]->mutable_data(); + //negative_slope = scale_a + //coef = scale_b + for (size_t j = 0; j < len; j++) { + output_data[j] = param.coef * tanh(param.negative_slope * input_data[j]); + } + } + } + // sigmoid: 1/(exp(-x) + 1) + if (param.active == Active_sigmoid) { + for ( size_t i = 0; i < inputs.size() ; i++) { + size_t len = inputs[i]->valid_size(); + const OpDataType *input_data = (OpDataType*)inputs[i]->data(); + OpDataType *output_data = (OpDataType*)outputs[i]->mutable_data(); + + for (size_t j = 0; j < len; j++) { + output_data[j] = 1.0f / (1.0f + exp(-input_data[j])); + } + } + } + + // tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x)) + if (param.active == Active_tanh) { + for (size_t i = 0; i < inputs.size(); i++) { + size_t len = inputs[i]->valid_size(); + const OpDataType *input_data = (OpDataType*)inputs[i]->data(); + OpDataType *output_data = (OpDataType*)outputs[i]->mutable_data(); + + for (size_t j = 0; j < len; j++) { + output_data[j] = tanh(input_data[j]); + } + } + } + + // clipped_relu + // x > 0 ? x : 0; + // x < threshold ? x : threshold + if (param.active == Active_clipped_relu) { + const OpDataType threshold = param.coef; + for (size_t i = 0; i < inputs.size(); i++) { + size_t len = inputs[i]->valid_size(); + const OpDataType *input_data = (OpDataType*)inputs[i]->data(); + OpDataType *output_data = (OpDataType*)outputs[i]->mutable_data(); + + for(size_t j = 0; j < len; j++){ + output_data[j] = input_data[j] > 0 ? input_data[j] : 0; + output_data[j] = output_data[j] < threshold ? output_data[j] : threshold; + } + } + } + + //elu: x > 0 ? x : coef * (exp(x) - 1) + if (param.active == Active_elu) { + const OpDataType coef = param.coef; + for (size_t i = 0; i < inputs.size(); i++) { + size_t len = inputs[i]->valid_size(); + const OpDataType *input_data = (OpDataType*)inputs[i]->data(); + OpDataType *output_data = (OpDataType*)outputs[i]->mutable_data(); + + for(size_t j = 0; j < len; j++){ + output_data[j] = input_data[j] > 0 ? input_data[j] : param.coef * (exp(input_data[j]) - 1); + } + } + } + //prelu: x > 0 ? x : slope[c] * x + if (param.active == Active_prelu) { + PreluParam prelu = param.prelu_param; + for (size_t i = 0; i < inputs.size(); i++) { + const OpDataType *input_data = (OpDataType*)inputs[i]->data(); + OpDataType *output_data = (OpDataType*)outputs[i]->mutable_data(); + Shape shin = inputs[i]->valid_shape(); + int num = shin[0]; + int channel = shin[1]; + int size = shin[2] * shin[3]; + for (int n = 0; n < num; n++){ + const OpDataType *in_ptr = input_data + n * channel * size; + OpDataType *out_ptr = output_data + n * channel * size; + OpDataType *slope_ptr = (OpDataType*)prelu.slope->data(); + for (int c = 0; c < channel; c++){ + const OpDataType *in_ch_ptr = in_ptr + c * size; + OpDataType *out_ch_ptr = out_ptr + c * size; + OpDataType slope = prelu.channel_shared ? slope_ptr[0]: slope_ptr[c]; + for (int k = 0; k < size; k++){ + out_ch_ptr[k] = in_ch_ptr[k] > 0 ? in_ch_ptr[k] : in_ch_ptr[k] * slope; + } + } + } + } + } + for (size_t i = 0; i < inputs.size(); i++) { + outputs[i]->set_seq_offset(inputs[i]->get_seq_offset()); + } return SaberSuccess; } -template class SaberActivation; - +template class SaberActivation; +DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, X86, AK_INT8); } } // namespace anakin diff --git a/saber/funcs/impl/x86/saber_activation.h b/saber/funcs/impl/x86/saber_activation.h index 398d712f1..3d4c39493 100644 --- a/saber/funcs/impl/x86/saber_activation.h +++ b/saber/funcs/impl/x86/saber_activation.h @@ -20,43 +20,31 @@ namespace anakin { namespace saber { -template -class SaberActivation : public ImplBase< - Tensor, - Tensor, - Tensor, - ActivationParam > > -{ +template +class SaberActivation : + public ImplBase< + X86, OpDtype, + ActivationParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; + typedef typename DataTrait::Dtype OpDataType; - SaberActivation() - {} + SaberActivation() {} - ~SaberActivation() { - } + ~SaberActivation() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ActivationParam ¶m, + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam ¶m, Context &ctx) override; - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ActivationParam ¶m, + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam ¶m, Context &ctx) override; - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ActivationParam ¶m) override; + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam ¶m) override; private: diff --git a/saber/funcs/impl/x86/saber_argmax.cpp b/saber/funcs/impl/x86/saber_argmax.cpp new file mode 100644 index 000000000..f5e56127b --- /dev/null +++ b/saber/funcs/impl/x86/saber_argmax.cpp @@ -0,0 +1,167 @@ +#include "saber/funcs/impl/x86/saber_argmax.h" + +namespace anakin{ + +namespace saber{ + +template +void Argmax_kernel_axis(const dtype* din, dtype* dout, int num, int in_stride, \ + int out_stride, int size, int in_ss, int out_ss, int top, bool out_max){ + for (int n = 0; n < num * out_stride; n++){ + for(int k = 0; k < in_stride; k ++){ + const dtype* din_ch = din + n * in_ss + k; + std::vector< std::pair > vec; + vec.resize(size); + for (int i = 0; i < size; i++){ + vec[i] = std::make_pair(din_ch[i * in_stride], i); + } + //sort + std::partial_sort(vec.begin(), vec.begin() + top, vec.end(), std::greater< std::pair >()); + //out + dtype* dout_ch = dout + n * out_ss + k; + for(int i = 0; i < top ;i ++){ + if(out_max) + dout_ch[i * in_stride] = vec[i].first; + else + dout_ch[i * in_stride] = vec[i].second; + } + } + } + } + +template +void Argmax_kernel(const dtype* din, dtype* dout, int num, int in_channel, \ + int out_channel, int top, bool out_max){ + for (int n = 0; n < num; n++){ + const dtype* din_ch = din + n * in_channel; + std::vector< std::pair > vec; + vec.resize(in_channel); + for (int i = 0; i < in_channel; i++){ + vec[i] = std::make_pair(din_ch[i], i); + } + //sort + std::partial_sort(vec.begin(), vec.begin() + top, vec.end(), std::greater< std::pair >()); + //out + if(out_max){ + dtype* dout_ch = dout + n * out_channel; + dtype* dout_data = dout_ch; + dtype* dout_index = dout_ch + top; + for (int i = 0; i < top; i++){ + dout_data[i] = vec[i].first; + dout_index[i] = vec[i].second; + //LOG(INFO) << "max_data: " < +SaberStatus SaberArgmax::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, ArgmaxParam ¶m) { + int num = inputs[0]->num(); + int channel = inputs[0]->channel(); + int height = inputs[0]->height(); + int width = inputs[0]->width(); + + int ch_out = outputs[0]->channel(); + int w_out = outputs[0]->width(); + int h_out = outputs[0]->height(); + + int top = param.top_k; + bool has_ax = param.has_axis; + int ax = param.axis; + bool out_max = param.out_max_val; + + const OpDataType* din = (const OpDataType*)inputs[0]->data(); + OpDataType* dout = (OpDataType*)outputs[0]->mutable_data(); + int in_channel = channel * height * width; + int out_channel = ch_out * w_out * h_out; + + if (has_ax){//nchw + auto shape = inputs[0]->valid_shape(); + int stride = shape.count(ax+1, shape.dims()); + int out_stride = shape.count(1, ax); + int out_ss = outputs[0]->valid_shape().count(ax, shape.dims()); + int in_ss = shape.count(ax, shape.dims()); + // LOG(INFO) << "stride: "< > vec; + vec.resize(size); + for (int i = 0; i < size; i++){ + vec[i] = std::make_pair(din_ch[i*stride], i); + } + //sort + std::partial_sort(vec.begin(), vec.begin() + top, vec.end(), std::greater< std::pair >()); + //out + OpDataType* dout_ch = dout + n * out_ss + k; + for(int i = 0; i < top ;i ++){ + if(out_max) + dout_ch[i*stride] = vec[i].first; + else + dout_ch[i*stride] = vec[i].second; + } + } + } + */ + Argmax_kernel_axis(din, dout, num, stride, out_stride, size, in_ss, out_ss, top, out_max); + }else{//all + if(in_channel < top){ + LOG(INFO) << "input data size less than topk"; + return SaberUnImplError; + } + /* + for (int n = 0; n < num; n++){ + const OpDataType* din_ch = din + n * in_channel; + std::vector< std::pair > vec; + vec.resize(in_channel); + for (int i = 0; i < in_channel; i++){ + vec[i] = std::make_pair(din_ch[i], i); + } + //sort + std::partial_sort(vec.begin(), vec.begin() + top, vec.end(), std::greater< std::pair >()); + //out + if(out_max){ + OpDataType* dout_ch = dout + n * out_channel; + OpDataType* dout_data = dout_ch; + OpDataType* dout_index = dout_ch + top; + for (int i = 0; i < top; i++){ + dout_data[i] = vec[i].first; + dout_index[i] = vec[i].second; + //LOG(INFO) << "max_data: " <(din, dout, num, in_channel, out_channel, top, out_max); + } + return SaberSuccess; +} + +template class SaberArgmax; +DEFINE_OP_TEMPLATE(SaberArgmax, ArgmaxParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberArgmax, ArgmaxParam, X86, AK_INT8); +} //namespace anakin + +} //namespace anakin diff --git a/saber/funcs/impl/x86/saber_argmax.h b/saber/funcs/impl/x86/saber_argmax.h new file mode 100644 index 000000000..a73349850 --- /dev/null +++ b/saber/funcs/impl/x86/saber_argmax.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ARGMAX_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ARGMAX_H + +#include "anakin_config.h" +#include "saber/funcs/impl/impl_argmax.h" +#include "saber/core/tensor.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberArgmax : \ + public ImplBase< + X86, + OpDtype, + ArgmaxParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberArgmax() = default; + ~SaberArgmax() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + ArgmaxParam ¶m, Context &ctx){ + //get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + ArgmaxParam ¶m, Context &ctx){ + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ArgmaxParam ¶m)override; + +private: +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ARGMAX_H diff --git a/saber/funcs/impl/x86/saber_avx2_math.h b/saber/funcs/impl/x86/saber_avx2_math.h new file mode 100644 index 000000000..6beea9a33 --- /dev/null +++ b/saber/funcs/impl/x86/saber_avx2_math.h @@ -0,0 +1,603 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/* + AVX implementation of sin, cos, sincos, exp and log + Based on "sse_mathfun.h", by Julien Pommier + http://gruntthepeon.free.fr/ssemath/ + Copyright (C) 2012 Giovanni Garberoglio + Interdisciplinary Laboratory for Computational Science (LISC) + Fondazione Bruno Kessler and University of Trento + via Sommarive, 18 + I-38123 Trento (Italy) + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + (this is the zlib license) +*/ +#ifndef SABER_FUNCS_IMPL_X86_SABER_AVX2_MATH_H +#define SABER_FUNCS_IMPL_X86_SABER_AVX2_MATH_H + + +#if defined(__AVX2__) and defined(__FMA__) + +namespace anakin { +namespace saber { + +#include +/* yes I know, the top of this file is quite ugly */ +#define ALIGN32_BEG +#define ALIGN32_END __attribute__((aligned(32))) + +/* __m128 is ugly to write */ +typedef __m256 v8sf; // vector of 8 float (avx) +typedef __m256i v8si; // vector of 8 int (avx) + + +#define _PI32AVX_CONST(Name, Val) \ + static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \ + Val, Val, Val, Val} + +_PI32AVX_CONST(1, 1); +_PI32AVX_CONST(inv1, ~1); +_PI32AVX_CONST(2, 2); +_PI32AVX_CONST(4, 4); + +/* declare some AVX constants -- why can't I figure a better way to do that? */ +#define _PS256_CONST(Name, Val) \ + static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} +#define _PI32_CONST256(Name, Val) \ + static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} +#define _PS256_CONST_TYPE(Name, Type, Val) \ + static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} + +_PS256_CONST(1, 1.0f); +_PS256_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); +_PS256_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST256(0, 0); +_PI32_CONST256(1, 1); +_PI32_CONST256(inv1, ~1); +_PI32_CONST256(2, 2); +_PI32_CONST256(4, 4); +_PI32_CONST256(0x7f, 0x7f); + +_PS256_CONST(cephes_SQRTHF, 0.707106781186547524); +_PS256_CONST(cephes_log_p0, 7.0376836292E-2); +_PS256_CONST(cephes_log_p1, -1.1514610310E-1); +_PS256_CONST(cephes_log_p2, 1.1676998740E-1); +_PS256_CONST(cephes_log_p3, -1.2420140846E-1); +_PS256_CONST(cephes_log_p4, +1.4249322787E-1); +_PS256_CONST(cephes_log_p5, -1.6668057665E-1); +_PS256_CONST(cephes_log_p6, +2.0000714765E-1); +_PS256_CONST(cephes_log_p7, -2.4999993993E-1); +_PS256_CONST(cephes_log_p8, +3.3333331174E-1); +_PS256_CONST(cephes_log_q1, -2.12194440e-4); +_PS256_CONST(cephes_log_q2, 0.693359375); + + +#define avx2_mm256_slli_epi32 _mm256_slli_epi32 +#define avx2_mm256_srli_epi32 _mm256_srli_epi32 +#define avx2_mm256_and_si256 _mm256_and_si256 +#define avx2_mm256_andnot_si256 _mm256_andnot_si256 +#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32 +#define avx2_mm256_sub_epi32 _mm256_sub_epi32 +#define avx2_mm256_add_epi32 _mm256_add_epi32 + +/* natural logarithm computed for 8 simultaneous float + return NaN for x <= 0 +*/ +inline v8sf log256_ps(v8sf x) { + v8si imm0; + v8sf one = *(v8sf*)_ps256_1; + + // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); + v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); + + x = _mm256_max_ps( + x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */ + + // can be done with AVX2 + imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23); + + /* keep only the fractional part */ + x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask); + x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5); + + // this is again another AVX2 instruction + imm0 = avx2_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f); + v8sf e = _mm256_cvtepi32_ps(imm0); + + e = _mm256_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); + v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS); + v8sf tmp = _mm256_and_ps(x, mask); + x = _mm256_sub_ps(x, one); + e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); + x = _mm256_add_ps(x, tmp); + + v8sf z = _mm256_mul_ps(x, x); + + v8sf y = *(v8sf*)_ps256_cephes_log_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8); + y = _mm256_mul_ps(y, x); + + y = _mm256_mul_ps(y, z); + + tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1); + y = _mm256_add_ps(y, tmp); + + tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); + y = _mm256_sub_ps(y, tmp); + + tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2); + x = _mm256_add_ps(x, y); + x = _mm256_add_ps(x, tmp); + x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN + return x; +} + +_PS256_CONST(exp_hi, 88.3762626647949f); +_PS256_CONST(exp_lo, -88.3762626647949f); + +_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); +_PS256_CONST(cephes_exp_C1, 0.693359375); +_PS256_CONST(cephes_exp_C2, -2.12194440e-4); + +_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); +_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); +_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); +_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); +_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); +_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); + +inline v8sf exp256_ps(v8sf x) { + v8sf tmp = _mm256_setzero_ps(), fx; + v8si imm0; + v8sf one = *(v8sf*)_ps256_1; + + x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); + x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); + fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); + + /* how to perform a floorf with SSE: just below */ + // imm0 = _mm256_cvttps_epi32(fx); + // tmp = _mm256_cvtepi32_ps(imm0); + + tmp = _mm256_floor_ps(fx); + + /* if greater, substract 1 */ + // v8sf mask = _mm256_cmpgt_ps(tmp, fx); + v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); + mask = _mm256_and_ps(mask, one); + fx = _mm256_sub_ps(tmp, mask); + + tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); + v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); + x = _mm256_sub_ps(x, tmp); + x = _mm256_sub_ps(x, z); + + z = _mm256_mul_ps(x, x); + + v8sf y = *(v8sf*)_ps256_cephes_exp_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, x); + y = _mm256_add_ps(y, one); + + /* build 2^n */ + imm0 = _mm256_cvttps_epi32(fx); + // another two AVX2 instructions + imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); + imm0 = avx2_mm256_slli_epi32(imm0, 23); + v8sf pow2n = _mm256_castsi256_ps(imm0); + y = _mm256_mul_ps(y, pow2n); + return y; +} + +inline v8sf exp256_ps_fma(v8sf x) { + v8sf tmp = _mm256_setzero_ps(), fx; + v8si imm0; + v8sf one = *(v8sf*)_ps256_1; + + x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); + x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); + + fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); + fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); + + + tmp = _mm256_floor_ps(fx); + + v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); + mask = _mm256_and_ps(mask, one); + fx = _mm256_sub_ps(tmp, mask); + + tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); + v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); + x = _mm256_sub_ps(x, tmp); + x = _mm256_sub_ps(x, z); + + z = _mm256_mul_ps(x, x); + + v8sf y = *(v8sf*)_ps256_cephes_exp_p0; + y = _mm256_fmadd_ps(y, x, *(v8sf*)_ps256_cephes_exp_p1); + y = _mm256_fmadd_ps(y, x, *(v8sf*)_ps256_cephes_exp_p2); + y = _mm256_fmadd_ps(y, x, *(v8sf*)_ps256_cephes_exp_p3); + y = _mm256_fmadd_ps(y, x, *(v8sf*)_ps256_cephes_exp_p4); + y = _mm256_fmadd_ps(y, x, *(v8sf*)_ps256_cephes_exp_p5); + y = _mm256_fmadd_ps(y, z, x); + y = _mm256_add_ps(y, one); + /* build 2^n */ + imm0 = _mm256_cvttps_epi32(fx); + // another two AVX2 instructions + imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); + imm0 = avx2_mm256_slli_epi32(imm0, 23); + v8sf pow2n = _mm256_castsi256_ps(imm0); + y = _mm256_mul_ps(y, pow2n); + return y; +} + +inline __m256 _mm256_expfaster_ps(const __m256& a) { + + const __m256 C1 = _mm256_set1_ps(1064872507.1541044f); + const __m256 C2 = _mm256_set1_ps(12102203.161561485f); + + return _mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_fmadd_ps(C2, a, C1))); +} + +_PS256_CONST(minus_cephes_DP1, -0.78515625); +_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); +_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); +_PS256_CONST(sincof_p0, -1.9515295891E-4); +_PS256_CONST(sincof_p1, 8.3321608736E-3); +_PS256_CONST(sincof_p2, -1.6666654611E-1); +_PS256_CONST(coscof_p0, 2.443315711809948E-005); +_PS256_CONST(coscof_p1, -1.388731625493765E-003); +_PS256_CONST(coscof_p2, 4.166664568298827E-002); +_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI + +/* evaluation of 8 sines at onces using AVX intrisics + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. +*/ +inline v8sf sin256_ps(v8sf x) { // any x + v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; + v8si imm0, imm2; + + sign_bit = x; + /* take the absolute value */ + x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); + + /* + Here we start a series of integer operations, which are in the + realm of AVX2. + If we don't have AVX, let's perform them using SSE2 directives + */ + + /* store the integer part of y in mm0 */ + imm2 = _mm256_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + // another two AVX2 instruction + imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); + imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1); + y = _mm256_cvtepi32_ps(imm2); + + /* get the swap sign flag */ + imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4); + imm0 = avx2_mm256_slli_epi32(imm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 +namespace anakin { +namespace saber { + +static inline __m512 _mm512_expfaster_ps(const __m512& a) { + + const __m512 C1 = _mm512_set1_ps(1064872507.1541044f); + const __m512 C2 = _mm512_set1_ps(12102203.161561485f); + + return _mm512_castsi512_ps(_mm512_cvttps_epi32(_mm512_fmadd_ps(C2, a, C1))); +} + +inline __m512 exp512_ps_fma(__m512 x) { + __m512 tmp = _mm512_setzero_ps(), fx; + __m512i imm0; + __m512 one = _mm512_set1_ps(1.f); + __m512 _ps512_exp_hi = _mm512_set1_ps(88.3762626647949f); + __m512 _ps512_exp_lo = _mm512_set1_ps(-88.3762626647949f); + x = _mm512_min_ps(x, _ps512_exp_hi); + x = _mm512_max_ps(x, _ps512_exp_lo); + + __m512 _ps512_cephes_LOG2EF = _mm512_set1_ps(1.44269504088896341f); + fx = _mm512_mul_ps(x, _ps512_cephes_LOG2EF); + __m512 _ps512_0p5 = _mm512_set1_ps(0.5); + fx = _mm512_add_ps(fx, _ps512_0p5); + + tmp = _mm512_floor_ps(fx); + + //TODO:check _mm512_cmp_ps_mask _mm512_cmp_ps + __mmask16 mask_16 = _mm512_cmp_ps_mask(tmp, fx, _CMP_GT_OS); + __m512 zero = _mm512_setzero_ps(); + __m512 mask = _mm512_mask_add_ps(zero, mask_16, zero, one); + + // __m512 mask = _mm512_cmp_ps(tmp, fx, _CMP_GT_OS); + // mask = _mm512_and_ps(mask, one); + fx = _mm512_sub_ps(tmp, mask); + + __m512 _ps512_cephes_exp_C1 = _mm512_set1_ps(0.693359375f); + __m512 _ps512_cephes_exp_C2 = _mm512_set1_ps(-2.12194440E-4f); + tmp = _mm512_mul_ps(fx, _ps512_cephes_exp_C1); + __m512 z = _mm512_mul_ps(fx, _ps512_cephes_exp_C2); + x = _mm512_sub_ps(x, tmp); + x = _mm512_sub_ps(x, z); + z = _mm512_mul_ps(x, x); + + __m512 _ps512_cephes_exp_p0 = _mm512_set1_ps(1.9875691500E-4f); + __m512 _ps512_cephes_exp_p1 = _mm512_set1_ps(1.3981999507E-3f); + __m512 _ps512_cephes_exp_p2 = _mm512_set1_ps(8.3334519073E-3f); + __m512 _ps512_cephes_exp_p3 = _mm512_set1_ps(4.1665795894E-2f); + __m512 _ps512_cephes_exp_p4 = _mm512_set1_ps(1.6666665459E-1f); + __m512 _ps512_cephes_exp_p5 = _mm512_set1_ps(5.0000001201E-1f); + __m512 y = _ps512_cephes_exp_p0; + y = _mm512_fmadd_ps(y, x, _ps512_cephes_exp_p1); + y = _mm512_fmadd_ps(y, x, _ps512_cephes_exp_p2); + y = _mm512_fmadd_ps(y, x, _ps512_cephes_exp_p3); + y = _mm512_fmadd_ps(y, x, _ps512_cephes_exp_p4); + y = _mm512_fmadd_ps(y, x, _ps512_cephes_exp_p5); + y = _mm512_fmadd_ps(y, z, x); + y = _mm512_add_ps(y, one); + /* build 2^n */ + imm0 = _mm512_cvttps_epi32(fx); + // another two AVX2 instructions + __m512i _pi32_512_0x7f = _mm512_set1_epi32(0x7f); + imm0 = _mm512_add_epi32(imm0, _pi32_512_0x7f); + imm0 = _mm512_slli_epi32(imm0, 23); + __m512 pow2n = _mm512_castsi512_ps(imm0); + y = _mm512_mul_ps(y, pow2n); + return y; +} + +} +} +#endif +#endif //ANAKIN_SABER_AVX512_MATH_H \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_axpy.cpp b/saber/funcs/impl/x86/saber_axpy.cpp new file mode 100644 index 000000000..98ddd79a1 --- /dev/null +++ b/saber/funcs/impl/x86/saber_axpy.cpp @@ -0,0 +1,56 @@ +#include "saber/funcs/impl/x86/saber_axpy.h" + +namespace anakin{ + +namespace saber{ + + +template +void axpy_kernel(const int len, const dtype* src, dtype* dst) { + if (dst != src) { + memcpy(dst, src, sizeof(dtype) * len); + } +} +template +SaberStatus SaberAxpy::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, AxpyParam ¶m) { + //compare + if (!(inputs[1]->valid_shape() == outputs[0]->valid_shape()) + || !(inputs[2]->valid_shape() == outputs[0]->valid_shape())) { + return SaberUnKownError; + } + + const OpDataType* scale = (OpDataType*)inputs[0]->data(); + const OpDataType* x = (OpDataType*)inputs[1]->data(); + const OpDataType* y = (OpDataType*)inputs[2]->data(); + OpDataType* dst = (OpDataType*)outputs[0]->mutable_data(); + + int num = inputs[2]->num(); + int channel = inputs[2]->channel(); + int size = inputs[2]->height() * inputs[2]->width(); + int in_channel = channel * size; + // scale*x + y + for (int i = 0; i < num; i++){ + const OpDataType* din_ptr = x + i * in_channel; + const OpDataType* bias_ptr = y + i * in_channel; + const OpDataType* scale_ptr = scale + i * channel; + OpDataType* dout_ptr = dst + i * in_channel; + for(int j = 0; j < channel; j++){ + const OpDataType* din_ch_ptr = din_ptr + j * size; + OpDataType* dout_ch_ptr = dout_ptr + j * size; + const OpDataType* scale_ch_ptr = scale_ptr + j; + const OpDataType* bias_ch_ptr = bias_ptr + j * size; + for (int k = 0; k < size; k++){ + dout_ch_ptr[k] = din_ch_ptr[k] * scale_ch_ptr[0] + bias_ch_ptr[k]; + } + } + } + return SaberSuccess; +} + +template class SaberAxpy; +DEFINE_OP_TEMPLATE(SaberAxpy, AxpyParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberAxpy, AxpyParam, X86, AK_INT8); +} //namespace anakin + +} //namespace anakin diff --git a/saber/funcs/impl/x86/saber_axpy.h b/saber/funcs/impl/x86/saber_axpy.h new file mode 100644 index 000000000..6ce96d93d --- /dev/null +++ b/saber/funcs/impl/x86/saber_axpy.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_AXPY_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_AXPY_H + +#include "anakin_config.h" +#include "saber/funcs/impl/impl_axpy.h" +#include "saber/core/tensor.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberAxpy : \ + public ImplBase< + X86, + OpDtype, + AxpyParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberAxpy() = default; + ~SaberAxpy() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + AxpyParam ¶m, Context &ctx){ + //get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + AxpyParam ¶m, Context &ctx){ + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + AxpyParam ¶m)override; + +private: +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_AXPY_H diff --git a/saber/funcs/impl/x86/saber_cast.cpp b/saber/funcs/impl/x86/saber_cast.cpp new file mode 100644 index 000000000..4cba1d85e --- /dev/null +++ b/saber/funcs/impl/x86/saber_cast.cpp @@ -0,0 +1,52 @@ +#include "saber/funcs/impl/x86/saber_cast.h" + +namespace anakin{ + +namespace saber{ + +template +void cast_kernel(const Dtype* src, Ttype* dst, int count) { + for (int i = 0; i < count; i++){ + dst[i] = static_cast(src[i]); + } +} + +template +SaberStatus SaberCast::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, CastParam ¶m) { + + int count = inputs[0]->valid_size(); + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); + + if(_inDtype == _outDtype){ + outputs[0]->copy_from(*inputs[0]); + return SaberSuccess; + } + + if(inputs[0]->get_dtype() == 1){//AK_FLOAT + const float* in_data = (const float*)inputs[0]->data(); + int* out_data = (int*)outputs[0]->mutable_data(); + if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { + cast_kernel(in_data, out_data, count); + } + + } + + if(inputs[0]->get_dtype() == 5){//AK_INT32 + const int* in_data = (const int*)inputs[0]->data(); + float* out_data = (float*)outputs[0]->mutable_data(); + if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { + cast_kernel(in_data, out_data, count); + } + } + + return SaberSuccess; +} + +template class SaberCast; +template class SaberCast; +DEFINE_OP_TEMPLATE(SaberCast, CastParam, X86, AK_INT16); +DEFINE_OP_TEMPLATE(SaberCast, CastParam, X86, AK_INT8); +} //namespace anakin + +} //namespace anakin diff --git a/saber/funcs/impl/x86/saber_cast.h b/saber/funcs/impl/x86/saber_cast.h new file mode 100644 index 000000000..2b65bcf3b --- /dev/null +++ b/saber/funcs/impl/x86/saber_cast.h @@ -0,0 +1,78 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CAST_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CAST_H + +#include "anakin_config.h" +#include "saber/funcs/impl/impl_cast.h" +#include "saber/core/tensor.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberCast : \ + public ImplBase< + X86, + OpDtype, + CastParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberCast() = default; + ~SaberCast() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + CastParam ¶m, Context &ctx){ + // get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + CastParam ¶m, Context &ctx){ + + _inDtype = param.in_type; + _outDtype = param.out_type; + if(_inDtype != 1 && _inDtype !=5){// AK_FLOAT AK_INT32 + LOG(FATAL) << "Cast not impl other type: " << _inDtype; + } + if(_outDtype != 1 && _outDtype !=5){ + LOG(FATAL) << "Cast not impl other type: " << _outDtype; + } + CHECK_EQ(_inDtype, inputs[0]->get_dtype()) << "inputs data type should be same with param.in_type"; + CHECK_EQ(_outDtype, outputs[0]->get_dtype()) << "outputs data type should be same with param.out_type"; + + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + CastParam ¶m)override; + +private: + int _inDtype; + int _outDtype; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_Cast_H diff --git a/saber/funcs/impl/x86/saber_concat.cpp b/saber/funcs/impl/x86/saber_concat.cpp index 0de93b19d..8319492ab 100644 --- a/saber/funcs/impl/x86/saber_concat.cpp +++ b/saber/funcs/impl/x86/saber_concat.cpp @@ -1,7 +1,5 @@ #include "saber/funcs/impl/x86/saber_concat.h" -#ifdef USE_X86_PLACE - namespace anakin{ namespace saber{ @@ -13,20 +11,11 @@ void concat_kernel(const int len, const dtype* src, dtype* dst) { } } -template -SaberStatus SaberConcat::dispatch(\ - const std::vector& inputs, - std::vector& outputs, - ConcatParam ¶m) { +template +SaberStatus SaberConcat::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, ConcatParam ¶m) { int input_size = inputs.size(); - //! get output data, valid shape and stride shape int offset_concat_axis = 0; Shape out_shape = outputs[0]->valid_shape(); @@ -37,29 +26,27 @@ LayOutType_op, LayOutType_in, LayOutType_out>::dispatch(\ return SaberSuccess; } - OutDataType* dout = outputs[0]->mutable_data(); + OpDataType* dout = (OpDataType*)outputs[0]->mutable_data(); for (int i = 0; i < input_size; ++i) { Shape sh_in = inputs[i]->valid_shape(); - const InDataType* din = inputs[i]->data(); + const OpDataType* din = (const OpDataType*)inputs[i]->data(); const int in_concat_axis = sh_in[param.axis]; for (int n = 0; n < _num_concats; ++n) { - concat_kernel(in_concat_axis * _concat_input_size, + concat_kernel(in_concat_axis * _concat_input_size, din + n * in_concat_axis * _concat_input_size, dout + (n * out_concat_axis + offset_concat_axis) * _concat_input_size); } offset_concat_axis += in_concat_axis; } - CHECK_GE(inputs[0]->get_seq_offset().size(), 2); outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); return SaberSuccess; } -template class SaberConcat; - +template class SaberConcat; +DEFINE_OP_TEMPLATE(SaberConcat, ConcatParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberConcat, ConcatParam, X86, AK_INT8); } //namespace anakin } //namespace anakin - -#endif // USE_X86_PLACE \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_concat.h b/saber/funcs/impl/x86/saber_concat.h index 49681da37..1566d3d33 100644 --- a/saber/funcs/impl/x86/saber_concat.h +++ b/saber/funcs/impl/x86/saber_concat.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,56 +20,42 @@ #include "saber/funcs/impl/impl_concat.h" #include "saber/core/tensor.h" -#ifdef USE_X86_PLACE - namespace anakin{ namespace saber{ -template -class SaberConcat : \ +template +class SaberConcat : \ public ImplBase< - Tensor, - Tensor, - Tensor, - ConcatParam > > { + X86, + OpDtype, + ConcatParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; + typedef typename DataTrait::Dtype OpDataType; SaberConcat() = default; ~SaberConcat() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConcatParam ¶m, Context &ctx){ + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + ConcatParam ¶m, Context &ctx){ // get context - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConcatParam ¶m, Context &ctx){ + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + ConcatParam ¶m, Context &ctx){ _num_concats = inputs[0]->count_valid(0, param.axis); _concat_input_size = inputs[0]->count_valid(param.axis + 1, inputs[0]->dims()); return SaberSuccess; } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConcatParam ¶m); + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConcatParam ¶m)override; private: int _num_concats; @@ -80,6 +66,4 @@ class SaberConcat +SaberStatus SaberConv2D::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + this->_ctx = &ctx; + return this->impl->create(inputs, outputs, param, ctx); +} +template <> +SaberStatus SaberConv2D::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + + this->_ctx = &ctx; + bool use_avx512 = mayiuse(avx512_common); + bool use_avx2 = mayiuse(avx2); + if (use_avx512 && param.group == inputs[0]->channel() && param.group == outputs[0]->channel()) { + this->impl = new JitUniDWConv; + } else if (use_avx512 && param.weight()->height() == 1 && param.weight()->width() == 1) { + this->impl = new JitAvx512Conv1x1; + } else if (use_avx512 && outputs[0]->get_layout() == Layout_NCHW_C16) { + this->impl = new JitAvx512Conv; + } else if (use_avx2 && (outputs[0]->get_layout() == Layout_NCHW_C8)) { + this->impl = new JitAvx2Conv; + } else { + this->impl = new SaberIm2colConv; + } + this->impl->init(inputs, outputs, param, ctx); + return create(inputs, outputs, param, ctx); + +} + +template <> +SaberStatus SaberConv2D::\ + dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param) { + return this->impl->dispatch(inputs, outputs, param); +} + +template <> +SaberStatus SaberConv2D::\ + create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + return SaberInvalidValue; +} + +template <> +SaberStatus SaberConv2D::\ + init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + return SaberInvalidValue; +} + +template <> +SaberStatus SaberConv2D::\ + dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param) { + + return SaberInvalidValue; +} +DEFINE_OP_TEMPLATE(SaberConv2D, ConvParam, X86, AK_INT16); +} +} diff --git a/saber/funcs/impl/x86/saber_conv.h b/saber/funcs/impl/x86/saber_conv.h new file mode 100644 index 000000000..8103dfe7d --- /dev/null +++ b/saber/funcs/impl/x86/saber_conv.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CONV_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CONV_H + +#include "saber/funcs/impl/impl_conv.h" + +namespace anakin { +namespace saber { + +template +class SaberConv2D : public ImplBase< + X86, OpDtype, ConvParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + typedef ImplBase > Impl_t; + + SaberConv2D() + : impl(nullptr) + {} + + ~SaberConv2D() { + if (impl != nullptr) { + delete impl; + } + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context &ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param); + + SaberStatus trans_weights(Tensor &target_weights, + int stride_h, int stride_w, int group) { + return SaberUnImplError; + } + +private: + Impl_t* impl; +}; + +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CONV_H diff --git a/saber/funcs/impl/x86/saber_conv_act.cpp b/saber/funcs/impl/x86/saber_conv_act.cpp deleted file mode 100644 index f45137079..000000000 --- a/saber/funcs/impl/x86/saber_conv_act.cpp +++ /dev/null @@ -1,131 +0,0 @@ -#include "saber/saber_types.h" -#include "saber/funcs/impl/impl_base.h" -#include "saber/saber_funcs_param.h" -#include "saber/funcs/impl/x86/saber_conv_act.h" -#include "saber/funcs/impl/x86/jit_call_conf.h" -#include "saber/funcs/impl/x86/jit_uni_dw_convolution.h" -#include "saber/funcs/impl/x86/jit_avx512_conv1x1_act.h" -#include "saber/funcs/impl/x86/jit_avx512_conv_act.h" -#include "saber/funcs/impl/x86/jit_avx2_conv_act.h" - -namespace anakin { -namespace saber { - -template -SaberStatus SaberConv2DAct::init( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) -{ - SaberStatus ret = SaberUnImplError; - - ConvParam *conv_param = &(param.conv_param); - const OpTensor *weight = conv_param->weight(); - Shape weight_shape(weight->shape()); - - // go to different engines per different input parameters - if (conv_param->group == weight_shape[0] && conv_param->group == weight_shape[1]) { - // depth-wise convolution - if (this->impl) { - delete this->impl; - } - this->impl = new JitUniDWConvolution; - ret = this->impl->init(inputs, outputs, param, ctx); - if (ret == SaberSuccess) { -// LOG(INFO) << "++++++++++++JitUniDWConvolution"; - return ret; - } - } else if (weight_shape[2] == 1 && weight_shape[3] == 1) { - // 1x1 convolution+act - if (this->impl) { - delete this->impl; - } - this->impl = new JitAvx512Conv1x1Act; - ret = this->impl->init(inputs, outputs, param, ctx); - if (ret == SaberSuccess) { -// LOG(INFO) << "++++++++++++JitAvx512Conv1x1Act"; - return ret; - } - } else if (std::is_same::value) { - if (this->impl) { - delete this->impl; - } - this->impl = new JitAvx512ConvAct; - ret = this->impl->init(inputs, outputs, param, ctx); - if (ret == SaberSuccess) { -// LOG(INFO) << "++++++++++++JitAvx512ConvAct"; - return ret; - } - } else if (std::is_same::value) { - if (this->impl) { - delete this->impl; - } - this->impl = new JitAvx2ConvAct; - ret = this->impl->init(inputs, outputs, param, ctx); - if (ret == SaberSuccess) { -// LOG(INFO) << "++++++++++++JitAvx2ConvAct"; - return ret; - } - } - return SaberUnImplError; -} - -template -SaberStatus SaberConv2DAct::create( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) -{ - SaberStatus ret = SaberSuccess; - if (!this->impl) { - LOG(ERROR) << "impl is NULL"; - return SaberNotInitialized; - } - ret = this->impl->create(inputs, outputs, param, ctx); - return ret; -} - -template -SaberStatus SaberConv2DAct::dispatch( - const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m) -{ - SaberStatus ret = SaberSuccess; - if (!this->impl) { - LOG(ERROR) << "impl is NULL"; - return SaberNotInitialized; - } - ret = this->impl->dispatch(inputs, outputs, param); - return ret; -} -template class SaberConv2DAct; -template class SaberConv2DAct; -template class SaberConv2DAct; -//template class SaberConvAct; -} -} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_conv_act.h b/saber/funcs/impl/x86/saber_conv_act.h deleted file mode 100644 index 7223759f7..000000000 --- a/saber/funcs/impl/x86/saber_conv_act.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CONV_ACT_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CONV_ACT_H - -#include "saber/funcs/impl/impl_conv_act.h" -#include "saber/funcs/impl/x86/jit_call_conf.h" -#include "saber/funcs/impl/x86/jit_uni_dw_convolution.h" -#include "saber/funcs/impl/x86/jit_avx512_conv1x1_act.h" -#include "saber/funcs/impl/x86/jit_avx512_conv_act.h" -#include "saber/funcs/impl/x86/jit_avx2_conv_act.h" - -namespace anakin { -namespace saber { -template -class SaberConv2DAct : public ImplBase< - Tensor, - Tensor, - Tensor, - ConvActiveParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef ConvActiveParam Param_t; - typedef ImplBase Impl_t; - - SaberConv2DAct() - : impl(NULL) - {} - - ~SaberConv2DAct() { - if (impl != NULL) { - delete impl; - } - } - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) override; - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m, - Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - ConvActiveParam ¶m) override; - -private: - Impl_t* impl; -}; -} -} -#endif \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_conv_act_pooling.cpp b/saber/funcs/impl/x86/saber_conv_act_pooling.cpp deleted file mode 100644 index 8b88ac656..000000000 --- a/saber/funcs/impl/x86/saber_conv_act_pooling.cpp +++ /dev/null @@ -1,151 +0,0 @@ - -#include "saber/funcs/impl/x86/saber_conv_act_pooling.h" -#include "saber/funcs/impl/x86/saber_conv_act.h" -#include "saber/funcs/impl/x86/saber_pooling.h" -#include "saber/funcs/impl/x86/jit_call_conf.h" - -namespace anakin{ -namespace saber { - -using namespace jit; - -template -SaberStatus SaberConv2DActPooling::init( - const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam ¶m, - Context &ctx) -{ - SaberStatus ret = SaberUnImplError; - - Convact_param_t c_param(param.conv_param, param.activation_param); - Pooling_param_t p_param = param.pooling_param; - - if (!((std::is_same::value - && std::is_same::value - && std::is_same::value) || - (std::is_same::value - && std::is_same::value - && std::is_same::value))) { - return ret; - } - - Shape out = outputs[0]->shape(); - Shape shape_buf(out[0], out[1], ((out[2] - 1) * p_param.stride_h) - + p_param.window_h - p_param.pad_h, - ((out[3] - 1) * p_param.stride_w) - + p_param.window_w - p_param.pad_w, 16); - -// std::cout << "buf shape n:" << shape_buf[0] -// << " c:" << shape_buf[1] -// << " h:" << shape_buf[2] -// << " w:" << shape_buf[3] -// << std::endl; - DataTensor_out *b_info = new DataTensor_out(shape_buf); - std::for_each(this->buf.begin(), this->buf.end(), - [&](DataTensor_out* t) { - delete t; - t = nullptr; - }); - buf.push_back(b_info); - - this->c_impl = new SaberConv2DAct; - ret = this->c_impl->init(inputs, buf, c_param, ctx); - if (ret != SaberSuccess) { - return ret; - } - - this->p_impl = new SaberPooling; - ret = this->p_impl->init(buf, outputs, p_param, ctx); - return ret; -} - -template -SaberStatus SaberConv2DActPooling::create( - const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam ¶m, - Context &ctx) -{ - SaberStatus ret = SaberSuccess; - if (!this->c_impl || !this->p_impl) { - LOG(ERROR) << "impl is NULL"; - return SaberNotInitialized; - } - - Convact_param_t c_param(param.conv_param, param.activation_param); - Pooling_param_t p_param = param.pooling_param; - - Shape out = outputs[0]->shape(); - Shape shape_buf(out[0], out[1], ((out[2] - 1) * p_param.stride_h) - + p_param.window_h - p_param.pad_h, - ((out[3] - 1) * p_param.stride_w) - + p_param.window_w - p_param.pad_w, 16); - - LOG(INFO) << "create buf shape n:" << shape_buf[0] - << " c:" << shape_buf[1] - << " h:" << shape_buf[2] - << " w:" << shape_buf[3] - << std::endl; - - DataTensor_out *b_info = new DataTensor_out(shape_buf); - std::for_each(this->buf.begin(), this->buf.end(), - [&](DataTensor_out* t) { - delete t; - t = nullptr; - }); - buf.push_back(b_info); - - ret = this->c_impl->create(inputs, buf, c_param, ctx); - if (ret != SaberSuccess) { - return ret; - } - - ret = this->p_impl->create(buf, outputs, p_param, ctx); - return ret; -} - -template -SaberStatus SaberConv2DActPooling::dispatch( - const std::vector& inputs, - std::vector& outputs, - ConvActivePoolingParam ¶m) -{ - SaberStatus ret = SaberSuccess; - if (!this->c_impl || !this->p_impl) { - LOG(ERROR) << "impl is NULL"; - return SaberNotInitialized; - } - - Convact_param_t c_param(param.conv_param, param.activation_param); - Pooling_param_t p_param = param.pooling_param; - ret = this->c_impl->dispatch(inputs, buf, c_param); - if (ret != SaberSuccess) { - return ret; - } - ret = this->p_impl->dispatch(buf, outputs, p_param); - return ret; -} -template class SaberConv2DActPooling; -} -} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_conv_act_pooling.h b/saber/funcs/impl/x86/saber_conv_act_pooling.h deleted file mode 100644 index f3a73d9af..000000000 --- a/saber/funcs/impl/x86/saber_conv_act_pooling.h +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CONV_ACT_POOLING_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CONV_ACT_POOLING_H - -#include "saber/funcs/impl/impl_conv_act_pooling.h" -#include "saber/funcs/impl/x86/jit_call_conf.h" - -namespace anakin { -namespace saber { - -using namespace jit; - -template -class SaberConv2DActPooling : public ImplBase< - Tensor, - Tensor, - Tensor, - ConvActivePoolingParam > > { -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef ConvActivePoolingParam Param_t; - typedef ImplBase Impl_t; - typedef ConvActiveParam Convact_param_t; - typedef PoolingParam Pooling_param_t; - - typedef ImplBase Conv_impl_t; - typedef ImplBase Pooling_impl_t; - - SaberConv2DActPooling() - : c_impl(nullptr), p_impl(nullptr) {} - - ~SaberConv2DActPooling() { - if (c_impl != nullptr) { - delete c_impl; - } - if (p_impl != nullptr) { - delete p_impl; - } - std::for_each(this->buf.begin(), this->buf.end(), - [&](DataTensor_out *t) { - delete t; - t = nullptr; - }); - } - - virtual SaberStatus init(const std::vector &inputs, - std::vector &outputs, - ConvActivePoolingParam ¶m, - Context &ctx) override; - - virtual SaberStatus create(const std::vector &inputs, - std::vector &outputs, - ConvActivePoolingParam ¶m, - Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector &inputs, - std::vector &outputs, - ConvActivePoolingParam ¶m) override; - -private: - Conv_impl_t *c_impl = nullptr; - Pooling_impl_t *p_impl = nullptr; - std::vector buf; -}; - -} -} - -#endif \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_crf_decoding.cpp b/saber/funcs/impl/x86/saber_crf_decoding.cpp index ea85ebc62..a3140625a 100644 --- a/saber/funcs/impl/x86/saber_crf_decoding.cpp +++ b/saber/funcs/impl/x86/saber_crf_decoding.cpp @@ -1,68 +1,158 @@ #include "saber/funcs/impl/x86/saber_crf_decoding.h" #include "saber/saber_funcs_param.h" +#include "x86_utils.h" #include #include #include +#include -namespace anakin{ +namespace anakin { namespace saber { -template -SaberStatus SaberCrfDecoding::init( - const std::vector& inputs, - std::vector& outputs, - CrfDecodingParam ¶m, Context &ctx) { - - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - this->_ctx = ctx; +template +SaberStatus SaberCrfDecoding::init( + const std::vector *>& inputs, + std::vector *>& outputs, + CrfDecodingParam ¶m, Context &ctx) { + + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } -template -SaberStatus SaberCrfDecoding::create( - const std::vector& inputs, - std::vector& outputs, - CrfDecodingParam ¶m, +template +SaberStatus SaberCrfDecoding::create( + const std::vector *>& inputs, + std::vector *>& outputs, + CrfDecodingParam ¶m, Context &ctx) { - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - this->_ctx = ctx; - _alpha.re_alloc(inputs[0]->valid_shape()); - _track.re_alloc(inputs[0]->valid_shape()); + CHECK_EQ(inputs[0]->get_dtype(), OpDtype) << "inputs data type should be same with OpDtype"; + CHECK_EQ(outputs[0]->get_dtype(), OpDtype) << "outputs data type should be same with OpDtype"; + + this->_ctx = &ctx; + _track.re_alloc(inputs[0]->valid_shape(), AK_INT32); + +#ifdef __AVX2__ + int tag_num = inputs[0]->channel(); + _aligned_tag_num = (tag_num % 8) ? (tag_num / 8 + 1) * 8 : tag_num; + // get transposed transition weight + const OpDataType *transition_ptr = (const OpDataType*)param.transition_weight()->data(); + Shape trans_shape({tag_num + 2, _aligned_tag_num, 1, 1}, Layout_NCHW); + _trans.re_alloc(trans_shape, OpDtype); + OpDataType *transition = (OpDataType*)_trans.mutable_data(); + memcpy(transition, transition_ptr, sizeof(OpDataType) * tag_num); + memcpy(transition + _aligned_tag_num, transition_ptr + tag_num, sizeof(OpDataType) * tag_num); + for (int i = 0; i < tag_num; i++) { + for (int j = 0; j < tag_num; j++) { + transition[(i + 2) * _aligned_tag_num + j] = transition_ptr[(j + 2) * tag_num + i]; + } + for (int j = tag_num; j < _aligned_tag_num; j++) { + transition[(i + 2) * _aligned_tag_num + j] = 0; + } + } + + Shape emis_shape({inputs[0]->num(), _aligned_tag_num, 1, 1}, Layout_NCHW); + _emis.re_alloc(emis_shape, OpDtype); + _alpha.re_alloc(emis_shape, OpDtype); +#else + _alpha.re_alloc(inputs[0]->valid_shape(), OpDtype); +#endif return SaberSuccess; } -template -void decoding(dtype* path, const dtype* emission, const dtype* transition, - dtype* alpha_value, int* track_value, int seq_len, int tag_num) { - const dtype* x = emission; - const dtype* w = transition; + +template +void decoding(Dtype* path, const Dtype* emission, const Dtype* transition, + Dtype* alpha_value, int* track_value, int aligned_tag_num, int seq_len, int tag_num) { +#ifdef __AVX2__ + const Dtype* x = emission; + const Dtype* w = transition; + const int state_trans_base_idx = 2; + + { + __m256 *ww = (__m256*)w; + __m256 *xx = (__m256*)x; + __m256 *aa = (__m256*)alpha_value; + for (int i = 0; i < aligned_tag_num / 8; ++i) { + aa[i] = ww[i] + xx[i]; + } + } + + int tail = ((aligned_tag_num == tag_num) ? 8 : tag_num % 8); + + for (int k = 1; k < seq_len; ++k) { + for (int i = 0; i < tag_num; ++i) { + Dtype max_score = -std::numeric_limits::max(); + int max_j = 0; + + __m256 *aa = (__m256*)(alpha_value + (k - 1) * aligned_tag_num); + __m256 *ww = (__m256*)(w + (i + state_trans_base_idx) * aligned_tag_num); + __m256 score_v; + Dtype *score = (Dtype*)(&score_v); + for (size_t j = 0; j < aligned_tag_num / 8 - 1; ++j) { + score_v = aa[j] + ww[j]; + for (int m = 0; m < 8; m++) { + if (score[m] > max_score) { + max_score = score[m]; + max_j = j * 8 + m; + } + } + } + int tail_idx = aligned_tag_num / 8 - 1; + score_v = aa[tail_idx] + ww[tail_idx]; + for (int m = 0; m < tail; m++) { + if (score[m] > max_score) { + max_score = score[m]; + max_j = tail_idx * 8 + m; + } + } + + alpha_value[k * aligned_tag_num + i] = max_score + x[k * aligned_tag_num + i]; + track_value[k * tag_num + i] = max_j; + } + } + + Dtype max_score = -std::numeric_limits::max(); + int max_i = 0; + __m256* aa = (__m256*)(alpha_value + (seq_len - 1) * aligned_tag_num); + __m256* ww = (__m256*)(w + aligned_tag_num); + __m256 score_v; + Dtype *score = (Dtype*)(&score_v); + for (size_t i = 0; i < aligned_tag_num / 8 - 1; ++i) { + score_v = aa[i] + ww[i]; + for (int m = 0; m < 8; m++) { + if (score[m] > max_score) { + max_score = score[m]; + max_i = i * 8 + m; + } + } + } + int tail_idx = aligned_tag_num / 8 - 1; + score_v = aa[tail_idx] + ww[tail_idx]; + for (int m = 0; m < tail; m++) { + if (score[m] > max_score) { + max_score = score[m]; + max_i = tail_idx * 8 + m; + } + } + + path[seq_len - 1] = max_i; + for (int k = seq_len - 1; k >= 1; --k) { + path[k - 1] = max_i = track_value[k * tag_num + max_i]; + } +#else + const Dtype* x = emission; + const Dtype* w = transition; const int state_trans_base_idx = 2; for (int i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i]; for (int k = 1; k < seq_len; ++k) { for (int i = 0; i < tag_num; ++i) { - dtype max_score = -std::numeric_limits::max(); + Dtype max_score = -std::numeric_limits::max(); int max_j = 0; for (size_t j = 0; j < tag_num; ++j) { - dtype score = alpha_value[(k - 1) * tag_num + j] + + Dtype score = alpha_value[(k - 1) * tag_num + j] + w[(j + state_trans_base_idx) * tag_num + i]; if (score > max_score) { max_score = score; @@ -73,10 +163,10 @@ void decoding(dtype* path, const dtype* emission, const dtype* transition, track_value[k * tag_num + i] = max_j; } } - dtype max_score = -std::numeric_limits::max(); + Dtype max_score = -std::numeric_limits::max(); int max_i = 0; for (size_t i = 0; i < tag_num; ++i) { - dtype score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i]; + Dtype score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i]; if (score > max_score) { max_score = score; max_i = i; @@ -86,46 +176,64 @@ void decoding(dtype* path, const dtype* emission, const dtype* transition, for (int k = seq_len - 1; k >= 1; --k) { path[k - 1] = max_i = track_value[k * tag_num + max_i]; } +#endif } -template -SaberStatus SaberCrfDecoding::dispatch( - const std::vector& inputs, - std::vector& outputs, - CrfDecodingParam ¶m) { - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - - std::vector seq_offset = inputs[0]->get_seq_offset(); - - const DataType_in *emission_ptr = inputs[0]->data(); - const DataType_op *transition_ptr = param.transition_weight()->data(); - DataType_out *decoded_path = outputs[0]->mutable_data(); - - int seq_num = seq_offset.size() - 1; - int slice_size = inputs[0]->channel() - * inputs[0]->height() - * inputs[0]->width(); +template +SaberStatus SaberCrfDecoding::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs, + CrfDecodingParam ¶m) { + std::vector> seq_offset = inputs[0]->get_seq_offset(); + + const OpDataType *emission_ptr = (const OpDataType*)inputs[0]->data(); + int tag_num = inputs[0]->channel(); + const OpDataType *transition_ptr = (const OpDataType*)param.transition_weight()->data(); + int slice_size = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width(); + +#ifdef __AVX2__ + if (tag_num % 8) { + transition_ptr = (OpDataType*)_trans.data(); + + // align emission to AVX2 register width + OpDataType *emission = (OpDataType*)_emis.mutable_data(); + for (int i = 0; i < inputs[0]->num(); i++) { + OpDataType* to = emission + i * _aligned_tag_num; + OpDataType* from = emission_ptr + i * tag_num; + memcpy(to, from, tag_num * sizeof(OpDataType)); + for (int j = tag_num; j < _aligned_tag_num; j++) { + to[j] = 0; + } + } + emission_ptr = emission; + slice_size = _aligned_tag_num; + } +#endif + OpDataType *decoded_path = (OpDataType*) outputs[0]->mutable_data(); + int seq_num = seq_offset[0].size() - 1; + int nthreads = omp_get_max_threads(); + + if (nthreads > seq_num) { + nthreads = seq_num; + } + #pragma omp parallel for num_threads(nthreads) if(seq_num > 1) for (int i = 0; i < seq_num; ++i) { - int seq_len = seq_offset[i+1] - seq_offset[i]; - decoding(decoded_path, emission_ptr, transition_ptr, - _alpha.mutable_data(), _track.mutable_data(), - seq_len, inputs[0]->channel()); + int seq_len = seq_offset[0][i+1] - seq_offset[0][i]; + // LOG(INFO) << "slice_size: " << slice_size << ", seq_num: " << seq_num << ", seq_len: " << seq_len; + decoding(decoded_path, emission_ptr, transition_ptr, + (OpDataType*)_alpha.mutable_data(), (int*)_track.mutable_data(), + _aligned_tag_num, seq_len, tag_num); decoded_path += seq_len; emission_ptr += slice_size * seq_len; } + //LOG(INFO) << "dispatch success "; return SaberSuccess; - } -template class SaberCrfDecoding; + +template class SaberCrfDecoding; +DEFINE_OP_TEMPLATE(SaberCrfDecoding, CrfDecodingParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberCrfDecoding, CrfDecodingParam, X86, AK_INT8); } } // namespace anakin diff --git a/saber/funcs/impl/x86/saber_crf_decoding.h b/saber/funcs/impl/x86/saber_crf_decoding.h index 5b9ef3238..c643c36fd 100644 --- a/saber/funcs/impl/x86/saber_crf_decoding.h +++ b/saber/funcs/impl/x86/saber_crf_decoding.h @@ -21,49 +21,39 @@ namespace anakin{ namespace saber { -template -class SaberCrfDecoding : public ImplBase< - Tensor, - Tensor, - Tensor, - CrfDecodingParam > > +template +class SaberCrfDecoding : public ImplBase< + X86, OpDtype, + CrfDecodingParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; + typedef typename DataTrait::Dtype OpDataType; SaberCrfDecoding() = default; ~SaberCrfDecoding() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - CrfDecodingParam ¶m, + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + CrfDecodingParam ¶m, Context &ctx) override; - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - CrfDecodingParam ¶m, + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + CrfDecodingParam ¶m, Context &ctx) override; - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - CrfDecodingParam ¶m) override; + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + CrfDecodingParam ¶m) override; private: - DataTensor_in _alpha; - Tensor _track; + Tensor _alpha; + Tensor _track; + Tensor _trans; + Tensor _emis; + int _aligned_tag_num; }; } } -#endif \ No newline at end of file +#endif diff --git a/saber/funcs/impl/x86/saber_crop.cpp b/saber/funcs/impl/x86/saber_crop.cpp new file mode 100644 index 000000000..5779f81a3 --- /dev/null +++ b/saber/funcs/impl/x86/saber_crop.cpp @@ -0,0 +1,39 @@ +#include "saber/funcs/impl/x86/saber_crop.h" + +namespace anakin{ +namespace saber { + +template +SaberStatus SaberCrop::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + CropParam& param) { + typedef typename DataTrait::Dtype DataType_in; + typedef typename DataTrait::Dtype DataType_out; + int num = inputs[0] -> num(); + int in_c = inputs[0]->channel(); + int in_h = inputs[0]->height(); + int in_w = inputs[0]->width(); + const DataType_in* ptr_in = (const DataType_in*)inputs[0]->data(); + DataType_out* ptr_out = (DataType_out*)outputs[0]->mutable_data(); + for(int i =0; i < num; ++i){ + int offset_n = i * in_c * in_h * in_w; + for(int j=_c_off; j < _c_end; ++j){ + int offset_c = offset_n + j * in_h * in_w; + for(int k=_h_off; k < _h_end; ++k){ + int offset_h = offset_c + k * in_w; + for(int l=_w_off; l < _w_end; ++l){ + ptr_out[0]=ptr_in[offset_h + l]; + ptr_out++; + } + } + } + } + return SaberSuccess; +} + +template class SaberCrop; +DEFINE_OP_TEMPLATE(SaberCrop, CropParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberCrop, CropParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_crop.h b/saber/funcs/impl/x86/saber_crop.h new file mode 100644 index 000000000..89e657482 --- /dev/null +++ b/saber/funcs/impl/x86/saber_crop.h @@ -0,0 +1,99 @@ +/* Copyright (c) 2018 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CROP_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CROP_H + +#include "saber/funcs/impl/impl_crop.h" +#include "saber/funcs/crop.h" +namespace anakin { +namespace saber { + +template +class SaberCrop : + public ImplBase< + X86, OpDtype, + CropParam > +{ +public: + + SaberCrop() + {} + + ~SaberCrop() { + } + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + CropParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + }; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + CropParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + this->_param = ¶m; + CHECK_EQ(param.shape.size(),4); + if (param.axis == 1) { + CHECK_EQ(param.offset.size(), 3); + _c_off = param.offset[0]; + _h_off = param.offset[1]; + _w_off = param.offset[2]; + _c_end = param.shape[1]+_c_off; + _h_end = param.shape[2]+_h_off; + _w_end = param.shape[3]+_w_off; + } else if (param.axis == 2) { + CHECK_EQ(param.offset.size(), 2); + _c_off = 0; + _h_off = param.offset[0]; + _w_off = param.offset[1]; + _c_end = param.shape[1]; + _h_end = param.shape[2]+_h_off; + _w_end = param.shape[3]+_w_off; + } else if (param.axis == 3) { + CHECK_EQ(param.offset.size(), 1); + _c_off = 0; + _h_off = 0; + _w_off = param.offset[0]; + _c_end = param.shape[1]; + _h_end = param.shape[2]; + _w_end = param.shape[3]+_w_off; + } else { + return SaberInvalidValue; + } + + return SaberSuccess; + }; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + CropParam ¶m) override; + +private: + int _c_off; + int _h_off; + int _w_off; + int _c_end; + int _h_end; + int _w_end; + +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_eltwise.cpp b/saber/funcs/impl/x86/saber_eltwise.cpp index 1ea374ab1..97f27027d 100644 --- a/saber/funcs/impl/x86/saber_eltwise.cpp +++ b/saber/funcs/impl/x86/saber_eltwise.cpp @@ -1,136 +1,177 @@ - #include "saber/funcs/impl/x86/saber_eltwise.h" #include "saber/funcs/impl/x86/x86_utils.h" -namespace anakin{ +namespace anakin { namespace saber { -template class SaberEltwise; - -template -SaberStatus SaberEltwise::init( - const std::vector& inputs, - std::vector& outputs, - EltwiseParam ¶m, - Context &ctx) -{ +template class SaberEltwise; + +template +SaberStatus SaberEltwise::init( + const std::vector& inputs, + std::vector& outputs, + EltwiseParam& param, + Context& ctx) { // get context - this->_ctx = ctx; + this->_ctx = &ctx; + _with_relu = param.has_eltwise && param.activation_param.active == Active_relu; + _other_activation = param.has_eltwise && param.activation_param.active != Active_relu + && param.activation_param.active != Active_unknow; + + if (_other_activation) { + LOG(FATAL) << "not support other_activation"; + } + return create(inputs, outputs, param, ctx); } -template -SaberStatus SaberEltwise::create( - const std::vector& inputs, - std::vector& outputs, - EltwiseParam& param, - Context &ctx) -{ +template +SaberStatus SaberEltwise::create( + const std::vector& inputs, + std::vector& outputs, + EltwiseParam& param, + Context& ctx) { this->_param = ¶m; - if (this->_param->operation != Eltwise_sum) { - LOG(INFO) << "eltwise type " - << this->_param->operation << " is not supported now"; - return SaberUnImplError; - } - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - this->_ctx = ctx; + this->_ctx = &ctx; return SaberSuccess; } +template +template +void SaberEltwise::simple_sum(const std::vector& inputs, + std::vector& outputs, + EltwiseParam& param) { + const int input_num = inputs.size(); + const size_t inner_size = inputs[0]->valid_size(); + OpDataType* target = (OpDataType*) outputs[0]->mutable_data(); + std::vector in_ptrs(input_num); + + for (int i = 0; i < input_num; ++i) { + in_ptrs[i] = (OpDataType*) inputs[i]->data(); + } + + const OpDataType* coeff = static_cast(param.coeff.data()); -template -void SaberEltwise::simple_sum( - const std::vector& inputs, + //TODO:can be SIMD to improve cache efficient + for (int inner_id = 0; inner_id < inner_size; ++inner_id) { + OpDataType tmp = coeff[0] * in_ptrs[0][inner_id]; + + for (int input_id = 1; input_id < input_num; ++input_id) { + tmp += coeff[input_id] * in_ptrs[input_id][inner_id]; + } + + if (with_relu) { + target[inner_id] = tmp > 0 ? tmp : 0; + } else { + target[inner_id] = tmp; + } + + } +} +template +template +void SaberEltwise::simple_prod(const std::vector& inputs, std::vector& outputs, - EltwiseParam& param){ - - const int num_arrs = inputs.size(); - const size_t nelems = inputs[0]->size(); - const size_t block_size = 16 * 1024 / sizeof(float); - const size_t blocks_number = nelems / block_size; - const size_t tail = nelems % block_size; -#pragma omp parallel - { - const int ithr = omp_get_thread_num(); - const int nthr = omp_get_num_threads(); - size_t start{0}, end{0}; - utils::balance211(blocks_number, nthr, ithr, start, end); - - for (size_t nb = start; nb < end; ++nb) { - size_t start_e = nb * block_size; - size_t end_e = start_e + block_size; - // #pragma omp simd - for (size_t e = start_e; e < end_e; e++) { - outputs[0]->mutable_data()[e] = param.coeff[0] * inputs[0]->mutable_data()[e]; - } - for (int a = 1; a < num_arrs; a++) { - // #pragma omp simd - for (size_t e = start_e; e < end_e; e++) { - outputs[0]->mutable_data()[e] += param.coeff[a] * inputs[a]->mutable_data()[e]; - } - } + EltwiseParam& param) { + const int input_num = inputs.size(); + const size_t inner_size = inputs[0]->valid_size(); + OpDataType* target = (OpDataType*) outputs[0]->mutable_data(); + std::vector in_ptrs(input_num); + + for (int i = 0; i < input_num; ++i) { + in_ptrs[i] = (OpDataType*) inputs[i]->data(); + } + + for (int inner_id = 0; inner_id < inner_size; ++inner_id) { + OpDataType tmp = in_ptrs[0][inner_id]; + + for (int input_id = 1; input_id < input_num; ++input_id) { + tmp *= in_ptrs[input_id][inner_id]; } - if (tail != 0 && ithr == nthr - 1) { - size_t start_e = nelems - tail; - size_t end_e = nelems; - // #pragma omp simd - for (size_t e = start_e; e < end_e; e++) { - outputs[0]->mutable_data()[e] = param.coeff[0] * inputs[0]->mutable_data()[e]; - } - for (int a = 1; a < num_arrs; a++) { - // #pragma omp simd - for (size_t e = start_e; e < end_e; e++) { - outputs[0]->mutable_data()[e] += param.coeff[a] * inputs[a]->mutable_data()[e]; - } - } + if (with_relu) { + target[inner_id] = tmp > 0 ? tmp : 0; + } else { + target[inner_id] = tmp; } } } -template -SaberStatus SaberEltwise::dispatch( - const std::vector& inputs, +template +template +void SaberEltwise::simple_max(const std::vector& inputs, std::vector& outputs, - EltwiseParam ¶m) -{ + EltwiseParam& param) { + const int input_num = inputs.size(); + volatile const size_t inner_size = inputs[0]->valid_size(); + OpDataType* target = (OpDataType*) outputs[0]->mutable_data(); + std::vector in_ptrs(input_num); + + for (int i = 0; i < input_num; ++i) { + in_ptrs[i] = (OpDataType*) inputs[i]->data(); + } + + for (int inner_id = 0; inner_id < inner_size; ++inner_id) { + OpDataType tmp = in_ptrs[0][inner_id]; + + for (int input_id = 1; input_id < input_num; ++input_id) { + tmp = tmp >= in_ptrs[input_id][inner_id] ? tmp : in_ptrs[input_id][inner_id]; + } + + if (with_relu) { + target[inner_id] = tmp > 0 ? tmp : 0; + } else { + target[inner_id] = tmp; + } + } +} + + +template +SaberStatus SaberEltwise::dispatch( + const std::vector& inputs, + std::vector& outputs, + EltwiseParam& param) { CHECK_EQ(outputs.size(), (size_t)1); + switch (param.operation) { - case Eltwise_sum: - simple_sum(inputs, outputs, param); - return SaberSuccess; - default: - return SaberUnImplError; + case Eltwise_sum: + if (_with_relu) { + simple_sum(inputs, outputs, param); + } else { + simple_sum(inputs, outputs, param); + } + + break; + + case Eltwise_prod: + if (_with_relu) { + simple_prod(inputs, outputs, param); + } else { + simple_prod(inputs, outputs, param); + } + + break; + + case Eltwise_max: + if (_with_relu) { + simple_max(inputs, outputs, param); + } else { + simple_max(inputs, outputs, param); + } + + break; + + default: + LOG(FATAL) << "unknown elementwise operation. "; } - -} + return SaberSuccess; + +} +template class SaberEltwise; +DEFINE_OP_TEMPLATE(SaberEltwise, EltwiseParam, X86, AK_INT16); +DEFINE_OP_TEMPLATE(SaberEltwise, EltwiseParam, X86, AK_INT8); } } // namespace anakin diff --git a/saber/funcs/impl/x86/saber_eltwise.h b/saber/funcs/impl/x86/saber_eltwise.h index ab87d9544..735c5a4ca 100644 --- a/saber/funcs/impl/x86/saber_eltwise.h +++ b/saber/funcs/impl/x86/saber_eltwise.h @@ -20,24 +20,16 @@ namespace anakin{ namespace saber { -template -class SaberEltwise : public ImplBase< - Tensor, - Tensor, - Tensor, - EltwiseParam > > +template +class SaberEltwise : public ImplBase< + X86, + OpDtype, + EltwiseParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef typename DataTrait::Dtype OpDataType; SaberEltwise() {} @@ -46,24 +38,36 @@ class SaberEltwise& inputs, std::vector& outputs, - EltwiseParam ¶m, + EltwiseParam ¶m, Context &ctx) override; virtual SaberStatus create(const std::vector& inputs, std::vector& outputs, - EltwiseParam ¶m, + EltwiseParam ¶m, Context &ctx) override; virtual SaberStatus dispatch(const std::vector& inputs, std::vector& outputs, - EltwiseParam ¶m) override; + EltwiseParam ¶m) override; private: + template void simple_sum(const std::vector& inputs, std::vector& outputs, - EltwiseParam ¶m); + EltwiseParam ¶m); + template + void simple_prod(const std::vector& inputs, + std::vector& outputs, + EltwiseParam ¶m); + template + void simple_max(const std::vector& inputs, + std::vector& outputs, + EltwiseParam ¶m); + + bool _with_relu; + bool _other_activation; }; } } -#endif \ No newline at end of file +#endif diff --git a/saber/funcs/impl/x86/saber_eltwise_act.cpp b/saber/funcs/impl/x86/saber_eltwise_act.cpp deleted file mode 100644 index 14a701c58..000000000 --- a/saber/funcs/impl/x86/saber_eltwise_act.cpp +++ /dev/null @@ -1,159 +0,0 @@ -#include "saber/funcs/impl/x86/saber_eltwise_act.h" -#include "saber/funcs/impl/x86/x86_utils.h" - -namespace anakin{ -namespace saber { - -template class SaberEltwiseActive; - -template -void SaberEltwiseActive::simple_sum( - const std::vector& inputs, - std::vector& outputs, - EltwiseActiveParam& param) { - const int num_arrs = inputs.size(); - const size_t nelems = inputs[0]->size(); - const size_t block_size = 16 * 1024 / sizeof(float); - const size_t blocks_number = nelems / block_size; - const size_t tail = nelems % block_size; -#pragma omp parallel - { - const int ithr = omp_get_thread_num(); - const int nthr = omp_get_num_threads(); - size_t start{0}, end{0}; - utils::balance211(blocks_number, nthr, ithr, start, end); - for (size_t nb = start; nb < end; ++nb) { - size_t start_e = nb * block_size; - size_t end_e = start_e + block_size; - // #pragma omp simd - for (size_t e = start_e; e < end_e; e++) { - outputs[0]->mutable_data()[e] = param.eltwise_param.coeff[0] - * inputs[0]->mutable_data()[e]; - } - for (int a = 1; a < num_arrs; a++) { - // #pragma omp simd - for (size_t e = start_e; e < end_e; e++) { - outputs[0]->mutable_data()[e] += param.eltwise_param.coeff[a] - * inputs[a]->mutable_data()[e]; - } - } - } - - if (tail != 0 && ithr == nthr - 1) { - size_t start_e = nelems - tail; - size_t end_e = nelems; - // #pragma omp simd - for (size_t e = start_e; e < end_e; e++) { - outputs[0]->mutable_data()[e] = param.eltwise_param.coeff[0] - * inputs[0]->mutable_data()[e]; - } - for (int a = 1; a < num_arrs; a++) { - // #pragma omp simd - for (size_t e = start_e; e < end_e; e++) { - outputs[0]->mutable_data()[e] += param.eltwise_param.coeff[a] - * inputs[a]->mutable_data()[e]; - } - } - } - } -} -template -void SaberEltwiseActive::simple_relu( - std::vector& outputs) { - - for (int i = 0; i < outputs[0]->size(); i++) { - if (outputs[0]->mutable_data()[i] < 0) { - outputs[0]->mutable_data()[i] = 0; - } - } -} - -template -SaberStatus SaberEltwiseActive::init( - const std::vector& inputs, - std::vector& outputs, - EltwiseActiveParam ¶m, Context &ctx) -{ - - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - this->_ctx = ctx; - - return create(inputs, outputs, param, ctx); -} - -template -SaberStatus SaberEltwiseActive - ::create(const std::vector& inputs, - std::vector& outputs, - EltwiseActiveParam ¶m, Context &ctx) -{ - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - this->_ctx = ctx; - - return SaberSuccess; -} - -template -SaberStatus SaberEltwiseActive - ::dispatch(const std::vector& inputs, - std::vector& outputs, - EltwiseActiveParam ¶m) -{ - CHECK_EQ(outputs.size(), (size_t)1); - switch (param.eltwise_param.operation) { - case Eltwise_sum: - simple_sum(inputs, outputs, param); - break; - default: - return SaberUnImplError; - } - - if (param.has_activation) { - switch (param.activation_param.active) { - case Active_relu: - simple_relu(outputs); - break; - default: - return SaberUnImplError; - } - } - return SaberSuccess; - -} - -} -} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_eltwise_act.h b/saber/funcs/impl/x86/saber_eltwise_act.h deleted file mode 100644 index 61897b946..000000000 --- a/saber/funcs/impl/x86/saber_eltwise_act.h +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ELTWISE_ACT_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ELTWISE_ACT_H - -#include "saber/funcs/impl/impl_eltwise_act.h" -#include "saber/funcs/impl/x86/x86_utils.h" - -namespace anakin{ -namespace saber { - -template -class SaberEltwiseActive : public ImplBase< - Tensor, - Tensor, - Tensor, - EltwiseActiveParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - SaberEltwiseActive() - {} - - ~SaberEltwiseActive() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - EltwiseActiveParam ¶m, - Context &ctx) override; - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - EltwiseActiveParam ¶m, - Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - EltwiseActiveParam ¶m) override; - -private: - void simple_sum(const std::vector& inputs, - std::vector& outputs, - EltwiseActiveParam& param); - - void simple_relu(std::vector& outputs); -}; - -} -} -#endif \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_embedding.cpp b/saber/funcs/impl/x86/saber_embedding.cpp index 3e92ab9d8..d4cdf0b4c 100644 --- a/saber/funcs/impl/x86/saber_embedding.cpp +++ b/saber/funcs/impl/x86/saber_embedding.cpp @@ -6,77 +6,66 @@ namespace anakin{ namespace saber { -template class SaberEmbedding; -template -SaberStatus SaberEmbedding::init( - const std::vector& inputs, - std::vector& outputs, - EmbeddingParam ¶m, +template +SaberStatus SaberEmbedding::init( + const std::vector*>& inputs, + std::vector*>& outputs, + EmbeddingParam ¶m, Context &ctx) { // get context - this->_ctx = ctx; + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } -template -SaberStatus SaberEmbedding::create( - const std::vector& inputs, - std::vector& outputs, - EmbeddingParam& param, +template +SaberStatus SaberEmbedding::create( + const std::vector*>& inputs, + std::vector*>& outputs, + EmbeddingParam ¶m, Context &ctx) { return SaberSuccess; } -template -SaberStatus SaberEmbedding::dispatch( - const std::vector& inputs, - std::vector& outputs, - EmbeddingParam ¶m) +template +SaberStatus SaberEmbedding::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + EmbeddingParam ¶m) { - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; + + typedef typename DataTrait::Dtype DataType_out; CHECK_EQ(inputs.size(), (size_t)1); CHECK_EQ(outputs.size(), (size_t)1); + CHECK_EQ(inputs[0]->get_dtype(), AK_FLOAT) << "embedding only support float inputs!"; outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); const int num_word = inputs[0]->valid_size(); - auto in_data = inputs[0]->data(); - auto out_data = outputs[0]->mutable_data(); + + //outputs: chose corresponding informations of words. + //inputs: word_id [Its type maybe float or int] + //outputs = weights[inputs[j]]. + + const float *in_data = (const float*)inputs[0]->data(); + DataType_out *out_data = (DataType_out*)outputs[0]->mutable_data(); int emb_dim = param.emb_dim; for (int i = 0; i < num_word; i++) { if (in_data[i] == param.padding_idx) { memset(out_data + i * emb_dim, 0, sizeof(DataType_out) * emb_dim); - } else { + } else { CHECK_GE(in_data[i], 0); CHECK_LT(in_data[i], param.word_num); - memcpy(out_data + i * emb_dim, param.weight()->data(int(in_data[i]) * emb_dim), sizeof(DataType_out) * emb_dim); + memcpy(out_data + i * emb_dim, (DataType_out*)param.weight()->data()+int(in_data[i]) * emb_dim, sizeof(DataType_out) * emb_dim); } } - + } +template class SaberEmbedding; +template class SaberEmbedding; +DEFINE_OP_TEMPLATE(SaberEmbedding, EmbeddingParam, X86, AK_HALF); } -} // namespace anakin +} // namespace anakin \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_embedding.h b/saber/funcs/impl/x86/saber_embedding.h index 76dc49373..77ca1b105 100644 --- a/saber/funcs/impl/x86/saber_embedding.h +++ b/saber/funcs/impl/x86/saber_embedding.h @@ -1,11 +1,8 @@ /* Copyright (c) 2016 Anakin Authors All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,54 +13,40 @@ #define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_EMBEDDING_H #include "saber/funcs/impl/impl_embedding.h" -#include "saber/saber_types.h" -#include "saber/funcs/impl/impl_base.h" -#include "saber/funcs/impl/x86/x86_utils.h" -#include "saber/saber_funcs_param.h" -namespace anakin{ +namespace anakin { namespace saber { -template -class SaberEmbedding : public ImplBase< - Tensor, - Tensor, - Tensor, - EmbeddingParam > > -{ +template +class SaberEmbedding : + public ImplBase< + X86, OpDtype, + EmbeddingParam > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; + typedef typename DataTrait::Dtype OpDataType; - SaberEmbedding() - {} + SaberEmbedding() {} - ~SaberEmbedding() { - } + ~SaberEmbedding() {} - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - EmbeddingParam ¶m, + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + EmbeddingParam ¶m, Context &ctx) override; - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - EmbeddingParam ¶m, + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + EmbeddingParam ¶m, Context &ctx) override; - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - EmbeddingParam ¶m) override; + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + EmbeddingParam ¶m) override; + +private: + }; } } - -#endif +#endif \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_gru.cpp b/saber/funcs/impl/x86/saber_gru.cpp index 18ec58591..ec27034ab 100644 --- a/saber/funcs/impl/x86/saber_gru.cpp +++ b/saber/funcs/impl/x86/saber_gru.cpp @@ -3,78 +3,14 @@ #include "saber/funcs/impl/x86/saber_gru.h" #include "saber/core/tensor_op.h" #include "mkl_cblas.h" +#include "saber_normal_activation.h" #include -#include "avx_mathfun.h" +#include "sys/time.h" + namespace anakin { namespace saber { - -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 - -inline __m256 InValidAct(__m256 a) { - CHECK_EQ(0,1)<<"InValidAct"; -} - -inline __m256 Exp(__m256 a) { - return exp256_ps(a); - // return exp(a); -} - -inline __m256 Relu(const __m256 a) { - __m256 tmp = _mm256_set1_ps(0.0f); - return _mm256_max_ps(a, tmp); -} - -inline __m256 Sigmoid_fluid(const __m256 a) { - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); - __m256 tmp = _mm256_max_ps(a, min); - tmp = _mm256_min_ps(tmp, max); - tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); - tmp = Exp(tmp); - tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); - tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp); - return tmp; -} - -inline __m256 Sigmoid(const __m256 a) { - __m256 tmp = a; - tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); - tmp = Exp(tmp); - tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); - tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp); - return tmp; -} - -inline __m256 Tanh_fluid(const __m256 a) { - __m256 max = _mm256_set1_ps(EXP_MAX_INPUT); - __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a); - tmp = _mm256_min_ps(tmp, max); - tmp = Exp(tmp); - return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f), - _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), - _mm256_set1_ps(1.0f)); -} - -inline __m256 Tanh(const __m256 a) { - __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a); - tmp = Exp(tmp); - return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f), - _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), - _mm256_set1_ps(1.0f)); -} - -__m256 Identity(const __m256 a) { - return a; -} - -static __m256 ( *act_funcs[10])(const __m256)={&InValidAct,&Sigmoid,&Relu,&Tanh,&InValidAct,\ - &InValidAct,&Identity,&Sigmoid_fluid,&Tanh_fluid}; - - //inline static void gemm(const bool TransA, const bool TransB, int m, int n, int k, const float alpha, const float* a, const float* b, const float beta, float* c) { @@ -85,629 +21,119 @@ static void gemm(const bool TransA, const bool TransB, int m, int n, int k, cons (!TransA/* == CblasNoTrans*/) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE cuTransB = (!TransB/* == CblasNoTrans*/) ? CblasNoTrans : CblasTrans; - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, a, k, b, n, beta, c, n); -}; - -template -inline Dtype Sigmoid(const Dtype a) { - return static_cast(1.0) / (static_cast(1.0) + exp(-a)); -} - -template -inline Dtype Sigmoid_fluid(const Dtype a) { - const Dtype min = SIGMOID_THRESHOLD_MIN; - const Dtype max = SIGMOID_THRESHOLD_MAX; - Dtype tmp = (a < min) ? min : ((a > max) ? max : a); - return static_cast(1.0) / (static_cast(1.0) + exp(-tmp)); -} - -template -inline Dtype Tanh_fluid(const Dtype a) { - Dtype tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - return (2.0 / (1.0 + exp(tmp))) - 1.0; -} - -template -inline Dtype Tanh(const Dtype a) { - Dtype tmp = -2.0 * a; - return (2.0 / (1.0 + exp(tmp))) - 1.0; -} - -template -inline Dtype Relu(const Dtype a) { - return a > static_cast(0.0) ? a : static_cast(0.0); -} - -template -inline Dtype Identity(const Dtype a) { - return a; -} - - -static float ( *act_funcs_f[10])(const float)={&InValidAct,&Sigmoid,&Relu,&Tanh,&InValidAct,\ - &InValidAct,&Identity,&Sigmoid_fluid,&Tanh_fluid}; - -template<> -SaberStatus SaberGru::naiv_gru(\ - const std::vector& inputs, - std::vector& outputs, - GruParam& param) { - CHECK_NE(param.formula, GRU_CUDNN) << "X86 gru not support cudnn formula now"; - const OpDataType* weight_h = _weights_h2h.data(); - const OpDataType* weight_w = _weights_i2h.data(); - const OpDataType* bias = _weights_bias.data(); - - float(* gat_act)(const float)=act_funcs_f[param.gate_activity]; - float(* h_act)(const float)=act_funcs_f[param.h_activity]; - - std::vector offset_vec = inputs[0]->get_seq_offset(); - bool is_hw2seq = offset_vec.size() > 2; - std::vector length_vec(offset_vec.size() - 1); - int batch_size = offset_vec.size() - 1; - int seqsum = 0; - int max_seq_len = 0; - - for (int i = 0; i < offset_vec.size() - 1; ++i) { - int len = offset_vec[i + 1] - offset_vec[i]; - max_seq_len = max_seq_len > len ? max_seq_len : len; - length_vec[i] = len; - seqsum += len; - } - - int word_sum = is_hw2seq ? offset_vec[offset_vec.size() - 1] : inputs[0]->channel(); - const OutDataType* h_init = nullptr; - - if (inputs.size() > 1) { - h_init = inputs[1]->data(); - } else if (param.init_hidden() != nullptr) { - h_init = param.init_hidden()->data(); - }else{ - _init_hidden.try_expand_size(batch_size*_hidden_size); - h_init=_init_hidden.data(); - } - - const InDataType* x = inputs[0]->data(); - OutDataType* out = outputs[0]->mutable_data(); - - bool is_reverse = param.is_reverse; - - // Shape wx_shaep(1,seqsum,3,_aligned_hidden_size_iter_num,_aligned_size); - _temp_wx.try_expand_size(seqsum * 3 * _hidden_size); - _temp_wh.try_expand_size(batch_size * 2 * _hidden_size); - _temp_whr.try_expand_size(batch_size * _hidden_size); - - OutDataType* temp_wh = _temp_wh.mutable_data(); - OutDataType* temp_wx = _temp_wx.mutable_data(); - OutDataType* temp_whr = _temp_whr.mutable_data(); - - -// LOG(INFO) << "gemm b" << inputs[0]->valid_shape().count() << "," << -// _weights_i2h.valid_shape().count() << "," << _temp_wx.valid_shape().count(); - //wx - gemm(false, false, seqsum, 3 * _hidden_size, _word_size, 1.f, x, weight_w, 0.f, temp_wx); - - int o_offset = 0; - int r_offset = 1; - int z_offset = 2; - const OpDataType* b_r = bias + r_offset * _hidden_size; - const OpDataType* b_z = bias + z_offset * _hidden_size; - const OpDataType* b_o = bias + o_offset * _hidden_size; - - - for (int batch_id = 0; batch_id < batch_size; ++batch_id) { - int batch_offset = offset_vec[batch_id]; - int batch_length = length_vec[batch_id]; - - for (int seq_id_in_batch = 0; seq_id_in_batch < length_vec[batch_id]; ++seq_id_in_batch) { - int seqid = batch_offset + seq_id_in_batch; - int last_seq_id = seqid - 1; - - if (is_reverse) { - seqid = batch_offset + batch_length - 1 - seq_id_in_batch; - last_seq_id = seqid + 1; - } - - const OutDataType* hin; - OutDataType* hout = seqid * _hidden_size + out; - - if (seq_id_in_batch == 0) { - hin = h_init + batch_id * _hidden_size; - - } else { - hin = out + last_seq_id * _hidden_size; - } - - gemm(false, false, 1, 2 * _hidden_size, _hidden_size, 1.0, hin, - weight_h + _hidden_size * _hidden_size, - 0.f, temp_wh); - - volatile OutDataType r; - volatile OutDataType z; - volatile OutDataType _h; - OutDataType* w_x_r = temp_wx + r_offset * _hidden_size - + seqid * _hidden_size * 3; - OutDataType* w_x_z = temp_wx + z_offset * _hidden_size - + seqid * _hidden_size * 3; - OutDataType* w_x_o = temp_wx + o_offset * _hidden_size - + seqid * _hidden_size * 3; - - OutDataType* w_h_r = temp_wh + 0 * _hidden_size; - OutDataType* w_h_z = temp_wh + 1 * _hidden_size; - OpDataType* w_o = weight_h; -//#pragma simd - for (int frame_id = 0; frame_id < _hidden_size; ++frame_id) { - r = w_x_r[frame_id] + w_h_r[frame_id] + b_r[frame_id]; //h_out=gate_r - r = gat_act(r); - hout[frame_id] = r * hin[frame_id]; - } - - - gemm(false, false, 1, _hidden_size, _hidden_size, 1.0, hout, w_o, 0.f, temp_whr); -//#pragma simd - for (int frame_id = 0; frame_id < _hidden_size; ++frame_id) { - z = gat_act(w_x_z[frame_id] + w_h_z[frame_id] + b_z[frame_id]); - _h = w_x_o[frame_id] + temp_whr[frame_id] + b_o[frame_id]; - _h = h_act(_h); - hout[frame_id] = (1 - z) * hin[frame_id] + z * _h; - } - } - - } - - return SaberSuccess; + cblas_sgemm(CblasRowMajor, cuTransA, cuTransB, m, n, k, alpha, a, k, b, n, beta, c, n); }; -template<> -SaberStatus SaberGru::batch_gru(\ - const std::vector& inputs, - std::vector& outputs, - GruParam& param) { - CHECK_NE(param.formula, GRU_CUDNN) << "X86 gru not support cudnn formula now"; - const OpDataType* weight_h = _weights_h2h.data(); - const OpDataType* weight_w = _weights_i2h.data(); - const OpDataType* bias = _weights_bias.data(); - - std::vector offset_vec = inputs[0]->get_seq_offset(); - bool is_hw2seq = offset_vec.size() > 2; - int word_sum = is_hw2seq ? offset_vec[offset_vec.size() - 1] : inputs[0]->channel(); - - const InDataType* x = inputs[0]->data(); - OutDataType* out = outputs[0]->mutable_data(); - bool is_reverse = param.is_reverse; - - - std::vector length_vec(offset_vec.size() - 1); - int batch_size = offset_vec.size() - 1; - int seqsum = 0; - int max_seq_len = 0; - - for (int i = 0; i < offset_vec.size() - 1; ++i) { - int len = offset_vec[i + 1] - offset_vec[i]; - max_seq_len = max_seq_len > len ? max_seq_len : len; - length_vec[i] = len; - seqsum += len; - } - - const OutDataType* h_init = nullptr; - - if (inputs.size() > 1) { - h_init = inputs[1]->data(); - } else if (param.init_hidden() != nullptr) { - CHECK_EQ(param.init_hidden()->valid_shape().count(), - batch_size * _hidden_size) << "hinit must match batchsize"; - h_init = param.init_hidden()->data(); - } - - - // Shape wx_shaep(1,seqsum,3,_aligned__hidden_size_iter_num,_aligned_size); - _temp_wx.try_expand_size(seqsum * 3 * _hidden_size); - _temp_wh.try_expand_size(batch_size * 2 * _hidden_size); - _temp_whr.try_expand_size(batch_size * _hidden_size); - - OutDataType* temp_wh = _temp_wh.mutable_data(); - OutDataType* temp_wx = _temp_wx.mutable_data(); - OutDataType* temp_whr = _temp_whr.mutable_data(); - /////////////////////////////////////////////////////// - std::vector emit_offset_vec; - int emit_length = 0; - - utils::SeqSortedseqTranseUtil transe_util; - bool transform = transe_util.get_sorted_map(offset_vec, emit_offset_vec, emit_length); - // print_vec(_map_vec.data(),_map_vec.size(),"map "); - float* inner_h_out = out; - float* inner_x = x; - const float* inner_h_init = h_init; - - // print_vec(x,word_sum*word_size,"before x"); - if (transform) { - _temp_out.try_expand_size(seqsum * _hidden_size * param.num_direction); - _temp_x.try_expand_size(seqsum * _word_size); - inner_h_out = _temp_out.mutable_data(); - inner_x = _temp_x.mutable_data(); - transe_util.seq_2_sorted_seq(x, inner_x, _word_size); - - if (inner_h_init != nullptr) { - _temp_h_init.try_expand_size(batch_size * _hidden_size); - transe_util.hidden_2_sorted_hidden(inner_h_init, _temp_h_init.mutable_data(), _hidden_size); - inner_h_init = _temp_h_init.data(); +template +static inline void cal_gru_reset_gate(float* hout,const float* hin,const BIT* b_r,const float* temp_wx,const float* temp_wh, + int emit_word_id_start,int emit_word_id_end,int _aligned_hidden_size,int r_offset,BIT(*gate_act)(const BIT)){ + + for (int emit_word_id = emit_word_id_start; emit_word_id < emit_word_id_end; emit_word_id++) { + int emit_id_offset = emit_word_id - emit_word_id_start; + BIT* w_x_r = (BIT*)(temp_wx + r_offset * _aligned_hidden_size + + emit_word_id * _aligned_hidden_size * 3); + BIT* w_h_r = (BIT*)(temp_wh + 0 * _aligned_hidden_size + + emit_id_offset * _aligned_hidden_size * 2); + BIT* emit_hout = (BIT*)(hout + emit_id_offset * _aligned_hidden_size); + const BIT* emit_hin = (BIT*)(hin + emit_id_offset * _aligned_hidden_size); + + for (int frame_id = 0; frame_id < _aligned_hidden_size / (sizeof(BIT) / 4); ++frame_id) { + BIT r; + r = w_x_r[frame_id] + w_h_r[frame_id] + b_r[frame_id]; //h_out=gate_r + r = gate_act(r); +// LOG(INFO)<<"SABER:"< -SaberStatus SaberGru:: -naiv_256(const std::vector& inputs, - std::vector& outputs, - GruParam& param) { - CHECK_NE(param.formula, GRU_CUDNN) << "X86 gru not support cudnn formula now"; - const OpDataType* weight_h = _weights_h2h.data(); - const OpDataType* weight_w = _weights_i2h.data(); - const OpDataType* bias = _weights_bias.data(); - - std::vector offset_vec = inputs[0]->get_seq_offset(); - bool is_hw2seq = offset_vec.size() > 2; - int word_sum = is_hw2seq ? offset_vec[offset_vec.size() - 1] : inputs[0]->channel(); +template +static inline void cal_gru_forgate_output_gate(float* hout,const float* hin,const BIT* b_z,const BIT* b_o,const float* temp_wx,const float* temp_whr, + const float* temp_wh,int emit_word_id_start,int emit_word_id_end,int _aligned_hidden_size,int z_offset,int o_offset,BIT(*gate_act)(const BIT),BIT(*hid_act)(const BIT) ){ - const OutDataType* h_init = nullptr; - if (inputs.size() > 1) { - h_init = inputs[1]->data(); - } else if (param.init_hidden() != nullptr) { - h_init = param.init_hidden()->data(); - } + for (int emit_word_id = emit_word_id_start; emit_word_id < emit_word_id_end; emit_word_id++) { - const InDataType* x = inputs[0]->data(); - OutDataType* out = outputs[0]->mutable_data(); - bool is_reverse = param.is_reverse; + int emit_offset = emit_word_id - emit_word_id_start; + BIT* w_x_z = (BIT*)(temp_wx + z_offset * _aligned_hidden_size + + emit_word_id * _aligned_hidden_size * 3); + BIT* w_x_o = (BIT*)(temp_wx + o_offset * _aligned_hidden_size + + emit_word_id * _aligned_hidden_size * 3); - std::vector length_vec(offset_vec.size() - 1); - int batch_size = offset_vec.size() - 1; - int seqsum = 0; - int max_seq_len = 0; + BIT* w_h_z = (BIT*)(temp_wh + 1 * _aligned_hidden_size + + emit_offset * _aligned_hidden_size * 2); - for (int i = 0; i < offset_vec.size() - 1; ++i) { - int len = offset_vec[i + 1] - offset_vec[i]; - length_vec[i] = len; - max_seq_len = max_seq_len > len ? max_seq_len : len; - seqsum += len; - } - - _temp_wx.try_expand_size(seqsum * 3 * _hidden_size); - _temp_wh.try_expand_size(batch_size * 2 * _hidden_size); - _temp_whr.try_expand_size(batch_size * _hidden_size); - - OutDataType* temp_wh = _temp_wh.mutable_data(); - OutDataType* temp_wx = _temp_wx.mutable_data(); - OutDataType* temp_whr = _temp_whr.mutable_data(); - ///////////////////////////////////////////////// - //wx - gemm(false, false, seqsum, 3 * _hidden_size, _word_size, 1.f, x, weight_w, 0.f, temp_wx); - // for(float i :_temp_WX){ - // cout<<" "<get_seq_offset().size(), 2); - std::vector offset_vec = inputs[0]->get_seq_offset(); - std::vector length_vec(offset_vec.size() - 1); - int batch_size = offset_vec.size() - 1; - int seqsum = 0; - int max_seq_len = 0; - bool is_hw2seq = offset_vec.size() > 2; - int word_sum = is_hw2seq ? offset_vec[offset_vec.size() - 1] : inputs[0]->channel(); - __m256 (*gate_act)(const __m256)=act_funcs[param.gate_activity]; - __m256 (*hid_act)(const __m256)=act_funcs[param.h_activity]; - utils::AlignedUtils aligned_utils; - utils::VectorPrint vector_print; - const OutDataType* h_init = nullptr; - - if (inputs.size() > 1) { - h_init = inputs[1]->data(); - _aligned_init_hidden.try_expand_size(batch_size * _aligned_hidden_size); - aligned_utils.aligned_last_dim(h_init, _aligned_init_hidden.mutable_data(), - batch_size * _hidden_size, _hidden_size, _aligned_hidden_size); - h_init = _aligned_init_hidden.data(); - } else if (param.init_hidden() != nullptr) { - h_init = param.init_hidden()->data(); - //FIXME:is it correct - } else { - _aligned_init_hidden.try_expand_size(batch_size * _aligned_hidden_size); - h_init = _aligned_init_hidden.data(); - } - - const InDataType* x = inputs[0]->data(); - OutDataType* out = outputs[0]->mutable_data(); - bool is_reverse = param.is_reverse; - - for (int i = 0; i < offset_vec.size() - 1; ++i) { - int len = offset_vec[i + 1] - offset_vec[i]; - length_vec[i] = len; - max_seq_len = max_seq_len > len ? max_seq_len : len; - seqsum += len; - } - - _temp_wx.try_expand_size(seqsum * 3 * _aligned_hidden_size); - _temp_wh.try_expand_size(batch_size * 2 * _aligned_hidden_size); - _temp_whr.try_expand_size(batch_size * _aligned_hidden_size); - _temp_out.try_expand_size(seqsum * _aligned_hidden_size); - OutDataType* temp_wh = _temp_wh.mutable_data(); - OutDataType* temp_wx = _temp_wx.mutable_data(); - OutDataType* temp_whr = _temp_whr.mutable_data(); - ///////////////////////////////////////////////// - //wx - gemm(false, false, seqsum, 3 * _aligned_hidden_size, _word_size, 1.f, x, weight_w, 0.f, temp_wx); - // for(float i :_temp_WX){ - // cout<<" "< -SaberStatus SaberGru:: -batch_256_s_aligned(const std::vector& inputs, - std::vector& outputs, - GruParam& param) { - CHECK_NE(param.formula, GRU_CUDNN) << "X86 gru not support cudnn formula now"; - const OpDataType* weight_h = _aligned_weights_h2h.data(); - const OpDataType* weight_w = _aligned_weights_i2h.data(); - const OpDataType* bias = _aligned_weights_bias.data(); - __m256 (*gate_act)(const __m256)=act_funcs[param.gate_activity]; - __m256 (*hid_act)(const __m256)=act_funcs[param.h_activity]; - std::vector offset_vec = inputs[0]->get_seq_offset(); +template <> +template +SaberStatus SaberGru:: +batch_s_aligned(const std::vector& inputs, + std::vector& outputs, + GruParam& param) { +// CHECK_NE(param.formula, GRU_CUDNN) << "X86 gru not support cudnn formula now"; + int loop_div = sizeof(BIT) / sizeof(float); + // LOG(INFO)<<"loop_div "<(param.gate_activity); + BIT(*hid_act)(const BIT) =Activate_inner(param.h_activity); + std::vector> offset_vec_vec = inputs[0]->get_seq_offset(); + std::vectoroffset_vec=offset_vec_vec[offset_vec_vec.size()-1]; std::vector length_vec(offset_vec.size() - 1); int batch_size = offset_vec.size() - 1; int seqsum = 0; @@ -716,24 +142,27 @@ batch_256_s_aligned(const std::vector& inputs, int word_sum = is_hw2seq ? offset_vec[offset_vec.size() - 1] : inputs[0]->channel(); utils::AlignedUtils aligned_utils; utils::VectorPrint vector_print; - const OutDataType* h_init = nullptr; + const OpDataType* h_init = nullptr; - const InDataType* x = inputs[0]->data(); - OutDataType* out = outputs[0]->mutable_data(); + const OpDataType* x = (const OpDataType*)inputs[0]->data(); + OpDataType* out = ( OpDataType*)outputs[0]->mutable_data(); bool is_reverse = param.is_reverse; if (inputs.size() > 1) { - h_init = inputs[1]->data(); - _aligned_init_hidden.try_expand_size(batch_size * _aligned_hidden_size); - aligned_utils.aligned_last_dim(h_init, _aligned_init_hidden.mutable_data(), + h_init = (const OpDataType*)inputs[1]->data(); + utils::try_expand_tensor(_aligned_init_hidden,batch_size * _aligned_hidden_size); + aligned_utils.aligned_last_dim(h_init, (OpDataType*)_aligned_init_hidden.mutable_data(), batch_size * _hidden_size, _hidden_size, _aligned_hidden_size); - h_init = _aligned_init_hidden.data(); + h_init = (const OpDataType*)_aligned_init_hidden.data(); } else if (param.init_hidden() != nullptr) { - h_init = param.init_hidden()->data(); + h_init = (const OpDataType*)param.init_hidden()->data(); //FIXME:is it correct? } else { - _aligned_init_hidden.try_expand_size(batch_size * _aligned_hidden_size); - h_init = _aligned_init_hidden.data(); + bool need_clear=utils::try_expand_tensor(_aligned_init_hidden,batch_size * _aligned_hidden_size); + if(need_clear){ + memset(_aligned_init_hidden.mutable_data(),0,_aligned_init_hidden.valid_shape().count()* sizeof(OpDataType)); + } + h_init = (const OpDataType*)_aligned_init_hidden.data(); } std::vector emit_offset_vec; @@ -741,9 +170,9 @@ batch_256_s_aligned(const std::vector& inputs, utils::SeqSortedseqTranseUtil transe_util(is_reverse); bool transform = transe_util.get_sorted_map(offset_vec, emit_offset_vec, emit_length); - float* inner_h_out = out; - float* inner_x = x; - const float* inner_h_init = h_init; + OpDataType* inner_h_out = out; + OpDataType* inner_x = (OpDataType*)x; + const OpDataType* inner_h_init = h_init; for (int i = 0; i < offset_vec.size() - 1; ++i) { int len = offset_vec[i + 1] - offset_vec[i]; @@ -752,52 +181,59 @@ batch_256_s_aligned(const std::vector& inputs, seqsum += len; } - _temp_wx.try_expand_size(seqsum * 3 * _aligned_hidden_size); - _temp_wh.try_expand_size(batch_size * 2 * _aligned_hidden_size); - _temp_whr.try_expand_size(batch_size * _aligned_hidden_size); - _temp_out.try_expand_size(seqsum * _aligned_hidden_size * param.num_direction); + utils::try_expand_tensor(_temp_wx,seqsum * 3 * _aligned_hidden_size); + utils::try_expand_tensor(_temp_out,seqsum * _aligned_hidden_size * param.num_direction); + if(param.formula==GRU_ORIGIN) { + utils::try_expand_tensor(_temp_wh, batch_size * 2 * _aligned_hidden_size); + utils::try_expand_tensor(_temp_whr, batch_size * _aligned_hidden_size); + }else{ + utils::try_expand_tensor(_temp_wh, batch_size * 3 * _aligned_hidden_size); + } + if (transform) { - _temp_x.try_expand_size(seqsum * _word_size); - inner_h_out = _temp_out.mutable_data(); - inner_x = _temp_x.mutable_data(); + utils::try_expand_tensor(_temp_x,seqsum * _word_size); + inner_h_out = (OpDataType*)_temp_out.mutable_data(); + inner_x = (OpDataType*)_temp_x.mutable_data(); transe_util.seq_2_sorted_seq(x, inner_x, _word_size); if (inner_h_init != nullptr) { - _temp_h_init.try_expand_size(batch_size * _aligned_hidden_size); - transe_util.hidden_2_sorted_hidden(inner_h_init, _temp_h_init.mutable_data(), _aligned_hidden_size); - inner_h_init = _temp_h_init.data(); + utils::try_expand_tensor(_temp_h_init,batch_size * _aligned_hidden_size); + transe_util.hidden_2_sorted_hidden(inner_h_init, (OpDataType*)_temp_h_init.mutable_data(), _aligned_hidden_size); + inner_h_init = (const OpDataType*)_temp_h_init.data(); } } else if (_hidden_size != _aligned_hidden_size) { - inner_h_out = _temp_out.mutable_data(); + inner_h_out = (OpDataType*)_temp_out.mutable_data(); } - OutDataType* temp_wh = _temp_wh.mutable_data(); - OutDataType* temp_wx = _temp_wx.mutable_data(); - OutDataType* temp_whr = _temp_whr.mutable_data(); + OpDataType* temp_wh = (OpDataType*)_temp_wh.mutable_data(); + OpDataType* temp_wx = (OpDataType*)_temp_wx.mutable_data(); + OpDataType* temp_whr = (OpDataType*)_temp_whr.mutable_data(); ///////////////////////////////////////////////// //wx + gemm(false, false, seqsum, 3 * _aligned_hidden_size, _word_size, 1.f, inner_x, weight_w, 0.f, temp_wx); + int o_offset = 0; int r_offset = 1; int z_offset = 2; - const __m256* b_r = (__m256*)(bias + r_offset * _aligned_hidden_size); - const __m256* b_z = (__m256*)(bias + z_offset * _aligned_hidden_size); - const __m256* b_o = (__m256*)(bias + o_offset * _aligned_hidden_size); + const BIT* b_r = (BIT*)(bias + r_offset * _aligned_hidden_size); + const BIT* b_z = (BIT*)(bias + z_offset * _aligned_hidden_size); + const BIT* b_o = (BIT*)(bias + o_offset * _aligned_hidden_size); + - int mod_num = _hidden_size % 8; - int reverse_out_offset=seqsum; + int reverse_out_offset = seqsum; for (int word_id = 0; word_id < emit_length; word_id++) { int real_word_id = word_id; int last_word_id = word_id - 1; - if (param.is_reverse&&batch_size==1) { + if (param.is_reverse && batch_size == 1) { real_word_id = emit_length - word_id - 1; last_word_id = real_word_id + 1; } @@ -810,106 +246,70 @@ batch_256_s_aligned(const std::vector& inputs, if (word_id == 0) { hin = inner_h_init; } else { -// if(is_reverse){ -// hin = inner_h_out + reverse_out_offset * _aligned_hidden_size; -// }else{ -// hin = inner_h_out + emit_offset_vec[last_word_id] * _aligned_hidden_size; -// } hin = inner_h_out + emit_offset_vec[last_word_id] * _aligned_hidden_size; } float* hout = nullptr; -// if(is_reverse){ -// reverse_out_offset-=emit_word_length; -// hout=reverse_out_offset*_aligned_hidden_size + inner_h_out; -// } else{ - hout=emit_offset_vec[real_word_id] * _aligned_hidden_size + inner_h_out; -// } - //wh - gemm(false, false, emit_word_length, 2 * _aligned_hidden_size, _aligned_hidden_size, 1.0, hin, - weight_h + _hidden_size * _aligned_hidden_size, - 0.f, temp_wh); - - __m256 r; - __m256 z; - __m256 _h; - __m256* hout_256 = (__m256*) hout; - const __m256* hin_256 = (__m256*) hin; - - for (int emit_word_id = emit_word_id_start; emit_word_id < emit_word_id_end; emit_word_id++) { - int emit_id_offset = emit_word_id - emit_word_id_start; - __m256* w_x_r = (__m256*)(temp_wx + r_offset * _aligned_hidden_size - + emit_word_id * _aligned_hidden_size * 3); - __m256* w_h_r = (__m256*)(temp_wh + 0 * _aligned_hidden_size - + emit_id_offset * _aligned_hidden_size * 2); - __m256* emit_hout = (__m256*)(hout + emit_id_offset * _aligned_hidden_size); - const __m256* emit_hin = (__m256*)(hin + emit_id_offset * _aligned_hidden_size); - - for (int frame_id = 0; frame_id < _aligned_hidden_size / 8; ++frame_id) { - r = w_x_r[frame_id] + w_h_r[frame_id] + b_r[frame_id]; //h_out=gate_r - r = gate_act(r); - - emit_hout[frame_id] = r * emit_hin[frame_id]; - } + hout = emit_offset_vec[real_word_id] * _aligned_hidden_size + inner_h_out; - } + if(param.formula==GRU_ORIGIN) { + //wh + gemm(false, false, emit_word_length, 2 * _aligned_hidden_size, _aligned_hidden_size, 1.0, hin, + weight_h , + 0.f, temp_wh); - // cout << "hout = " << hout[0] << endl; - gemm(false, false, emit_word_length, _aligned_hidden_size, _aligned_hidden_size, 1.0, hout, - weight_h, 0.f, temp_whr); + //#pragma omp parallel for + cal_gru_reset_gate(hout, hin, b_r, temp_wx, temp_wh, emit_word_id_start, emit_word_id_end, + _aligned_hidden_size, r_offset, gate_act); - for (int emit_word_id = emit_word_id_start; emit_word_id < emit_word_id_end; emit_word_id++) { - int emit_offset = emit_word_id - emit_word_id_start; - __m256* w_x_z = (__m256*)(temp_wx + z_offset * _aligned_hidden_size - + emit_word_id * _aligned_hidden_size * 3); - __m256* w_x_o = (__m256*)(temp_wx + o_offset * _aligned_hidden_size - + emit_word_id * _aligned_hidden_size * 3); - __m256* w_h_z = (__m256*)(temp_wh + 1 * _aligned_hidden_size - + emit_offset * _aligned_hidden_size * 2); + gemm(false, false, emit_word_length, _aligned_hidden_size, _aligned_hidden_size, 1.0, hout, + static_cast(_aligned_weights_h2h_o.data()), 0.f, temp_whr); - __m256* w_h_o = (__m256*)(temp_whr + emit_offset * _aligned_hidden_size); - __m256* emit_hout = (__m256*)(hout + emit_offset * _aligned_hidden_size) ; - const __m256* emit_hin = (__m256*)(hin + emit_offset * _aligned_hidden_size) ; + cal_gru_forgate_output_gate(hout, hin, b_z, b_o, temp_wx, temp_whr, temp_wh, emit_word_id_start, + emit_word_id_end, + _aligned_hidden_size, z_offset, o_offset, gate_act, hid_act); + }else if(param.formula==GRU_CUDNN){ - for (int frame_id = 0; frame_id < _aligned_hidden_size / 8; ++frame_id) { + gemm(false, false, emit_word_length, 3 * _aligned_hidden_size, _aligned_hidden_size, 1.0, hin, + weight_h, + 0.f, temp_wh); - z = gate_act(w_x_z[frame_id] + w_h_z[frame_id] + b_z[frame_id]); - _h = w_x_o[frame_id] + w_h_o[frame_id] + b_o[frame_id]; - _h = hid_act(_h); - // vector_print.print_float(&z); - emit_hout[frame_id] = (1 - z) * emit_hin[frame_id] + z * _h; - } + cal_gru_cudnn(hout,hin,b_o,b_r,b_z,temp_wx,temp_wh,emit_word_id_start,emit_word_id_end, + _aligned_hidden_size,o_offset,r_offset,z_offset); } } - if (transform){ + if (transform) { transe_util.sorted_seq_2_seq(inner_h_out, out, _hidden_size, _aligned_hidden_size); - }else if (_hidden_size != _aligned_hidden_size) { - aligned_utils.unaligned_last_dim(_temp_out.data(), out, seqsum * _hidden_size, _hidden_size, + } else if (_hidden_size != _aligned_hidden_size) { + aligned_utils.unaligned_last_dim((const OpDataType*)_temp_out.data(), out, seqsum * _hidden_size, _hidden_size, _aligned_hidden_size); } + return SaberSuccess; }; + + template<> -SaberStatus SaberGru::dispatch(\ - const std::vector& inputs, - std::vector& outputs, - GruParam& param) { - // return naiv_256_s_aligned(inputs, outputs, param); -// return naiv_gru(inputs, outputs, param); -// return batch_gru(inputs, outputs, param); +SaberStatus SaberGru::dispatch(\ + const std::vector& inputs, + std::vector& outputs, + GruParam& param) { + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); - if(inputs[0]->get_seq_offset().size()>2) { - return batch_256_s_aligned(inputs, outputs, param); - }else { - return naiv_256_s_aligned(inputs, outputs, param); - } + + batch_s_aligned(inputs, outputs, param); + + return SaberSuccess; + }; -template class SaberGru; +template class SaberGru; +DEFINE_OP_TEMPLATE(SaberGru, GruParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberGru, GruParam, X86, AK_INT8); } } diff --git a/saber/funcs/impl/x86/saber_gru.h b/saber/funcs/impl/x86/saber_gru.h index f787a47a7..3d6bebc94 100644 --- a/saber/funcs/impl/x86/saber_gru.h +++ b/saber/funcs/impl/x86/saber_gru.h @@ -4,46 +4,43 @@ #define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_GRU_H #include "saber/funcs/impl/impl_gru.h" #include "saber/funcs/impl/x86/x86_utils.h" + +#if defined(__AVX512F__) +#define SABER_X86_TYPE __m512 +#elif defined(__AVX2__) and defined(__FMA__) +#define SABER_X86_TYPE __m256 +#elif defined(__SSE4_2__) and defined(__FMA__) +#define SABER_X86_TYPE __m128 +#else +#define SABER_X86_TYPE float +#endif + namespace anakin { namespace saber { -template -class SaberGru : \ +template +class SaberGru : \ public ImplBase < - Tensor, \ - Tensor, \ - Tensor, \ - GruParam >> { + X86, OpDtype,GruParam> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - + typedef typename DataTrait::Dtype OpDataType; + typedef Tensor OpTensor; SaberGru() {} ~SaberGru() {} - virtual SaberStatus init(const std::vector& inputs, \ - std::vector& outputs, \ - GruParam& gru_param, Context& ctx) { - this->_ctx=ctx; - CHECK_EQ(gru_param.formula ,GRU_ORIGIN)<<"only support gru_origin now"; - _hidden_size = gru_param.bias()->valid_size() / 3; - if (gru_param.formula == GRU_ORIGIN&&_aligned_way) { + virtual SaberStatus init(const std::vector& inputs, \ + std::vector& outputs, \ + GruParam& gru_param, Context& ctx) { + this->_ctx = &ctx; +// CHECK_EQ(gru_param.formula, GRU_ORIGIN) << "only support gru_origin now"; + + if (gru_param.formula == GRU_ORIGIN ) { //FIXME:aligned should be determine by framework - int aligned_byte=64; - int c_size=aligned_byte/sizeof(OpDataType); + int aligned_byte = sizeof(SABER_X86_TYPE); + int c_size = aligned_byte / sizeof(OpDataType); _hidden_size = gru_param.bias()->valid_size() / 3; int weights_bias_size = _hidden_size * 3; @@ -51,135 +48,147 @@ class SaberGruvalid_size() - weights_h2h_size; _word_size = weights_i2h_size / _hidden_size / 3; - _aligned_size=c_size; - _aligned_word_size=utils::round_up(_word_size,c_size); - _aligned_hidden_size=utils::round_up(_hidden_size,c_size); - _aligned_word_size_iter_num=_aligned_word_size/c_size; - _aligned_hidden_size_iter_num=_aligned_hidden_size/c_size; + _aligned_size = c_size; + _aligned_word_size = utils::round_up(_word_size, c_size); + _aligned_hidden_size = utils::round_up(_hidden_size, c_size); - Shape weights_i2h_shape(1,_word_size,3,_aligned_hidden_size); - Shape weights_h2h_shape(1,_aligned_hidden_size,3,_aligned_hidden_size); - Shape weights_bias_shape(1,1,3,_aligned_hidden_size); - _aligned_weights_i2h.try_expand_size(weights_i2h_shape); - _aligned_weights_h2h.try_expand_size(weights_h2h_shape); - _aligned_weights_bias.try_expand_size(weights_bias_shape); + Shape weights_i2h_shape({1, _word_size, 3, _aligned_hidden_size},Layout_NCHW); + Shape weights_h2h_shape({1, _aligned_hidden_size, 2, _aligned_hidden_size},Layout_NCHW); + Shape weights_h2h_o_shape({1, _aligned_hidden_size, 1, _aligned_hidden_size},Layout_NCHW); + Shape weights_bias_shape({1, 1, 3, _aligned_hidden_size},Layout_NCHW); + utils::try_expand_clean_tensor(_aligned_weights_i2h,weights_i2h_shape); + utils::try_expand_clean_tensor(_aligned_weights_h2h,weights_h2h_shape); + utils::try_expand_clean_tensor(_aligned_weights_h2h_o,weights_h2h_o_shape); + utils::try_expand_clean_tensor(_aligned_weights_bias,weights_bias_shape); utils::AlignedUtils aligned_tool; - aligned_tool.aligned_last_dim(gru_param.weight()->data(),_aligned_weights_i2h.mutable_data(), - weights_i2h_size,_hidden_size,_aligned_hidden_size); - - aligned_tool.aligned_last_dim(gru_param.weight()->data() + weights_i2h_size,_aligned_weights_h2h.mutable_data(), - weights_h2h_size,_hidden_size,_aligned_hidden_size); - - aligned_tool.aligned_last_dim(gru_param.bias()->data(),_aligned_weights_bias.mutable_data(), - weights_bias_size,_hidden_size,_aligned_hidden_size); - - _weights_i2h.try_expand_size(weights_i2h_size); - _weights_h2h.try_expand_size(weights_h2h_size); - _weights_bias.try_expand_size(weights_bias_size); - //FIXME:format pitch - memcpy(_weights_i2h.mutable_data(), gru_param.weight()->data(), - sizeof(InDataType) * weights_i2h_size); - memcpy(_weights_h2h.mutable_data(), gru_param.weight()->data() + weights_i2h_size, - sizeof(InDataType) * weights_h2h_size); - memcpy(_weights_bias.mutable_data(), gru_param.bias()->data(), - sizeof(InDataType) * weights_bias_size); - -// Shape wh_shape(1,1,2,_aligned_hidden_size/c_size,c_size); -// Shape whr_shape(1,1,1,_aligned_hidden_size/c_size,c_size); -// _temp_wh.try_expand_size(wh_shape); -// _temp_whr.try_expand_size(whr_shape); - }else if(gru_param.formula == GRU_ORIGIN){ + aligned_tool.aligned_last_dim(static_cast(gru_param.weight()->data()), ( OpDataType*)_aligned_weights_i2h.mutable_data(), + weights_i2h_size, _hidden_size, _aligned_hidden_size); + + aligned_tool.aligned_last_dim(static_cast(gru_param.weight()->data()) + weights_i2h_size+_hidden_size*_hidden_size, + (OpDataType*) _aligned_weights_h2h.mutable_data(), + weights_h2h_size-_hidden_size*_hidden_size, _hidden_size, _aligned_hidden_size); + + aligned_tool.aligned_last_dim(static_cast(gru_param.weight()->data()) + weights_i2h_size, + (OpDataType*) _aligned_weights_h2h_o.mutable_data(), + _hidden_size*_hidden_size, _hidden_size, _aligned_hidden_size); + + aligned_tool.aligned_last_dim(static_cast(gru_param.bias()->data()), (OpDataType*)_aligned_weights_bias.mutable_data(), + weights_bias_size, _hidden_size, _aligned_hidden_size); + + + } else if (gru_param.formula == GRU_CUDNN) { + int aligned_byte = sizeof(SABER_X86_TYPE); + int c_size = aligned_byte / sizeof(OpDataType); + _hidden_size = gru_param.bias()->valid_size() / 3; int weights_bias_size = _hidden_size * 3; int weights_h2h_size = _hidden_size * _hidden_size * 3; int weights_i2h_size = gru_param.weight()->valid_size() - weights_h2h_size; _word_size = weights_i2h_size / _hidden_size / 3; - _weights_i2h.try_expand_size(weights_i2h_size); - _weights_h2h.try_expand_size(weights_h2h_size); - _weights_bias.try_expand_size(weights_bias_size); + _aligned_size = c_size; + _aligned_word_size = utils::round_up(_word_size, c_size); + _aligned_hidden_size = utils::round_up(_hidden_size, c_size); + + Shape weights_i2h_shape({1, _word_size, 3, _aligned_hidden_size},Layout_NCHW); + Shape weights_h2h_shape({1, _aligned_hidden_size, 3, _aligned_hidden_size},Layout_NCHW); + Shape weights_bias_shape({1, 1, 3, _aligned_hidden_size},Layout_NCHW); + utils::try_expand_clean_tensor(_aligned_weights_i2h,weights_i2h_shape); + utils::try_expand_clean_tensor(_aligned_weights_h2h,weights_h2h_shape); + utils::try_expand_clean_tensor(_aligned_weights_bias,weights_bias_shape); + + OpTensor temp_tensor; + utils::try_expand_tensor(temp_tensor,weights_h2h_size); + OpTensor temp_tensor_origin; + utils::try_expand_tensor(temp_tensor_origin,weights_h2h_size); + + float* temp_tensor_ptr= static_cast(temp_tensor_origin.mutable_data()); + memcpy(temp_tensor_ptr, static_cast(gru_param.weight()->data()) + weights_i2h_size, + sizeof(OpDataType) * _hidden_size*_hidden_size); + + float* rz_temp_tensor_ptr=temp_tensor_ptr+_hidden_size*_hidden_size; + const float* rz_weights_tensor_ptr=static_cast(gru_param.weight()->data()) + weights_i2h_size+_hidden_size*_hidden_size; + for(int row=0;row<_hidden_size;row++){ + for(int block=0;block<2;block++) { + int block_offset=block*_hidden_size; + for (int cow = 0; cow < _hidden_size; cow++) { + rz_temp_tensor_ptr[block*_hidden_size*_hidden_size+row*_hidden_size+cow]=rz_weights_tensor_ptr[row*(2*_hidden_size)+cow+block_offset]; + } + } + } + + float* orz_temp_tensor_ptr=temp_tensor_ptr; + float* orz_weights_tensor_ptr=static_cast(temp_tensor.mutable_data()); + for(int row=0;row<_hidden_size;row++){ + for(int block=0;block<3;block++) { + int block_offset=block*_hidden_size; + for (int cow = 0; cow < _hidden_size; cow++) { + orz_weights_tensor_ptr[row*(3*_hidden_size)+cow+block_offset]=orz_temp_tensor_ptr[block*_hidden_size*_hidden_size+row*_hidden_size+cow]; + } + } + } + + utils::AlignedUtils aligned_tool; + aligned_tool.aligned_last_dim((const OpDataType*)gru_param.weight()->data(), ( OpDataType*)_aligned_weights_i2h.mutable_data(), + weights_i2h_size, _hidden_size, _aligned_hidden_size); + + aligned_tool.aligned_last_dim((const OpDataType*)temp_tensor.data(), + (OpDataType*) _aligned_weights_h2h.mutable_data(), + weights_h2h_size, _hidden_size, _aligned_hidden_size); + + aligned_tool.aligned_last_dim((const OpDataType*)gru_param.bias()->data(), (OpDataType*)_aligned_weights_bias.mutable_data(), + weights_bias_size, _hidden_size, _aligned_hidden_size); - memcpy(_weights_i2h.mutable_data(), gru_param.weight()->data(), - sizeof(InDataType) * weights_i2h_size); - memcpy(_weights_h2h.mutable_data(), gru_param.weight()->data() + weights_i2h_size, - sizeof(InDataType) * weights_h2h_size); - memcpy(_weights_bias.mutable_data(), gru_param.bias()->data(), - sizeof(InDataType) * weights_bias_size); } - LOG(INFO)<<"success init"; - return create(inputs,outputs,gru_param,ctx); + + return create(inputs, outputs, gru_param, ctx); } - virtual SaberStatus create(const std::vector& inputs, \ - std::vector& outputs, \ - GruParam& gru_param, Context& ctx) { + virtual SaberStatus create(const std::vector& inputs, \ + std::vector& outputs, \ + GruParam& gru_param, Context& ctx) { return SaberSuccess; } - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - GruParam& param); - SaberStatus naiv_gru( - const std::vector& inputs, - std::vector& outputs, - GruParam& param); + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + GruParam& param); + private: int _word_size; int _hidden_size; - bool _aligned_way=true; int _aligned_word_size; int _aligned_hidden_size; int _aligned_size; - int _aligned_word_size_iter_num; - int _aligned_hidden_size_iter_num; - OpTensor _weights_i2h; - OpTensor _weights_h2h; - OpTensor _weights_bias; - DataTensor_out _init_hidden; + OpTensor _init_hidden; OpTensor _aligned_weights_i2h; OpTensor _aligned_weights_h2h; + OpTensor _aligned_weights_h2h_o; OpTensor _aligned_weights_bias; - DataTensor_out _aligned_init_hidden; - - DataTensor_out _temp_wx; - DataTensor_out _temp_wh; - DataTensor_out _temp_whr; - - DataTensor_in _temp_x; - DataTensor_out _temp_out; - DataTensor_out _temp_h_init; -// lod_no_batch_gru(const OpDataType* weight_w, const OpDataType* weight_h,const OpDataType* b, const OutDataType* h_init, OutDataType* h_out, -// const InDataType* x,OutDataType *temp_wx,OutDataType *temp_wh,OutDataType *temp_whr, -// int hidden_size, int word_size, std::vector& offset_vec, bool is_reverse); - - - SaberStatus batch_gru(\ - const std::vector& inputs, - std::vector& outputs, - GruParam& param); - - SaberStatus naiv_256(\ - const std::vector& inputs, - std::vector& outputs, - GruParam& param); - - SaberStatus naiv_256_s_aligned(\ - const std::vector& inputs, - std::vector& outputs, - GruParam& param); - - SaberStatus batch_256_s_aligned(\ - const std::vector& inputs, - std::vector& outputs, - GruParam& param); + OpTensor _aligned_init_hidden; + + OpTensor _temp_wx; + OpTensor _temp_wh; + OpTensor _temp_whr; + + OpTensor _temp_x; + OpTensor _temp_out; + OpTensor _temp_h_init; + + template + SaberStatus batch_s_aligned(\ + const std::vector& inputs, + std::vector& outputs, + GruParam& param); + }; } diff --git a/saber/funcs/impl/x86/saber_im2col_conv.cpp b/saber/funcs/impl/x86/saber_im2col_conv.cpp new file mode 100644 index 000000000..4f6db4488 --- /dev/null +++ b/saber/funcs/impl/x86/saber_im2col_conv.cpp @@ -0,0 +1,166 @@ + +#include "saber/funcs/impl/x86/saber_im2col_conv.h" + +namespace anakin { +namespace saber { + +inline bool is_a_ge_zero_and_a_lt_b(int a, int b) { + return static_cast(a) < static_cast(b); +} + +template +void im2col_cpu(const Dtype* data_im, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + Dtype* data_col) { + + const int output_h = (height + 2 * pad_h - + (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int output_w = (width + 2 * pad_w - + (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + const int channel_size = height * width; + + for (int channel = channels; channel--; data_im += channel_size) { + for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { + for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { + int input_row = -pad_h + kernel_row * dilation_h; + + for (int output_rows = output_h; output_rows; output_rows--) { + if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { + for (int output_cols = output_w; output_cols; output_cols--) { + *(data_col++) = 0; + } + } else { + int input_col = -pad_w + kernel_col * dilation_w; + + for (int output_col = output_w; output_col; output_col--) { + if (is_a_ge_zero_and_a_lt_b(input_col, width)) { + *(data_col++) = data_im[input_row * width + input_col]; + } else { + *(data_col++) = 0; + } + input_col += stride_w; + } + } + input_row += stride_h; + } + } + } + } +} + +template <> +SaberStatus SaberIm2colConv::create(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m, Context&ctx) { + this->_ctx = &ctx; + int in_c = inputs[0]->channel(); + int in_h = inputs[0]->height(); + int in_w = inputs[0]->width(); + int out_c = outputs[0]->channel(); + int out_h = outputs[0]->height(); + int out_w = outputs[0]->width(); + int kernel_h = param.weight()->height(); + int kernel_w = param.weight()->width(); + + int slice_size = in_c * kernel_h * kernel_w * out_h * out_w; + Shape _im2col_shape({slice_size}, Layout_W); + _im2col_tensor.reshape(_im2col_shape); + + int out_stride = out_h * out_w; + _gemm.init(false, false, out_c, out_stride, in_c * kernel_h * kernel_w, *(this->_ctx)); + + return SaberSuccess; +} + +template <> +SaberStatus SaberIm2colConv::init(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m, Context&ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} +template <> +SaberStatus SaberIm2colConv::dispatch(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m) { + + int batch_size = inputs[0]->num(); + int in_c = inputs[0]->channel(); + int in_h = inputs[0]->height(); + int in_w = inputs[0]->width(); + int out_c = outputs[0]->channel(); + int out_h = outputs[0]->height(); + int out_w = outputs[0]->width(); + int kernel_h = param.weight()->height(); + int kernel_w = param.weight()->width(); + int in_stride = in_h * in_w; + int out_stride = out_h * out_w; + + const float* din = (const float*)inputs[0]->data(); + float* dout = (float*)outputs[0]->mutable_data(); + const float* weights_d = (const float*)param.weight()->data(); + + bool flag_bias = (param.bias()->valid_size() > 0); + bool flag_relu = param.activation_param.has_active; + const float* bias = flag_bias ? (const float*)param.bias()->data() : nullptr; + if (param.group != 1) { + return SaberUnImplError; + } + + for (int i = 0; i < batch_size; i++) { + + im2col_cpu(din, in_c, in_h, in_w, kernel_h, kernel_w, param.pad_h, param.pad_w, + param.stride_h, param.stride_w, param.dilation_h, param.dilation_w, + (float*)_im2col_tensor.mutable_data()); + + _gemm.dispatch(1.f, 0.f, weights_d, (const float*)_im2col_tensor.data(), dout); + + din += in_c * in_stride; + dout += out_c * out_stride; + } + + if (flag_bias && !flag_relu) { + float *output = (float*)outputs[0]->mutable_data(); + int id = 0; + for (int i = 0; i < batch_size; i++) { + for (int oc = 0; oc < out_c; ++oc) { + for (int inner_id = 0; inner_id < out_stride; ++inner_id,++id) { + output[id] += bias[oc]; + } + } + } + } else if (!flag_bias && flag_relu) { + float *output = (float*)outputs[0]->mutable_data(); + int id = 0; + for (int i = 0; i < batch_size; i++) { + for (int oc = 0; oc < out_c; ++oc) { + for (int inner_id = 0; inner_id < out_stride; ++inner_id, ++id) { + if (output[id] < 0) { + output[id] = 0; + } + } + } + } + } else if (flag_bias && flag_relu) { + float *output = (float*)outputs[0]->mutable_data(); + int id = 0; + for (int i = 0; i < batch_size; i++) { + for (int oc = 0; oc < out_c; ++oc) { + for (int inner_id = 0; inner_id < out_stride; ++inner_id, ++id) { + float temp = output[id]; + temp += bias[oc]; + if (temp < 0) { + temp = 0; + } + output[id] = temp; + } + } + } + } + return SaberSuccess; +} +} +} diff --git a/saber/funcs/impl/x86/saber_im2col_conv.h b/saber/funcs/impl/x86/saber_im2col_conv.h new file mode 100644 index 000000000..a0876f118 --- /dev/null +++ b/saber/funcs/impl/x86/saber_im2col_conv.h @@ -0,0 +1,35 @@ + +#include "saber/core/tensor.h" +#include "saber_funcs_param.h" +#include "saber_types.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/gemm.h" + +namespace anakin { +namespace saber { + +template +class SaberIm2colConv : public ImplBase< + X86, OpDtype, ConvParam > { + + typedef typename DataTrait::Dtype OpDataType; +public: + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m, Context&ctx) override; + virtual SaberStatus create(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m, Context&ctx) override; + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m) override; + +private: + Tensor _im2col_tensor; + Gemm _gemm; +}; + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_im2sequence.cpp b/saber/funcs/impl/x86/saber_im2sequence.cpp new file mode 100644 index 000000000..356592deb --- /dev/null +++ b/saber/funcs/impl/x86/saber_im2sequence.cpp @@ -0,0 +1,66 @@ +#include "saber/funcs/impl/x86/saber_im2sequence.h" + +namespace anakin { + +namespace saber { + +/** + * @brief Extract image patches from input tensor to a tensor with the shape + * [batch_size * output_h * ouput_w, window_h * window_w * channels] + * output_h = (padding_up + padding_down + input_h - window_h)/strid_h + 1; + * output_w = (padding_left + padding_right + input_w - windwo_w)/strid_w + 1; + * @tparam OpDtype + */ +template +SaberStatus SaberIm2Sequence::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + Im2SequenceParam& param) { + + //brief for each channel: + //get patches[kernel_extern_w * kernel_extern_h] to dst tensor util the channel has been finished. + int out_rows_id = 0; + int old_row; + int out_cols = outputs[0]->channel(); + const OpDataType* input_ptr = (const OpDataType*)inputs[0]->data(); + OpDataType* output_ptr = (OpDataType*)outputs[0]->mutable_data(); + int H_pad = H + param.pad_up + param.pad_down; + int W_pad = W + param.pad_left + param.pad_right; + int wd_id = 0; + int wd_num_each_channel = output_height * output_width; + int wd_size = param.window_h * param.window_w; + int m = 0; //the id which is mapped to the j th element of i th window + int input_id; + int st_id; + int get_stride_h = param.dilation_h ? param.dilation_h : 1; + int get_stride_w = param.dilation_w ? param.dilation_w : 1; + for (int i = 0; i < N; i++) { + wd_id = 0; + out_rows_id = i * wd_num_each_channel + wd_id % wd_num_each_channel; + for (int j = 0; j < C; j++) { + for (int k = 0; k < H_pad - kernel_extern_h + 1; k += param.stride_h) { + for (int l = 0; l < W_pad - kernel_extern_w + 1; l += param.stride_w) { + m = 0; + //consider dilation. + for (int wd_h = k; wd_h < k + kernel_extern_h; wd_h += get_stride_h) { + for (int wd_w = l; wd_w < l + kernel_extern_w; wd_w += get_stride_w) { + input_id = i * C * H_pad * W_pad + j * H_pad * W_pad + wd_h * W_pad + wd_w; + st_id = out_rows_id * out_cols + j * wd_size + m; + output_ptr[st_id] = input_ptr[input_id]; + m++; + } + } + wd_id++; + out_rows_id = i * wd_num_each_channel + wd_id % wd_num_each_channel; + } + } + } + } + + return SaberSuccess; +} +template class SaberIm2Sequence; +DEFINE_OP_TEMPLATE(SaberIm2Sequence, Im2SequenceParam, X86, AK_INT16); +DEFINE_OP_TEMPLATE(SaberIm2Sequence, Im2SequenceParam, X86, AK_INT8); +} +} diff --git a/saber/funcs/impl/x86/saber_im2sequence.h b/saber/funcs/impl/x86/saber_im2sequence.h new file mode 100644 index 000000000..031101b11 --- /dev/null +++ b/saber/funcs/impl/x86/saber_im2sequence.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_IM2SEQUENCE_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_IM2SEQUENCE_H + +#include "saber/funcs/impl/impl_im2sequence.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberIm2Sequence:\ + public ImplBase > { + +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberIm2Sequence() {} + + ~SaberIm2Sequence() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + Im2SequenceParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + Im2SequenceParam ¶m, + Context &ctx) { + N = inputs[0]->num(); + C = inputs[0]->channel(); + H = inputs[0]->height(); + W = inputs[0]->width(); + //extern kernel height + kernel_extern_h = param.dilation_h * (param.window_h - 1) + 1; + output_height = (H + param.pad_up + param.pad_down - kernel_extern_h) + / param.stride_h + 1; + + //extern kernel width. + kernel_extern_w = param.dilation_w * (param.window_w - 1) + 1; + output_width = (W + param.pad_left + param.pad_right - kernel_extern_w) + / param.stride_w + 1; + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + Im2SequenceParam ¶m); + +private: + int N, C, H, W; + int output_height; + int output_width; + int kernel_extern_h; + int kernel_extern_w; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_IM2SEQUENCE_H diff --git a/saber/funcs/impl/x86/saber_layer_norm.cpp b/saber/funcs/impl/x86/saber_layer_norm.cpp new file mode 100644 index 000000000..6b64fb396 --- /dev/null +++ b/saber/funcs/impl/x86/saber_layer_norm.cpp @@ -0,0 +1,48 @@ +#include "saber/funcs/impl/x86/saber_layer_norm.h" +#include + +namespace anakin{ + +namespace saber{ + +template +SaberStatus SaberLayerNorm::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + LayerNormParam ¶m) { + + const OpDataType* src = (const OpDataType*)inputs[0]->data(); + OpDataType* dst = (OpDataType*)outputs[0]->mutable_data(); + const OpDataType* bias = (const OpDataType*)(param.bias_weights()->data()); + const OpDataType* scale = (const OpDataType*)(param.scale_weights()->data()); + + for (int i = 0; i < outer_size; ++i) { + OpDataType mean = 0; + OpDataType std = 0; + const OpDataType* src_ptr = src + i * inner_size; + OpDataType* dst_ptr = dst + i * inner_size; + for (int j = 0; j < inner_size; ++j) { + mean += src_ptr[j]; + } + mean /= inner_size; + for (int j = 0; j < inner_size; ++j) { + std += (src_ptr[j] - mean) * (src_ptr[j] - mean); + } + std = std / inner_size; + //printf("std pre: %.6f\n", std); + std = 1.f / (sqrtf(std) + param.eps); + //printf("mean: %.6f, std: %.6f\n", mean, std); + for (int j = 0; j < inner_size; ++j) { + dst_ptr[j] = (flag_scale? scale[j] : 1) * (src_ptr[j] - mean) * std + (flag_bias? bias[j] : 0); + } + } + + return SaberSuccess; +} + +template class SaberLayerNorm; +DEFINE_OP_TEMPLATE(SaberLayerNorm, LayerNormParam, X86, AK_INT16); +DEFINE_OP_TEMPLATE(SaberLayerNorm, LayerNormParam, X86, AK_INT8); +} //namespace anakin + +} //namespace anakin diff --git a/saber/funcs/impl/x86/saber_layer_norm.h b/saber/funcs/impl/x86/saber_layer_norm.h new file mode 100644 index 000000000..2350b9899 --- /dev/null +++ b/saber/funcs/impl/x86/saber_layer_norm.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_LAYER_NORM_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_LAYER_NORM_H + +#include "saber/funcs/impl/impl_layer_norm.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberLayerNorm:public ImplBase > { + +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberLayerNorm() = default; + ~SaberLayerNorm() {} + + virtual SaberStatus init(const std::vector* >& inputs, + std::vector* >& outputs, + LayerNormParam ¶m, + Context &ctx) { + // get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector* >& inputs, + std::vector* >& outputs, + LayerNormParam ¶m, + Context &ctx) { + + inner_size = inputs[0]->count_valid(param.axis, inputs[0]->dims()); + outer_size = inputs[0]->count_valid(0, param.axis); + + if (param.scale_weights()->valid_size() == 0) { + flag_scale = false; + } else { + flag_scale = true; + } + if (param.bias_weights()->valid_size() == 0) { + flag_bias = false; + } else { + flag_bias = true; + } + + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector* >& inputs, + std::vector* >& outputs, + LayerNormParam ¶m); + + +private: + int inner_size; + int outer_size; + bool flag_scale{true}; + bool flag_bias{true}; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_LAYER_NORM_H diff --git a/saber/funcs/impl/x86/saber_lstm.cpp b/saber/funcs/impl/x86/saber_lstm.cpp new file mode 100644 index 000000000..c6eb97be3 --- /dev/null +++ b/saber/funcs/impl/x86/saber_lstm.cpp @@ -0,0 +1,303 @@ +#include "saber/funcs/impl/x86/saber_lstm.h" +#include "sys/time.h" +#include "saber_normal_activation.h" +#include "mkl_cblas.h" + + +namespace anakin { + +namespace saber { + + +//inline +static void gemm(const bool TransA, const bool TransB, int m, int n, int k, const float alpha, + const float* a, const float* b, const float beta, float* c) { + // cout << "(" << m << "," << n << "," << k << ")" << endl; + int lda = (!TransA/* == CblasNoTrans*/) ? k : m; + int ldb = (!TransB/* == CblasNoTrans*/) ? n : k; + CBLAS_TRANSPOSE cuTransA = + (!TransA/* == CblasNoTrans*/) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE cuTransB = + (!TransB/* == CblasNoTrans*/) ? CblasNoTrans : CblasTrans; + cblas_sgemm(CblasRowMajor, cuTransA, cuTransB, m, n, k, alpha, a, k, b, n, beta, c, n); +}; + +template +static inline void cal_first_lstm_nullhidden(int emit_word_id_start,int emit_word_id_end,OpDataType* temp_wx,const OpDataType* weight_peephole, + OpDataType* hout,OpDataType* inner_cell,const BIT* b_i, const BIT* b_f, const BIT* b_c, const BIT* b_o, + ActiveType gate_activity, ActiveType cell_activity, ActiveType candi_activity,int hidden_size){ + const int i_offset = 0; + const int c_offset = 2; + const int o_offset = 3; + BIT(*gate_act)(const BIT) = Activate_inner(gate_activity); + BIT(*cell_act)(const BIT) = Activate_inner(cell_activity); + BIT(*candi_act)(const BIT) = Activate_inner(candi_activity); + for (int emit_word_id = emit_word_id_start; emit_word_id < emit_word_id_end; emit_word_id++) { + int emit_wx_offset = emit_word_id * hidden_size * 4; + const BIT* w_x_i = (BIT*)(temp_wx + i_offset * hidden_size + emit_wx_offset); + const BIT* w_x_c = (BIT*)(temp_wx + c_offset * hidden_size + emit_wx_offset); + const BIT* w_x_o = (BIT*)(temp_wx + o_offset * hidden_size + emit_wx_offset); + + const BIT* w_co = (BIT*)(weight_peephole + 2 * hidden_size); + int emit_id_offset = emit_word_id - emit_word_id_start; + BIT* gate_h_p = (BIT*)(hout + emit_id_offset * hidden_size); + BIT* gate_c_p = (BIT*)(inner_cell + emit_id_offset * hidden_size); + + if(with_peephole) { + for (int frame_id = 0; frame_id < hidden_size / (sizeof(BIT) / sizeof(OpDataType)); + ++frame_id) { + BIT gate_i = gate_act(w_x_i[frame_id] + b_i[frame_id]); + BIT gate_c_s = cell_act(w_x_c[frame_id] + b_c[frame_id]); + BIT gate_c = gate_i * gate_c_s; + BIT gate_o = gate_act(w_x_o[frame_id] + b_o[frame_id] + gate_c * w_co[frame_id]); + gate_c_p[frame_id] = gate_c; + gate_h_p[frame_id] = gate_o * candi_act(gate_c); + } + } else{ + for (int frame_id = 0; frame_id < hidden_size / (sizeof(BIT) / sizeof(OpDataType)); + ++frame_id) { + BIT gate_i = gate_act(w_x_i[frame_id] + b_i[frame_id]); + BIT gate_c_s = cell_act(w_x_c[frame_id] + b_c[frame_id]); + BIT gate_c = gate_i * gate_c_s; + BIT gate_o = gate_act(w_x_o[frame_id] + b_o[frame_id]); + gate_c_p[frame_id] = gate_c; + gate_h_p[frame_id] = gate_o * candi_act(gate_c); + } + } + } +} + +template +static inline void cal_lstm_batch(int emit_word_id_start,int emit_word_id_end,OpDataType* temp_wx,const OpDataType* weight_peephole, + OpDataType* hout,OpDataType* inner_cell,const BIT* b_i, const BIT* b_f, const BIT* b_c, const BIT* b_o, + ActiveType gate_activity, ActiveType cell_activity, ActiveType candi_activity,int hidden_size){ + const int i_offset = 0; + const int f_offset = 1; + const int c_offset = 2; + const int o_offset = 3; + BIT(*gate_act)(const BIT) = Activate_inner(gate_activity); + BIT(*cell_act)(const BIT) = Activate_inner(cell_activity); + BIT(*candi_act)(const BIT) = Activate_inner(candi_activity); + for (int emit_word_id = emit_word_id_start; emit_word_id < emit_word_id_end; emit_word_id++) { + int emit_wx_offset = emit_word_id * hidden_size * 4; + const BIT* w_x_i = (BIT*)(temp_wx + i_offset * hidden_size + emit_wx_offset); + const BIT* w_x_f = (BIT*)(temp_wx + f_offset * hidden_size + emit_wx_offset); + const BIT* w_x_c = (BIT*)(temp_wx + c_offset * hidden_size + emit_wx_offset); + const BIT* w_x_o = (BIT*)(temp_wx + o_offset * hidden_size + emit_wx_offset); + + int emit_id_offset = emit_word_id - emit_word_id_start; + + const BIT* w_ci = (BIT*)(weight_peephole + 0 * hidden_size); + const BIT* w_cf = (BIT*)(weight_peephole + 1 * hidden_size); + const BIT* w_co = (BIT*)(weight_peephole + 2 * hidden_size); + + BIT* gate_h_p = (BIT*)(hout + emit_id_offset * hidden_size); + BIT* gate_c_p = (BIT*)(inner_cell + emit_id_offset * hidden_size); + + if(with_peephole) { + for (int frame_id = 0; frame_id < hidden_size / (sizeof(BIT) / sizeof(OpDataType)); + ++frame_id) { + BIT c_1 = gate_c_p[frame_id]; + BIT gate_i = gate_act(w_x_i[frame_id] + b_i[frame_id] + w_ci[frame_id] * c_1); + BIT gate_f = gate_act(w_x_f[frame_id] + b_f[frame_id] + w_cf[frame_id] * c_1); + BIT gate_c_s = cell_act(w_x_c[frame_id] + b_c[frame_id]); + BIT gate_c = gate_f * c_1 + gate_i * gate_c_s; + BIT gate_o = gate_act(w_x_o[frame_id] + b_o[frame_id] + gate_c * w_co[frame_id]); + gate_c_p[frame_id] = gate_c; + gate_h_p[frame_id] = gate_o * candi_act(gate_c); + + } + }else{ + for (int frame_id = 0; frame_id < hidden_size / (sizeof(BIT) / sizeof(OpDataType)); + ++frame_id) { + BIT c_1 = gate_c_p[frame_id]; + BIT gate_i = gate_act(w_x_i[frame_id] + b_i[frame_id]); + BIT gate_f = gate_act(w_x_f[frame_id] + b_f[frame_id]); + BIT gate_c_s = cell_act(w_x_c[frame_id] + b_c[frame_id]); + BIT gate_c = gate_f * c_1 + gate_i * gate_c_s; + BIT gate_o = gate_act(w_x_o[frame_id] + b_o[frame_id]); + gate_c_p[frame_id] = gate_c; + gate_h_p[frame_id] = gate_o * candi_act(gate_c); + } + } + } +} + +template<> +template +SaberStatus SaberLstm:: +avx_dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param) { + + int loop_div = sizeof(BIT) / sizeof(OpDataType); + const OpDataType* weight_h = (const OpDataType*)_aligned_weights_h2h.data(); + const OpDataType* weight_w = (const OpDataType*)_aligned_weights_i2h.data(); + const OpDataType* bias = (const OpDataType*)_aligned_weights_bias.data(); + const OpDataType* weight_peephole = (const OpDataType*)_aligned_weights_peephole.data(); + BIT(*gate_act)(const BIT) = Activate_inner(param.gate_activity); + BIT(*cell_act)(const BIT) = Activate_inner(param.cell_activity); + BIT(*candi_act)(const BIT) = Activate_inner(param.candidate_activity); + + std::vector offset_vec = inputs[0]->get_seq_offset()[inputs[0]->get_seq_offset().size()-1]; + std::vector length_vec(offset_vec.size() - 1); + int batch_size = offset_vec.size() - 1; + int seqsum = 0; + int max_seq_len = 0; + bool is_hw2seq = offset_vec.size() > 2; + int word_sum = is_hw2seq ? offset_vec[offset_vec.size() - 1] : inputs[0]->channel(); + utils::AlignedUtils aligned_utils; + const OpDataType* h_init = nullptr; + const OpDataType* cell_init = nullptr; + + const OpDataType* x = (const OpDataType*)inputs[0]->data(); + OpDataType* out = (OpDataType*)outputs[0]->mutable_data(); + bool is_reverse = param.is_reverse; + + if (inputs.size() > 1) { + h_init = (const OpDataType*)inputs[1]->data(); + utils::try_expand_tensor(_aligned_init_hidden,batch_size * _aligned_hidden_size); + aligned_utils.aligned_last_dim(h_init, (OpDataType*)_aligned_init_hidden.mutable_data(), + batch_size * _hidden_size, _hidden_size, _aligned_hidden_size); + h_init = (const OpDataType*)_aligned_init_hidden.data(); + } else if (param.init_hidden() != nullptr) { + h_init =(const OpDataType*) param.init_hidden()->data(); + //FIXME:is it correct? + } else { + // _aligned_init_hidden.try_expand_tensor(batch_size * _aligned_hidden_size); + // _aligned_init_celll.try_expand_tensor(batch_size * _aligned_hidden_size); + // h_init = _aligned_init_hidden.data(); + // cell_init=_aligned_init_celll.data(); + } + + std::vector emit_offset_vec; + int emit_length = 0; + utils::SeqSortedseqTranseUtil transe_util(is_reverse); + bool transform = transe_util.get_sorted_map(offset_vec, emit_offset_vec, emit_length); + + OpDataType* inner_h_out = out; + OpDataType* inner_cell = nullptr; + const OpDataType* inner_x = x; + const OpDataType* inner_h_init = h_init; + + for (int i = 0; i < offset_vec.size() - 1; ++i) { + int len = offset_vec[i + 1] - offset_vec[i]; + length_vec[i] = len; + max_seq_len = max_seq_len > len ? max_seq_len : len; + seqsum += len; + } + + utils::try_expand_tensor(_temp_wx,seqsum * 4 * _aligned_hidden_size); + utils::try_expand_tensor(_temp_wh,batch_size * 4 * _aligned_hidden_size); + utils::try_expand_tensor(_temp_out,seqsum * _aligned_hidden_size * param.num_direction); + utils::try_expand_tensor(_temp_cell,batch_size * _aligned_hidden_size); + + if (transform) { + utils::try_expand_tensor(_temp_x,seqsum * _word_size); + inner_h_out = (OpDataType*)_temp_out.mutable_data(); + inner_x = (OpDataType*)_temp_x.mutable_data(); + transe_util.seq_2_sorted_seq(x, (OpDataType*)inner_x, _word_size); + + if (inner_h_init != nullptr) { + utils::try_expand_tensor(_temp_h_init,batch_size * _aligned_hidden_size); + transe_util.hidden_2_sorted_hidden(inner_h_init, (OpDataType*)_temp_h_init.mutable_data(), _aligned_hidden_size); + inner_h_init = (const OpDataType*)_temp_h_init.data(); + } + } else if (_hidden_size != _aligned_hidden_size) { + inner_h_out = (OpDataType*)_temp_out.mutable_data(); + } + + inner_cell = (OpDataType*)_temp_cell.mutable_data(); + memset(inner_cell, 0, _temp_cell.valid_size()* sizeof(OpDataType)); + + OpDataType* temp_wh = (OpDataType*)_temp_wh.mutable_data(); + OpDataType* temp_wx = (OpDataType*)_temp_wx.mutable_data(); + + gemm(false, false, seqsum, 4 * _aligned_hidden_size, _word_size, 1.f, inner_x, weight_w, 0.f, + temp_wx); + + const int i_offset = 0; + const int f_offset = 1; + const int c_offset = 2; + const int o_offset = 3; + const BIT* b_i = (BIT*)(bias + i_offset * _aligned_hidden_size); + const BIT* b_f = (BIT*)(bias + f_offset * _aligned_hidden_size); + const BIT* b_c = (BIT*)(bias + c_offset * _aligned_hidden_size); + const BIT* b_o = (BIT*)(bias + o_offset * _aligned_hidden_size); + + for (int word_id = 0; word_id < emit_length; word_id++) { + int real_word_id = word_id; + int last_word_id = word_id - 1; + + if (param.is_reverse && batch_size == 1) { + real_word_id = emit_length - word_id - 1; + last_word_id = real_word_id + 1; + } + + int emit_word_id_start = emit_offset_vec[real_word_id]; + int emit_word_id_end = emit_offset_vec[real_word_id + 1]; + int emit_word_length = emit_word_id_end - emit_word_id_start; + const float* hin; + + if (word_id == 0 && inner_h_init == nullptr) { + float* hout = nullptr; + hout = emit_offset_vec[real_word_id] * _aligned_hidden_size + inner_h_out; + + cal_first_lstm_nullhidden(emit_word_id_start,emit_word_id_end,temp_wx,weight_peephole, + hout,inner_cell,b_i,b_f,b_c,b_o, + param.gate_activity, param.cell_activity, param.candidate_activity, _aligned_hidden_size); + + continue; + + } else if (word_id == 0) { + hin = inner_h_init; + } else { + hin = inner_h_out + emit_offset_vec[last_word_id] * _aligned_hidden_size; + } + + float* hout = nullptr; + hout = emit_offset_vec[real_word_id] * _aligned_hidden_size + inner_h_out; + + //wh + gemm(false, false, emit_word_length, 4 * _aligned_hidden_size, _aligned_hidden_size, 1.0, hin, + weight_h, + 1.f, temp_wx+emit_word_id_start*4*_aligned_hidden_size); + + cal_lstm_batch(emit_word_id_start,emit_word_id_end,temp_wx,weight_peephole, + hout,inner_cell,b_i,b_f,b_c,b_o, + param.gate_activity, param.cell_activity, param.candidate_activity, _aligned_hidden_size); + } + + + if (transform) { + transe_util.sorted_seq_2_seq(inner_h_out, out, _hidden_size, _aligned_hidden_size); + } else if (_hidden_size != _aligned_hidden_size) { + aligned_utils.unaligned_last_dim((OpDataType*)_temp_out.data(), out, seqsum * _hidden_size, _hidden_size, + _aligned_hidden_size); + } + return SaberSuccess; +} + + +template<> +SaberStatus SaberLstm:: +dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param) { + CHECK_EQ(inputs.size(), 1) << "only support input size = 1"; + CHECK_EQ(outputs.size(), 1) << "only support outputs size = 1"; + CHECK_EQ(param.init_hidden() == nullptr, true) << "only support param.init_hidden() == nullptr"; + CHECK_EQ(param.num_layers, 1) << "only support param.num_layers==1"; + + if (param.with_peephole) { + avx_dispatch(inputs, outputs, param); + } else { + avx_dispatch(inputs, outputs, param); + } + return SaberSuccess; +} + +DEFINE_OP_TEMPLATE(SaberLstm, LstmParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberLstm, LstmParam, X86, AK_INT8); +} +} diff --git a/saber/funcs/impl/x86/saber_lstm.h b/saber/funcs/impl/x86/saber_lstm.h new file mode 100644 index 000000000..57767263b --- /dev/null +++ b/saber/funcs/impl/x86/saber_lstm.h @@ -0,0 +1,134 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_LSTM_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_LSTM_H +#include "saber/funcs/impl/impl_lstm.h" +#include "saber_funcs_param.h" +#include "saber/funcs/impl/x86/x86_utils.h" + + +#if defined(__AVX512F__) +#define SABER_X86_TYPE __m512 +#elif defined(__AVX2__) and defined(__FMA__) +#define SABER_X86_TYPE __m256 +#elif defined(__SSE4_2__) and defined(__FMA__) +#define SABER_X86_TYPE __m128 +#else +#define SABER_X86_TYPE float +#endif + +//#define SABER_X86_TYPE __m128 + +namespace anakin { +namespace saber { + +template +class SaberLstm : + public ImplBase < + X86, OpDtype,LstmParam > { +public: + typedef typename DataTrait::Dtype OpDataType; +// typedef Tensor OpTensor; + SaberLstm() {} + + ~SaberLstm() {} + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, + Context& ctx){ + if(param.with_peephole){ + _hidden_size=param.bias()->valid_size()/7; + }else{ + _hidden_size=param.bias()->valid_size()/4; + } + _word_size=(param.weight()->valid_size()-_hidden_size*_hidden_size*4)/_hidden_size/4; + + int weights_i2h_size=4*_hidden_size*_word_size; + int weights_h2h_size=4*_hidden_size*_hidden_size; + int weights_bias_size=4*_hidden_size; + int weights_peephole_size=3*_hidden_size; + + int aligned_byte= sizeof(SABER_X86_TYPE); + int c_size=aligned_byte/sizeof(OpDataType); + + _aligned_word_size=utils::round_up(_word_size,c_size); + _aligned_hidden_size=utils::round_up(_hidden_size,c_size); + + + Shape aligned_weights_i2h_shape({1,_word_size,4,_aligned_hidden_size}); + Shape aligned_weights_h2h_shape({1,_aligned_hidden_size,4,_aligned_hidden_size}); + Shape aligned_weights_bias_shape({1,1,4,_aligned_hidden_size}); + utils::try_expand_tensor(_aligned_weights_i2h,aligned_weights_i2h_shape); + utils::try_expand_tensor(_aligned_weights_h2h,aligned_weights_h2h_shape); + utils::try_expand_tensor(_aligned_weights_bias,aligned_weights_bias_shape); + + utils::AlignedUtils aligned_tool; + aligned_tool.aligned_last_dim((OpDataType*)(param.weight()->data()),(OpDataType*)_aligned_weights_i2h.mutable_data(), + weights_i2h_size,_hidden_size,_aligned_hidden_size); + + aligned_tool.aligned_last_dim((OpDataType*)(param.weight()->data()) + weights_i2h_size,(OpDataType*)_aligned_weights_h2h.mutable_data(), + weights_h2h_size,_hidden_size,_aligned_hidden_size); + + aligned_tool.aligned_last_dim((OpDataType*)param.bias()->data(),(OpDataType*)_aligned_weights_bias.mutable_data(), + weights_bias_size,_hidden_size,_aligned_hidden_size); + //FIXME:init weights tensor + if(param.with_peephole){ + Shape aligned_weights_peephole_shape({1,1,3,_aligned_hidden_size}); + utils::try_expand_tensor(_aligned_weights_peephole,aligned_weights_peephole_shape); + aligned_tool.aligned_last_dim((OpDataType*)(param.bias()->data())+weights_bias_size,(OpDataType*)_aligned_weights_peephole.mutable_data(), + weights_peephole_size,_hidden_size,_aligned_hidden_size); + } + + return create(inputs,outputs,param,ctx); + } ; + + SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, + Context& ctx) { + return SaberSuccess; + }; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param) ; + +private: + + int _word_size; + int _hidden_size; + int _aligned_word_size; + int _aligned_hidden_size; + + + Tensor _weights_i2h; + Tensor _weights_h2h; + Tensor _weights_bias; + Tensor _weights_peephole; + Tensor _init_hidden; + + Tensor _aligned_weights_i2h; + Tensor _aligned_weights_h2h; + Tensor _aligned_weights_bias; + Tensor _aligned_weights_peephole; + + Tensor _aligned_init_hidden; + + Tensor _temp_wx; + Tensor _temp_wh; + Tensor _temp_cell; + + Tensor _temp_x; + Tensor _temp_out; + Tensor _temp_h_init; + + template + SaberStatus avx_dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param); + + +}; + +} +} +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_LSTM_H diff --git a/saber/funcs/impl/x86/saber_mvn.cpp b/saber/funcs/impl/x86/saber_mvn.cpp new file mode 100644 index 000000000..7940ffdd6 --- /dev/null +++ b/saber/funcs/impl/x86/saber_mvn.cpp @@ -0,0 +1,74 @@ + +#include "saber/funcs/impl/x86/saber_mvn.h" +#include "saber/funcs/impl/x86/x86_utils.h" + +namespace anakin{ +namespace saber { +/** + * @brief for each graph, do MVN(Mean-Variance Normalization): + * formula: + * (x - mean) / ( sqrt(var) + eps ) (the eps iterm avoid to divde 0). + * + * + * @tparam OpDtype + * @param inputs + * @param outputs + * @param param + * @return SaberStatus + */ +template +SaberStatus SaberMvn::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs, + MvnParam& param) +{ + int N = inputs[0]->num(); + int C = inputs[0]->channel(); + int H = inputs[0]->height(); + int W = inputs[0]->width(); + + const OpDataType* src = (const OpDataType*)inputs[0]->data(); + OpDataType* dst = (OpDataType*)outputs[0]->mutable_data(); + int num = N * C; + int inner_dim = H * W; + if (param.across_channels) { + num = N; + inner_dim *= C; //CHW + } + + for (int i = 0; i < num; i++) { + OpDataType mean = 0; + OpDataType std = 0; + OpDataType* dst_ptr = dst + i * inner_dim; + const OpDataType* src_ptr = src + i * inner_dim; + //compute mean + for (int j = 0; j < inner_dim; j++) { + mean += src_ptr[j]; + } + mean /= inner_dim; + //compute variance + for (int j = 0; j < inner_dim; ++j) { + std += (src_ptr[j] - mean) * (src_ptr[j] - mean); + } + std /= inner_dim; + std = 1.0f / (sqrtf(std) + param.eps); + // normalize: (x - mean)/(sqrt(var)+eps) + if (param.normalize_variance) { + for (int j = 0; j < inner_dim; j++) { + dst_ptr[j] = (src_ptr[j] - mean) * std; + } + }else { // normalize: x-mean; + for (int j = 0; j < inner_dim; j++) { + dst_ptr[j] = src_ptr[j] - mean; + } + } + } + + return SaberSuccess; +} + +template class SaberMvn; +DEFINE_OP_TEMPLATE(SaberMvn, MvnParam, X86, AK_INT8); +DEFINE_OP_TEMPLATE(SaberMvn, MvnParam, X86, AK_INT16); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_mvn.h b/saber/funcs/impl/x86/saber_mvn.h new file mode 100644 index 000000000..d4e462fcb --- /dev/null +++ b/saber/funcs/impl/x86/saber_mvn.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_MVN_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_MVN_H + +#include "saber/funcs/impl/impl_mvn.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberMvn: public ImplBase > { + +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberMvn() {} + + ~SaberMvn() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + MvnParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + MvnParam ¶m, + Context &ctx) { + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + MvnParam ¶m); + + +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_MVN_H diff --git a/saber/funcs/impl/x86/saber_normal_activation.h b/saber/funcs/impl/x86/saber_normal_activation.h new file mode 100644 index 000000000..b4efcb956 --- /dev/null +++ b/saber/funcs/impl/x86/saber_normal_activation.h @@ -0,0 +1,171 @@ + +#ifndef ANAKIN_SABER_NORMAL_ACTIVATION_H +#define ANAKIN_SABER_NORMAL_ACTIVATION_H + +#include "saber_types.h" + + +#include "saber_avx512_math.h" +#include "saber_avx2_math.h" +#include "saber_sse_math.h" + +namespace anakin { + +namespace saber { + + +template +inline Dtype InValidAct(Dtype a) { + CHECK_EQ(0, 1) << "InValidAct"; +} + +template +inline Dtype Sigmoid(const Dtype a) { + return static_cast(1.0) / (static_cast(1.0) + exp(-a)); +} + + +template +inline Dtype Tanh(const Dtype a) { + Dtype tmp = -2.0 * a; + return (2.0 / (1.0 + exp(tmp))) - 1.0; +} + +template +inline Dtype Relu(const Dtype a) { + return a > static_cast(0.0) ? a : static_cast(0.0); +} + +template +inline Dtype Identity(const Dtype a) { + return a; +} + +#if defined(__SSE4_2__) and defined(__FMA__) + + +template<> +inline __m128 Relu<__m128>(const __m128 a) { + __m128 tmp = _mm_set1_ps(0.0f); + return _mm_max_ps(a, tmp); +} + + +template<> +inline __m128 Sigmoid<__m128>(const __m128 a) { + __m128 tmp = a; + tmp = _mm_sub_ps(_mm_set1_ps(0.0f), tmp); + tmp = exp128_ps_fma(tmp); + tmp = _mm_add_ps(_mm_set1_ps(1.0f), tmp); + tmp = _mm_div_ps(_mm_set1_ps(1.0f), tmp); + return tmp; +} + + +template<> +inline __m128 Tanh<__m128>(const __m128 a) { + __m128 tmp = _mm_mul_ps(_mm_set1_ps(-2.0f), a); + tmp = exp128_ps_fma(tmp); + return _mm_sub_ps(_mm_div_ps(_mm_set1_ps(2.0f), + _mm_add_ps(_mm_set1_ps(1.0f), tmp)), + _mm_set1_ps(1.0f)); +} + + +#endif + + + + +#if defined(__AVX2__) and defined(__FMA__) + + +template<> +inline __m256 Relu<__m256>(const __m256 a) { + __m256 tmp = _mm256_set1_ps(0.0f); + return _mm256_max_ps(a, tmp); +} + + +template<> +inline __m256 Sigmoid<__m256>(const __m256 a) { + __m256 tmp = a; + tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); + tmp = exp256_ps_fma(tmp); + tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); + tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp); + return tmp; +} + +template<> +inline __m256 Tanh<__m256>(const __m256 a) { + __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a); + tmp = exp256_ps_fma(tmp); + return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f), + _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), + _mm256_set1_ps(1.0f)); +} + +#endif + + +#if defined(__AVX512F__) + + +template<> +inline __m512 Relu<__m512>(const __m512 a) { + __m512 tmp = _mm512_set1_ps(0.0f); + return _mm512_max_ps(a, tmp); +} + + +template<> +inline __m512 Sigmoid<__m512>(const __m512 a) { + __m512 tmp = a; + tmp = _mm512_sub_ps(_mm512_set1_ps(0.0f), tmp); + tmp = exp512_ps_fma(tmp); + tmp = _mm512_add_ps(_mm512_set1_ps(1.0f), tmp); + tmp = _mm512_div_ps(_mm512_set1_ps(1.0f), tmp); + return tmp; +} + +template<> +inline __m512 Tanh<__m512>(const __m512 a) { + __m512 tmp = _mm512_mul_ps(_mm512_set1_ps(-2.0f), a); + tmp = exp512_ps_fma(tmp); + return _mm512_sub_ps(_mm512_div_ps(_mm512_set1_ps(2.0f), + _mm512_add_ps(_mm512_set1_ps(1.0f), tmp)), + _mm512_set1_ps(1.0f)); +} + +#endif + + +template +struct ACTIVATION { + typedef Dtype(*Act)(const Dtype); +}; + +template +inline typename ACTIVATION::Act Activate_inner(ActiveType type) { + static typename ACTIVATION::Act vec[7] = {&InValidAct, &Sigmoid < Dtype >, &Relu < Dtype >, + &Tanh < Dtype >, + &InValidAct, &InValidAct, + &Identity < Dtype > + }; + return vec[type]; +} + +template +static inline Dtype Activate_inner(Dtype value,ActiveType type) { + static typename ACTIVATION::Act vec[7] = {&InValidAct, &Sigmoid < Dtype >, &Relu < Dtype >, + &Tanh < Dtype >, + &InValidAct, &InValidAct, + &Identity < Dtype > + }; + return vec[type](value); +} + +} +} +#endif //ANAKIN_SABER_NORMAL_ACTIVATION_H diff --git a/saber/funcs/impl/x86/saber_normalize.cpp b/saber/funcs/impl/x86/saber_normalize.cpp new file mode 100644 index 000000000..1be6ca22b --- /dev/null +++ b/saber/funcs/impl/x86/saber_normalize.cpp @@ -0,0 +1,126 @@ +#include "saber/funcs/impl/x86/saber_normalize.h" + +namespace anakin{ +namespace saber{ + +template class SaberNormalize; + +template <> +SaberStatus SaberNormalize::\ + dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + NormalizeParam ¶m){ + int p = param.p; + bool across_spatial = param.across_spatial; + bool has_scale = param.has_scale; + bool channel_shared = param.channel_shared; + float eps = param.eps; + int n = inputs[0]->num(); + int c = inputs[0]->channel(); + int h = inputs[0]->height(); + int w = inputs[0]->width(); + Tensor th_scale; + const float* scale; + if(has_scale){ + th_scale.re_alloc(param.scale->shape(), AK_FLOAT); + th_scale.copy_from(*param.scale); + scale=static_cast(th_scale.data()); + } + const float* src_ptr = static_cast(inputs[0]->data()); + float* dst_ptr = static_cast(outputs[0]->mutable_data()); + + if (across_spatial) { + int compute_size = h * w * c; + int outer_size = n * c * h * w / compute_size; + + for (int i = 0; i < outer_size; ++i) { + float sum = 0; + + for (int j = 0; j < compute_size; ++j) { + if (p == 1) { + sum += fabsf(src_ptr[j]); + } else { + sum += src_ptr[j] * src_ptr[j]; + } + } + + if (p == 1) { + sum = 1 / (sum + eps); + } else { + sum = 1 / sqrtf(sum+eps); + } + + if (has_scale) { //! with scale + if (channel_shared) { // scale is shared across channel + for (int j = 0; j < compute_size; ++j) { + dst_ptr[j] = src_ptr[j] * sum * scale[0]; + } + } else { + for (int j = 0; j < compute_size; ++j) { + int c_idx = j / (h * w); + dst_ptr[j] = src_ptr[j] * sum * scale[c_idx]; + } + } + } else { //! without scale + for (int j = 0; j < compute_size; ++j) { + dst_ptr[j] = src_ptr[j] * sum; + } + } + + src_ptr += compute_size; + dst_ptr += compute_size; + } + } else { + int channel_in_size = h * w; + + for (int i = 0; i < n; ++i) { + const float* src_batch_ptr = src_ptr + i * c * h * w; + float* dst_batch_ptr = dst_ptr + i * c * h * w; + + for (int j = 0; j < h; ++j) { + for (int k = 0; k < w; ++k) { + const float* src_pixel = src_batch_ptr + j * w + k; + float* dst_pixel = dst_batch_ptr + j * w + k; + float norm = 0.f; + //LOG(INFO)<<"c:"< + class SaberNormalize: + public ImplBase< + X86, OpDtype, + NormalizeParam > { + +public: + + typedef typename DataTrait::Dtype OpDataType; + + SaberNormalize() = default; + ~SaberNormalize() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + NormalizeParam ¶m, + Context &ctx) { + // get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + NormalizeParam ¶m, + Context &ctx) { + // compute norm size + int channel_index = inputs[0]->channel_index(); + _dims = inputs[0]->dims(); + _size = inputs[0]->valid_size(); + _channels = inputs[0]->channel(); + _batchs = inputs[0]->num(); + + //! check the scale size + if (param.has_scale) { + if (!param.channel_shared) { + CHECK_EQ(_channels, param.scale->valid_size()) << \ + "scale data size must = channels"; + } + } + + //! size of data to compute square root sum (eg. H * W for channel, C * H * W for batch) + if (param.across_spatial) { + _norm_size = _batchs; + } else { + _norm_size = _channels * _batchs; + } + _channel_stride = inputs[0]->count_valid(channel_index + 1, _dims); + _compute_size = _size / _norm_size; + Shape sh_norm({1, 1, 1, _norm_size}); + _norm_reduce.reshape(sh_norm); + + _is_continue_buf = outputs[0]->is_continue_mem() && inputs[0]->is_continue_mem(); + if (!_is_continue_buf) { + Shape sh_input_real_stride = inputs[0]->get_stride(); + Shape sh_output_real_stride = outputs[0]->get_stride(); + + //! re_alloc device memory + Shape sh({1, 1, 1, _dims}); + _valid_shape.reshape(sh); + _input_stride.reshape(sh); + _output_stride.reshape(sh); + + memcpy(_valid_shape.mutable_data(), inputs[0]->valid_shape().data(), sizeof(int) * _dims); + memcpy(_input_stride.mutable_data(), sh_input_real_stride.data(), sizeof(int) * _dims); + memcpy(_output_stride.mutable_data(), sh_output_real_stride.data(), sizeof(int) * _dims); + } + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + NormalizeParam ¶m); + + +private: + Tensor _norm_reduce; + int _size; + int _norm_size; + int _compute_size; + int _batchs; + int _channels; + int _dims; + int _channel_stride; + //todo: + Tensor _input_stride; + Tensor _output_stride; + Tensor _valid_shape; + + bool _is_continue_buf{true}; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_NORMALIZE_H diff --git a/saber/funcs/impl/x86/saber_permute.cpp b/saber/funcs/impl/x86/saber_permute.cpp new file mode 100644 index 000000000..1c71bef69 --- /dev/null +++ b/saber/funcs/impl/x86/saber_permute.cpp @@ -0,0 +1,112 @@ +#include "saber/funcs/impl/x86/saber_permute.h" + +namespace anakin{ +namespace saber{ +template class SaberPermute; + +template <> +SaberStatus SaberPermute::\ + create(const std::vector*>& inputs, + std::vector*>& outputs, + PermuteParam ¶m, Context &ctx) { + + Shape order_shape({_num_axes, 1, 1, 1}); + _in_steps.reshape(order_shape); + _out_steps.reshape(order_shape); + _out_valid_shape.reshape(order_shape); + + Shape in_stride = inputs[0]->get_stride(); + Shape out_stride = outputs[0]->get_stride(); + + memcpy(_in_steps.mutable_data(), &in_stride[0], sizeof(int) * _in_steps.size()); + memcpy(_out_steps.mutable_data(), &out_stride[0], sizeof(int) * _out_steps.size()); + memcpy(_out_valid_shape.mutable_data(), &((outputs[0]->valid_shape())[0]), sizeof(int) * _out_valid_shape.size()); + return SaberSuccess; +} + +template <> +SaberStatus SaberPermute::\ + init(const std::vector*>& inputs, + std::vector*>& outputs, + PermuteParam ¶m, Context &ctx) { + this->_ctx = &ctx; + _num_axes = inputs[0]->valid_shape().size(); + for (int i = 0; i < _num_axes; i++) { + if (std::find(_order_dims.begin(), _order_dims.end(), + param.order[i]) == _order_dims.end()) { + _order_dims.push_back(param.order[i]); + } + } + + CHECK_EQ(_num_axes, _order_dims.size()); + + // set _need_permute + _need_permute = false; + for (int i = 0; i < _num_axes; ++i) { + if (param.order[i] != i) { + _need_permute = true; + break; + } + } + Shape order_shape({_num_axes, 1, 1, 1}); + _permute_order.reshape(order_shape); + memcpy(_permute_order.mutable_data(), &(param.order[0]), sizeof(int) * _permute_order.size()); + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberPermute::\ + dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PermuteParam ¶m){ + if (!_need_permute){ + outputs[0] -> copy_from(*inputs[0]); + return SaberSuccess; + } + const float* src_ptr = static_cast(inputs[0] -> data()); + float* dst_ptr = static_cast(outputs[0] -> mutable_data()); + std::vector orders = param.order; + int out_size = outputs[0] -> valid_size(); + int num_axes = inputs[0] -> valid_shape().size(); + std::vector new_steps = outputs[0] -> get_stride(); + std::vector old_steps = inputs[0] -> get_stride(); + std::vector new_valid_shape = outputs[0] -> valid_shape(); + if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()){ + for (int j=0; j= 0; --i) { + int order = orders[i]; + int new_step = new_steps[i]; + int old_step = old_steps[order]; + int id = (j / new_valid_stride) % new_valid_shape[i]; + in_idx += id * old_step; + out_idx += id * new_step; + new_valid_stride *= new_valid_shape[i]; + } + dst_ptr[out_idx] = src_ptr[in_idx]; + } + } + return SaberSuccess; + +} + + +} //namespace saber + +} //namespace anakin diff --git a/saber/funcs/impl/x86/saber_permute.h b/saber/funcs/impl/x86/saber_permute.h new file mode 100644 index 000000000..5461735fb --- /dev/null +++ b/saber/funcs/impl/x86/saber_permute.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PERMUTE_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PERMUTE_H + +#include "saber/funcs/impl/impl_permute.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberPermute:\ + public ImplBase< + X86, + OpDtype, + PermuteParam> { + +public: + + SaberPermute() {} + ~SaberPermute() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PermuteParam ¶m, + Context &ctx); + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PermuteParam ¶m, + Context &ctx); + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PermuteParam ¶m); + +private: + int _num_axes; + bool _need_permute; + std::vector _order_dims; + Tensor _permute_order; + Tensor _in_steps; + Tensor _out_steps; + Tensor _out_valid_shape; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PERMUTE_H diff --git a/saber/funcs/impl/x86/saber_permute_power.cpp b/saber/funcs/impl/x86/saber_permute_power.cpp new file mode 100644 index 000000000..e7735bfc1 --- /dev/null +++ b/saber/funcs/impl/x86/saber_permute_power.cpp @@ -0,0 +1,124 @@ +#include "saber/funcs/impl/x86/saber_permute_power.h" + +namespace anakin{ +namespace saber{ + +template class SaberPermutePower; + +template <> +SaberStatus SaberPermutePower::\ + dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PermutePowerParam ¶m){ + const float* src_ptr = static_cast(inputs[0] -> data()); + float* dst_ptr = static_cast(outputs[0] -> mutable_data()); + + float p = param.power_param.power; + float scale = param.power_param.scale; + float shift = param.power_param.shift; + + if (!_need_permute){ + outputs[0] -> copy_from(*inputs[0]); + } else { + std::vector orders = param.permute_param.order; + int out_size = outputs[0] -> valid_size(); + int num_axes = outputs[0] -> valid_shape().size(); + std::vector new_steps = outputs[0] -> get_stride(); + std::vector old_steps = inputs[0] -> get_stride(); + std::vector new_valid_shape = outputs[0] -> valid_shape(); + if (outputs[0] -> is_continue_mem() && inputs[0] -> is_continue_mem()){ + for (int j=0; j= 0; --i) { + int order = orders[i]; + int new_step = new_steps[i]; + int old_step = old_steps[order]; + int id = (j / new_valid_stride) % new_valid_shape[i]; + in_idx += id * old_step; + out_idx += id * new_step; + new_valid_stride *= new_valid_shape[i]; + } + if (p == 1){ + dst_ptr[out_idx] = src_ptr[in_idx]*scale + shift; + } else { + dst_ptr[out_idx] = pow(src_ptr[in_idx]*scale + shift, p); + } + } + } + }//if !need_permute + + //if _need_permute is false, do power individually + if (!_need_permute){ + int out_size = outputs[0] -> valid_size(); + + if (outputs[0] -> is_continue_mem() && inputs[0] -> is_continue_mem()){ + if (p ==1){ + for (int i=0; i < out_size; ++i){ + dst_ptr[i] = dst_ptr[i] * scale + shift; + } + } else { + for (int i=0; i < out_size; ++i){ + dst_ptr[i] = pow(dst_ptr[i] * scale + shift, p); + } + } + } else { + int num_axes = outputs[0] -> valid_shape().size(); + std::vector new_steps = outputs[0] -> get_stride(); + std::vector old_steps = inputs[0] -> get_stride(); + std::vector new_valid_shape = outputs[0] -> valid_shape(); + + if (p ==1){ + for (int i=0; i=0; --axis_id){ + int id = (i / new_valid_stride) % new_valid_shape[axis_id]; + in_idx += id*old_steps[axis_id]; + out_idx += id*new_steps[axis_id]; + new_valid_stride *= new_valid_shape[axis_id]; + } + dst_ptr[out_idx] = dst_ptr[in_idx] *scale + shift; + } + } else { + for (int i=0; i=0; --axis_id){ + int id = (i / new_valid_stride) % new_valid_shape[axis_id]; + in_idx += id*old_steps[axis_id]; + out_idx += id*new_steps[axis_id]; + new_valid_stride *= new_valid_shape[axis_id]; + } + dst_ptr[out_idx] = pow(dst_ptr[in_idx] *scale + shift, p); + } + }//if p=1 + }//if is_continue_mem + } + return SaberSuccess; +} + + +} +} diff --git a/saber/funcs/impl/x86/saber_permute_power.h b/saber/funcs/impl/x86/saber_permute_power.h new file mode 100644 index 000000000..78c2de22e --- /dev/null +++ b/saber/funcs/impl/x86/saber_permute_power.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PERMUTE_POWER_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PERMUTE_POWER_H + +#include "saber/funcs/impl/impl_permute_power.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberPermutePower:\ + public ImplBase< + X86, + OpDtype, + PermutePowerParam > { + +public: + + SaberPermutePower() {} + + ~SaberPermutePower() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PermutePowerParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PermutePowerParam ¶m, + Context &ctx) { + _num_axes = inputs[0]->shape().size(); + PermuteParam permute_param = param.permute_param; + for (int i = 0; i < _num_axes; i++) { + if (std::find(_order_dims.begin(), _order_dims.end(), permute_param.order[i]) == _order_dims.end()) { + _order_dims.push_back(permute_param.order[i]); + } + } + CHECK_EQ(_num_axes, _order_dims.size()); + + // set _need_permute + _need_permute = false; + for (int i = 0; i < _num_axes; ++i) { + if (permute_param.order[i] != i) { + _need_permute = true; + break; + } + } + Shape order_shape({_num_axes, 1, 1, 1}); + _permute_order.re_alloc(order_shape, OpDtype); + _old_steps.re_alloc(order_shape, OpDtype); + _new_steps.re_alloc(order_shape, OpDtype); + _out_valid_shape.re_alloc(order_shape, OpDtype); + Shape in_stride = inputs[0]->get_stride(); + Shape out_stride = outputs[0]->get_stride(); + Shape out_valid_shape = outputs[0]->valid_shape(); + memcpy(_old_steps.mutable_data(), &in_stride[0], sizeof(int) * _num_axes); + memcpy(_new_steps.mutable_data(), &out_stride[0], sizeof(int) * _num_axes); + memcpy(_permute_order.mutable_data(), &(permute_param.order[0]), sizeof(int) * _num_axes); + memcpy(_out_valid_shape.mutable_data(), &out_valid_shape[0], sizeof(int) * _num_axes); + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PermutePowerParam ¶m); + +private: + int _num_axes; + bool _need_permute; + std::vector _order_dims; + Tensor _permute_order; + Tensor _out_valid_shape; + Tensor _old_steps; + Tensor _new_steps; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PERMUTE_POWER_H diff --git a/saber/funcs/impl/x86/saber_pooling.cpp b/saber/funcs/impl/x86/saber_pooling.cpp index 5c2118d82..49215f6f4 100644 --- a/saber/funcs/impl/x86/saber_pooling.cpp +++ b/saber/funcs/impl/x86/saber_pooling.cpp @@ -7,150 +7,160 @@ namespace saber { using namespace jit; -template class SaberPooling; - -template -SaberStatus SaberPooling::init( - const std::vector& inputs, - std::vector& outputs, - PoolingParam ¶m, Context &ctx) -{ - - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - this->_ctx = ctx; - - return create(inputs, outputs, param, ctx); +template class SaberPooling; + +template <> +SaberStatus SaberPooling::init_conf( + jit_pool_conf_t &jpp, const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m) { + + using namespace utils; + + Shape src_shape(inputs[0]->shape()); + Shape dst_shape(outputs[0]->shape()); + bool ok = true + && mayiuse(avx512_common) + // && std::is_same::value + // && std::is_same::value + && one_of(param.pooling_type, Pooling_max, + Pooling_average_include_padding, + Pooling_average_exclude_padding); + if (!ok) { + return SaberUnImplError; + } + + const int simd_w = 16; + const int ndims = 4; + + jpp.ndims = ndims; + jpp.mb = src_shape[0]; + jpp.c = src_shape[1] * 16; + jpp.id = (ndims == 5) ? src_shape[2] : 1; + jpp.ih = src_shape[ndims - 2]; + jpp.iw = src_shape[ndims - 1]; + jpp.od = (ndims == 5) ? dst_shape[2] : 1; + jpp.oh = dst_shape[ndims - 2]; + jpp.ow = dst_shape[ndims - 1]; + + jpp.stride_d = 1; + jpp.stride_h = param.stride_h; + jpp.stride_w = param.stride_w; + jpp.kd = 1; + jpp.kh = param.window_h; + jpp.kw = param.window_w; + + jpp.f_pad = 0; + jpp.t_pad = param.pad_h; + jpp.l_pad = param.pad_w; + + jpp.alg = param.pooling_type; + + jpp.ind_dt = AK_FLOAT; + + jpp.simple_alg = false; + + jpp.c_block = simd_w; + + jpp.nb_c = jpp.c / jpp.c_block; + if (jpp.alg == Pooling_max) { + jpp.ur_w = 16; + } else { + jpp.ur_w = 24; + } + + if (jpp.ow < jpp.ur_w) { + jpp.ur_w = jpp.ow; + } + if (jpp.l_pad > jpp.ur_w) { + return SaberUnImplError; + } + + jpp.ur_w_tail = jpp.ow % jpp.ur_w; + if (jit_uni_pool_kernel_f32::init_conf(jpp)) { + return SaberSuccess; + } else { + return SaberUnImplError; + } } -template -SaberStatus SaberPooling::create( - const std::vector& inputs, - std::vector& outputs, - PoolingParam ¶m, - Context &ctx) -{ +template <> +SaberStatus SaberPooling::create( + const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m, + Context &ctx){ + jit_pool_conf_t jpp_; if(init_conf(jpp_, inputs, outputs, param) != SaberSuccess) { return SaberUnImplError; } - kernel_ = new jit_uni_pool_kernel_f32(jpp_); + _kernel = new jit_uni_pool_kernel_f32(jpp_); return SaberSuccess; } -template -SaberStatus SaberPooling - ::dispatch(const std::vector& inputs, - std::vector& outputs, - PoolingParam ¶m) -{ - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - - return SaberSuccess; - +template <> +SaberStatus SaberPooling::init( + const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m, Context &ctx){ + + this->_ctx = &ctx; + + return create(inputs, outputs, param, ctx); } - -template -SaberStatus SaberPooling::init_conf( - jit_pool_conf_t &jpp, const std::vector& inputs, - std::vector& outputs, - PoolingParam ¶m) { - - using namespace utils; - - Shape src_shape(inputs[0]->shape()); - Shape dst_shape(outputs[0]->shape()); - bool ok = true - && mayiuse(avx512_common) - && std::is_same::value - && std::is_same::value - && one_of(param.pooling_type, Pooling_max, - Pooling_average_include_padding, - Pooling_average_exclude_padding); - if (!ok) { - return SaberUnImplError; - } - - const int simd_w = 16; - const int ndims = 4; - - jpp.ndims = ndims; - jpp.mb = src_shape[0]; - jpp.c = src_shape[1] * 16; - jpp.id = (ndims == 5) ? src_shape[2] : 1; - jpp.ih = src_shape[ndims - 2]; - jpp.iw = src_shape[ndims - 1]; - jpp.od = (ndims == 5) ? dst_shape[2] : 1; - jpp.oh = dst_shape[ndims - 2]; - jpp.ow = dst_shape[ndims - 1]; - - jpp.stride_d = 1; - jpp.stride_h = param.stride_h; - jpp.stride_w = param.stride_w; - jpp.kd = 1; - jpp.kh = param.window_h; - jpp.kw = param.window_w; - - jpp.f_pad = 0; - jpp.t_pad = param.pad_h; - jpp.l_pad = param.pad_w; - - jpp.alg = param.pooling_type; - - jpp.ind_dt = AK_FLOAT; - - jpp.simple_alg = false; - - jpp.c_block = simd_w; - - jpp.nb_c = jpp.c / jpp.c_block; - if (jpp.alg == Pooling_max) { - jpp.ur_w = 16; - } else { - jpp.ur_w = 24; - } - - if (jpp.ow < jpp.ur_w) { - jpp.ur_w = jpp.ow; - } - if (jpp.l_pad > jpp.ur_w) { + +template <> +SaberStatus SaberPooling + ::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m){ + + if (!mayiuse(avx512_common)) { return SaberUnImplError; } - - jpp.ur_w_tail = jpp.ow % jpp.ur_w; - if (jit_uni_pool_kernel_f32::init_conf(jpp)) { - return SaberSuccess; - } else { - return SaberUnImplError; + const float *src = (const float*)inputs[0]->data(); + float *dst = (float*)outputs[0]->mutable_data(); + + const auto &jpp = _kernel->jpp; + + auto ker = [&](int n, int b_c, int oh) { + jit_pool_call_t arg; + + const int ij = oh * jpp.stride_h; + const int i_t_overflow = std::max(0, jpp.t_pad - ij); + const int i_b_overflow = std::max(jpp.ih, ij + jpp.kh - jpp.t_pad) - jpp.ih; + const int ih = std::max(ij - jpp.t_pad, 0); + + // TODO verify the calulation + int index = n * jpp.ih * jpp.iw * jpp.c + b_c * jpp.iw * jpp.ih * jpp.c_block + ih * jpp.iw * jpp.c_block; + arg.src = &src[index]; + index = n * jpp.oh * jpp.ow * jpp.c + b_c * jpp.ow * jpp.oh * jpp.c_block + oh * jpp.ow * jpp.c_block; + arg.dst = &dst[index]; + + arg.oh = (oh == 0); + arg.kh_padding = jpp.kh - i_t_overflow - i_b_overflow; + arg.kh_padding_shift = i_t_overflow * jpp.kw; + arg.kw_padding = 0; + arg.ker_area_h = (float)(jpp.kh - + std::max(0, oh * jpp.stride_h - jpp.t_pad + jpp.kh - jpp.ih) - + std::max(0, jpp.t_pad - oh * jpp.stride_h)); + (*_kernel)(&arg); + }; + +#pragma omp parallel for collapse(3) schedule(static) + for (int n = 0; n < jpp.mb; ++n) { + for (int b_c = 0; b_c < jpp.nb_c; ++b_c) { + for (int oh = 0; oh < jpp.oh; ++oh) { + ker(n, b_c, oh); + } + } } + + return SaberSuccess; } -template class SaberPooling; + +DEFINE_OP_TEMPLATE(SaberPooling, PoolingParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberPooling, PoolingParam, X86, AK_INT8); } } // namespace anakin diff --git a/saber/funcs/impl/x86/saber_pooling.h b/saber/funcs/impl/x86/saber_pooling.h index f52115fc9..94393e9b0 100644 --- a/saber/funcs/impl/x86/saber_pooling.h +++ b/saber/funcs/impl/x86/saber_pooling.h @@ -1,21 +1,24 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_POOLING_H #define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_POOLING_H #include "saber/funcs/impl/impl_pooling.h" +#include "saber/saber_funcs_param.h" #include "saber/funcs/impl/x86/kernel/jit_uni_pool_kernel_f32.h" #include "saber/funcs/impl/x86/kernel/jit_generator.h" @@ -24,58 +27,49 @@ namespace saber { using namespace jit; -template -class SaberPooling : public ImplBase< - Tensor, - Tensor, - Tensor, - PoolingParam > > +template +class SaberPooling : public ImplBase< + X86, + OpDtype, + PoolingParam> { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; SaberPooling() - : kernel_(nullptr) - {} + : _kernel(nullptr) {} ~SaberPooling() { - if (kernel_ != nullptr) { - delete kernel_; + if (_kernel != nullptr) { + delete _kernel; } } virtual SaberStatus init(const std::vector& inputs, std::vector& outputs, - PoolingParam ¶m, + PoolingParam ¶m, Context &ctx) override; virtual SaberStatus create(const std::vector& inputs, std::vector& outputs, - PoolingParam ¶m, + PoolingParam ¶m, Context &ctx) override; virtual SaberStatus dispatch(const std::vector& inputs, std::vector& outputs, - PoolingParam ¶m) override; + PoolingParam ¶m) override; virtual SaberStatus init_conf(jit_pool_conf_t &jpp, const std::vector& inputs, std::vector& outputs, - PoolingParam& param); + PoolingParam& param); private: - jit_uni_pool_kernel_f32 *kernel_; + jit_uni_pool_kernel_f32* _kernel; }; } } -#endif \ No newline at end of file +#endif diff --git a/saber/funcs/impl/x86/saber_pooling_with_index.cpp b/saber/funcs/impl/x86/saber_pooling_with_index.cpp new file mode 100644 index 000000000..6446ae5a4 --- /dev/null +++ b/saber/funcs/impl/x86/saber_pooling_with_index.cpp @@ -0,0 +1,80 @@ +#include "saber/funcs/impl/x86/saber_pooling_with_index.h" + +namespace anakin{ +namespace saber{ + +template class SaberPoolingWithIndex; + +template <> +SaberStatus SaberPoolingWithIndex::\ + dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m){ + if (!outputs[0] -> is_continue_mem() || !inputs[0] -> is_continue_mem()){ + LOG(ERROR) <<"pooling_with_index only support continue memory"; + return SaberUnImplError; + } + const float* src_ptr = static_cast(inputs[0] -> data()); + float* out_data_ptr = static_cast(outputs[0] -> mutable_data()); + float* out_index_ptr = static_cast(outputs[1] -> mutable_data()); + + int in_n = inputs[0] -> num(); + int in_c = inputs[0] -> channel(); + int in_h=inputs[0] -> height(); + int in_w=inputs[0] -> width(); + int size_in_n = in_c*in_h*in_w; + int size_in_c = in_h*in_w; + + int out_h = outputs[0] -> height(); + int out_w = outputs[0] -> width(); + int size_out_n = in_c*out_h*out_w; + int size_out_c = out_h*out_w; + + for(int ind_n = 0; ind_n < in_n; ++ind_n){ + for(int ind_c=0; ind_c < in_c; ++ind_c){ + for(int ind_h=0; ind_h 0) + { + sh=(sh - param.pad_h) < 0? 0 : sh-param.pad_h; + eh=(eh - param.pad_h)>in_h? in_h : eh-param.pad_h; + } + for(int ind_w=0; ind_w < out_w; ++ind_w){ + int sw = ind_w*param.stride_w; + int ew = sw + param.window_w; + if(param.pad_w > 0){ + sw = (sw - param.pad_w) < 0? 0 : sw-param.pad_w; + ew = (ew - param.pad_w) > in_w?in_w : ew-param.pad_w; + } + + float result; + float index; + + int dst_ind = ind_n*size_out_n + ind_c*size_out_c + ind_h*out_w + ind_w; + for(int kh = sh; kh= src_ptr[src_ind]? index : kh*in_w + kw; + result = result >= src_ptr[src_ind]? result : src_ptr[src_ind]; + } + + } + } + out_data_ptr[dst_ind] = result; + out_index_ptr[dst_ind] = index; + } + } + } + } + return SaberSuccess; + +} + + +} +} diff --git a/saber/funcs/impl/x86/saber_pooling_with_index.h b/saber/funcs/impl/x86/saber_pooling_with_index.h new file mode 100644 index 000000000..9836309f1 --- /dev/null +++ b/saber/funcs/impl/x86/saber_pooling_with_index.h @@ -0,0 +1,91 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_POOLING_WITH_INDEX_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_POOLING_WITH_INDEX_H + +#include "saber/funcs/impl/impl_pooling_with_index.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberPoolingWithIndex:\ + public ImplBase< + X86, OpDtype, + PoolingParam> { + +public: + typedef typename DataTrait :: Dtype dtype; + + SaberPoolingWithIndex() {} + + ~SaberPoolingWithIndex() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m, \ + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam &power_param, + Context &ctx) { + Shape out_stride = outputs[0]->get_stride(); + Shape in_stride = inputs[0]->get_stride(); + int in_n_index = inputs[0]->num_index(); + int in_c_index = inputs[0]->channel_index(); + int in_h_index = inputs[0]->height_index(); + int in_w_index = inputs[0]->width_index(); + int out_n_index = outputs[0]->num_index(); + int out_c_index = outputs[0]->channel_index(); + int out_h_index = outputs[0]->height_index(); + int out_w_index = outputs[0]->width_index(); + _in_n_stride = in_stride[in_n_index]; + _in_c_stride = in_stride[in_c_index]; + _in_h_stride = in_stride[in_h_index]; + _in_w_stride = in_stride[in_w_index]; + _out_n_stride = out_stride[out_n_index]; + _out_c_stride = out_stride[out_c_index]; + _out_h_stride = out_stride[out_h_index]; + _out_w_stride = out_stride[out_w_index]; + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m); + +private: + int _in_n_stride; + int _in_c_stride; + int _in_h_stride; + int _in_w_stride; + int _out_n_stride; + int _out_c_stride; + int _out_h_stride; + int _out_w_stride; + +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_POOLING_WITH_INDEX_H diff --git a/saber/funcs/impl/x86/saber_power.cpp b/saber/funcs/impl/x86/saber_power.cpp new file mode 100644 index 000000000..652829a4b --- /dev/null +++ b/saber/funcs/impl/x86/saber_power.cpp @@ -0,0 +1,65 @@ +#include "saber/funcs/impl/x86/saber_power.h" + +namespace anakin{ +namespace saber { + +template <> +SaberStatus SaberPower::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + PowerParam& param) { + const float p = param.power; + const float scale = param.scale; + const float shift = param.shift; + + const float* src_ptr = static_cast(inputs[0] -> data()); + float* dst_ptr = static_cast(outputs[0] -> mutable_data()); + int count = outputs[0] -> valid_size(); + if (inputs[0] -> is_continue_mem() && outputs[0] -> is_continue_mem()){ + if (p == 1){ + for (int i=0; i < count; ++i){ + dst_ptr[i] = src_ptr[i]* scale + shift; + } + } else { + for (int i=0; i < count; ++i){ + dst_ptr[i] = pow(src_ptr[i]*scale + shift, p); + } + } + } else { + int num_axis = outputs[0] -> dims(); + int in_offset = 0; + int out_offset = 0; + int valid_stride = 1; + const int* in_strides = static_cast(_in_steps.data()); + const int* out_strides = static_cast(_out_steps.data()); + const int* valid_shape = static_cast(_out_valid_shape.data()); + if (p ==1){ + for (int i=0; i < count; ++i){ + for (int axis_id = num_axis; axis_id >= 0; --axis_id){ + int id = (i / valid_stride) % valid_shape[axis_id]; + out_offset += id*out_strides[axis_id]; + in_offset += id*in_strides[axis_id]; + valid_stride *= valid_shape[axis_id]; + } + dst_ptr[out_offset] = src_ptr[in_offset]*scale + shift; + } + } else { + for (int i=0; i < count; ++i){ + for (int axis_id = num_axis; axis_id >= 0; --axis_id){ + int id = (i / valid_stride) % valid_shape[i]; + out_offset += id*out_strides[i]; + in_offset += id*in_strides[i]; + valid_stride *= valid_shape[i]; + } + dst_ptr[out_offset] = pow(src_ptr[in_offset]*scale + shift, p); + } + } + } + return SaberSuccess; +} + +template class SaberPower; +DEFINE_OP_TEMPLATE(SaberPower, PowerParam, X86, AK_INT16); +DEFINE_OP_TEMPLATE(SaberPower, PowerParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_power.h b/saber/funcs/impl/x86/saber_power.h new file mode 100644 index 000000000..7634a51d9 --- /dev/null +++ b/saber/funcs/impl/x86/saber_power.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2018 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_POWER_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_POWER_H + +#include "saber/funcs/impl/impl_power.h" +#include "saber/funcs/power.h" +namespace anakin { +namespace saber { + +template +class SaberPower : + public ImplBase< + X86, OpDtype, + PowerParam > +{ +public: + + SaberPower() + {} + + ~SaberPower() { + } + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PowerParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + }; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PowerParam ¶m, + Context &ctx) { + Shape shape({inputs[0]->dims(), 1, 1, 1}); + _in_steps.re_alloc(shape, OpDtype); + _out_steps.re_alloc(shape, OpDtype); + _out_valid_shape.re_alloc(shape, OpDtype); + Shape in_stride = inputs[0]->get_stride(); + Shape out_stride = outputs[0]->get_stride(); + Shape out_valid_shape = outputs[0]->valid_shape(); + memcpy(_out_steps.data(), &out_stride[0], sizeof(int)*4); + memcpy(_in_steps.data(), &in_stride[0], sizeof(int)*4); + memcpy(_out_valid_shape.data(), &out_valid_shape[0], sizeof(int)*4); + return SaberSuccess; + }; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PowerParam ¶m) override; +private: + Tensor _in_steps; + Tensor _out_steps; + Tensor _out_valid_shape; + + +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_reverse_input.cpp b/saber/funcs/impl/x86/saber_reverse_input.cpp new file mode 100644 index 000000000..b4b453e35 --- /dev/null +++ b/saber/funcs/impl/x86/saber_reverse_input.cpp @@ -0,0 +1,50 @@ + +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/x86/saber_reverse_input.h" + +namespace anakin { +namespace saber { + +template +SaberStatus SaberReverseInput::init(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m, + Context &ctx) { + return create(inputs,outputs,param,ctx); +}; +template +SaberStatus SaberReverseInput::create(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m, + Context &ctx) { + return SaberSuccess; +}; +template +SaberStatus SaberReverseInput::dispatch(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m) { + int input_size=inputs.size(); + for(int input_id=0;input_id> offset_vec=inputs[input_id]->get_seq_offset(); + std::vector offset=offset_vec[offset_vec.size()-1]; + const OpDataType* in= static_cast(inputs[input_id]->data()); + OpDataType* out=static_cast(outputs[input_id]->mutable_data()); + for(int sequence_id=0;sequence_id; +template class SaberReverseInput; +template class SaberReverseInput; +template class SaberReverseInput; + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_reverse_input.h b/saber/funcs/impl/x86/saber_reverse_input.h new file mode 100644 index 000000000..632d8847d --- /dev/null +++ b/saber/funcs/impl/x86/saber_reverse_input.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_REVERSE_INPUT_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_REVERSE_INPUT_H + +#include "saber/funcs/impl/impl_reverse_input.h" +#include "saber/saber_funcs_param.h" + +namespace anakin{ +namespace saber { + + +template +class SaberReverseInput : public ImplBase< + X86, + OpDtype, + EmptyParam> +{ +public: +typedef Tensor OpTensor; +typedef typename DataTrait::Dtype OpDataType; + +SaberReverseInput() {} + +~SaberReverseInput() { + +} + +virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m, + Context &ctx) override; + +virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m, + Context &ctx) override; + +virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m) override; + + +private: + +}; + + +} +} + +#endif diff --git a/saber/funcs/impl/x86/saber_reverse_sequence.cpp b/saber/funcs/impl/x86/saber_reverse_sequence.cpp new file mode 100644 index 000000000..2eaad25c4 --- /dev/null +++ b/saber/funcs/impl/x86/saber_reverse_sequence.cpp @@ -0,0 +1,56 @@ + +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/x86/saber_reverse_sequence.h" + +namespace anakin { +namespace saber { + +template +SaberStatus SaberReverseSequence::init(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m, + Context &ctx) { + return create(inputs,outputs,param,ctx); +}; +template +SaberStatus SaberReverseSequence::create(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m, + Context &ctx) { + int input_size=inputs.size(); + CHECK_EQ(input_size,1)<<"only support one input now"; + return SaberSuccess; +}; +template +SaberStatus SaberReverseSequence::dispatch(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m) { + int input_size=inputs.size(); + CHECK_EQ(input_size,1)<<"only support one input now"; + + std::vector> offset_vec=inputs[0]->get_seq_offset(); + std::vector offset=offset_vec[offset_vec.size()-1]; + const OpDataType* in= static_cast(inputs[0]->data()); + OpDataType* out=static_cast(outputs[0]->mutable_data()); + int batch_size=offset.size()-1; + int word_size=inputs[0]->valid_shape()[1]; + for (int i = 0; i < batch_size; i++) { + int seq_len = offset[i + 1] - offset[i]; + int start_word_id=offset[i]; + for (int j = 0; j < seq_len; j++) { + int output_offset = word_size * (start_word_id + seq_len - j - 1); + int input_offset = word_size * (start_word_id + j); + memcpy(out + output_offset, in + input_offset, word_size * sizeof(OpDataType)); + } + } + return SaberSuccess; + +}; + +template class SaberReverseSequence; +template class SaberReverseSequence; +template class SaberReverseSequence; +template class SaberReverseSequence; + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_reverse_sequence.h b/saber/funcs/impl/x86/saber_reverse_sequence.h new file mode 100644 index 000000000..7ee21d2ff --- /dev/null +++ b/saber/funcs/impl/x86/saber_reverse_sequence.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_REVERSE_SEQUENCE_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_REVERSE_SEQUENCE_H + +#include "saber/funcs/impl/impl_reverse_sequence.h" +#include "saber/saber_funcs_param.h" + +namespace anakin{ +namespace saber { + + +template +class SaberReverseSequence : public ImplBase< + X86, + OpDtype, + EmptyParam> +{ +public: +typedef Tensor OpTensor; +typedef typename DataTrait::Dtype OpDataType; + +SaberReverseSequence() {} + +~SaberReverseSequence() { + +} + +virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m, + Context &ctx) override; + +virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m, + Context &ctx) override; + +virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + EmptyParam ¶m) override; + + +private: + +}; + + +} +} + +#endif diff --git a/saber/funcs/impl/x86/saber_scale.cpp b/saber/funcs/impl/x86/saber_scale.cpp deleted file mode 100644 index 230c61709..000000000 --- a/saber/funcs/impl/x86/saber_scale.cpp +++ /dev/null @@ -1,96 +0,0 @@ - -#include "saber/funcs/impl/x86/saber_scale.h" -namespace anakin{ -namespace saber { - -template -SaberStatus SaberScale::init( - const std::vector& inputs, - std::vector& outputs, - ScaleParam ¶m, - Context &ctx) -{ - - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - this->_ctx = ctx; - - return create(inputs, outputs, param, ctx); -} - -template -SaberStatus SaberScale::create( - const std::vector& inputs, - std::vector& outputs, - ScaleParam ¶m, - Context &ctx) -{ - return SaberSuccess; -} - -template -SaberStatus SaberScale::dispatch( - const std::vector& inputs, - std::vector& outputs, - ScaleParam ¶m) -{ - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - - auto in_data = inputs[0]->data(); - auto out_data = outputs[0]->mutable_data(); - DataType_op* scale_data = (inputs.size() > 1) ? inputs[1]->data() : &(param.scale_w[0]); - DataType_op* bias_data = param.bias_term ? &(param.scale_b[0]) : NULL; - - const int count = inputs[0]->valid_size(); - int axis = (param.num_axes == 0) ? 0 : param.axis; - int num_axes = param.num_axes >=0 ? param.num_axes : inputs[0]->shape().dims() - axis; - CHECK_LE(axis + num_axes, inputs[0]->shape().dims()); - int outer_dim = inputs[0]->count(0, axis); - int inner_dim = inputs[0]->count(axis + num_axes, inputs[0]->shape().dims()); - int scale_dim = inputs[0]->count(axis, axis + num_axes); - if (inputs.size() > 1) { - CHECK_EQ(scale_dim, inputs[1]->valid_size()) << "scale dim not valid"; - } else { - CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid"; - } - - // TODO !! need add other types of scale - for (int outer_id = 0; outer_id < outer_dim; outer_id++) { - for (int scale_id = 0; scale_id < scale_dim; scale_id++) { - auto scale = scale_data[scale_id]; - auto bias = param.bias_term ? bias_data[scale_id] : 0; - for (int inner_id = 0; inner_id < inner_dim; inner_id++) { - *out_data = (*in_data) * scale + bias; - in_data++; - out_data++; - } - } - } - return SaberSuccess; -} - -template class SaberScale; - -} -} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_sequence_pool.cpp b/saber/funcs/impl/x86/saber_sequence_pool.cpp deleted file mode 100644 index ab4d4e107..000000000 --- a/saber/funcs/impl/x86/saber_sequence_pool.cpp +++ /dev/null @@ -1,174 +0,0 @@ - -#include "saber/funcs/impl/x86/saber_sequence_pool.h" -#include "saber/saber_funcs_param.h" -#include "saber/funcs/impl/x86/kernel/jit_generator.h" -#include -#include - -namespace anakin{ -namespace saber { - -template -void seq_pool_average(dtype* dst, const dtype* src_in, - const int slice_num, const int slice_size) { - dtype sum = 0.f; - for (int i = 0; i < slice_size; ++i) { - sum = src_in[i]; - for (int s = 1; s < slice_num; ++s) { - dtype src_in_read = src_in[s * slice_size +i]; - sum += src_in_read; - } - dst[i] = sum / slice_num; - } -} - -template -void seq_pool_sum(dtype* dst, const dtype* src_in, - const int slice_num, const int slice_size) { - dtype sum = 0.f; - for (int i = 0; i < slice_size; ++i) { - sum = src_in[i]; - for (int s = 1; s < slice_num; ++s) { - dtype src_in_read = src_in[s * slice_size +i]; - sum += src_in_read; - } - dst[i] = sum; - } -} - -template -void seq_pool_sqrt(dtype* dst, const dtype* src_in, - const int slice_num, const int slice_size) { - dtype sqrt_len = sqrtf(slice_num); - dtype sum = 0.f; - for (int i = 0; i < slice_size; ++i) { - sum = src_in[i]; - for (int s = 1; s < slice_num; ++s) { - dtype src_in_read = src_in[s * slice_size +i]; - sum += src_in_read; - } - dst[i] = sum / sqrt_len; - } -} - -template -void seq_pool_max(dtype* dst, const dtype* src_in, - const int slice_num, const int slice_size) { - dtype max = 0.f; - for (int i = 0; i < slice_size; ++i) { - max = src_in[i]; - for (int s = 1; s < slice_num; ++s) { - dtype src_in_read = src_in[s * slice_size +i]; - if (max < src_in_read) { - max = src_in_read; - } - } - dst[i] = max; - } -} - -template -SaberStatus SaberSequencePool::init( - const std::vector& inputs, - std::vector& outputs, - SequencePoolParam ¶m, Context &ctx) { - - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - this->_ctx = ctx; - kernel_direct_map = { - {Sequence_pool_unknow, []( - DataType_in*, const DataType_in*, const int, const int){ - LOG(ERROR) << " UNKNOWN seq pool type";}}, - - {Sequence_pool_average, seq_pool_average}, - {Sequence_pool_sum, seq_pool_sum}, - {Sequence_pool_sqrt, seq_pool_sqrt}, - {Sequence_pool_max, seq_pool_max}, - - {Sequence_pool_last, []( - DataType_in* dst, const DataType_in* src_in, - const int slice_num, const int slice_size) { - memcpy(dst, src_in + slice_size * (slice_num - 1), - sizeof(DataType_in)* slice_size); - }}, - {Sequence_pool_first, []( - DataType_in* dst, const DataType_in* src_in, - const int slice_num, const int slice_size) { - memcpy(dst, src_in, sizeof(DataType_in)* slice_size); - }}, - }; - return create(inputs, outputs, param, ctx); -} - -template -SaberStatus SaberSequencePool::create( - const std::vector& inputs, - std::vector& outputs, - SequencePoolParam ¶m, - Context &ctx) -{ - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - - this->_ctx = ctx; - - return SaberSuccess; -} - -template -SaberStatus SaberSequencePool::dispatch( - const std::vector& inputs, - std::vector& outputs, - SequencePoolParam ¶m) -{ - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - - CHECK_EQ(inputs[0]->channel(), outputs[0]->channel()); - CHECK_EQ(inputs[0]->height(), outputs[0]->height()); - CHECK_EQ(inputs[0]->width(), outputs[0]->width()); - - std::vector seq_offset = inputs[0]->get_seq_offset(); - int slice_size = outputs[0]->channel() - * outputs[0]->height() - * outputs[0]->width(); - - DataType_in *dst_ptr = outputs[0]->mutable_data(); - DataType_out *src_ptr = inputs[0]->data(); - for (int i = 0; i < seq_offset.size()-1; ++i) { - int slice_num = seq_offset[i+1] - seq_offset[i]; - - kernel_direct_map[param.sequence_pool_type]( - dst_ptr, src_ptr, slice_num, slice_size); - - dst_ptr += slice_size; - src_ptr += slice_size * slice_num; - } - return SaberSuccess; - -} -template class SaberSequencePool; -} -} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_sequence_pool.h b/saber/funcs/impl/x86/saber_sequence_pool.h deleted file mode 100644 index c6819c9b9..000000000 --- a/saber/funcs/impl/x86/saber_sequence_pool.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SEQUENCE_POOL_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SEQUENCE_POOL_H - -#include "saber/funcs/impl/impl_sequence_pool.h" -#include "saber/saber_funcs_param.h" -#include -#include - -namespace anakin{ -namespace saber { - -template -class SaberSequencePool : public ImplBase< - Tensor, - Tensor, - Tensor, - SequencePoolParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - - SaberSequencePool() = default; - - ~SaberSequencePool() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - SequencePoolParam ¶m, - Context &ctx) override; - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - SequencePoolParam ¶m, - Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - SequencePoolParam ¶m) override; -private: - typedef std::function seq_pool_direct_kernel; - std::map kernel_direct_map; - -}; -} -} - -#endif \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_softmax.cpp b/saber/funcs/impl/x86/saber_softmax.cpp deleted file mode 100644 index f9f3d4d01..000000000 --- a/saber/funcs/impl/x86/saber_softmax.cpp +++ /dev/null @@ -1,162 +0,0 @@ -#include "saber/funcs/impl/x86/saber_softmax.h" -#include "mkl_cblas.h" -#include "mkl_vml_functions.h" - -namespace anakin{ -namespace saber { - -template class SaberSoftmax; - -template -SaberStatus SaberSoftmax::init( - const std::vector& inputs, - std::vector& outputs, - SoftmaxParam ¶m, Context &ctx) -{ - this->_ctx = ctx; - return create(inputs, outputs, param, ctx); -} - -template -SaberStatus SaberSoftmax::create( - const std::vector& inputs, - std::vector& outputs, - SoftmaxParam ¶m, Context &ctx) -{ -// LOG(INFO)<<"here!!!"; - this->_param = ¶m; - this->_ctx = ctx; - - return SaberSuccess; -} - -template -void SaberSoftmax::_max( - int n, const float *x, float *max_data) { - max_data[0] = x[0]; - for (int c = 1; c < n; ++c) { - max_data[0] = max_data[0] > x[c] ? max_data[0] : x[c]; - } -} - -template -void SaberSoftmax::_sub( - int n, float alpha, const float *x, float *y) { - for (int c = 0; c < n; ++c) { - y[c] = x[c] - alpha; - } -} - -template -void SaberSoftmax::_exp( - int n, const float *a, float *r) { -#if 1 - vsExp(n, a, r); -#else - #pragma omp parallel for - for (int c = 0; c < n; ++c) { - r[c] = expf(a[c]); - } -#endif - return; -} - -template -void SaberSoftmax::_sum( - int n, const float *x, float *sum_data) { - sum_data[0] = 0; - for (int c = 0; c < n; ++c) { - sum_data[0] += x[c]; - } -} - -template -void SaberSoftmax::_scal - (int n, float alpha, float *x) { -#if 0 - cblas_sscal(n, alpha, x, 1); -#else -#pragma omp parallel for - for (int c = 0; c < n; ++c) { - x[c] *= alpha; - } -#endif - return; -} - -template -SaberStatus SaberSoftmax::dispatch( - const std::vector& inputs, - std::vector& outputs, - SoftmaxParam& param) { -// LOG(INFO)<<"here!!!"; - int num = inputs[0] -> num(); - int channel = inputs[0]->channel(); - float *src_ptr = inputs[0]->mutable_data(); - float *dst_ptr = outputs[0]->mutable_data(); - -#pragma omp parallel for schedule(static) - for (int ou = 0; ou < num ; ou++) { - const float *src_data = src_ptr + ou * channel; - float *dst_data = dst_ptr + ou * channel; - float scalar = 0; - - _max(channel, src_data, &scalar); - _sub(channel, scalar, src_data, dst_data); - _exp(channel, dst_data, dst_data); - _sum(channel, dst_data, &scalar); - _scal(channel, float(1)/scalar, dst_data); - } - return SaberSuccess; -} - -} -} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_softmax.h b/saber/funcs/impl/x86/saber_softmax.h index 4d8899916..ed2010002 100644 --- a/saber/funcs/impl/x86/saber_softmax.h +++ b/saber/funcs/impl/x86/saber_softmax.h @@ -20,22 +20,14 @@ namespace anakin{ namespace saber { -template -class SaberSoftmax : public ImplBase< - Tensor, - Tensor, - Tensor, - SoftmaxParam > > { +template +class SaberSoftmax : + public ImplBase> +{ public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; + typedef Tensor DataTensor_in; + typedef Tensor DataTensor_out; + typedef Tensor OpTensor; SaberSoftmax() {} @@ -45,17 +37,23 @@ class SaberSoftmax& inputs, std::vector& outputs, - SoftmaxParam& param, - Context &ctx) override; + SoftmaxParam& param, + Context &ctx) override { + return SaberUnImplError; + } virtual SaberStatus create(const std::vector& inputs, std::vector& outputs, - SoftmaxParam& param, - Context &ctx) override; + SoftmaxParam& param, + Context &ctx) override { + return SaberUnImplError; + } virtual SaberStatus dispatch(const std::vector& inputs, std::vector& outputs, - SoftmaxParam ¶m) override; + SoftmaxParam ¶m) override { + return SaberUnImplError; + } private: void _max(int n, const float *x, float *max_data); diff --git a/saber/funcs/impl/x86/saber_sse_math.h b/saber/funcs/impl/x86/saber_sse_math.h new file mode 100644 index 000000000..c275532ae --- /dev/null +++ b/saber/funcs/impl/x86/saber_sse_math.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_SSE_MATH_H +#define ANAKIN_SABER_SSE_MATH_H +#if defined(__SSE4_2__) and defined(__FMA__) +namespace anakin { +namespace saber { + +#include + +static inline __m128 exp128_ps_fma(__m128 x) { + __m128 tmp = _mm_setzero_ps(), fx; + __m128i imm0; + __m128 one = _mm_set1_ps(1.f); + __m128 _ps128_exp_hi = _mm_set1_ps(88.3762626647949f); + __m128 _ps128_exp_lo = _mm_set1_ps(-88.3762626647949f); + x = _mm_min_ps(x, _ps128_exp_hi); + x = _mm_max_ps(x, _ps128_exp_lo); + + __m128 _ps128_cephes_LOG2EF = _mm_set1_ps(1.44269504088896341f); + fx = _mm_mul_ps(x, _ps128_cephes_LOG2EF); + __m128 _ps128_0p5 = _mm_set1_ps(0.5); + fx = _mm_add_ps(fx, _ps128_0p5); + + tmp = _mm_floor_ps(fx); + + __m128 mask = _mm_cmp_ps(tmp, fx, _CMP_GT_OS); + mask = _mm_and_ps(mask, one); + fx = _mm_sub_ps(tmp, mask); + + __m128 _ps128_cephes_exp_C1 = _mm_set1_ps(0.693359375f); + __m128 _ps128_cephes_exp_C2 = _mm_set1_ps(-2.12194440E-4f); + tmp = _mm_mul_ps(fx, _ps128_cephes_exp_C1); + __m128 z = _mm_mul_ps(fx, _ps128_cephes_exp_C2); + x = _mm_sub_ps(x, tmp); + x = _mm_sub_ps(x, z); + z = _mm_mul_ps(x, x); + + __m128 _ps128_cephes_exp_p0 = _mm_set1_ps(1.9875691500E-4f); + __m128 _ps128_cephes_exp_p1 = _mm_set1_ps(1.3981999507E-3f); + __m128 _ps128_cephes_exp_p2 = _mm_set1_ps(8.3334519073E-3f); + __m128 _ps128_cephes_exp_p3 = _mm_set1_ps(4.1665795894E-2f); + __m128 _ps128_cephes_exp_p4 = _mm_set1_ps(1.6666665459E-1f); + __m128 _ps128_cephes_exp_p5 = _mm_set1_ps(5.0000001201E-1f); + __m128 y = _ps128_cephes_exp_p0; + y = _mm_fmadd_ps(y, x, _ps128_cephes_exp_p1); + y = _mm_fmadd_ps(y, x, _ps128_cephes_exp_p2); + y = _mm_fmadd_ps(y, x, _ps128_cephes_exp_p3); + y = _mm_fmadd_ps(y, x, _ps128_cephes_exp_p4); + y = _mm_fmadd_ps(y, x, _ps128_cephes_exp_p5); + y = _mm_fmadd_ps(y, z, x); + y = _mm_add_ps(y, one); + /* build 2^n */ + imm0 = _mm_cvttps_epi32(fx); + // another two AVX2 instructions + __m128i _pi32_128_0x7f = _mm_set1_epi32(0x7f); + imm0 = _mm_add_epi32(imm0, _pi32_128_0x7f); + imm0 = _mm_slli_epi32(imm0, 23); + __m128 pow2n = _mm_castsi128_ps(imm0); + y = _mm_mul_ps(y, pow2n); + return y; +} + +} +} +#endif + +#endif \ No newline at end of file diff --git a/saber/funcs/impl/x86/sequence2batch.cpp b/saber/funcs/impl/x86/sequence2batch.cpp new file mode 100644 index 000000000..303c105a8 --- /dev/null +++ b/saber/funcs/impl/x86/sequence2batch.cpp @@ -0,0 +1,65 @@ +#include "sequence2batch.h" + +namespace anakin { +namespace saber { +namespace math { + +template +void CopyMatrixRowsFunctor::operator()( + ioTensor* src, + std::vector index_lod, ioTensor* dst, + bool is_src_index, int fragment_num) { + typedef typename DataTrait::Dtype dtype; + int* index = index_lod.data(); + auto src_shape = src->valid_shape(); + auto dst_shape = dst->valid_shape(); + /*if (src_shape.size() != 2) { + LOG(ERROR) << "The src must be matrix with rank 2."; + exit(-1); + } + if (dst_shape.size() != 2) { + LOG(ERROR) << "The dst must be matrix with rank 2."; + exit(-1); + } + if (dst_shape[1] != src_shape[1]) { + LOG(ERROR) << "The width of src and dst must be same."; + exit(-1); + }*/ + if (dst_shape[1] % fragment_num != 0 && src_shape[1] % fragment_num != 0) { + LOG(ERROR) << "hidden size should be divided with no remainder by fragment_num."; + exit(-1); + } + auto height = dst_shape[0]; + auto dst_width = dst_shape[1] / fragment_num; + auto src_width = src_shape[1] / fragment_num; + auto real_width = (dst_width > src_width ? src_width: dst_width); + const dtype* src_data = (const dtype*)src->data(); + dtype* dst_data = (dtype*)dst->mutable_data(); + if (is_src_index) { +#pragma omp parallel for collapse(2) + for (int i = 0; i < height; ++i) { + for (int j = 0; j < fragment_num; j++) { + memcpy(dst_data + i * fragment_num * dst_width + j * dst_width, src_data + index[i] * fragment_num * src_width + j * src_width, + real_width * sizeof(dtype)); + } + } + } else { +#pragma omp parallel for collapse(2) + for (int i = 0; i < height; ++i) { + for (int j = 0; j < fragment_num; j++) { + memcpy(dst_data + index[i] * fragment_num * dst_width + j * dst_width, src_data + i * fragment_num * src_width + j * src_width, + real_width * sizeof(dtype)); + } + } + } +} + +template class CopyMatrixRowsFunctor; + +template class Seq2BatchFunctor; +template class Batch2SeqFunctor; +template class ReorderInitState; + +} // namespace math +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/x86/sequence2batch.h b/saber/funcs/impl/x86/sequence2batch.h new file mode 100644 index 000000000..926118cda --- /dev/null +++ b/saber/funcs/impl/x86/sequence2batch.h @@ -0,0 +1,368 @@ +#ifndef ANAKIN_SABER_FUNC_IMPL_X86_MATH_SEQUENCE_BATCH_H +#define ANAKIN_SABER_FUNC_IMPL_X86_MATH_SEQUENCE_BATCH_H + +#include +#include +#include "saber/core/tensor.h" + +#include "x86_utils.h" + +namespace anakin { +namespace saber { +namespace math { + +template +class CopyMatrixRowsFunctor { +public: + typedef Tensor ioTensor; + typedef typename DataTrait::Dtype dtype; + + // If is_src_index is true, + // copy the indexed rows of input src to the output dst. + // If is_src_index is false, + // copy the input src to the indexed rows of output dst. + // The indexed rows are based on the input index. + void operator()(ioTensor* src, + std::vector index_lod, ioTensor* dst, + bool is_src_index, int fragment_num); +}; + +template +class Seq2BatchFunctor { + // Calculate the length of each sequence and + // sort sequence index by the length. + // example: sequences = {s0, s1, s2} + // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + // seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)} + // + struct SeqInfo { + SeqInfo(int start, int length, int seq_idx) + : start(start), length(length), seq_idx(seq_idx) {} + int start; + int length; + int seq_idx; + }; + +public: + typedef Tensor ioTensor; + void operator()(ioTensor* seq, + ioTensor* batch, std::vector>& seq_to_batch_meta, bool is_cal_batch_lod, + bool is_reverse = false, int fragment_num = 1) const { + if (!is_cal_batch_lod) { + if (seq_to_batch_meta.size() < 2) { + LOG(ERROR) << "The size of seq_to_batch_meta should inlcude at least 2-level sequence information."; + exit(-1); + } + + if (seq_to_batch_meta[1].size() != static_cast(seq->num())) { + LOG(ERROR) << "The seq_to_batch information should be consistent with the dims."; + exit(-1); + } + + CopyMatrixRowsFunctor to_batch; + to_batch(seq, seq_to_batch_meta[1], batch, true, fragment_num); + return; + } + + if (seq_to_batch_meta.size() != 1) { + LOG(ERROR) << "Only support one level sequence now."; + exit(-1); + } + + auto seq_meta = seq_to_batch_meta[0]; + + std::vector seq_info; + + for (int seq_id = 0; seq_id < seq_meta.size() - 1; ++seq_id) { + int length = seq_meta[seq_id + 1] - seq_meta[seq_id]; + seq_info.emplace_back(seq_meta[seq_id], length, seq_id); + //LOG(INFO) << "seq_meta[seq_id]:" << seq_meta[seq_id] << " length:" << length << " seq_id:" < b.length; + }); + + // Calculate the start position of each batch. + // example: sequences = {s0, s1, s2} + // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + // num_batch = 5, + // batchIndex = {b0, b1, b2, b3, b4} + // b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1 + // batch_start_positions[6] = {0, 3, 6, 9, 11, 12} + // batch_start_positions[0] = len(b0) + // batch_start_positions[1] = len(b0) + len(b1) + // batch_start_positions[2] = len(b0) + len(b1) + len(b2) + // ... + // seq2batch_idx[12] = {4, 0, 9, + // 5, 1, 10, + // 6, 2, 11, + // 7, 3, + // 8} + // seq_order = {1, 0, 2}, the sort order. + // where 1 is the second sequence, + // 0 is the first sequence, + // 2 is the third sequence. + // The num_batch represents batch size after rearranging the + // input LodTensor. It is also the maximum length of input sequence. + + std::vector> batch_seq_meta; + batch_seq_meta.emplace_back(std::vector {0}); + batch_seq_meta.emplace_back(std::vector {0}); + batch_seq_meta.emplace_back(std::vector {0}); + + // batch_seq_meta[0] is the start positions for batch LoDTensor + int num_batch = seq_info[0].length; + batch_seq_meta[0].resize(static_cast(num_batch + 1)); + // batch_seq_meta[1] is the raw index in the input LoDTensor + batch_seq_meta[1].resize(static_cast(seq->num())); + // batch_seq_meta[2] is the sort order for the input LoDTensor. + batch_seq_meta[2].resize(seq_info.size()); + + int* batch_starts = batch_seq_meta[0].data(); + int* seq2batch_idx = batch_seq_meta[1].data(); + batch_starts[0] = 0; + + for (int n = 0; n < num_batch; n++) { + auto batch_id = static_cast(batch_starts[n]); + + for (int i = 0; i < seq_info.size(); ++i) { + int seq_len = seq_info[i].length; + int start = seq_info[i].start; + + if (n < seq_len) { + seq2batch_idx[batch_id] = + is_reverse ? start + seq_len - 1 - n : start + n; + batch_id++; + } else { + break; + } + } + + batch_starts[n + 1] = static_cast(batch_id); + } + + int* seq_order = batch_seq_meta[2].data(); + + for (int i = 0; i < seq_info.size(); ++i) { + seq_order[i] = seq_info[i].seq_idx; + } + + seq_to_batch_meta = batch_seq_meta; + + CopyMatrixRowsFunctor to_batch; + to_batch(seq, batch_seq_meta[1], batch, true, fragment_num); + } +}; + +template +class Batch2SeqFunctor { +public: + typedef Tensor ioTensor; + void operator()(ioTensor* batch, + ioTensor* seq, std::vector>& seq_to_batch_meta, int fragment_num = 1) const { + if (seq_to_batch_meta.size() < 2) { + LOG(ERROR) << "The size of seq_to_batch_meta should inlcude at least 2-level sequence information."; + exit(-1); + } + + if (seq_to_batch_meta[1].size() != static_cast(seq->num())) { + LOG(ERROR) << "The seq_to_batch information should be consistent with the dims."; + exit(-1); + } + + CopyMatrixRowsFunctor to_seq; + to_seq(batch, seq_to_batch_meta[1], seq, false, fragment_num); + } +}; + +template +class ReorderInitState { +public: + typedef Tensor ioTensor; + void operator()(ioTensor* src, std::vector ind_lod, ioTensor* dst, bool indexed_src, + int fragment_num = 1) { + math::CopyMatrixRowsFunctor row_shuffle; + row_shuffle(src, ind_lod, dst, indexed_src, fragment_num); + } +}; + + +/* + * This class can used to modify the matrix structure of sequence matrix into + * batch structure. + * sequence matrix: [C1_s ... Cn_s | ...... | C1_t ... Cn_t] + * batch matrix: [C1_s ... C1_t | ...... | Cn_s ... Cn_t] + * Cn_s is the state for sequence s at time n. + * + * Exampel: sequence matrix = {{0, 0, 0, 0}, {1, 1, 1, 1, 1}, {2, 2, 2}} + * s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + * batch matrix = {{1, 0, 2}, {1, 0, 2}, {1, 0, 2}, {1, 0}, {1}} + * b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1 + * + * Use: + * Input: seqMatrix, seqStarts(Sequence Start Positions) + * Output: batchMatrix + * 1. SequenceToBatch seq2batch; + * 2. seq2batch.resizeOrCreateBatch(seqStarts); // calculate seq2BatchIdx + * 3. seq2batch.copy(seqMatrix, batchMatrix, true); // copy seq to batch matrix + * + */ + +class SequenceToBatch { +public: + SequenceToBatch() {}; + + template + void seq_2_bat(const Dtype* input, Dtype* output, int word_size) { + int word_sum = seq2BatchIdx_.size(); + #pragma omp parallel for if(thread_num > 1) + + for (int old_id = 0; old_id < word_sum; ++old_id) { + int word_start = old_id * word_size; + int maped_id = seq2BatchIdx_[old_id]; + int maped_start = maped_id * word_size; + + for (int word_offset = 0; word_offset < word_size; ++word_offset) { + output[word_start + word_offset] = input[maped_start + word_offset]; + } + } + } + + template + void hidden_2_bat(const Dtype* input, Dtype* output, int hidden_size) { + int batch_size = seqStartAndLength_.size(); + + for (int old_id = 0; old_id < batch_size; ++old_id) { + int word_start = old_id * hidden_size; + int maped_id = seqStartAndLength_[old_id].seqIdx_; + int maped_start = maped_id * hidden_size; + + for (int word_offset = 0; word_offset < hidden_size; ++word_offset) { + output[word_start + word_offset] = input[maped_start + word_offset]; + } + } + } + + template + void bat_2_seq(const Dtype* input, Dtype* output, int hidden_size) { + int word_sum = seq2BatchIdx_.size(); + #pragma omp parallel for if(thread_num > 1) + + for (int old_id = 0; old_id < word_sum; old_id++) { + int word_start = old_id * hidden_size; + int maped_id = seq2BatchIdx_[old_id]; + int maped_start = maped_id * hidden_size; + + for (int word_offset = 0; word_offset < hidden_size; word_offset++) { + output[maped_start + word_offset] = input[word_start + word_offset]; + } + } + } + + template + void bat_2_seq(const Dtype* input, Dtype* output, int hidden_size, int aligned_hidden_size) { + int word_sum = seq2BatchIdx_.size(); + #pragma omp parallel for if(thread_num > 1) + + for (int old_id = 0; old_id < word_sum; old_id++) { + int word_start = old_id * aligned_hidden_size; + int maped_id = seq2BatchIdx_[old_id]; + int maped_start = maped_id * hidden_size; + + for (int word_offset = 0; word_offset < hidden_size; word_offset++) { + output[maped_start + word_offset] = input[word_start + word_offset]; + } + } + } + + void get_batch_offset(std::vector& bat_offset) { + for (size_t i = 0; i < batchStartPositions_.size(); i++) { + bat_offset[i] = batchStartPositions_[i]; + } + } + + size_t get_batch_num() const { + return numBatch_; + } + + void create_batch(int batchSize, size_t numSequences, std::vector& seqStarts, + bool reversed) { + CHECK_EQ(seqStarts[numSequences], batchSize); + seq2BatchIdx_.resize(batchSize); + + /* + * calculate the length of each sequence & sort sequence index by the length + * Exampel: Sequences = {s0, s1, s2} + * s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + * seqStartAndLength_[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)} + */ + for (size_t seqId = 0; seqId < numSequences; ++seqId) { + int length = seqStarts[seqId + 1] - seqStarts[seqId]; + seqStartAndLength_.emplace_back(seqStarts[seqId], length, seqId); + } + + std::sort(seqStartAndLength_.begin(), seqStartAndLength_.end(), + [](SeqStartAndLength a, SeqStartAndLength b) { + return a.length_ > b.length_; + }); + + /* + * calculate the start position of each batch + * (numBatch equal the maxLength of sequences) + * Exampel: Sequences = {s0, s1, s2} + * s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + * numBatch = 5, + * batchIndex = {b0, b1, b2, b3, b4} + * b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1 + * batchStartPositions[6] = {0, 3, 6, 9, 11, 12} + */ + numBatch_ = (size_t)seqStartAndLength_[0].length_; + batchStartPositions_.resize(numBatch_ + 1); + batchStartPositions_[0] = 0; + + for (size_t n = 0; n < numBatch_; n++) { + int batchId = batchStartPositions_[n]; + + for (size_t i = 0; i < seqStartAndLength_.size(); ++i) { + size_t seqLength = seqStartAndLength_[i].length_; + int start = seqStartAndLength_[i].start_; + + if (n < seqLength) { + if (!reversed) { + seq2BatchIdx_[batchId] = start + n; + } else { + seq2BatchIdx_[batchId] = start + seqLength - 1 - n; + } + + batchId++; + } else { + break; + } + } + + batchStartPositions_[n + 1] = batchId; + } + } + + +protected: + struct SeqStartAndLength { + int start_; + int length_; + int seqIdx_; + SeqStartAndLength(int start, int length, int seqIdx) + : start_(start), length_(length), seqIdx_(seqIdx) {} + }; + std::vector seqStartAndLength_; + std::vector batchStartPositions_; + std::vector seq2BatchIdx_; + size_t numBatch_; + int thread_num = omp_get_max_threads(); +}; +} // namespace math +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/impl/x86/vender_conv.cpp b/saber/funcs/impl/x86/vender_conv.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/saber/funcs/impl/x86/vender_conv.h b/saber/funcs/impl/x86/vender_conv.h new file mode 100644 index 000000000..e69de29bb diff --git a/saber/funcs/impl/x86/vender_fc.cpp b/saber/funcs/impl/x86/vender_fc.cpp index d9b93e9dc..92474632e 100644 --- a/saber/funcs/impl/x86/vender_fc.cpp +++ b/saber/funcs/impl/x86/vender_fc.cpp @@ -1,97 +1,124 @@ #include "saber/funcs/impl/x86/vender_fc.h" +#include "saber/funcs/impl/x86/x86_utils.h" #include "mkl_cblas.h" +#include "mkl_vml_functions.h" -namespace anakin{ +namespace anakin { namespace saber { typedef MKL_INT cblas_int; -template class VenderFc; - -template -SaberStatus VenderFc - ::init(const std::vector& inputs, - std::vector& outputs, - FcParam ¶m, Context &ctx) -{ - - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - this->_ctx = ctx; +template class VenderFc; + +template +SaberStatus VenderFc + ::init(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam ¶m, Context &ctx) { + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } -template -SaberStatus VenderFc - ::create(const std::vector& inputs, - std::vector& outputs, - FcParam ¶m, Context &ctx) -{ - typedef typename DataTensor_in::Dtype DataType_in; - typedef typename DataTensor_out::Dtype DataType_out; - typedef typename OpTensor::Dtype DataType_op; - this->_ctx = ctx; +template +SaberStatus VenderFc + ::create(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam ¶m, Context &ctx) { + + //check + CHECK_EQ(OpDtype, AK_FLOAT) << "vender fc only supports FP32 currently"; + + this->_ctx = &ctx; this->_param = ¶m; + MB = inputs[0]->count_valid(0, param.axis); + OC = outputs[0]->channel(); + + // weights + for (int i = packed_weights.size() - 1; i >= 0; i--) { + cblas_sgemm_free(packed_weights[i]); + } + std::vector ().swap(packed_weights); + + const float *weights = (const float*)param.weights->data(); + int total_IC = 0; + for (int i = 0; i < inputs.size(); i++) { + cblas_int IC = inputs[i]->count_valid(param.axis, inputs[i]->dims()); + packed_weights.push_back(cblas_sgemm_alloc(CblasAMatrix, OC, MB, IC)); + // LOG(INFO) << "anakin input[" << i << "] alloc passed"; + cblas_sgemm_pack(CblasColMajor, + CblasAMatrix, + param.is_transpose_weights ? CblasNoTrans : CblasTrans, + OC, MB, IC, + 1.0, + weights + total_IC * OC, IC, + packed_weights[i]); + total_IC += IC; + // LOG(INFO) << "anakin input[" << i << "] pack passed"; + } + return SaberSuccess; } -template -SaberStatus VenderFc - ::dispatch(const std::vector& inputs, - std::vector& outputs, - FcParam ¶m) -{ - if (inDtype == AK_FLOAT) { - const float* src = static_cast(inputs[0]->get_buf()->get_data()); - const float* weights = static_cast(param.weights->get_buf()->get_data()); - const float* bias = NULL; - if (param.bias) - bias = static_cast(param.bias->get_buf()->get_data()); - float* dst = static_cast(outputs[0]->get_buf()->get_data_mutable()); - - // TODO: consistency checks - int m = inputs[0]->count_valid(0, param.axis); - int k = inputs[0]->count_valid(param.axis, inputs[0]->dims()); - const cblas_int MB = m; - int channel_idx = outputs[0]->channel_index(); - Shape output_shape = outputs[0]->shape(); - const cblas_int OC = output_shape[channel_idx]; - const cblas_int IC = k; - - cblas_sgemm(CblasColMajor, param.is_transpose_weights ? CblasNoTrans : CblasTrans, - CblasNoTrans, OC, MB, IC, - 1.0, weights, IC, src, IC, 0.0, dst, OC); - if (bias) { -#pragma omp parallel for schedule(static) - for (cblas_int mb = 0; mb < MB; mb++) { - cblas_saxpy(OC, 1.0, bias, 1, dst + mb * OC, 1); - } +template +SaberStatus VenderFc + ::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam ¶m) { + + //check + CHECK_EQ(OpDtype, AK_FLOAT) << "vender fc only supports FP32 currently"; + + float* dst = (float *)outputs[0]->mutable_data(); + const float* bias = NULL; + + if (param.bias) { + bias = (const float*)param.bias->data(); + } + + for (int i = 0; i < inputs.size(); i++) { + const float* src = static_cast(inputs[i]->data()); + cblas_int IC = inputs[i]->count_valid(param.axis, inputs[i]->dims()); + if(i == 0) { + // C := alpha * op(A) * op(B) + beta * C + cblas_sgemm_compute(CblasColMajor, // Layout + CblasPacked, // a + CblasNoTrans, // b是否转置 + OC, MB, IC, // m, n, k + packed_weights[i], IC, // a, lda + src, IC, // b, ldb + 0.0, // beta + dst, OC); // c, ldc + } else { + cblas_sgemm_compute(CblasColMajor, // Layout + CblasPacked, // a + CblasNoTrans, // b是否转置 + OC, MB, IC, // m, n, k + packed_weights[i], IC, // a, lda + src, IC, // b, ldb + 1.0, // beta + dst, OC); // c, ldc } + //LOG(INFO) << "anakin compute[" << i << "] passed"; + + // LOG(INFO) << "inputs[]:dims: " << inputs[0]->dims(); + // LOG(INFO) << "inputs:size: " << inputs.size(); + // LOG(INFO) << "inputs:capacity: " << inputs.capacity(); + // LOG(INFO) << "output:size: " << outputs.size(); + // LOG(INFO) << "OC, MB, IC: " << OC << " "<< MB << " " << IC; } - outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); - return SaberSuccess; -} + if (bias) { + #pragma omp parallel for schedule(static) + for (cblas_int mb = 0; mb < MB; mb++) { + cblas_saxpy(OC, 1.0, bias, 1.0, dst + mb * OC, 1); + } + } + + return SaberSuccess; } +DEFINE_OP_TEMPLATE(VenderFc, FcParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(VenderFc, FcParam, X86, AK_INT8); +} // namespace saber } // namespace anakin diff --git a/saber/funcs/impl/x86/vender_fc.h b/saber/funcs/impl/x86/vender_fc.h index 2c6d5f5af..d6d0e34fb 100644 --- a/saber/funcs/impl/x86/vender_fc.h +++ b/saber/funcs/impl/x86/vender_fc.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -10,57 +10,66 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_VENDER_FC_H #define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_VENDER_FC_H +#include + +#include "mkl_cblas.h" #include "saber/funcs/impl/impl_fc.h" -namespace anakin{ +namespace anakin { namespace saber { -template -class VenderFc : public ImplBase< - Tensor, - Tensor, - Tensor, - FcParam > > -{ +template +class VenderFc : public ImplBase > { public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - - VenderFc() {} - - ~VenderFc() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - FcParam ¶m, + typedef typename DataTrait::Dtype OpDataType; + + VenderFc() : bias_sum(nullptr) + {} + + ~VenderFc() { + if (bias_sum) { + free(bias_sum); + bias_sum = nullptr; + } + + for (int i = packed_weights.size() - 1; i >= 0; i--) { + OpDataType *pw = packed_weights[i]; + cblas_sgemm_free(pw); + pw = nullptr; + packed_weights.pop_back(); + } + std::vector ().swap(packed_weights); + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam ¶m, Context &ctx) override; - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - FcParam ¶m, + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam ¶m, Context &ctx) override; - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - FcParam ¶m) override; + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam ¶m) override; private: - + OpDataType *bias_sum; + int MB; + int OC; + std::vector packed_weights; }; -} -} -#endif \ No newline at end of file +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_SABER_VENDER_FC_H diff --git a/saber/funcs/impl/x86/vender_gemm.cpp b/saber/funcs/impl/x86/vender_gemm.cpp new file mode 100644 index 000000000..7443f2352 --- /dev/null +++ b/saber/funcs/impl/x86/vender_gemm.cpp @@ -0,0 +1,81 @@ + +#include "saber/funcs/impl/x86/vender_gemm.h" + +namespace anakin { + +namespace saber { + +template <> +SaberStatus Gemm::init( + const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx){ + + if (!(ctx == this->_ctx)) { + _ctx = ctx; + } + _lda = (!trans_a) ? k : m; + _ldb = (!trans_b) ? n : k; + _ldc = n; + _m = m; + _n = n; + _k = k; + c_trans_a = trans_a ? CblasTrans: CblasNoTrans; + c_trans_b = trans_b ? CblasTrans: CblasNoTrans; + return SaberSuccess; +} + +template <> +SaberStatus Gemm::dispatch( + const float alpha, const float beta, + const float* ptr_a, const float* ptr_b, float* ptr_c) { + CHECK(ptr_a != nullptr); + CHECK(ptr_b != nullptr); + CHECK(ptr_c != nullptr); + cblas_sgemm(_layout, c_trans_a, c_trans_b, _m, _n, _k, + alpha, ptr_a, _lda, ptr_b, _ldb, beta, ptr_c, _ldc); + return SaberSuccess; +} + +template<> +SaberStatus Gemv::init(const bool trans, const int m, const int n, + const int incx, const int incy, + Context ctx) { + + if (!(ctx == this->_ctx)) { + this->_ctx = ctx; + } + + _lda = n; + CHECK_GT(m, 0); + CHECK_GT(n, 0); + CHECK_GT(incx, 0); + CHECK_GT(incy, 0); + _m = m; + _n = n; + _incx = incx; + _incy = incy; + _c_trans = trans ? CblasTrans : CblasNoTrans; + + return SaberSuccess; +} + +template<> +SaberStatus Gemv::dispatch( + const float alpha, const float beta, + const float* a, const float* b, + float* c) { + + CHECK(a != nullptr); + CHECK(b != nullptr); + CHECK(c != nullptr); + + cblas_sgemv(_layout, _c_trans, _m, _n, + alpha, a, _lda, b, _incx, beta, c, _incy); + + return SaberSuccess; +} + +}// namespace saber + +}// namespace anakin \ No newline at end of file diff --git a/saber/funcs/impl/x86/vender_gemm.h b/saber/funcs/impl/x86/vender_gemm.h new file mode 100644 index 000000000..8e46d740d --- /dev/null +++ b/saber/funcs/impl/x86/vender_gemm.h @@ -0,0 +1,71 @@ + +#ifndef SABER_FUNCS_IMPL_X86_VENDER_GEMM_H +#define SABER_FUNCS_IMPL_X86_VENDER_GEMM_H + +#include "saber/core/tensor.h" +#include "saber/funcs/gemm.h" +#include "mkl.h" + +namespace anakin { +namespace saber { + +template +class Gemm { + +public: + Gemm() = default; + ~Gemm() {} + + SaberStatus init(const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx); + + SaberStatus dispatch(const outDtype alpha, const outDtype beta, + const inDtype* a, const inDtype* b, + outDtype* c); + +private: + Context _ctx; + CBLAS_LAYOUT _layout = CblasRowMajor; + CBLAS_TRANSPOSE c_trans_a; + CBLAS_TRANSPOSE c_trans_b; + int _m{-1}; + int _n{-1}; + int _k{-1}; + int _lda{-1}; + int _ldb{-1}; + int _ldc{-1}; +}; + +template +class Gemv { + +public: + Gemv() = default; + ~Gemv() {} + + SaberStatus init(const bool trans, const int m, const int n, + const int incx, const int incy, + Context ctx); + + SaberStatus dispatch(const outDtype alpha, const outDtype beta, + const inDtype* a, const inDtype* b, + outDtype* c); + +private: + Context _ctx; + CBLAS_LAYOUT _layout = CblasRowMajor; + CBLAS_TRANSPOSE _c_trans; + int _incx{-1}; + int _incy{-1}; + int _m{-1}; + int _n{-1}; + int _lda{-1}; +}; + +} +} + +#endif \ No newline at end of file diff --git a/saber/funcs/impl/x86/vender_gru.cpp b/saber/funcs/impl/x86/vender_gru.cpp new file mode 100644 index 000000000..629c322f4 --- /dev/null +++ b/saber/funcs/impl/x86/vender_gru.cpp @@ -0,0 +1,418 @@ +#include "mkl_cblas.h" +#include "mkl_vml_functions.h" + +#include "saber/funcs/impl/x86/vender_gru.h" +#include "sequence2batch.h" +#include "saber/funcs/impl/x86/x86_utils.h" +#include "saber/funcs/impl/x86/kernel/jit_generator.h" +#include "tensor_op.h" +#include "saber/funcs/impl/x86/saber_normal_activation.h" + +namespace anakin { +namespace saber { + +template +SaberStatus VenderGru::init( + const std::vector& inputs, + std::vector& outputs, +GruParam& param, Context& ctx) { + this->_ctx = &ctx; + this->max_thread_num_ = omp_get_max_threads(); + hidden_size_ = outputs[0]->channel(); + word_size_ = inputs[0]->channel(); + + int aligned_size = 8; + aligned_hidden_size_ = (hidden_size_ % aligned_size) ? ((hidden_size_ / aligned_size) + 1) * + aligned_size : hidden_size_; + +#if defined(__AVX2__) and defined(__FMA__) + avx2_available_ = jit::mayiuse(jit::avx2); +#else + avx2_available_ = false; +#endif + // LOG(ERROR) << "AVX2 available: " << avx2_available_; + + if (param.formula == GRU_ORIGIN) { + OpDataType* weights_data = static_cast(param.weight()->data()); + + OpDataType* wx = weights_data; + OpDataType* wch = wx + word_size_ * hidden_size_ * 3; + OpDataType* wh = wch + hidden_size_ * hidden_size_; + + OpDataType* aligned_wx = nullptr; + OpDataType* aligned_wch = nullptr; + OpDataType* aligned_wh = nullptr; + + int delta = aligned_hidden_size_ - hidden_size_; + + + if (aligned_bias_ == nullptr) { + aligned_bias_ = (OpDataType*)zmalloc(3 * aligned_hidden_size_ * sizeof(float), 4096); + const OpDataType* bias_data = static_cast(param.bias()->data()); + + for (int i = 0; i < 3; i++) { + memcpy(aligned_bias_ + i * aligned_hidden_size_, bias_data + i * hidden_size_, + hidden_size_ * sizeof(float)); + + if (delta > 0) { + memset(aligned_bias_ + i * aligned_hidden_size_ + hidden_size_, 0, delta * sizeof(float)); + } + } + } else { + CHECK(false) << "aligned bias in init should not be a non-nullptr"; + } + + + if (delta > 0) { + aligned_wx = (OpDataType*)zmalloc(word_size_ * aligned_hidden_size_ * 3 * sizeof(float), 4096); + aligned_wch = (OpDataType*)zmalloc(aligned_hidden_size_ * aligned_hidden_size_ * sizeof(float), + 4096); + aligned_wh = (OpDataType*)zmalloc(2 * aligned_hidden_size_ * aligned_hidden_size_ * sizeof(float), + 4096); + + for (int i = 0; i < word_size_; i++) { + float* aligned_row = aligned_wx + i * aligned_hidden_size_ * 3; + float* row = wx + i * hidden_size_ * 3; + + for (int j = 0; j < 3; j++) { + memcpy(aligned_row + j * aligned_hidden_size_, row + j * hidden_size_, + hidden_size_ * sizeof(float)); + memset(aligned_row + j * aligned_hidden_size_ + hidden_size_, 0, delta * sizeof(float)); + } + } + + for (int i = 0; i < aligned_hidden_size_; i++) { + float* aligned_row = aligned_wch + i * aligned_hidden_size_; + float* row = wch + i * hidden_size_; + + if (i < hidden_size_) { + memcpy(aligned_row, row, hidden_size_ * sizeof(float)); + memset(aligned_row + hidden_size_, 0, delta * sizeof(float)); + } else { + memset(aligned_row, 0, aligned_hidden_size_ * sizeof(float)); + } + } + + for (int i = 0; i < aligned_hidden_size_; i++) { + float* aligned_row = aligned_wh + i * aligned_hidden_size_ * 2; + float* row = wh + i * hidden_size_ * 2; + + if (i < hidden_size_) { + for (int j = 0; j < 2; j++) { + memcpy(aligned_row + j * aligned_hidden_size_, row + j * hidden_size_, + hidden_size_ * sizeof(float)); + memset(aligned_row + j * aligned_hidden_size_ + hidden_size_, 0, delta * sizeof(float)); + } + } else { + memset(aligned_row, 0, 2 * aligned_hidden_size_ * sizeof(float)); + } + } + } else { + aligned_wx = wx; + aligned_wch = wch; + aligned_wh = wh; + } + + if (weight_x_packed_) { + cblas_sgemm_free(weight_x_packed_); + weight_x_packed_ = nullptr; + } + + if (weight_ru_packed_) { + cblas_sgemm_free(weight_ru_packed_); + weight_ru_packed_ = nullptr; + } + + if (weight_c_packed_) { + cblas_sgemm_free(weight_c_packed_); + weight_c_packed_ = nullptr; + } + + weight_x_packed_ = cblas_sgemm_alloc(CblasBMatrix, inputs[0]->num(), 3 * aligned_hidden_size_, + word_size_); + + if (!weight_x_packed_) { + LOG(ERROR) << "cannot alloc weight_x_packed_ for gru"; + return SaberOutOfMem; + } + + cblas_sgemm_pack(CblasRowMajor, CblasBMatrix, CblasNoTrans, inputs[0]->num(), + 3 * aligned_hidden_size_, word_size_, 1.0, + aligned_wx, 3 * aligned_hidden_size_, weight_x_packed_); + + weight_ru_packed_ = cblas_sgemm_alloc(CblasBMatrix, 1, 2 * aligned_hidden_size_, + aligned_hidden_size_); + + if (!weight_ru_packed_) { + LOG(ERROR) << "cannot alloc weight_ru_packed_ for gru"; + return SaberOutOfMem; + } + + cblas_sgemm_pack(CblasRowMajor, CblasBMatrix, CblasNoTrans, 1, 2 * aligned_hidden_size_, + aligned_hidden_size_, 1.0, + aligned_wh, 2 * aligned_hidden_size_, weight_ru_packed_); + + weight_c_packed_ = cblas_sgemm_alloc(CblasBMatrix, 1, aligned_hidden_size_, aligned_hidden_size_); + + if (!weight_c_packed_) { + LOG(ERROR) << "cannot alloc weight_c_packed_ for gru"; + return SaberOutOfMem; + } + + cblas_sgemm_pack(CblasRowMajor, CblasBMatrix, CblasNoTrans, 1, aligned_hidden_size_, + aligned_hidden_size_, 1.0, + aligned_wch, aligned_hidden_size_, weight_c_packed_); + + if (delta > 0) { + zfree(aligned_wx); + wx = nullptr; + zfree(aligned_wh); + wh = nullptr; + zfree(aligned_wch); + wch = nullptr; + } + } else { + LOG(ERROR) << "only support GRU_ORIGIN now"; + return SaberUnImplError; + } + + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus VenderGru::create( + const std::vector& inputs, + std::vector& outputs, + GruParam& param, +Context& ctx) { + + utils::try_expand_tensor(batched_h,inputs[0]->num() * aligned_hidden_size_ * param.num_direction); + utils::try_expand_tensor(batched_x,inputs[0]->num() * word_size_); + utils::try_expand_tensor(batched_xx,inputs[0]->num() * 3 * aligned_hidden_size_); + + return SaberSuccess; +} + +template +SaberStatus VenderGru::dispatch( + const std::vector& inputs, + std::vector& outputs, + GruParam& param) { + + const OpDataType* bias = static_cast(param.bias()->data()); + std::vector> seq_offset_vec_vec = inputs[0]->get_seq_offset(); + std::vector seq_offset = seq_offset_vec_vec[seq_offset_vec_vec.size()-1]; + int word_sum = inputs[0]->num(); + const OpDataType* x = static_cast(inputs[0]->data()); + OpDataType* out = static_cast(outputs[0]->mutable_data()); + bool is_reverse = param.is_reverse; + int batch_size = seq_offset.size() - 1; + + utils::try_expand_tensor(batched_h,inputs[0]->num() * aligned_hidden_size_ * param.num_direction); + OpDataType* batched_h_data = static_cast(batched_h.mutable_data()); + utils::try_expand_tensor(batched_x,inputs[0]->num() * word_size_); + utils::try_expand_tensor(batched_xx,inputs[0]->num() * 3 * aligned_hidden_size_); + OpDataType* batched_xx_data = static_cast(batched_xx.mutable_data()); + + // input sequence to batch + math::SequenceToBatch batch_value; + batch_value.create_batch(word_sum, batch_size, seq_offset, is_reverse); + + int bat_length = batch_value.get_batch_num(); + std::vector bat_offset(bat_length + 1); + batch_value.get_batch_offset(bat_offset); + batch_value.seq_2_bat(x, static_cast(batched_x.mutable_data()), word_size_); + + int delta = aligned_hidden_size_ - hidden_size_; + + // init h + Shape h_init_shape({batch_size, aligned_hidden_size_, 1, 1}); + utils::try_expand_tensor(aligned_init_hidden,h_init_shape); + const OpDataType* h0 = nullptr; + + if (param.init_hidden() != nullptr) { + CHECK_EQ(param.init_hidden()->valid_shape().count(), + batch_size * hidden_size_) << "hidden init must match batch size"; + h0 = static_cast(param.init_hidden()->data()); + OpTensor h_init_tmp(h_init_shape); + OpDataType* aligned_init = static_cast(h_init_tmp.mutable_data()); + int delta = aligned_hidden_size_ - hidden_size_; + + if (delta > 0) { + for (int i = 0; i < batch_size; i++) { + OpDataType* aligned_row = aligned_init + i * aligned_hidden_size_; + const OpDataType* row = h0 + i * hidden_size_; + memcpy(aligned_row, row, hidden_size_ * sizeof(OpDataType)); + memset(aligned_row + hidden_size_, 0, delta * sizeof(OpDataType)); + } + + batch_value.hidden_2_bat(static_cast(h_init_tmp.data()), static_cast(aligned_init_hidden.mutable_data()), + aligned_hidden_size_); + h0 = static_cast(aligned_init_hidden.data()); + } else { + batch_value.hidden_2_bat(h0, static_cast(aligned_init_hidden.mutable_data()), aligned_hidden_size_); + h0 = static_cast(aligned_init_hidden.data()); + } + } else { + fill_tensor_const(aligned_init_hidden, 0); + h0 = static_cast(aligned_init_hidden.data()); + } + + // batched_xx = batched_x * [Wcx, Wrx, Wux] + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + word_sum, + 3 * aligned_hidden_size_, + word_size_, + static_cast(batched_x.data()), + word_size_, + weight_x_packed_, + 3 * aligned_hidden_size_, + 0.f, + batched_xx_data, + 3 * aligned_hidden_size_); + + // batched_xx += bias + int xx_num = inputs[0]->num(); + int hidden_stride = 3 * aligned_hidden_size_; + #pragma omp parallel for if(this->max_thread_num_ > 1) + + for (int i = 0; i < xx_num; i++) { + cblas_saxpy(hidden_stride, 1, aligned_bias_, 1, batched_xx_data + i * hidden_stride, 1); + } + + int c_offset = 0; + int r_offset = 1; + int u_offset = 2; + + for (int word_id = 0; word_id < bat_length; word_id++) { + int bat_word_id_start = bat_offset[word_id]; + int bat_word_id_end = bat_offset[word_id + 1]; + int bat_word_length = bat_word_id_end - bat_word_id_start; + const float* ht_1; + + if (word_id == 0) { + ht_1 = h0; + } else { + ht_1 = batched_h_data + bat_offset[word_id - 1] * aligned_hidden_size_; + } + + float* ht = batched_h_data + bat_offset[word_id] * aligned_hidden_size_; + + // xx = xx + ht_1 * Wh + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + bat_word_length, + 2 * aligned_hidden_size_, + aligned_hidden_size_, + ht_1, + aligned_hidden_size_, + weight_ru_packed_, + 2 * aligned_hidden_size_, + 1.f, + batched_xx_data + bat_word_id_start * hidden_stride + r_offset * aligned_hidden_size_, + hidden_stride); + + // compute reset gate output r and rh + if (avx2_available_) { +#if defined(__AVX2__) and defined(__FMA__) + for (int bat_word_id = bat_word_id_start; bat_word_id < bat_word_id_end; bat_word_id++) { + int intra_bat_offset = bat_word_id - bat_word_id_start; + __m256* r = (__m256*)(batched_xx_data + bat_word_id * hidden_stride + r_offset * + aligned_hidden_size_); + __m256* hit = (__m256*)(ht + intra_bat_offset * aligned_hidden_size_); + __m256* hit_1 = (__m256*)(ht_1 + intra_bat_offset * aligned_hidden_size_); + + for (int i = 0; i < aligned_hidden_size_ / 8; ++i) { + r[i] = Activate_inner(r[i], param.gate_activity); + hit[i] = r[i] * hit_1[i]; + } + } +#endif + } else { + for (int bat_word_id = bat_word_id_start; bat_word_id < bat_word_id_end; bat_word_id++) { + int intra_bat_offset = bat_word_id - bat_word_id_start; + float* r = (float*)(batched_xx_data + bat_word_id * hidden_stride + r_offset * + aligned_hidden_size_); + float* hit = (float*)(ht + intra_bat_offset * aligned_hidden_size_); + float* hit_1 = (float*)(ht_1 + intra_bat_offset * aligned_hidden_size_); + + for (int i = 0; i < aligned_hidden_size_; ++i) { + r[i] = Activate_inner(r[i], param.gate_activity); + hit[i] = r[i] * hit_1[i]; + } + } + } + + // xx = xx + rh * Wch + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + bat_word_length, + aligned_hidden_size_, + aligned_hidden_size_, + ht, + aligned_hidden_size_, + weight_c_packed_, + aligned_hidden_size_, + 1.f, + batched_xx_data + bat_word_id_start * hidden_stride + c_offset * aligned_hidden_size_, + hidden_stride); + + // compute candidate activation output and h + if (avx2_available_) { +#if defined(__AVX2__) and defined(__FMA__) + for (int bat_word_id = bat_word_id_start; bat_word_id < bat_word_id_end; bat_word_id++) { + int intra_bat_offset = bat_word_id - bat_word_id_start; + int h_word_id_offset = bat_word_id * hidden_stride; + __m256* u = (__m256*)(batched_xx_data + h_word_id_offset + u_offset * aligned_hidden_size_); + __m256* c = (__m256*)(batched_xx_data + h_word_id_offset + c_offset * aligned_hidden_size_); + __m256* hit = (__m256*)(ht + intra_bat_offset * aligned_hidden_size_); + __m256* hit_1 = (__m256*)(ht_1 + intra_bat_offset * aligned_hidden_size_); + + for (int i = 0; i < aligned_hidden_size_ / 8; ++i) { + u[i] = Activate_inner(u[i], param.gate_activity); + c[i] = Activate_inner(c[i], param.h_activity); + hit[i] = (c[i] - hit_1[i]) * u[i] + hit_1[i]; + } + } +#endif + } else { + for (int bat_word_id = bat_word_id_start; bat_word_id < bat_word_id_end; bat_word_id++) { + int intra_bat_offset = bat_word_id - bat_word_id_start; + int h_word_id_offset = bat_word_id * hidden_stride; + float* u = (float*)(batched_xx_data + h_word_id_offset + u_offset * aligned_hidden_size_); + float* c = (float*)(batched_xx_data + h_word_id_offset + c_offset * aligned_hidden_size_); + float* hit = (float*)(ht + intra_bat_offset * aligned_hidden_size_); + float* hit_1 = (float*)(ht_1 + intra_bat_offset * aligned_hidden_size_); + + for (int i = 0; i < aligned_hidden_size_; ++i) { + u[i]=Activate_inner(u[i], param.gate_activity); + c[i] = Activate_inner(c[i], param.h_activity); + hit[i] = (c[i] - hit_1[i]) * u[i] + hit_1[i]; + } + } + } + } + + // batch to sequence + batch_value.bat_2_seq(batched_h_data, out, hidden_size_, aligned_hidden_size_); + + return SaberSuccess; +} + +template +SaberStatus VenderGru::check_conf( + const std::vector& inputs, + std::vector& outputs, +GruParam& param) { + return SaberSuccess; +} + +template class VenderGru; +DEFINE_OP_TEMPLATE(VenderGru, GruParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(VenderGru, GruParam, X86, AK_INT8); +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/x86/vender_gru.h b/saber/funcs/impl/x86/vender_gru.h new file mode 100644 index 000000000..b17b68eae --- /dev/null +++ b/saber/funcs/impl/x86/vender_gru.h @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_GRU_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_GRU_H + +#include "saber/saber_types.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_gru.h" +#include "saber/saber_funcs_param.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/impl/x86/x86_utils.h" + + +#include +#include + +namespace anakin { +namespace saber { + +template +class VenderGru: public ImplBase < + X86, OpDtype,GruParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + typedef Tensor OpTensor; + + VenderGru() : avx2_available_(false), aligned_bias_(nullptr), + max_thread_num_(1), + weight_x_packed_(nullptr), + weight_ru_packed_(nullptr), + weight_c_packed_(nullptr) { + LOG(INFO) << "init vender gru"; + } + + ~VenderGru() { + if (this->weight_x_packed_) { + cblas_sgemm_free(this->weight_x_packed_); + this->weight_x_packed_ = nullptr; + } + + if (this->weight_ru_packed_) { + cblas_sgemm_free(this->weight_ru_packed_); + this->weight_ru_packed_ = nullptr; + } + + if (this->weight_c_packed_) { + cblas_sgemm_free(this->weight_c_packed_); + this->weight_c_packed_ = nullptr; + } + + if (this->aligned_bias_) { + zfree(this->aligned_bias_); + this->aligned_bias_ = nullptr; + } + } + + virtual SaberStatus init(const std::vector& inputs, + std::vector& outputs, + GruParam& gru_param, + Context& ctx) override; + + virtual SaberStatus create(const std::vector& inputs, + std::vector& outputs, + GruParam& gru_param, + Context& ctx) override; + + virtual SaberStatus dispatch(const std::vector& inputs, + std::vector& outputs, + GruParam& param) override; + +private: + bool avx2_available_; + int max_thread_num_; + int word_size_; + int hidden_size_; + int aligned_hidden_size_; + + float* aligned_bias_; + OpTensor aligned_init_hidden; + + OpDataType* weight_x_packed_ = nullptr; + OpDataType* weight_ru_packed_ = nullptr; + OpDataType* weight_c_packed_ = nullptr; + OpTensor batched_h; + OpTensor batched_x; + OpTensor batched_xx; + + SaberStatus check_conf(const std::vector& inputs, + std::vector& outputs, + GruParam& param); +}; + +} // namespace saber +} // namespace anakin +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_GRU_H diff --git a/saber/funcs/impl/x86/vender_lstm.cpp b/saber/funcs/impl/x86/vender_lstm.cpp new file mode 100644 index 000000000..018e756d9 --- /dev/null +++ b/saber/funcs/impl/x86/vender_lstm.cpp @@ -0,0 +1,495 @@ +#include "saber/funcs/impl/x86/vender_lstm.h" +#include "saber/funcs/impl/x86/kernel/jit_generator.h" +#include "saber/funcs/impl/x86/saber_normal_activation.h" +namespace anakin { +namespace saber { + +template <> +void VenderLstm::compute_with_avx(LstmMetaValue value, + int hidden_size, int batch_size, + const ActiveType& gate_act, + const ActiveType& cell_act, + const ActiveType& cand_act) { +#if defined(__AVX2__) and defined(__FMA__) + #pragma omp parallel for if(this->max_thread_num_ > 1) collapse(2) + + for (int b = 0; b < batch_size; b++) { + for (int i = 0; i < hidden_size / 8; i++) { + __m256 r_checkI = _mm256_set1_ps(0.0f); + __m256 r_checkF = _mm256_set1_ps(0.0f); + __m256 r_checkO = _mm256_set1_ps(0.0f); + __m256 prev_state_v = _mm256_set1_ps(0.0f); + int batch_offset = b * hidden_size; + __m256* value_ig = reinterpret_cast<__m256*>(value.gate_value + batch_offset * 4); + __m256* value_fg = reinterpret_cast<__m256*>(value.gate_value + batch_offset * 4 + hidden_size); + __m256* value_in = reinterpret_cast<__m256*>(value.gate_value + batch_offset * 4 + hidden_size * 2); + __m256* value_og = reinterpret_cast<__m256*>(value.gate_value + batch_offset * 4 + hidden_size * 3); + + __m256* state_active = reinterpret_cast<__m256*>(value.state_active_value + batch_offset); + __m256* state = reinterpret_cast<__m256*>(value.state_value + batch_offset); + __m256* output = reinterpret_cast<__m256*>(value.output_value + batch_offset); + + if (value.prev_state_value) { + prev_state_v = (reinterpret_cast<__m256*>(value.prev_state_value + batch_offset))[i]; + } + + if (value.check_ig) { + r_checkI = (reinterpret_cast(value.check_ig))[i]; + r_checkF = (reinterpret_cast(value.check_fg))[i]; + r_checkO = (reinterpret_cast(value.check_og))[i]; + } + + value_in[i] = Activate_inner(value_in[i], cand_act); + value_ig[i] = Activate_inner(_mm256_add_ps(value_ig[i], _mm256_mul_ps(prev_state_v, r_checkI)), gate_act); + value_fg[i] = Activate_inner(_mm256_add_ps(value_fg[i], _mm256_mul_ps(prev_state_v, r_checkF)), gate_act); + state[i] = _mm256_add_ps(_mm256_mul_ps(value_in[i],value_ig[i]), _mm256_mul_ps(prev_state_v, value_fg[i])); + value_og[i] = Activate_inner(_mm256_add_ps(value_og[i], _mm256_mul_ps(state[i], r_checkO)), gate_act); + state_active[i] = Activate_inner(state[i], cell_act); + output[i] = _mm256_mul_ps(value_og[i], state_active[i]); + } + } + +#endif +} + +template <> +void VenderLstm::compute(LstmMetaValue value, + int hidden_size, int batch_size, + const ActiveType& gate_act, + const ActiveType& cell_act, + const ActiveType& cand_act) { + #pragma omp parallel for if(this->max_thread_num_ > 1) collapse(2) + + for (int b = 0; b < batch_size; b++) { + for (int i = 0; i < hidden_size; i++) { + OpDataType* value_ig = value.gate_value + b * hidden_size * 4; + OpDataType* value_fg = value_ig + hidden_size; + OpDataType* value_in = value_ig + hidden_size * 2; + OpDataType* value_og = value_ig + hidden_size * 3; + OpDataType* state_active = value.state_active_value + b * hidden_size; + OpDataType* state = value.state_value + b * hidden_size; + OpDataType* output = value.output_value + b * hidden_size; + OpDataType prev_state_v = 0; + + if (value.prev_state_value) { + prev_state_v = *(value.prev_state_value + b * hidden_size + i); + } + + OpDataType r_checkI = value.check_ig ? value.check_ig[i] : 0; + OpDataType r_checkF = value.check_fg ? value.check_fg[i] : 0; + OpDataType r_checkO = value.check_og ? value.check_og[i] : 0; + + value_in[i]=Activate_inner(value_in[i],cand_act); + OpDataType tmp = value_ig[i] + prev_state_v * r_checkI; + value_ig[i]=Activate_inner(tmp,gate_act); + tmp = value_fg[i] + prev_state_v * r_checkF; + value_fg[i]=Activate_inner(tmp,gate_act); + state[i] = value_in[i] * value_ig[i] + prev_state_v * value_fg[i]; + tmp = value_og[i] + state[i] * r_checkO; + value_og[i]=Activate_inner(tmp,gate_act); + state_active[i]=Activate_inner(state[i],cell_act); + + output[i] = value_og[i] * state_active[i]; + } + } +} + + + +template <> +SaberStatus VenderLstm::create( + const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, Context& ctx) { + Tensor* input = inputs[0]; + Tensor* hidden_out = outputs[0]; + int hidden_size = hidden_out->channel(); + + // aligned hidden_size with AVX-512 + int aligned_size = 8; + this->aligned_hidden_size_ = (hidden_size % aligned_size) ? ((hidden_size / aligned_size) + 1) * + aligned_size : hidden_size; + Shape aligned_output_shape({hidden_out->num(), this->aligned_hidden_size_, 1, 1}, Layout_NCHW); + + // xx = x * [Wix, Wfx, Wcx, Wox] + Shape xx_shape({input->num(), hidden_size * 4, 1, 1}, Layout_NCHW); + Shape aligned_xx_shape({input->num(), this->aligned_hidden_size_ * 4, 1, 1}, Layout_NCHW); + // if current size < request size, realloc a buf + this->xx_ = request_buf_for_input(this->xx_, xx_shape); + this->batch_xx_ = request_buf_for_input(this->batch_xx_, aligned_xx_shape); + this->batch_hidden_ = request_buf_for_input(this->batch_hidden_, aligned_output_shape); + this->batch_cell_ = request_buf_for_input(this->batch_cell_, aligned_output_shape); + this->batch_cell_act_ = request_buf_for_input(this->batch_cell_act_, aligned_output_shape); + + return SaberSuccess; +} + +template <> +SaberStatus VenderLstm::init( + const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, Context& ctx) { + avx2_available_ = jit::mayiuse(jit::avx2); + + Tensor* input = inputs[0]; + + const Tensor* bias = param.bias(); + int frame_size = input->channel(); + int hidden_size = outputs[0]->channel(); + + // aligned hidden_size with 8 float + int aligned_size = 8; + this->aligned_hidden_size_ = (hidden_size % aligned_size) ? ((hidden_size / aligned_size) + 1) * + aligned_size : hidden_size; + + Tensor* aligned_weights_data_h = nullptr; + + if (this->aligned_hidden_size_ != hidden_size) { + Shape aligned_w_shape({this->aligned_hidden_size_, this->aligned_hidden_size_ * 4, 1, 1}, Layout_NCHW); + aligned_weights_data_h = new Tensor(aligned_w_shape); + } + + OpDataType* weights_data = (OpDataType*)(param.weight()->data()); + MatrixInfo* weight_x = nullptr; + MatrixInfo* weight_h = nullptr; + MatrixInfo* weight_h_tmp = nullptr; + + if (param.skip_input) { + // if skip_input is true, the weights just includes [Wih, Wfh, Wch, Wph] + weight_h = new MatrixInfo(weights_data, hidden_size, (hidden_size * 4)); + } else { + // split the weight to two parts: [Wix, Wfx, Wcx, Wox], [Wih, Wfh, Wch, Woh] + weight_x = new MatrixInfo(weights_data, frame_size, (hidden_size * 4)); + weight_h = new MatrixInfo((weights_data + frame_size * hidden_size * 4), hidden_size, + (hidden_size * 4)); + } + + if (this->aligned_hidden_size_ != hidden_size) { + weight_h_tmp = weight_h; + weight_h = new MatrixInfo((OpDataType *)aligned_weights_data_h->mutable_data(), + this->aligned_hidden_size_, (this->aligned_hidden_size_ * 4)); + // do weight align + int stride = 0; + int diff = this->aligned_hidden_size_ - hidden_size; + OpDataType* src = nullptr; + OpDataType* dst = nullptr; + + for (int i = 0; i < this->aligned_hidden_size_; i++) { + stride = 4 * (this->aligned_hidden_size_); + + dst = weight_h->buf() + i * stride; + + if (i >= hidden_size) { + memset(dst, 0, stride * sizeof(OpDataType)); + } else { + src = weight_h_tmp->buf() + i * 4 * hidden_size; + + for (int j = 0; j < 4; j++) { + memcpy(dst + j * this->aligned_hidden_size_, + src + j * hidden_size, hidden_size * sizeof(OpDataType)); + memset(dst + j * this->aligned_hidden_size_ + hidden_size, + 0, diff * sizeof(OpDataType)); + } + } + } + + delete weight_h_tmp; + } + + // clean the packed weight + safe_free(&(this->packed_w_x_)); + safe_free(&(this->packed_w_h_)); + + // pack weights for Wix, Wfx, Wcx, Wox] and [Wih, Wfh, Wch, Woh] + if (weight_x) { + int m = input->num(); + this->packed_w_x_ = new mkl_packed_weight(weight_x, m); + this->packed_w_x_->pack(); + } + + this->packed_w_h_ = new mkl_packed_weight(weight_h); + this->packed_w_h_->pack(); + + const Tensor* init_t0 = param.init_hidden(); + safe_free(&batch_c0_); + safe_free(&batch_h0_); + + // tensor for batched init cell and batched init hidden, they are both with size batch_size * hidden_size + if (init_t0) { + int batch_size = input->get_seq_offset().size() - 1; + Shape batched_state_shape({batch_size, this->aligned_hidden_size_, 1, 1}, Layout_NCHW); + + // create buf in create func, batch_size * hidden_size + batch_c0_ = new Tensor(batched_state_shape); + + // create buf in create func, batch_size * hidden_size + batch_h0_ = new Tensor(batched_state_shape); + } + + bool with_peephole = param.with_peephole; + + if (bias && with_peephole) { + const OpDataType* bias_data = (const OpDataType*)bias->data(); + // shape for Wic, Wfc, Woc + Shape weights_c_shape({1, this->aligned_hidden_size_, 1, 1}, Layout_NCHW); + safe_free(&(this->check_ig_)); + safe_free(&(this->check_fg_)); + safe_free(&(this->check_og_)); + this->check_ig_ = new Tensor(weights_c_shape); + this->check_fg_ = new Tensor(weights_c_shape); + this->check_og_ = new Tensor(weights_c_shape); + memcpy(this->check_ig_->mutable_data(), bias_data + 4 * hidden_size, + hidden_size * sizeof(OpDataType)); + memcpy(this->check_fg_->mutable_data(), bias_data + 5 * hidden_size, + hidden_size * sizeof(OpDataType)); + memcpy(this->check_og_->mutable_data(), bias_data + 6 * hidden_size, + hidden_size * sizeof(OpDataType)); + } + + safe_free(&weight_x); + safe_free(&weight_h); + safe_free(&aligned_weights_data_h); + + this->_ctx = &ctx; + this->max_thread_num_ = omp_get_max_threads(); + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus VenderLstm::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param) { + Tensor* input = inputs[0]; + Tensor* hidden_out = outputs[0]; + Tensor* cell_out = nullptr; + + if (outputs.size() >= 2) { + cell_out = outputs[1]; + } + + const Tensor* bias = param.bias(); + const Tensor* init_t0 = param.init_hidden(); + + int hidden_size = hidden_out->channel(); + int batch_size = input->get_seq_offset().size() - 1; + Shape offset({0, 0, 0, 0}, Layout_NCHW); + + // init state shape + Shape init_state_shape({batch_size, hidden_size, 1, 1}, Layout_NCHW); + math::ReorderInitState reorder; + + Tensor* xx = nullptr; + + if (param.skip_input) { + // if skip_input is true, the input memory layout should be + // total_seq_len * (4 * hidden_size) + xx = input; + } else { + // if skip_input is false, the input memory layout should be + // total_seq_len * input_size + // xx = x * [Wix, Wfx, Wcx, Wox] + Shape xx_shape({input->num(), hidden_size * 4, 1, 1}, Layout_NCHW); + + // if current size < request size, realloc a buf for using + xx = new Tensor(); + this->xx_ = request_buf_for_input(this->xx_, xx_shape); + xx->share_sub_buffer(*(this->xx_), xx_shape, offset); + + MatrixInfo src((OpDataType*)(input->mutable_data()), input->num(), input->channel()); + MatrixInfo dst((OpDataType*)(xx->mutable_data()), xx->num(), xx->channel()); + packed_w_x_->gemm_compute(src, &dst, 0.0f); + + // input activation + int cnt = xx->size(); + OpDataType* p = (OpDataType*)xx->mutable_data(); + + switch (param.input_activity) { + case Active_stanh: + case Active_tanh: + for(int i=0;i batch_xx; + Shape aligned_xx_shape({xx->num(), this->aligned_hidden_size_ * 4, 1, 1}, Layout_NCHW); + batch_xx.share_sub_buffer(*(this->batch_xx_), aligned_xx_shape, offset); + + Tensor batch_hidden; + Shape aligned_output_shape({hidden_out->num(), this->aligned_hidden_size_, 1, 1}, Layout_NCHW); + batch_hidden.share_sub_buffer(*(this->batch_hidden_), aligned_output_shape, offset); + + Tensor batch_cell; + batch_cell.share_sub_buffer(*(this->batch_cell_), aligned_output_shape, offset); + + Tensor batch_cell_act; + batch_cell_act.share_sub_buffer(*(this->batch_cell_act_), aligned_output_shape, offset); + + MatrixInfo xx_matrix((OpDataType *)xx->mutable_data(), xx->num(), xx->channel()); + MatrixInfo batch_xx_matrix((OpDataType *)batch_xx.mutable_data(), batch_xx.num(), + batch_xx.channel()); + MatrixInfo batch_hidden_matrix((OpDataType*)batch_hidden.mutable_data(), + batch_hidden.num(), + batch_hidden.channel()); + MatrixInfo batch_cell_matrix((OpDataType*)batch_cell.mutable_data(), batch_cell.num(), + batch_cell.channel()); + MatrixInfo batch_cell_act_matrix((OpDataType*)batch_cell_act.mutable_data(), + batch_cell_act.num(), + batch_cell_act.channel()); + + // handle bias info + if (bias) { + // row-wise-add bias to batch_xx, the layout of bias [bi, bf, bc, bo] + const OpDataType* bias_data = (const OpDataType*) bias->data(); + + for (int i = 0; i < input->num(); i++) { + int row_size = 4 * hidden_size; + cblas_saxpby(row_size, 1, bias_data, 1, 1, (xx_matrix.buf() + i * row_size), 1); + } + } + + // seq to batch meta data + std::vector> seq_to_batch_meta; + seq_to_batch_meta.push_back(input->get_seq_offset()[input->get_seq_offset().size() - 1]); + + // sequence to batch + bool is_reverse = param.is_reverse; + math::Seq2BatchFunctor to_batch; + to_batch(xx, &batch_xx, seq_to_batch_meta, true, is_reverse, 4); + + std::vector order(seq_to_batch_meta[2]); + LstmMetaValue lstm_value; + bool with_peephole = param.with_peephole; + + if (bias && with_peephole) { + // with peephole enable, [Wic, Wfc, Woc] is at the behind of bias + const OpDataType* bias_data = (const OpDataType*)bias->data(); + lstm_value.check_ig = (const OpDataType*)this->check_ig_->data(); + lstm_value.check_fg = (const OpDataType*)this->check_fg_->data(); + lstm_value.check_og = (const OpDataType*)this->check_og_->data(); + } else { + lstm_value.check_ig = nullptr; + lstm_value.check_fg = nullptr; + lstm_value.check_og = nullptr; + } + + lstm_value.prev_state_value = nullptr; + auto gate_act = param.gate_activity; + auto cell_act = param.cell_activity; + auto cand_act = param.candidate_activity; + + if (init_t0) { + // if have init cell info, fill it to lstm value + // get init_c0 from init_t0 and reorder it + Shape offset({batch_size, 0, 0, 0}, Layout_NCHW); + Tensor init_c0; + init_c0.share_sub_buffer(*init_t0, init_state_shape, offset); + reorder(&init_c0, order, batch_c0_, true); + + lstm_value.prev_state_value = (OpDataType*)batch_c0_->mutable_data(); + } + + auto batch_starts = seq_to_batch_meta[0]; + size_t num_batch = batch_starts.size() - 1; + + for (size_t n = 0; n < num_batch; n++) { + int bstart = batch_starts[n]; + int bend = batch_starts[n + 1]; + int cur_batch_size = bend - bstart; + + // xx += Ht-1 * [Wih, Wfh, Wch, Woh] according to batch number + MatrixInfo dst = batch_xx_matrix.subMatrixInfo(bstart, bend); + + if (n > 0) { + // if n > 0, get Ht-1 information from last calc, and convert it to src + int pre_h_start = batch_starts[n - 1]; + int pre_h_end = pre_h_start + cur_batch_size; + MatrixInfo src = batch_hidden_matrix.subMatrixInfo(pre_h_start, pre_h_end); + packed_w_h_->gemm_compute(src, &dst); + } else if (init_t0) { + // if this is the fisrt time calc and the batch_h0_ is not NULL, then using the init hidden value as src + // get init_h0 from init_t0 and reorder it + Shape offset({0, 0, 0, 0}, Layout_NCHW); + Tensor init_h0; + init_h0.share_sub_buffer(*init_t0, init_state_shape, offset); + reorder(&init_h0, order, batch_h0_, true); + + MatrixInfo src((OpDataType*)(batch_h0_->mutable_data()), batch_h0_->num(), + batch_h0_->channel()); + packed_w_h_->gemm_compute(src, &dst); + } + + // calc [Wic*Ct-1, Wfc*Ct-1, WocCt] and activation + // fill lstm value with the calc result before and the output buf + lstm_value.gate_value = dst.buf(); + lstm_value.output_value = batch_hidden_matrix.subMatrixInfo(bstart, bend).buf(); + lstm_value.state_value = batch_cell_matrix.subMatrixInfo(bstart, bend).buf(); + lstm_value.state_active_value = batch_cell_act_matrix.subMatrixInfo(bstart, bend).buf(); + + if (avx2_available_) { + compute_with_avx(lstm_value, this->aligned_hidden_size_, cur_batch_size, gate_act, cell_act, + cand_act); + } else { + compute(lstm_value, this->aligned_hidden_size_, cur_batch_size, gate_act, cell_act, cand_act); + } + + lstm_value.prev_state_value = lstm_value.state_value; + } + + // batch to sequence + math::Batch2SeqFunctor to_seq; + to_seq(&batch_hidden, hidden_out, seq_to_batch_meta); + + if (cell_out) { + to_seq(&batch_cell, cell_out, seq_to_batch_meta); + } + + if (!param.skip_input && xx) { + delete xx; + xx = nullptr; + } + + return SaberSuccess; +} + +template <> +void VenderLstm::compute(LstmMetaValue value, + int hidden_size, int batch_size, + const ActiveType& gate_act, + const ActiveType& cell_act, + const ActiveType& cand_act){} +template <> +void VenderLstm::compute(LstmMetaValue value, + int hidden_size, int batch_size, + const ActiveType& gate_act, + const ActiveType& cell_act, + const ActiveType& cand_act){} + +template <> +void VenderLstm::compute_with_avx(LstmMetaValue value, + int hidden_size, int batch_size, + const ActiveType& gate_act, + const ActiveType& cell_act, + const ActiveType& cand_act){} +template <> +void VenderLstm::compute_with_avx(LstmMetaValue value, + int hidden_size, int batch_size, + const ActiveType& gate_act, + const ActiveType& cell_act, + const ActiveType& cand_act){} +DEFINE_OP_TEMPLATE(VenderLstm, LstmParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(VenderLstm, LstmParam, X86, AK_INT8); + +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/x86/vender_lstm.h b/saber/funcs/impl/x86/vender_lstm.h new file mode 100644 index 000000000..0249287db --- /dev/null +++ b/saber/funcs/impl/x86/vender_lstm.h @@ -0,0 +1,168 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_LSTM_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_LSTM_H + +#include "saber/saber_types.h" +#include "saber/saber_funcs_param.h" +#include "mkl_packed_weight.h" +#include "sequence2batch.h" + +#include "saber/funcs/impl/impl_lstm.h" + +namespace anakin { +namespace saber { + +template +struct LstmMetaValue { + T* gate_value; + T* prev_state_value; + T* state_value; + T* state_active_value; + T* output_value; + const T* check_ig; + const T* check_fg; + const T* check_og; +}; + +template +class VenderLstm: public ImplBase < + X86, OpDtype,LstmParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + VenderLstm() : + avx2_available_(false), max_thread_num_(1), + packed_w_x_(nullptr), packed_w_h_(nullptr), + batch_h0_(nullptr), batch_c0_(nullptr), check_ig_(nullptr), + check_fg_(nullptr), check_og_(nullptr), + xx_(nullptr), batch_xx_(nullptr), batch_hidden_(nullptr), + batch_cell_(nullptr), batch_cell_act_(nullptr), aligned_hidden_size_(0) { + // LOG(INFO)<<"vender construct"; + } + + ~VenderLstm() { + safe_free(&packed_w_x_); + safe_free(&packed_w_h_); + safe_free(&batch_h0_); + safe_free(&batch_c0_); + safe_free(&check_ig_); + safe_free(&check_fg_); + safe_free(&check_og_); + safe_free(&xx_); + safe_free(&batch_xx_); + safe_free(&batch_hidden_); + safe_free(&batch_cell_); + safe_free(&batch_cell_act_); + } + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, + Context& ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, + Context& ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param) override; + + +private: + inline void safe_free(MatrixInfo** ptr) { + if (*ptr) { + delete (*ptr); + (*ptr) = nullptr; + } + } + + inline void safe_free(Tensor** ptr) { + if (*ptr) { + delete (*ptr); + (*ptr) = nullptr; + } + } + + inline void safe_free(mkl_packed_weight** ptr) { + if (*ptr) { + delete (*ptr); + (*ptr) = nullptr; + } + } + + inline Tensor* request_buf_for_input(Tensor* input, Shape required_shape) { + if (input) { + int len = 1; + + if (required_shape.size() == 0) { + len = 0; + } + + for (int i = 0; i < required_shape.size(); i++) { + len *= required_shape[i]; + } + + if (input->size() < len) { + input->re_alloc(required_shape,input->get_dtype()); + } + } else { + input = new Tensor(required_shape); + } + + return input; + } + + bool avx2_available_; + int max_thread_num_; + + mkl_packed_weight* packed_w_x_; + mkl_packed_weight* packed_w_h_; + Tensor* batch_h0_; + Tensor* batch_c0_; + Tensor* check_ig_; + Tensor* check_fg_; + Tensor* check_og_; + // buf for storing data after calculating x * [Wix, Wfx, Wcx, Wox] + Tensor* xx_; + // buf for storing data after xx calculating seq to batch + Tensor* batch_xx_; + + // buf for storing batch tmp data + Tensor* batch_hidden_; + Tensor* batch_cell_; + Tensor* batch_cell_act_; + /*aligned with 256bit(8 float)*/ + int aligned_hidden_size_; + + + virtual void compute(LstmMetaValue value, + int hidden_size, int batch_size, + const ActiveType& gate_act, + const ActiveType& cell_act, + const ActiveType& cand_act); + virtual void compute_with_avx(LstmMetaValue value, + int hidden_size, int batch_size, + const ActiveType& gate_act, + const ActiveType& cell_act, + const ActiveType& cand_act); +}; + +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_SABER_LSTM_H diff --git a/saber/funcs/impl/x86/vender_mat_mul.cpp b/saber/funcs/impl/x86/vender_mat_mul.cpp new file mode 100644 index 000000000..264f181d6 --- /dev/null +++ b/saber/funcs/impl/x86/vender_mat_mul.cpp @@ -0,0 +1,29 @@ +#include "saber/funcs/impl/x86/vender_mat_mul.h" + + +namespace anakin{ + +namespace saber{ + +template +SaberStatus SaberMatMul::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs, + MatMulParam ¶m) { + + CHECK_EQ(OpDtype, AK_FLOAT) << "vender mat mul only support float now!"; + const OpDataType* src0 = (OpDataType*)inputs[0]->data(); + const OpDataType* src1 = (OpDataType*)inputs[1]->data(); + OpDataType* dst = (OpDataType*)outputs[0]->mutable_data(); + + for (int i = 0; i < batch; i++) { + + cblas_sgemm(layout, transa, transb, M, N, K, alpha, src0 + i * M * K, lda, src1 + i * K * N, ldb, beta, dst + i * M * N, ldc); + } +} + +template class SaberMatMul; + +} // namespace saber; + +} // namespace anakin; \ No newline at end of file diff --git a/saber/funcs/impl/x86/vender_mat_mul.h b/saber/funcs/impl/x86/vender_mat_mul.h new file mode 100644 index 000000000..faf2c9242 --- /dev/null +++ b/saber/funcs/impl/x86/vender_mat_mul.h @@ -0,0 +1,126 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_MAT_MUL_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_MAT_MUL_H + +#include "saber/funcs/impl/impl_mat_mul.h" +#include "mkl.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberMatMul: public ImplBase > { + +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberMatMul() {} + + ~SaberMatMul() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + MatMulParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + MatMulParam ¶m, + Context &ctx) { + M = param._m; + N = param._n; + K = param._k; + batch = param._b; + + //row major. + layout = CblasRowMajor; + + //matrix A whether to tranpose. + //matrix A has size M by K. + if (param._is_transpose_X) { + transa = CblasTrans; + if (layout == CblasRowMajor) { + //A has changed its shape at mat_mul.h + lda = M; + }else { + lda = K; + } + }else { + transa = CblasNoTrans; + if (layout == CblasRowMajor) { + lda = K; + }else { + lda = M; + } + } + + //matrix B whether to transpose. + //matrix B has size K by N. + if (param._is_transpose_Y) { + transb = CblasTrans; + if (layout == CblasRowMajor) { + ldb = K; + }else { + ldb = N; + } + }else { + transb = CblasNoTrans; + if (layout == CblasRowMajor) { + ldb = N; + }else { + ldb = K; + } + } + + if (layout == CblasRowMajor) { + ldc = N; + }else { + ldc = M; + } + + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + MatMulParam ¶m); + +private: + CBLAS_LAYOUT layout; //CblasRowMajor or CblasColMajor + CBLAS_TRANSPOSE transa; //matrix A whether to tranpose. + CBLAS_TRANSPOSE transb; //matrix B whether to tranpose. + int batch; + int M; + int N; + int K; + int lda; //matrix A leading dimention. + int ldb; //matrix B leading dimention. + int ldc; //matrix C leading dimention. + float alpha{1.0f}; + float beta{0.0f}; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_MAT_MUL_H \ No newline at end of file diff --git a/saber/funcs/impl/x86/x86_utils.h b/saber/funcs/impl/x86/x86_utils.h index fbb028505..7e7355c88 100644 --- a/saber/funcs/impl/x86/x86_utils.h +++ b/saber/funcs/impl/x86/x86_utils.h @@ -1,27 +1,30 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -#ifndef X86_UTILS_H -#define X86_UTILS_H +#ifndef SABER_FUNCS_IMPL_X86_X86_UTILS_H +#define SABER_FUNCS_IMPL_X86_X86_UTILS_H #include #include #include #include +#include "saber/core/common.h" #include "saber/core/tensor.h" - +#include "saber/funcs/saber_util.h" namespace anakin { namespace saber { @@ -34,6 +37,7 @@ namespace saber { namespace utils { + /* a bunch of std:: analogues to be compliant with any msvs version * * Rationale: msvs c++ (and even some c) headers contain special pragma that @@ -43,35 +47,51 @@ namespace utils { * (since there is no any c++-rt dependent stuff, ideally...). */ /* SFINAE helper -- analogue to std::enable_if */ -class VectorPrint{ +class VectorPrint { public: template - static void print_float(Dtype *target){ - float* f=(float*)target; + static void print_float(Dtype* target) { + float* f = (float*)target; printf("size = %d\n", sizeof(Dtype)); - for(int i=0;i +static inline void try_expand_clean_tensor(opTensor& tensor,anakin::saber::Shape shape){ + if(utils::try_expand_tensor(tensor,shape)){ + memset(tensor.mutable_data(),0,tensor.valid_size()* type_length(tensor.get_dtype())); + }; +} -class AlignedUtils{ +class AlignedUtils { public: template - void aligned_last_dim(const Dtype* input,Dtype* output,int input_size, int ori_last_dim,int aligned_dim){ - for(int i=0;i(0); + } + } + + for (int i = 0; i < input_size; i++) { + int row = i / ori_last_dim; + int col = i % ori_last_dim; + output[row * aligned_dim + col] = input[i]; } } template - void unaligned_last_dim(const Dtype* input,Dtype* output,int output_size, int ori_last_dim,int aligned_dim){ - for(int i=0;i - void sorted_seq_2_seq(const Dtype* input, Dtype* output, int hidden_size,int alligned_hidden_size) { + void sorted_seq_2_seq(const Dtype* input, Dtype* output, int hidden_size, + int alligned_hidden_size) { int word_sum = _map_vec.size(); for (int ori_word_id = 0; ori_word_id < word_sum; ori_word_id++) { @@ -159,13 +180,13 @@ class SeqSortedseqTranseUtil { } } } -/** - * return whether need to transform - * @param offset_vec - * @param emit_offset_vec - * @param emit_length - * @return - */ + /** + * return whether need to transform + * @param offset_vec + * @param emit_offset_vec + * @param emit_length + * @return + */ bool get_sorted_map(std::vector& offset_vec, std::vector& emit_offset_vec, int& emit_length) { int batch_size = offset_vec.size() - 1; @@ -175,9 +196,12 @@ class SeqSortedseqTranseUtil { if (batch_size == 1) { emit_length = offset_vec[1] - offset_vec[0]; - emit_offset_vec.resize(emit_length+1); - for(int i=0;i<=emit_length;i++) - emit_offset_vec[i]=i; + emit_offset_vec.resize(emit_length + 1); + + for (int i = 0; i <= emit_length; i++) { + emit_offset_vec[i] = i; + } + return false; } @@ -194,7 +218,7 @@ class SeqSortedseqTranseUtil { if (max_len == 1) { emit_offset_vec.push_back(0); - emit_offset_vec.push_back(emit_length*batch_size); + emit_offset_vec.push_back(emit_length * batch_size); return false; } @@ -206,7 +230,8 @@ class SeqSortedseqTranseUtil { _map_vec.resize(word_sum); int target_word_id = 0; - std::vector length_vec_cnt=length_vec; + std::vector length_vec_cnt = length_vec; + for (int word_id_in_seq = 0; word_id_in_seq < max_len; word_id_in_seq++) { emit_offset_vec[word_id_in_seq] = target_word_id; @@ -214,13 +239,15 @@ class SeqSortedseqTranseUtil { int old_batch_id = _length_index[batch_id]; if (length_vec_cnt[old_batch_id] > 0) { - int inner_word_id_in_seq=word_id_in_seq; - if(_is_reverse){ - inner_word_id_in_seq=length_vec[old_batch_id]-1-word_id_in_seq; + int inner_word_id_in_seq = word_id_in_seq; + + if (_is_reverse) { + inner_word_id_in_seq = length_vec[old_batch_id] - 1 - word_id_in_seq; } + int old_word_id = offset_vec[old_batch_id] + inner_word_id_in_seq; _map_vec[old_word_id] = target_word_id; -// printf("map %d -> %d\n",old_word_id,target_word_id); + // printf("map %d -> %d\n",old_word_id,target_word_id); length_vec_cnt[old_batch_id]--; target_word_id++; } else { @@ -246,7 +273,7 @@ class SeqSortedseqTranseUtil { }; inline int round_up(int k, int c) { - return k+(c-k%c); + return ((k + c - 1) / c) * c; } inline int div_up(int k, int c) { @@ -595,10 +622,15 @@ struct c_compatible { inline void yield_thread() { } // reorder weight layout from NCHW(oc, ic, kh, kw) to OIhw16i16o -inline void weight_reorder_OIhw16i16o(Tensor& input, - Tensor& output) { - Shape shape = input.shape(); +inline void weight_reorder_OIhw16i16o(Tensor& input, + Tensor& output) { + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; + CHECK_EQ(output.get_dtype(), AK_FLOAT) << "only support float type"; + Shape shape = input.valid_shape(); int oc_value = shape[0], ic_value = shape[1], kh_value = shape[2], kw_value = shape[3]; + + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); #pragma omp parallel for collapse(6) schedule(static) for (int oc_idx = 0; oc_idx < oc_value / 16; ++oc_idx) { @@ -615,7 +647,7 @@ inline void weight_reorder_OIhw16i16o(Tensor& input, kh * kw_value * 16 * 16 + kw * 16 * 16 + ic * 16 + oc; - *(output.mutable_data() + output_idx) = *(input.data() + input_idx); + *(output_ptr + output_idx) = *(input_ptr + input_idx); } } } @@ -625,9 +657,15 @@ inline void weight_reorder_OIhw16i16o(Tensor& input, } // reorder weight layout from NCHW(oc, ic, kh, kw) to OIhwi16o -inline void weight_reorder_OIhwi16o(Tensor& input, - Tensor& output) { +inline void weight_reorder_OIhwi16o(Tensor& input, + Tensor& output) { + + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; + CHECK_EQ(output.get_dtype(), AK_FLOAT) << "only support float type"; Shape shape = input.shape(); + + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); #pragma omp parallel for collapse(5) schedule(static) for (int oc_idx = 0; oc_idx < shape[0] / 16; ++oc_idx) { @@ -643,7 +681,7 @@ inline void weight_reorder_OIhwi16o(Tensor& input, kw * shape[1] * 16 + ic * 16 + oc; - *(output.mutable_data() + output_idx) = *(input.data() + input_idx); + *(output_ptr + output_idx) = *(input_ptr + input_idx); } } } @@ -653,10 +691,15 @@ inline void weight_reorder_OIhwi16o(Tensor& input, // reorder weight layout from NCHW(oc, ic, kh, kw) to OIhwi8o -inline void weight_reorder_OIhwi8o(Tensor& input, - Tensor& output) { +inline void weight_reorder_OIhwi8o(Tensor& input, + Tensor& output) { Shape shape = input.shape(); + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; + CHECK_EQ(output.get_dtype(), AK_FLOAT) << "only support float type"; + + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); #pragma omp parallel for collapse(5) schedule(static) for (int oc_idx = 0; oc_idx < shape[0] / 8; ++oc_idx) { @@ -672,7 +715,7 @@ inline void weight_reorder_OIhwi8o(Tensor& input, kw * shape[1] * 8 + ic * 8 + oc; - *(output.mutable_data() + output_idx) = *(input.data() + input_idx); + *(output_ptr + output_idx) = *(input_ptr + input_idx); } } } @@ -681,11 +724,17 @@ inline void weight_reorder_OIhwi8o(Tensor& input, } // reorder weight layout from NCHW to Goihw16g -static void weight_reorder_Goihw16g(Tensor& input, - Tensor& output) { +static void weight_reorder_Goihw16g(Tensor& input, + Tensor& output) { + + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; + CHECK_EQ(output.get_dtype(), AK_FLOAT) << "only support float type"; Shape shape = input.shape(); int g_value = shape[0], oc_value = shape[1], ic_value = shape[1], kh_value = shape[2], kw_value = shape[3]; + + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); #pragma omp parallel for collapse(6) schedule(static) for (int g_idx = 0; g_idx < g_value / 16; ++g_idx) { @@ -703,7 +752,7 @@ static void weight_reorder_Goihw16g(Tensor& input, ic_idx * kh_value * kw_value * 16 + kh * kw_value * 16 + kw * 16 + g; - *(output.mutable_data() + output_idx) = *(input.data() + input_idx); + *(output_ptr + output_idx) = *(input_ptr + input_idx); } } } @@ -720,7 +769,7 @@ inline size_t datatype_size(DataType data_type) { case AK_INT32: return sizeof(int32_t); - case AK_INT16: + case AK_HALF: return sizeof(int16_t); case AK_INT8: @@ -740,23 +789,21 @@ inline size_t datatype_size(DataType data_type) { } // namespace saber } // namespace anakin -#if defined(_OPENMP) +#if defined(USE_OPENMP) #include #else -inline int omp_get_max_threads() { +static inline int omp_get_max_threads() { return 1; } -inline int omp_get_num_threads() { +static inline int omp_get_num_threads() { return 1; } -inline int omp_get_thread_num() { +static inline int omp_get_thread_num() { return 0; } -inline int omp_in_parallel() { +static inline int omp_in_parallel() { return 0; } #endif -#endif // X86_UTILS_H - -// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s +#endif // X86_UTILS_H \ No newline at end of file diff --git a/saber/funcs/layer_norm.h b/saber/funcs/layer_norm.h index fb6261be4..cddf30d8d 100644 --- a/saber/funcs/layer_norm.h +++ b/saber/funcs/layer_norm.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,12 +18,13 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_layer_norm.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_layer_norm.h" #endif #ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_layer_norm.h" +#include "saber/funcs/impl/x86/saber_layer_norm.h" #endif #ifdef USE_ARM_PLACE //todo @@ -33,35 +34,26 @@ namespace anakin{ namespace saber{ -template +template class LayerNorm : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, LayerNormParam > { public: using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - LayerNormParam>::BaseFunc; + TargetType, + OpDtype, + ImplBase, + LayerNormParam>::BaseFunc; LayerNorm() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef LayerNormParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef LayerNormParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -78,13 +70,11 @@ class LayerNorm : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderLayerNorm ); + this->_impl.push_back(new VenderLayerNorm ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberLayerNorm ); + this->_impl.push_back(new SaberLayerNorm ); return SaberSuccess; default: @@ -100,12 +90,6 @@ class LayerNorm : public BaseFunc< this->_best_impl = this->_impl[0]; } - virtual void pick_best_runtime(Input_v input, Output_v output, \ - Param_t& param, Context &ctx) override { - //! Normalize only has saber implementation - this->_best_impl = this->_impl[0]; - } - virtual void pick_best_specify(ImplEnum implenum) override { //! Normalize only has saber implementation this->_best_impl = this->_impl[0]; diff --git a/saber/funcs/lrn.h b/saber/funcs/lrn.h index 3b29ac273..018b8b53a 100644 --- a/saber/funcs/lrn.h +++ b/saber/funcs/lrn.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_lrn.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_lrn.h" #endif @@ -25,39 +26,32 @@ #ifdef USE_X86_PLACE #include "saber/funcs/impl/impl_lrn.h" #endif - +#ifdef USE_ARM_PLACE +//todo +#include "saber/funcs/impl/impl_lrn.h" +#endif namespace anakin { namespace saber { -template +template class Lrn : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - LrnParam -> { + LrnParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - LrnParam>::BaseFunc; + TargetType, + OpDtype, + ImplBase, + LrnParam>::BaseFunc; Lrn() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef LrnParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef LrnParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -76,13 +70,11 @@ class Lrn : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderLrn ); + this->_impl.push_back(new VenderLrn ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberLrn ); + this->_impl.push_back(new SaberLrn ); return SaberSuccess; default: @@ -97,8 +89,6 @@ class Lrn : public BaseFunc< this->_best_impl = this->_impl[0]; } - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} - virtual void pick_best_specify(ImplEnum implenum) override { this->_best_impl = this->_impl[0]; } diff --git a/saber/funcs/lstm.h b/saber/funcs/lstm.h new file mode 100644 index 000000000..c862cbe41 --- /dev/null +++ b/saber/funcs/lstm.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_LSTM_H +#define ANAKIN_SABER_FUNCS_LSTM_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_lstm.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_lstm.h" +//#include "saber/funcs/impl/cuda/vender_lstm.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_lstm.h" +#include "saber/funcs/impl/x86/vender_lstm.h" +#endif + +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/impl_lstm.h" +#endif + +namespace anakin { +namespace saber { +template +class Lstm : public BaseFunc < + TargetType, + OpDtype, + ImplBase, + LstmParam > { +public: + using BaseFunc < + TargetType, + OpDtype, + ImplBase, + LstmParam >::BaseFunc; + + Lstm() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef LstmParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v& input, + Output_v& output, Param_t& param) override { + + int seqLength = input[0]->num(); + int hiddenSize = 0; + + if (param.with_peephole) { + hiddenSize = param.bias()->valid_size() / 7; + } else { + hiddenSize = param.bias()->valid_size() / 4; + } + + Shape output_shape = Shape({seqLength, hiddenSize, param.num_direction, 1},input[0]->get_layout()); + output[0]->set_seq_offset(input[0]->get_seq_offset()); + + if (output.size() >= 2) { + output[1]->set_seq_offset(input[0]->get_seq_offset()); + } + + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + //this->_impl.push_back(new VenderLstm _impl.push_back(new VenderLstm ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberLstm ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + +} // namespace saber +} // namepace anakin + + +#endif // ANAKIN_SABER_FUNCS_LSTM_H + diff --git a/saber/funcs/mat_mul.h b/saber/funcs/mat_mul.h index e5ec84286..7ced3db09 100644 --- a/saber/funcs/mat_mul.h +++ b/saber/funcs/mat_mul.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,45 +18,36 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_mat_mul.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_mat_mul.h" #endif #ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_mat_mul.h" +#include "saber/funcs/impl/x86/vender_mat_mul.h" #endif namespace anakin{ namespace saber{ -template +template class MatMul : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - MatMulParam -> { + MatMulParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - MatMulParam>::BaseFunc; - - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef MatMulParam Param_t; + TargetType, + OpDtype, + ImplBase, + MatMulParam>::BaseFunc; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef MatMulParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -90,23 +81,21 @@ class MatMul : public BaseFunc< } CHECK_EQ(K0, K1); - param._B = input[0]->num() * input[0]->channel(); - param._M = M; - param._N = N; - param._K = K0; - return output[0]->set_shape({input[0]->num(), input[0]->channel(), M, N}); + param._b = input[0]->num() * input[0]->channel(); + param._m = M; + param._n = N; + param._k = K0; + return output[0]->set_shape(Shape({input[0]->num(), input[0]->channel(), M, N})); } virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderMatMul ); + this->_impl.push_back(new VenderMatMul ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberMatMul ); + this->_impl.push_back(new SaberMatMul ); return SaberSuccess; default: @@ -121,12 +110,10 @@ class MatMul : public BaseFunc< this->_best_impl = this->_impl[0]; } - virtual void pick_best_runtime(Input_v input, Output_v output, \ - Param_t& param, Context &ctx) override { - //! Fc only has saber implementation + virtual void pick_best_runtime(const Input_v input, Output_v output, Param_t& param, \ + Context &ctx) { this->_best_impl = this->_impl[0]; } - virtual void pick_best_specify(ImplEnum implenum) override { //! Fc only has saber implementation this->_best_impl = this->_impl[0]; diff --git a/saber/funcs/multiclass_nms.h b/saber/funcs/multiclass_nms.h deleted file mode 100644 index 98f22edca..000000000 --- a/saber/funcs/multiclass_nms.h +++ /dev/null @@ -1,123 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_MULTICLASS_NMS_H -#define ANAKIN_SABER_FUNCS_MULTICLASS_NMS_H - -#include "saber/funcs/base.h" -#include "saber/funcs/impl/impl_base.h" -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_multiclass_nms.h" -#endif - -#ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_multiclass_nms.h" -#endif - -namespace anakin{ - -namespace saber{ - -template -class MultiClassNMS : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - MultiClassNMSParam -> { -public: - using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - MultiClassNMSParam>::BaseFunc; - - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef MultiClassNMSParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; - typedef std::vector Shape_v; - - MultiClassNMS() = default; - - virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ - Param_t& param) override { - //! inputs[0]: bbox map, dims = 3 {N, boxes, 4(xmin, ymin, xmax, ymax)} - //! inputs[1]: score map, dims = 3 {N, classes, boxes} - //! output[0]: output detection result, dims = 2 {No., 6} - Shape sh1 = input[0]->valid_shape(); - Shape sh2 = input[1]->valid_shape(); - CHECK_EQ(sh1.dims(), 3) << "only support 3d (NHW) layout"; - Shape shape_out = output[0]->valid_shape(); - CHECK_EQ(shape_out.dims(), 2) << "only support 2d(NW) layout"; - int boxes = sh1[1]; - shape_out[0] = 1; - shape_out[1] = 7; - return output[0]->set_shape(shape_out); - } - - virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderMultiClassNMS ); - return SaberSuccess; - - case SABER_IMPL: - this->_impl.push_back(new SaberMultiClassNMS ); - return SaberSuccess; - - default: - return SaberUnImplError; - } - } - -private: - - virtual void pick_best_static() override { - //! Fc only has saber implementation - this->_best_impl = this->_impl[0]; - } - - virtual void pick_best_runtime(Input_v input, Output_v output, \ - Param_t& param, Context &ctx) override { - //! Fc only has saber implementation - this->_best_impl = this->_impl[0]; - } - - virtual void pick_best_specify(ImplEnum implenum) override { - //! Fc only has saber implementation - this->_best_impl = this->_impl[0]; - } - -}; - -} //namespace saber - -} //namespace anakin - -#endif //ANAKIN_SABER_FUNCS_MULTICLASS_NMS_H diff --git a/saber/funcs/mvn.h b/saber/funcs/mvn.h index b85f9f330..b74cfe30a 100644 --- a/saber/funcs/mvn.h +++ b/saber/funcs/mvn.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,47 +18,41 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_mvn.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_mvn.h" #endif #ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_mvn.h" +#endif +#ifdef USE_ARM_PLACE +//todo #include "saber/funcs/impl/impl_mvn.h" #endif - namespace anakin{ namespace saber{ -template +template class Mvn : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - MvnParam -> { + MvnParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - MvnParam>::BaseFunc; + TargetType, + OpDtype, + ImplBase, + MvnParam>::BaseFunc; Mvn() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef MvnParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef MvnParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -68,19 +62,17 @@ class Mvn : public BaseFunc< //! support inplace computation, output shape = input shape Shape output_shape = input[0]->valid_shape(); - output[0]->set_shape(output_shape); + return output[0]->set_shape(output_shape); } virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderMvn ); + this->_impl.push_back(new VenderMvn ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberMvn ); + this->_impl.push_back(new SaberMvn ); return SaberSuccess; default: @@ -96,12 +88,6 @@ class Mvn : public BaseFunc< this->_best_impl = this->_impl[0]; } - virtual void pick_best_runtime(Input_v input, Output_v output, \ - Param_t& param, Context &ctx) override { - //! Mvn only has saber implementation - this->_best_impl = this->_impl[0]; - } - virtual void pick_best_specify(ImplEnum implenum) override { //! Mvn only has saber implementation this->_best_impl = this->_impl[0]; diff --git a/saber/funcs/normalize.h b/saber/funcs/normalize.h index 5ece04b16..c56052e08 100644 --- a/saber/funcs/normalize.h +++ b/saber/funcs/normalize.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,51 +18,50 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_normalize.h" + #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_normalize.h" #endif #ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_normalize.h" +#include "saber/funcs/impl/x86/saber_normalize.h" #endif +/* +#ifdef AMD_GPU +#include "saber/funcs/impl/impl_normalize.h" +*/ + namespace anakin{ namespace saber{ template + DataType OpDtype> class Normalize : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - NormalizeParam -> { + NormalizeParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, NormalizeParam>::BaseFunc; Normalize() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef NormalizeParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef NormalizeParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; + virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ Param_t& param) override { @@ -75,13 +74,11 @@ class Normalize : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderNormalize ); + this->_impl.push_back(new VenderNormalize ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberNormalize ); + this->_impl.push_back(new SaberNormalize ); return SaberSuccess; default: @@ -93,16 +90,9 @@ class Normalize : public BaseFunc< private: virtual void pick_best_static() override { - //! Normalize only has saber implementation - this->_best_impl = this->_impl[0]; - } - - virtual void pick_best_runtime(Input_v input, Output_v output, \ - Param_t& param, Context &ctx) override { - //! Normalize only has saber implementation + //! Normalize only has saber implementations this->_best_impl = this->_impl[0]; } - virtual void pick_best_specify(ImplEnum implenum) override { //! Normalize only has saber implementation this->_best_impl = this->_impl[0]; diff --git a/saber/funcs/pad.h b/saber/funcs/pad.h index cd64eb810..4257f33e0 100644 --- a/saber/funcs/pad.h +++ b/saber/funcs/pad.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,50 +16,39 @@ #ifndef ANAKIN_SABER_FUNCS_PAD_H #define ANAKIN_SABER_FUNCS_PAD_H +#include "saber/funcs/impl/impl_pad.h" + #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_pad.h" #endif -#ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_pad.h" -#endif namespace anakin { namespace saber { template class Pad : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, PadParam > { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, PadParam>::BaseFunc; Pad() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef PadParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; + typedef PadParam Param_t; + typedef std::vector *> Input_v; + typedef std::vector *> Output_v; typedef std::vector Shape_v; virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ @@ -83,13 +72,11 @@ class Pad : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderPad ); + this->_impl.push_back(new VenderPad); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberPad ); + this->_impl.push_back(new SaberPad); return SaberSuccess; default: @@ -104,8 +91,6 @@ class Pad : public BaseFunc< this->_best_impl = this->_impl[0]; } - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} - virtual void pick_best_specify(ImplEnum implenum) override { this->_best_impl = this->_impl[0]; } diff --git a/saber/funcs/permute.h b/saber/funcs/permute.h index 0e36191bd..966f84f28 100644 --- a/saber/funcs/permute.h +++ b/saber/funcs/permute.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,48 +18,39 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_permute.h" + #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_permute.h" #endif #ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_permute.h" +#include "saber/funcs/impl/x86/saber_permute.h" +#endif +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_permute.h" #endif - namespace anakin { namespace saber { -template +template class Permute : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - PermuteParam -> { + PermuteParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, PermuteParam>::BaseFunc; Permute() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef PermuteParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; + typedef PermuteParam Param_t; + typedef std::vector *> Input_v; + typedef std::vector *> Output_v; typedef std::vector Shape_v; virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ @@ -85,13 +76,11 @@ class Permute : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderPermute ); + this->_impl.push_back(new VenderPermute ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberPermute ); + this->_impl.push_back(new SaberPermute ); return SaberSuccess; default: @@ -106,8 +95,6 @@ class Permute : public BaseFunc< this->_best_impl = this->_impl[0]; } - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} - virtual void pick_best_specify(ImplEnum implenum) override { this->_best_impl = this->_impl[0]; } diff --git a/saber/funcs/permute_power.h b/saber/funcs/permute_power.h index 04116e226..0f5951b09 100644 --- a/saber/funcs/permute_power.h +++ b/saber/funcs/permute_power.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,51 +16,39 @@ #ifndef ANAKIN_SABER_FUNCS_PERMUTE_POWER_H #define ANAKIN_SABER_FUNCS_PERMUTE_POWER_H +#include "saber/funcs/impl/impl_permute_power.h" + #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_permute_power.h" #include "saber/funcs/impl/cuda/vender_permute_power.h" #endif - #ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_permute_power.h" +#include "saber/funcs/impl/x86/saber_permute_power.h" #endif - namespace anakin { namespace saber { template + DataType OpDtype> class PermutePower : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - PermutePowerParam -> { + PermutePowerParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, PermutePowerParam>::BaseFunc; PermutePower() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef PermutePowerParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; + typedef PermutePowerParam Param_t; + typedef std::vector *> Input_v; + typedef std::vector *> Output_v; typedef std::vector Shape_v; virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ @@ -87,13 +75,11 @@ class PermutePower : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderPermutePower ); + this->_impl.push_back(new VenderPermutePower ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberPermutePower ); + this->_impl.push_back(new SaberPermutePower ); return SaberSuccess; default: @@ -108,8 +94,6 @@ class PermutePower : public BaseFunc< this->_best_impl = this->_impl[0]; } - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} - virtual void pick_best_specify(ImplEnum implenum) override { this->_best_impl = this->_impl[0]; } diff --git a/saber/funcs/pooling.h b/saber/funcs/pooling.h index 739d05851..f004e7923 100644 --- a/saber/funcs/pooling.h +++ b/saber/funcs/pooling.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_pooling.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_pooling.h" #include "saber/funcs/impl/cuda/vender_pooling.h" @@ -26,43 +27,32 @@ #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_pooling.h" #endif - -#ifdef USE_BM -#include "saber/funcs/impl/bm/vender_pooling.h" +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/impl_pooling.h" #endif - namespace anakin { namespace saber { template + DataType OpDtype> class Pooling : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - PoolingParam -> { + PoolingParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, PoolingParam>::BaseFunc; Pooling() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef PoolingParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef PoolingParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -130,12 +120,10 @@ class Pooling : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderPooling ); + this->_impl.push_back(new VenderPooling ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberPooling ); + this->_impl.push_back(new SaberPooling ); return SaberSuccess; default: return SaberUnImplError; diff --git a/saber/funcs/pooling_with_index.h b/saber/funcs/pooling_with_index.h index f2fc943cd..b3315a0f7 100644 --- a/saber/funcs/pooling_with_index.h +++ b/saber/funcs/pooling_with_index.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,50 +16,39 @@ #ifndef ANAKIN_SABER_FUNCS_POOLING_WITH_INDEX_H #define ANAKIN_SABER_FUNCS_POOLING_WITH_INDEX_H +#include "saber/funcs/impl/impl_pooling_with_index.h" + #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_pooling_with_index.h" #endif - #ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_pooling_with_index.h" +#include "saber/funcs/impl/x86/saber_pooling_with_index.h" #endif - namespace anakin { namespace saber { template + DataType OpDtype> class PoolingWithIndex : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, PoolingParam > { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, PoolingParam>::BaseFunc; PoolingWithIndex() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef PoolingParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; + typedef PoolingParam Param_t; + typedef std::vector *> Input_v; + typedef std::vector *> Output_v; typedef std::vector Shape_v; virtual SaberStatus compute_output_shape(const Input_v& input, Output_v &output, \ @@ -111,13 +100,13 @@ class PoolingWithIndex : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderPoolingWithIndex ); + this->_impl.push_back(new VenderPoolingWithIndex ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberPoolingWithIndex ); + this->_impl.push_back(new SaberPoolingWithIndex ); return SaberSuccess; default: diff --git a/saber/funcs/power.h b/saber/funcs/power.h index e9773b44c..cb7337569 100644 --- a/saber/funcs/power.h +++ b/saber/funcs/power.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,48 +18,36 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_power.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_power.h" #endif #ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_power.h" +#include "saber/funcs/impl/x86/saber_power.h" #endif - namespace anakin { namespace saber { template + DataType OpDtype> class Power : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - PowerParam -> { + PowerParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, PowerParam>::BaseFunc; Power() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef PowerParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; + typedef PowerParam Param_t; + typedef std::vector *> Input_v; + typedef std::vector *> Output_v; typedef std::vector Shape_v; virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ @@ -76,13 +64,11 @@ class Power : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderPower ); + this->_impl.push_back(new VenderPower ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberPower ); + this->_impl.push_back(new SaberPower ); return SaberSuccess; default: @@ -97,8 +83,6 @@ class Power : public BaseFunc< this->_best_impl = this->_impl[0]; } - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} - virtual void pick_best_specify(ImplEnum implenum) override { this->_best_impl = this->_impl[0]; } @@ -106,7 +90,6 @@ class Power : public BaseFunc< }; } - } #endif //ANAKIN_SABER_FUNCS_POWER_H diff --git a/saber/funcs/prelu.h b/saber/funcs/prelu.h deleted file mode 100644 index 1d2bde02e..000000000 --- a/saber/funcs/prelu.h +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_PRELU_H -#define ANAKIN_SABER_FUNCS_PRELU_H - -#include "saber/funcs/base.h" -#include "saber/funcs/impl/impl_base.h" -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_prelu.h" -#endif - -#ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_prelu.h" -#endif - -namespace anakin{ - -namespace saber{ - -template -class Prelu : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - PreluParam -> { -public: - using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - PreluParam>::BaseFunc; - - Prelu() = default; - - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef PreluParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; - typedef std::vector Shape_v; - - virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ - Param_t& param) override { - - //! support inplace computation, output shape = input shape - Shape output_shape = input[0]->valid_shape(); - output[0]->set_shape(output_shape); - return SaberSuccess; - } - - virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderPrelu); - return SaberSuccess; - - case SABER_IMPL: - this->_impl.push_back(new SaberPrelu); - return SaberSuccess; - - default: - return SaberUnImplError; - } - } - - -private: - - virtual void pick_best_static() override { - //! Prelu only has saber implementation - this->_best_impl = this->_impl[0]; - } - - //virtual void pick_best_runtime(Input_v input, Output_v output, \ - // Param_t& param, Context &ctx) override { - // //! Prelu only has saber implementation - // this->_best_impl = this->_impl[0]; - //} - - virtual void pick_best_specify(ImplEnum implenum) override { - //! Prelu only has saber implementation - this->_best_impl = this->_impl[0]; - } - -}; - -} //namespace saber - -} //namespace anakin - -#endif //ANAKIN_SABER_FUNCS_PRELU_H diff --git a/saber/funcs/priorbox.h b/saber/funcs/priorbox.h index cb42a2f2f..6def408a7 100644 --- a/saber/funcs/priorbox.h +++ b/saber/funcs/priorbox.h @@ -1,126 +1,261 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ - #ifndef ANAKIN_SABER_FUNCS_PRIORBOX_H #define ANAKIN_SABER_FUNCS_PRIORBOX_H #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_priorbox.h" -#endif - -#ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_priorbox.h" -#endif -namespace anakin{ +namespace anakin { -namespace saber{ +namespace saber { template + DataType OpDtype> class PriorBox : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - PriorBoxParam -> { + PriorBoxParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, PriorBoxParam>::BaseFunc; - PriorBox() = default; + ~PriorBox() { + if (_cpu_data) { + fast_free(_cpu_data); + _cpu_data = nullptr; + } + } + typedef TargetWrapper API; + typedef std::vector *> Input_v; + typedef std::vector *> Output_v; + typedef PriorBoxParam Param_t; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef PriorBoxParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; - typedef std::vector Shape_v; - - virtual SaberStatus compute_output_shape(const Input_v& input, - Output_v& output, \ - Param_t& param) override { - Shape shape_out = output[0]->valid_shape(); - CHECK_EQ(shape_out.dims(), 4) << "only support 4d (NCHW) layout"; - shape_out[0] = 1; - shape_out[1] = 1; - shape_out[2] = 2; - + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + //! priorbox layout NHW + //! N = 1, H = 2 + //! W = 4 * feature_map_width * feature_map_height * num_of_priors int win1 = input[0]->width(); int hin1 = input[0]->height(); - int wout = win1 * hin1 * param.prior_num * 4; - shape_out[3] = wout; - + Shape shape_out({1, 2, wout}, Layout_NHW); return output[0]->set_shape(shape_out); } virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderPriorBox); - return SaberSuccess; - - case SABER_IMPL: - this->_impl.push_back(new SaberPriorBox); - return SaberSuccess; - - default: - return SaberUnImplError; + return SaberSuccess; + } + + SaberStatus compute_priorbox_kernel(const Input_v& input, Output_v& output, Param_t& param) { + /* + unsigned long long out_size = output[0]->valid_size(); + if (_cpu_data == nullptr) { + _size = out_size; + _cpu_data = static_cast(fast_malloc(sizeof(float) * _size)); + } else { + if (out_size > _size) { + _size = out_size; + fast_free(_cpu_data); + _cpu_data = static_cast(fast_malloc(sizeof(float) * _size)); + } + } + _tensor_tmp.reshape(output[0]->valid_shape()); + + float* min_buf = (float*)fast_malloc(sizeof(float) * 4); + float* max_buf = (float*)fast_malloc(sizeof(float) * 4); + float* com_buf = (float*)fast_malloc(sizeof(float) * param.aspect_ratio.size() * 4); + + const int width = input[0]->width(); + const int height = input[0]->height(); + int img_width = param.img_w; + int img_height = param.img_h; + if (img_width == 0 || img_height == 0) { + img_width = input[1]->width(); + img_height = input[1]->height(); } + + float step_w = param.step_w; + float step_h = param.step_h; + if (step_w == 0 || step_h == 0) { + step_w = static_cast(img_width) / width; + step_h = static_cast(img_height) / height; + } + float offset = param.offset; + + int channel_size = height * width * param.prior_num * 4; + int idx = 0; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + float center_x = (w + offset) * step_w; + float center_y = (h + offset) * step_h; + float box_width; + float box_height; + for (int s = 0; s < param.min_size.size(); ++s) { + int min_idx = 0; + int max_idx = 0; + int com_idx = 0; + int min_size = param.min_size[s]; + //! first prior: aspect_ratio = 1, size = min_size + box_width = box_height = min_size; + //! xmin + min_buf[min_idx++] = (center_x - box_width / 2.f) / img_width; + //! ymin + min_buf[min_idx++] = (center_y - box_height / 2.f) / img_height; + //! xmax + min_buf[min_idx++] = (center_x + box_width / 2.f) / img_width; + //! ymax + min_buf[min_idx++] = (center_y + box_height / 2.f) / img_height; + + if (param.max_size.size() > 0) { + + int max_size = param.max_size[s]; + //! second prior: aspect_ratio = 1, size = sqrt(min_size * max_size) + box_width = box_height = sqrtf(min_size * max_size); + //! xmin + max_buf[max_idx++] = (center_x - box_width / 2.f) / img_width; + //! ymin + max_buf[max_idx++] = (center_y - box_height / 2.f) / img_height; + //! xmax + max_buf[max_idx++] = (center_x + box_width / 2.f) / img_width; + //! ymax + max_buf[max_idx++] = (center_y + box_height / 2.f) / img_height; + } + + //! rest of priors + for (int r = 0; r < param.aspect_ratio.size(); ++r) { + float ar = param.aspect_ratio[r]; + if (fabsf(ar - 1.f) < 1e-6f) { + continue; + } + box_width = min_size * sqrt(ar); + box_height = min_size / sqrt(ar); + //! xmin + com_buf[com_idx++] = (center_x - box_width / 2.f) / img_width; + //! ymin + com_buf[com_idx++] = (center_y - box_height / 2.f) / img_height; + //! xmax + com_buf[com_idx++] = (center_x + box_width / 2.f) / img_width; + //! ymax + com_buf[com_idx++] = (center_y + box_height / 2.f) / img_height; + } + + for (const auto &type : param.order) { + if (type == PRIOR_MIN) { + memcpy(_output_host + idx, min_buf, sizeof(float) * min_idx); + idx += min_idx; + } else if (type == PRIOR_MAX) { + memcpy(_output_host + idx, max_buf, sizeof(float) * max_idx); + idx += max_idx; + } else if (type == PRIOR_COM) { + memcpy(_output_host + idx, com_buf, sizeof(float) * com_idx); + idx += com_idx; + } + } + } + } + } + + fast_free(min_buf); + fast_free(max_buf); + fast_free(com_buf); + + //! clip the prior's coordidate such that it is within [0, 1] + if (param.is_clip) { + for (int d = 0; d < channel_size; ++d) { + _output_host[d] = std::min(std::max(_output_host[d], 0.f), 1.f); + } + } + //! set the variance. + + float* ptr = _output_host + channel_size; + int count = 0; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int i = 0; i < param.prior_num; ++i) { + for (int j = 0; j < 4; ++j) { + ptr[count] = param.variance[j]; + ++count; + } + } + } + } + //! copy data to tensor + typedef typename TargetTypeTraits::target_category target_category; + typedef typename IF::value, H2H, H2D>::Type copy_type; + API::sync_memcpy(_tensor_tmp.mutable_data(), 0, API::get_device_id(), \ + _cpu_data, 0, 0, sizeof(float) * out_size, copy_type()); +*/ + return SaberSuccess; + } + + //PriorBox do computation in init + virtual SaberStatus init(const Input_v& input, Output_v& output, Param_t& param, + SaberImplStrategy strategy, ImplEnum implenum, Context &ctx) { + if (output[0]->get_dtype() != AK_FLOAT) { + return SaberInvalidValue; + } else { + compute_priorbox_kernel(input, output, param); + } + return SaberSuccess; } + //copy data to output + virtual SaberStatus operator() (const Input_v& input, Output_v& output, Param_t& param, \ + Context &ctx) { + /*typename Tensor::API::stream_t stream = ctx.get_compute_stream(); + bool flag = (this->_param == param); + for (int i = 0; i < input.size(); ++i) { + flag = flag && input[i]->valid_shape() == this->_last_input_shape[i]; + } + if (!flag) { + this->_param = param; + this->_last_input_shape.clear(); + for (int i = 0; i < input.size(); ++i) { + this->_last_input_shape.push_back(input[i]->valid_shape()); + } + compute_output_shape(input, output, param); + compute_priorbox_kernel(input, output, param); + } + return output[0]->async_copy_from(_tensor_tmp, stream);*/ + return SaberSuccess; + } private: + float* _cpu_data{nullptr}; + Tensor _tensor_tmp; + unsigned long long _size{0}; virtual void pick_best_static() override { - //! PriorBox only has saber implementation - this->_best_impl = this->_impl[0]; + // do nothing + return; } - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param, \ - // Context &ctx) override { - // //! PriorBox only has saber implementation - // this->_best_impl = this->_impl[0]; - //} - virtual void pick_best_specify(ImplEnum implenum) override { - //! PriorBox only has saber implementation - this->_best_impl = this->_impl[0]; + //do nothing + return; } }; -} //namespace saber +} // namespace saber -} //namespace anakin +} // namespace anakin #endif //ANAKIN_SABER_FUNCS_PRIORBOX_H diff --git a/saber/funcs/reshape.h b/saber/funcs/reshape.h index 5dad6250d..34c33d642 100644 --- a/saber/funcs/reshape.h +++ b/saber/funcs/reshape.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,42 +18,30 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/saber_funcs_param.h" namespace anakin { namespace saber { -template +template class Reshape : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - ReshapeParam - > + ReshapeParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, ReshapeParam >::BaseFunc; Reshape() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef ReshapeParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; + typedef ReshapeParam Param_t; + typedef std::vector *> Input_v; + typedef std::vector *> Output_v; typedef std::vector Shape_v; virtual SaberStatus compute_output_shape(const Input_v& input, \ @@ -62,10 +50,10 @@ class Reshape : public BaseFunc< Shape output_shape; output_shape.resize(param.shape_params.size()); - CHECK_EQ(input[0]->is_continue_mem(), true) << "input tensor must not have roi"; + CHECK_EQ(input[0] -> is_continue_mem(), true) << "input tensor must not have roi"; - Shape input_shape = input[0]->valid_shape(); - int valid_size = input[0]->valid_size(); + Shape input_shape = input[0] -> valid_shape(); + int valid_size = input[0] -> valid_size(); int infer_axis = -1; int count_axis = 1; for (int i = 0; i < param.shape_params.size(); ++i) { @@ -85,7 +73,7 @@ class Reshape : public BaseFunc< if (infer_axis >= 0){ output_shape[infer_axis] = valid_size / count_axis; } - return output[0]->set_shape(output_shape);//, output_shape, offset); + return output[0] -> set_shape(output_shape); } //Reshape ops do nothing virtual SaberStatus init_impl(ImplEnum implenum) override { @@ -102,20 +90,18 @@ class Reshape : public BaseFunc< //Reshape ops do nothing virtual SaberStatus operator()(const Input_v& input, Output_v& output, Param_t& param, \ Context &ctx) { - + return SaberSuccess; } private: virtual void pick_best_static() override { //saber impl - this->_best_impl = this->_impl[0]; + this -> _best_impl = this -> _impl[0]; } - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} - virtual void pick_best_specify(ImplEnum implenum) override { //saber impl - this->_best_impl = this->_impl[0]; + this -> _best_impl = this -> _impl[0]; } }; diff --git a/saber/funcs/resize.h b/saber/funcs/resize.h old mode 100644 new mode 100755 index 4bbceed1f..16fb002fa --- a/saber/funcs/resize.h +++ b/saber/funcs/resize.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ #ifndef ANAKIN_SABER_FUNCS_RESIZE_H #define ANAKIN_SABER_FUNCS_RESIZE_H +#include "saber/funcs/impl/impl_resize.h" #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" @@ -26,38 +27,38 @@ #include "saber/funcs/impl/impl_resize.h" #endif +#ifdef AMD_GPU +#include "saber/funcs/impl/impl_resize.h" +#endif + +#ifdef USE_ARM_PLACE +//todo +#include "saber/funcs/impl/impl_resize.h" +#endif namespace anakin{ namespace saber{ template + DataType OpDtype> class Resize : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, ResizeParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, ResizeParam >::BaseFunc; Resize() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef ResizeParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ResizeParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -96,15 +97,17 @@ class Resize : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - return SaberUnImplError; + //return SaberUnImplError; + this->_impl.push_back(new VenderResize); + return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberResize); + this->_impl.push_back(new SaberResize); return SaberSuccess; default: return SaberUnImplError; } - return SaberSuccess; }; private: @@ -113,13 +116,6 @@ class Resize : public BaseFunc< //! resize only has saber implementation this->_best_impl = this->_impl[0]; } - - virtual void pick_best_runtime(Input_v input, Output_v output, \ - Param_t& param, Context &ctx) override { - //! resize only has saber implementation - this->_best_impl = this->_impl[0]; - } - virtual void pick_best_specify(ImplEnum implenum) override { //! resize only has saber implementation this->_best_impl = this->_impl[0]; diff --git a/saber/funcs/reverse_input.h b/saber/funcs/reverse_input.h new file mode 100644 index 000000000..f6f34f065 --- /dev/null +++ b/saber/funcs/reverse_input.h @@ -0,0 +1,107 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_REVERSE_INPUT_H +#define ANAKIN_SABER_FUNCS_REVERSE_INPUT_H +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_reverse_input.h" + + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_reverse_input.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_reverse_input.h" +#endif + +#ifdef USE_AMD + +#endif + +#ifdef USE_ARM_PLACE + +#endif + +namespace anakin { +namespace saber { + +template +class ReverseInput : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + EmptyParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + EmptyParam>::BaseFunc; + + ReverseInput() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef EmptyParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + for (int i = 0; i < input.size(); ++i) { + output[i]->set_shape(input[i]->valid_shape()); + output[i]->set_seq_offset(input[i]->get_seq_offset()); + } + return SaberSuccess; + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderReverseInput ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberReverseInput ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + if (true) + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} +} +#endif //SABER_FUNCS_REVERSE_INPUT_H diff --git a/saber/funcs/reverse_sequence.h b/saber/funcs/reverse_sequence.h new file mode 100644 index 000000000..610e72d41 --- /dev/null +++ b/saber/funcs/reverse_sequence.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_REVERSE_SEQUENCE_H +#define ANAKIN_SABER_FUNCS_REVERSE_SEQUENCE_H +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_reverse_sequence.h" + + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_reverse_sequence.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_reverse_sequence.h" +#endif + + +#ifdef USE_AMD + +#endif + +#ifdef USE_ARM_PLACE + +#endif + +namespace anakin { +namespace saber { + +template +class ReverseSequence : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + EmptyParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + EmptyParam>::BaseFunc; + + ReverseSequence() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef EmptyParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + for (int i = 0; i < input.size(); ++i) { + output[i]->set_shape(input[i]->valid_shape()); + output[i]->set_seq_offset(input[i]->get_seq_offset()); + } + return SaberSuccess; + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderReverseSequence ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberReverseSequence ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + if (true) + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} +} +#endif //ANAKIN_SABER_FUNCS_REVERSE_SEQUENCE_H diff --git a/saber/funcs/roi_pooling.h b/saber/funcs/roi_pooling.h index d4abb8d1d..82dec6f49 100644 --- a/saber/funcs/roi_pooling.h +++ b/saber/funcs/roi_pooling.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,47 +18,36 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_roi_pool.h" -#endif - -#ifdef USE_X86_PLACE #include "saber/funcs/impl/impl_roi_pooling.h" +#ifdef NVIDIA_GPU +//#include "saber/funcs/impl/cuda/saber_roi_pool.h" #endif namespace anakin { namespace saber { template class RoiPool : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - RoiPoolParam - > + TargetType, + OpDtype, + ImplBase, + RoiPoolParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - RoiPoolParam >::BaseFunc; + TargetType, + OpDtype, + ImplBase, + RoiPoolParam >::BaseFunc; RoiPool() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef RoiPoolParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef RoiPoolParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -84,12 +73,12 @@ class RoiPool : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderRoiPool ); + this->_impl.push_back(new VenderRoiPool ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberRoiPool ); + this->_impl.push_back(new SaberRoiPool ); return SaberSuccess; default: return SaberUnImplError; @@ -102,9 +91,6 @@ class RoiPool : public BaseFunc< if (true) // some condition? this->_best_impl = this->_impl[0]; } - - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} - virtual void pick_best_specify(ImplEnum implenum) override { this->_best_impl = this->_impl[0]; } diff --git a/saber/funcs/saber_util.h b/saber/funcs/saber_util.h new file mode 100644 index 000000000..ac2d22b9d --- /dev/null +++ b/saber/funcs/saber_util.h @@ -0,0 +1,44 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_SABER_UTIL_H +#define ANAKIN_SABER_FUNCS_IMPL_SABER_UTIL_H +namespace anakin { + +namespace saber { +namespace utils { + +#include "saber/core/common.h" +#include "saber/core/tensor.h" +#include "saber/core/shape.h" + +template +static inline bool try_expand_tensor(opTensor& x, anakin::saber::Shape shape) { + if (x.valid_size() < shape.count()) { + x.re_alloc(shape, x.get_dtype()); + return true; + } + return false; +} + +template +static inline bool try_expand_tensor(opTensor& x, int size) { + if (x.valid_size() < size) { + anakin::saber::Shape shape({1, 1, 1, size}, Layout_NCHW); + return try_expand_tensor(x, shape); + } + return false; +} + +template +static inline void transpose(const DataType* in,int height,int width,DataType*out){ + for(int i=0;i class Scale : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, - ScaleParam -> { + ScaleParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, ScaleParam>::BaseFunc; Scale() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef ScaleParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ScaleParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -85,14 +65,12 @@ class Scale : public BaseFunc< switch (implenum) { case VENDER_IMPL: this->_impl.push_back(new VenderScale ); + OpDtype>); return SaberSuccess; case SABER_IMPL: this->_impl.push_back(new SaberScale ); + OpDtype>); return SaberSuccess; default: diff --git a/saber/funcs/sequence_conv.h b/saber/funcs/sequence_conv.h new file mode 100644 index 000000000..ab3176fc5 --- /dev/null +++ b/saber/funcs/sequence_conv.h @@ -0,0 +1,97 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_SEQUENCE_CONV_H +#define ANAKIN_SABER_FUNCS_SEQUENCE_CONV_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/impl_sequence_conv.h" + +#ifdef USE_X86_PLACE +//#include "saber/funcs/impl/x86/saber_sequence_conv.h" +#endif +namespace anakin { +namespace saber { + +template +class SequenceConv : public BaseFunc < + TargetType, + OpDtype, + ImplBase, + SequenceConvParam> { +public: + using BaseFunc ::BaseFunc; + + SequenceConv() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef SequenceConvParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v& input, \ + Output_v& output, Param_t& param) override { + + InDataTensor* input_tensor = input[0]; + Shape new_shape = input_tensor->valid_shape(); + new_shape.set_num(input_tensor->num()); + new_shape.set_channel(param.filter_tensor->width()); + new_shape.set_height(1); + new_shape.set_width(1); + return output[0]->set_shape(new_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderSequenceConv ); + return SaberSuccess; + case SABER_IMPL: + this->_impl.push_back(new SaberSequenceConv ); + return SaberSuccess; + default: + return SaberUnImplError; + } + } +private: + + virtual void pick_best_static() override { + if (true) { // some condition? + this->_best_impl = this->_impl[0]; + } + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; +} +} + + +#endif diff --git a/saber/funcs/sequence_pool.h b/saber/funcs/sequence_pool.h index 1a5f4a196..3e6ba3b03 100644 --- a/saber/funcs/sequence_pool.h +++ b/saber/funcs/sequence_pool.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,47 +19,37 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" #include "saber/saber_funcs_param.h" - -#ifdef NVIDIA_GPU #include "saber/funcs/impl/impl_sequence_pool.h" -#endif #ifdef USE_X86_PLACE -#include "saber/funcs/impl/x86/saber_sequence_pool.h" +//#include "saber/funcs/impl/x86/saber_sequence_pool.h" #endif namespace anakin { namespace saber { template class SequencePool : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, SequencePoolParam > { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, SequencePoolParam>::BaseFunc; SequencePool() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef SequencePoolParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef SequencePoolParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -68,21 +58,29 @@ class SequencePool : public BaseFunc< Output_v &output, Param_t& param) override { Shape output_shape = (input[0]->valid_shape()); int num_idx = input[0]->num_index(); - std::vector offset = input[0]->get_seq_offset(); - CHECK_GT(offset.size(), 1) << "seq num error! " << offset.size(); - output_shape[num_idx] = offset.size() - 1; + std::vector > offset = input[0]->get_seq_offset(); + //CHECK_GT(offset.size(), 1) << "seq num error! " << offset.size(); + int output_shape_num=0; + if (offset[0].size() > 1) { + output_shape_num = offset[0].size() - 1; + } else { + output_shape_num = input[0]->num(); + } + output_shape[num_idx]=output_shape_num; + + output[0]->set_seq_offset(input[0]->get_seq_offset()); return output[0]->set_shape(output_shape); } virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderSequencePool ); + this->_impl.push_back(new VenderSequencePool ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberSequencePool ); + this->_impl.push_back(new SaberSequencePool ); return SaberSuccess; default: return SaberUnImplError; diff --git a/saber/funcs/slice.h b/saber/funcs/slice.h index 99cc62107..feeadce2c 100644 --- a/saber/funcs/slice.h +++ b/saber/funcs/slice.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_slice.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_slice.h" #endif @@ -25,40 +26,25 @@ #ifdef USE_X86_PLACE #include "saber/funcs/impl/impl_slice.h" #endif - +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_slice.h" +#endif namespace anakin{ namespace saber{ -template -class Slice : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - SliceParam> +template +class Slice : public BaseFunc { public: - using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - SliceParam >::BaseFunc; + using BaseFunc::BaseFunc; Slice() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef SliceParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef SliceParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -99,8 +85,9 @@ class Slice : public BaseFunc< Shape sh = shape_in; sh[param.axis] = step; output[0]->set_shape(sh); + param.slice_points.clear(); for (int i = 1; i < top_size; ++i) { - param.slice_points[i - 1] = i * step; + param.slice_points.push_back(i * step); status = output[i]->set_shape(sh); if (status != SaberSuccess) { return status; @@ -113,12 +100,10 @@ class Slice : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderSlice ); + this->_impl.push_back(new VenderSlice ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberSlice ); + this->_impl.push_back(new SaberSlice ); return SaberSuccess; default: return SaberUnImplError; @@ -132,12 +117,6 @@ class Slice : public BaseFunc< this->_best_impl = this->_impl[0]; } - virtual void pick_best_runtime(Input_v input, Output_v output, \ - Param_t& param, Context &ctx) override { - //! slice only has saber implementation - this->_best_impl = this->_impl[0]; - } - virtual void pick_best_specify(ImplEnum implenum) override { //! slice only has saber implementation this->_best_impl = this->_impl[0]; diff --git a/saber/funcs/softmax.h b/saber/funcs/softmax.h index 1ad324908..8556c6d39 100644 --- a/saber/funcs/softmax.h +++ b/saber/funcs/softmax.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_softmax.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_softmax.h" #include "saber/funcs/impl/cuda/vender_softmax.h" @@ -26,50 +27,31 @@ #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_softmax.h" #endif - -#ifdef USE_BM -#include "saber/funcs/impl/bm/vender_softmax.h" +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_softmax.h" #endif - namespace anakin{ namespace saber{ -template -class Softmax : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - SoftmaxParam> +template +class Softmax : public BaseFunc { public: - using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - SoftmaxParam >::BaseFunc; + using BaseFunc::BaseFunc; Softmax() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef SoftmaxParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef SoftmaxParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; virtual SaberStatus compute_output_shape(const Input_v& input,\ Output_v &output, Param_t& param) override { - + output[0]->set_seq_offset(input[0]->get_seq_offset()); //! "input" only has one input tensor return output[0]->set_shape(input[0]->valid_shape()); } @@ -77,12 +59,10 @@ class Softmax : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderSoftmax ); + this->_impl.push_back(new VenderSoftmax ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberSoftmax ); + this->_impl.push_back(new SaberSoftmax ); return SaberSuccess; default: return SaberUnImplError; diff --git a/saber/funcs/spp.h b/saber/funcs/spp.h index 5ff5d58cb..7b70cea4c 100644 --- a/saber/funcs/spp.h +++ b/saber/funcs/spp.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,46 +18,34 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_spp.h" -#endif - -#ifdef USE_X86_PLACE #include "saber/funcs/impl/impl_spp.h" +#ifdef NVIDIA_GPU +//#include "saber/funcs/impl/cuda/saber_spp.h" #endif namespace anakin { namespace saber { template + DataType OpDtype +> class Spp : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - SPPParam - > -{ + TargetType, + OpDtype, + ImplBase, + SPPParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - SPPParam >::BaseFunc; + TargetType, + OpDtype, + ImplBase, + SPPParam >::BaseFunc; Spp() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef SPPParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef SPPParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -82,12 +70,12 @@ class Spp : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderSpp ); + this->_impl.push_back(new VenderSpp ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberSpp ); + this->_impl.push_back(new SaberSpp ); return SaberSuccess; default: return SaberUnImplError; diff --git a/saber/funcs/timer.h b/saber/funcs/timer.h index e5014a9cb..c32e90605 100644 --- a/saber/funcs/timer.h +++ b/saber/funcs/timer.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,10 +19,8 @@ #include "anakin_config.h" //#include #include -#ifdef USE_CUDA -#include -#endif #include +#include #include "saber/core/common.h" #include "saber/core/context.h" @@ -32,11 +30,6 @@ namespace saber{ template class SaberTimer final { -}; - -template <> -class SaberTimer final { - public: SaberTimer() {} @@ -46,11 +39,11 @@ class SaberTimer final { ms_time.clear(); } - void start(Context &ctx) { + void start(Context &ctx) { tstart = std::chrono::system_clock::now(); } - void end(Context &ctx) { + void end(Context &ctx) { tend = std::chrono::system_clock::now(); auto ts = std::chrono::duration_cast(tend - tstart); float elapse_ms = 1000.f * float(ts.count()) * std::chrono::microseconds::period::num / \ @@ -173,29 +166,53 @@ class SaberTimer final { }; #endif -#ifdef USE_BM + +#ifdef AMD_GPU + +typedef TargetWrapper AMD_API; + template <> -class SaberTimer final { +class SaberTimer final { public: - SaberTimer() {} + SaberTimer() { + Env::env_init(); + AMD_API::create_event(_e_start); + AMD_API::create_event(_e_end); + } - ~SaberTimer() {} + ~SaberTimer() { + AMD_API::destroy_event(_e_start); + AMD_API::destroy_event(_e_end); + } void clear() { ms_time.clear(); } - void start(Context &ctx) { - tstart = std::chrono::system_clock::now(); + void start(Context &ctx) { + AMD_API::destroy_event(_e_start); + AMD_API::record_event(_e_start, ctx.get_compute_stream()); } - void end(Context &ctx) { - tend = std::chrono::system_clock::now(); - auto ts = std::chrono::duration_cast(tend - tstart); - float elapse_ms = 1000.f * float(ts.count()) * std::chrono::microseconds::period::num / \ - std::chrono::microseconds::period::den; - ms_time.push_back(elapse_ms); + void end(Context &ctx) { + if(_e_start == nullptr) { + LOG(ERROR) << "please call start() befoer call end()"; + return; + } + + AMD_API::destroy_event(_e_end); + AMD_API::record_event(_e_end, ctx.get_compute_stream()); + AMD_API::sync_event(_e_end); + + cl_ulong start; + clGetEventProfilingInfo(_e_start, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &start,NULL); + + cl_ulong end; + clGetEventProfilingInfo(_e_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); + + float executionTime = 1e-6 * (end - start); + ms_time.push_back(executionTime); } float get_average_ms() { @@ -209,8 +226,22 @@ class SaberTimer final { return sum / ms_time.size(); } - // return tile (0-99) time. - float get_tile_time(float tile) { + float get_best_ms(){ + if (ms_time.size() == 0) { + return 0.f; + } +#if 0 + for(auto time : ms_time) + LOG(INFO) << time; +#endif + ms_time.sort(); + LOG(INFO) << ms_time.front() <<" - " << ms_time.back(); + + return ms_time.front(); + } + + // return tile (0-99) time. + float get_tile_time(float tile) { if (tile <0 || tile > 100) { return -1.f; @@ -233,11 +264,11 @@ class SaberTimer final { } private: - std::chrono::time_point tstart; - std::chrono::time_point tend; + cl_event _e_start, _e_end; std::list ms_time; }; -#endif // USE_BM +#endif + } } diff --git a/saber/funcs/transpose.h b/saber/funcs/transpose.h index c776668f3..9b33619f4 100644 --- a/saber/funcs/transpose.h +++ b/saber/funcs/transpose.h @@ -1,28 +1,27 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_FUNCS_TRANSPOSE_H #define ANAKIN_SABER_FUNCS_TRANSPOSE_H #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_transpose.h" -#endif - -#ifdef USE_X86_PLACE #include "saber/funcs/impl/impl_transpose.h" + +#ifdef NVIDIA_GPU +//#include "saber/funcs/impl/cuda/saber_transpose.h" #endif namespace anakin{ @@ -30,33 +29,26 @@ namespace anakin{ namespace saber{ template class Transpose : public BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, TransposeParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - TransposeParam >::BaseFunc; + TargetType, + OpDtype, + ImplBase, + TransposeParam >::BaseFunc; Transpose() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef TransposeParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef TransposeParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -66,26 +58,10 @@ class Transpose : public BaseFunc< Shape output_shape = input[0]->valid_shape(); - int num_idx = input[0]->num_index(); - int channel_idx = input[0]->channel_index(); - int height_idx = input[0]->height_index(); - int width_idx = input[0]->width_index(); - - CHECK_GE(height_idx, 0) << "no height dim in tensor"; - CHECK_GE(width_idx, 0) << "no width dim in tensor"; - - if (num_idx > -1) { - output_shape[num_idx] = input[0]->num(); // N - } - if (channel_idx > -1) { - output_shape[channel_idx] = input[0]->channel(); // C - } - if (height_idx > -1) { - output_shape[height_idx] = input[0]->width(); - } - if (width_idx > -1) { - output_shape[width_idx] = input[0]->height(); - } + output_shape.set_num(input[0]->num()); // N + output_shape.set_channel(input[0]->channel()); // C + output_shape.set_height(input[0]->width()); + output_shape.set_width(input[0]->height()); return output[0]->set_shape(output_shape); @@ -94,12 +70,12 @@ class Transpose : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderTranspose ); + this->_impl.push_back(new VenderTranspose ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberTranspose ); + this->_impl.push_back(new SaberTranspose ); return SaberSuccess; default: return SaberUnImplError; @@ -113,12 +89,6 @@ class Transpose : public BaseFunc< this->_best_impl = this->_impl[0]; } - virtual void pick_best_runtime(Input_v input, Output_v output, \ - Param_t& param, Context &ctx) override { - //! Transpose only has saber implementation - this->_best_impl = this->_impl[0]; - } - virtual void pick_best_specify(ImplEnum implenum) override { //! Transpose only has saber implementation this->_best_impl = this->_impl[0]; @@ -130,4 +100,4 @@ class Transpose : public BaseFunc< } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_TRANSPOSE_H +#endif //ANAKIN_SABER_FUNCS_TRANSPOSE_H \ No newline at end of file diff --git a/saber/funcs/unpool.h b/saber/funcs/unpool.h index 088454c67..be2de8f2e 100644 --- a/saber/funcs/unpool.h +++ b/saber/funcs/unpool.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,45 +17,33 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_unpool.h" #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_unpool.h" #endif -#ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_unpool.h" -#endif - namespace anakin { namespace saber { template + DataType OpDtype> class Unpool : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, - PoolingParam - > -{ + TargetType, + OpDtype, + ImplBase, + PoolingParam> { public: using BaseFunc< - Tensor, - Tensor, - Tensor, + TargetType, + OpDtype, ImplBase, PoolingParam >::BaseFunc; Unpool() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef PoolingParam Param_t; + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef PoolingParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -78,12 +66,12 @@ class Unpool : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderUnpool ); + this->_impl.push_back(new VenderUnpool ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberUnpool ); + this->_impl.push_back(new SaberUnpool ); return SaberSuccess; default: return SaberUnImplError; @@ -97,8 +85,6 @@ class Unpool : public BaseFunc< this->_best_impl = this->_impl[0]; } - //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} - virtual void pick_best_specify(ImplEnum implenum) override { this->_best_impl = this->_impl[0]; } diff --git a/saber/core/impl/arm/arm_device.h b/saber/lite/core/arm_device.h similarity index 76% rename from saber/core/impl/arm/arm_device.h rename to saber/lite/core/arm_device.h index c92cc9674..0e04f1119 100644 --- a/saber/core/impl/arm/arm_device.h +++ b/saber/lite/core/arm_device.h @@ -1,9 +1,23 @@ -#ifndef ANAKIN2_SABER_ARM_DEVICES_H -#define ANAKIN2_SABER_ARM_DEVICES_H +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_CORE_ARM_DEVICE_H +#define ANAKIN_SABER_LITE_CORE_ARM_DEVICE_H #include #include -#include "device.h" #ifdef PLATFORM_ANDROID #include @@ -29,8 +43,7 @@ #endif #ifdef USE_ARM_PLACE -static int arm_get_cpucount() -{ +static int arm_get_cpucount() { #ifdef PLATFORM_ANDROID // get cpu count from /proc/cpuinfo FILE* fp = fopen("/proc/cpuinfo", "rb"); @@ -40,8 +53,7 @@ static int arm_get_cpucount() int count = 0; char line[1024]; - while (!feof(fp)) - { + while (!feof(fp)) { char* s = fgets(line, 1024, fp); if (!s) { break; @@ -74,8 +86,7 @@ static int arm_get_cpucount() #endif } -static int arm_get_meminfo() -{ +static int arm_get_meminfo() { #ifdef PLATFORM_ANDROID // get cpu count from /proc/cpuinfo FILE* fp = fopen("/proc/meminfo", "rb"); @@ -85,8 +96,7 @@ static int arm_get_meminfo() int memsize = 0; char line[1024]; - while (!feof(fp)) - { + while (!feof(fp)) { char* s = fgets(line, 1024, fp); if (!s) { break; @@ -104,8 +114,7 @@ static int arm_get_meminfo() } #ifdef PLATFORM_ANDROID -static int get_max_freq_khz(int cpuid) -{ +static int get_max_freq_khz(int cpuid) { // first try, for all possible cpu char path[256]; snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state",\ @@ -113,15 +122,13 @@ static int get_max_freq_khz(int cpuid) FILE* fp = fopen(path, "rb"); - if (!fp) - { + if (!fp) { // second try, for online cpu snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",\ cpuid); fp = fopen(path, "rb"); - if (!fp) - { + if (!fp) { // third try, for online cpu snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",\ cpuid); @@ -141,8 +148,7 @@ static int get_max_freq_khz(int cpuid) } int max_freq_khz = 0; - while (!feof(fp)) - { + while (!feof(fp)) { int freq_khz = 0; int nscan = fscanf(fp, "%d %*d", &freq_khz); if (nscan != 1) { @@ -172,42 +178,17 @@ static int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector& cpui cpu_freq.resize(cpu_count); cluster_ids.resize(cpu_count); - for (int i = 0; i < cpu_count; i++) - { + for (int i = 0; i < cpu_count; i++) { int max_freq_khz = get_max_freq_khz(i); //printf("%d max freq = %d khz\n", i, max_freq_khz); cpuids[i] = i; cpu_freq[i] = max_freq_khz / 1000; } - // sort cpuid as big core first - // simple bubble sort - /* - for (int i = 0; i < cpu_count; i++) - { - for (int j = i+1; j < cpu_count; j++) - { - if (cpu_freq[i] < cpu_freq[j]) - { - // swap - int tmp = cpuids[i]; - cpuids[i] = cpuids[j]; - cpuids[j] = tmp; - - tmp = cpu_freq[i]; - cpu_freq[i] = cpu_freq[j]; - cpu_freq[j] = tmp; - } - } - }*/ - // SMP int mid_max_freq_khz = (cpu_freq.front() + cpu_freq.back()) / 2; - //if (mid_max_freq_khz == cpu_freq.back()) - // return 0; - for (int i = 0; i < cpu_count; i++) - { + for (int i = 0; i < cpu_count; i++) { if (cpu_freq[i] >= mid_max_freq_khz) { cluster_ids[i] = 0; } @@ -239,30 +220,26 @@ static int sort_cpuid_by_max_frequency(int cpu_count, std::vector& cpuids, #endif #ifdef PLATFORM_ANDROID -static int set_sched_affinity(const std::vector& cpuids) -{ +static int set_sched_affinity(const std::vector& cpuids) { // cpu_set_t definition // ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity - typedef struct - { + typedef struct { unsigned long mask_bits[1024 / __NCPUBITS__]; - }cpu_set_t; + } cpu_set_t; // set affinity for thread pid_t pid = gettid(); cpu_set_t mask; __CPU_ZERO(&mask); - for (int i = 0; i < (int)cpuids.size(); i++) - { + for (int i = 0; i < (int)cpuids.size(); i++) { __CPU_SET(cpuids[i], &mask); } int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); - if (syscallret) - { - LOG(ERROR) << "syscall error " << syscallret; + if (syscallret) { + //LOG(ERROR) << "syscall error " << syscallret; return -1; } @@ -275,15 +252,12 @@ static int set_cpu_affinity(const std::vector& cpuids){ omp_set_num_threads(num_threads); std::vector ssarets(num_threads, 0); #pragma omp parallel for - for (int i = 0; i < num_threads; i++) - { + for (int i = 0; i < num_threads; i++) { ssarets[i] = set_sched_affinity(cpuids); } - for (int i = 0; i < num_threads; i++) - { - if (ssarets[i] != 0) - { - LOG(ERROR)<<"set cpu affinity failed, cpuID: " << cpuids[i]; + for (int i = 0; i < num_threads; i++) { + if (ssarets[i] != 0) { + //LOG(ERROR)<<"set cpu affinity failed, cpuID: " << cpuids[i]; return -1; } } @@ -291,9 +265,8 @@ static int set_cpu_affinity(const std::vector& cpuids){ std::vector cpuid1; cpuid1.push_back(cpuids[0]); int ssaret = set_sched_affinity(cpuid1); - if (ssaret != 0) - { - LOG(ERROR)<<"set cpu affinity failed, cpuID: " << cpuids[0]; + if (ssaret != 0) { + //LOG(ERROR)<<"set cpu affinity failed, cpuID: " << cpuids[0]; return -1; } #endif @@ -303,4 +276,4 @@ static int set_cpu_affinity(const std::vector& cpuids){ #endif //USE_ARM_PLACE -#endif //ANAKIN2_SABER_ARM_DEVICES_H +#endif //ANAKIN_SABER_LITE_CORE_ARM_DEVICE_H diff --git a/saber/lite/core/buffer_lite.cpp b/saber/lite/core/buffer_lite.cpp new file mode 100644 index 000000000..b6f6b9a87 --- /dev/null +++ b/saber/lite/core/buffer_lite.cpp @@ -0,0 +1,106 @@ +#include "saber/lite/core/buffer_lite.h" +namespace anakin{ + +namespace saber{ + +namespace lite{ + +template <> +void Buffer::clean() { + if (_own_data){ + fast_free(_data); + } + _own_data = false; + _data = nullptr; + _capacity = 0; +} + +template <> +const Buffer::dtype* Buffer::get_data() { + return _data; +} + +template <> +Buffer::Buffer() { + _capacity = 0; + _data = nullptr; + _own_data = false; +} + +template <> +Buffer::Buffer(size_t size) { + _own_data = true; + _data = fast_malloc(size); +} + +template <> +Buffer::Buffer(dtype* data, size_t size) { + _own_data = false; + _data = data; + _capacity = size; +} + +template <> +Buffer::~Buffer() { + clean(); +} + +template <> +Buffer& Buffer::operator=(Buffer& buf) { + this->_capacity = buf._capacity; + this->_own_data = false; + this->_data = buf._data; + return *this; +} + +template <> +void Buffer::re_alloc(size_t size) { + if(_own_data && size < _capacity) { + return; + } else { + clean(); + _capacity = size; + _own_data = true; + _data = fast_malloc(_capacity); + } +} + +template <> +void Buffer::alloc(size_t size) { + clean(); + _capacity = size; + _own_data = true; + _data = fast_malloc(_capacity); +} + +template <> +void Buffer::copy_from(Buffer &buf) { + if (buf.get_data() == _data) { + return; + } + memcpy(_data, buf.get_data(), _capacity); +} + +template <> +Buffer::dtype* Buffer::get_data_mutable() { + return _data; +} + +template <> +void Buffer::mem_set(int c, size_t size) { + if (size > _capacity) { + size = _capacity; + } + memset(_data, c, size); +} + +template +size_t Buffer::get_capacity() { + return _capacity; +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin diff --git a/saber/lite/core/buffer_lite.h b/saber/lite/core/buffer_lite.h new file mode 100644 index 000000000..120d9b235 --- /dev/null +++ b/saber/lite/core/buffer_lite.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_CORE_BUFFER_LITE_H +#define ANAKIN_SABER_LITE_CORE_BUFFER_LITE_H + +#include "saber/lite/core/common_lite.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +template +class Buffer{ +public: + typedef typename TargetTrait::bdtype dtype; + /** + * \brief constructor + */ + Buffer(); + /** + * \brief constructor, allocate data + */ + explicit Buffer(size_t size); + + /** + * \brief construct from existence data + * @param data + * @param size + */ + Buffer(dtype* data, size_t size); + + /** + * \brief assigned function + */ + Buffer& operator = (Buffer& buf); + + /** + * \brief destructor + */ + ~Buffer(); + + /** + * \brief deep copy function + */ + void copy_from(Buffer& buf); + + /** + * \brief set _data to (c) with length of (size) + */ + void mem_set(int c, size_t size); + + /** + * \brief re-alloc memory + */ + void re_alloc(size_t size); + + /** + * \brief alloc memory + */ + void alloc(size_t size); + + /** + * \brief free memory + */ + void clean(); + + /** + * \brief return const data pointer + */ + const dtype* get_data(); + + /** + * \brief return mutable data pointer + */ + dtype* get_data_mutable(); + + /** + * \brief return total size of memory, in bytes + */ + size_t get_capacity(); + +protected: + dtype* _data; + bool _own_data; + size_t _capacity; + +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif //ANAKIN_SABER_LITE_CORE_BUFFER_LITE_H diff --git a/saber/lite/core/common_lite.h b/saber/lite/core/common_lite.h new file mode 100644 index 000000000..93d3f6d2a --- /dev/null +++ b/saber/lite/core/common_lite.h @@ -0,0 +1,184 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_CORE_COMMON_H +#define ANAKIN_SABER_LITE_CORE_COMMON_H + +#include +#include +#include +#include +#include +#include "anakin_config.h" +#include "saber/saber_types.h" + +#ifdef USE_ARM_PLACE +#include +#ifdef USE_OPENMP +#include +#endif //openmp +#endif //ARM + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +#if defined WIN32 || defined _WIN32 || defined WINCE || defined __CYGWIN__ +# define LITE_EXPORT __declspec(dllexport) +#elif defined(__GNUC__) && (__GNUC__ >= 4) +# define LITE_EXPORT __attribute__ ((visibility ("default"))) +#else +# define LITE_EXPORT +#endif + +//#define CHECK_EQ(a, b) std::cout +//#define CHECK_LE(a, b) std::cout +//#define CHECK_LT(a, b) std::cout +//#define CHECK_GE(a, b) std::cout +//#define CHECK_GT(a, b) std::cout +//#define LOG(a) std::cout + +#define LCHECK_EQ(a, b, out) \ +do { if (a != b) { printf("%s\n", out); assert(0);} } while (0) + +#define LCHECK_GE(a, b, out) \ +do { if (a < b) { printf("%s\n", out); assert(0);} } while (0) + +#define LCHECK_GT(a, b, out) \ +do { if (a <= b) { printf("%s\n", out); assert(0);} } while (0) + +#define LCHECK_LE(a, b, out) \ +do { if (a > b) { printf("%s\n", out); assert(0);} } while (0) + +#define LCHECK_LT(a, b, out) \ +do { if (a >= b) { printf("%s\n", out); assert(0);} } while (0) + +#define LITE_CHECK(condition) \ + do { \ + SaberStatus error = condition; \ + if (error != SaberSuccess) { \ + printf("SaberLite runtime error type %s\n", get_error_string_lite(error)); \ + assert(0);\ + } \ +} while (0) + +inline const char* get_error_string_lite(SaberStatus error_code) { + switch (error_code) { + case SaberSuccess: + return "ANAKIN_SABER_STATUS_SUCCESS"; + case SaberNotInitialized: + return "ANAKIN_SABER_STATUS_NOT_INITIALIZED"; + case SaberInvalidValue: + return "ANAKIN_SABER_STATUS_INVALID_VALUE"; + case SaberMemAllocFailed: + return "ANAKIN_SABER_STATUS_MEMALLOC_FAILED"; + case SaberUnKownError: + return "ANAKIN_SABER_STATUS_UNKNOWN_ERROR"; + case SaberOutOfAuthority: + return "ANAKIN_SABER_STATUS_OUT_OF_AUTHORITH"; + case SaberOutOfMem: + return "ANAKIN_SABER_STATUS_OUT_OF_MEMORY"; + case SaberUnImplError: + return "ANAKIN_SABER_STATUS_UNIMPL_ERROR"; + case SaberWrongDevice: + return "ANAKIN_SABER_STATUS_WRONG_DEVICE"; + default: + return "ANAKIN SABER UNKOWN ERRORS"; + } +} +#if 0 //add support for opencl device memory +template +struct CLDtype{ + CLDtype(){ + offset = 0; + ptr = nullptr; + } + + CLDtype& operator++(){ + offset++; + return *this; + } + CLDtype operator++(int){ + + } + int offset; + cl_mem ptr; +}; +#endif + +enum ARMType{ + CPU = 0, + GPU = 1, + DSP = 2 +}; + +template +struct DataTrait{ + typedef void dtype; +}; + + +template +struct DataTrait{ + typedef float dtype; + typedef float Dtype; +}; + +template +struct DataTrait{ + typedef char dtype; + typedef char Dtype; +}; + +template +struct TargetTrait{ + typedef void* stream_t; + typedef void* event_t; + typedef void bdtype; + int get_device_count() { return 1;} + int get_device_id(){ return 0;} + void set_device_id(int id){} +}; + +//! the alignment of all the allocated buffers +const int MALLOC_ALIGN = 16; + +static void* fast_malloc(size_t size) { + size_t offset = sizeof(void*) + MALLOC_ALIGN - 1; + char* p; + p = static_cast(malloc(offset + size)); + if (!p) { + return nullptr; + } + void* r = reinterpret_cast(reinterpret_cast(p + offset) & (~(MALLOC_ALIGN - 1))); + static_cast(r)[-1] = p; + return r; +} + +static void fast_free(void* ptr) { + if (ptr){ + free(static_cast(ptr)[-1]); + } +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_LITE_CORE_COMMON_H + diff --git a/saber/lite/core/context_lite.cpp b/saber/lite/core/context_lite.cpp new file mode 100644 index 000000000..9010743e5 --- /dev/null +++ b/saber/lite/core/context_lite.cpp @@ -0,0 +1,634 @@ +#include "saber/lite/core/context_lite.h" + +#ifdef PLATFORM_ANDROID +#include +#include + +#define __NCPUBITS__ (8 * sizeof (unsigned long)) + +#define __CPU_SET(cpu, cpusetp) \ + ((cpusetp)->mask_bits[(cpu) / __NCPUBITS__] |= (1UL << ((cpu) % __NCPUBITS__))) + +#define __CPU_ZERO(cpusetp) \ + memset((cpusetp), 0, sizeof(cpu_set_t)) +#endif //android + +#if __APPLE__ +#include "TargetConditionals.h" +#if TARGET_OS_IPHONE +#include +#include +#include +#define __IOS__ +#endif +#endif //apple + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +int arm_get_cpucount() { +#ifdef PLATFORM_ANDROID + // get cpu count from /proc/cpuinfo + FILE* fp = fopen("/proc/cpuinfo", "rb"); + if (!fp) { + return 1; + } + + int count = 0; + char line[1024]; + while (!feof(fp)) { + char* s = fgets(line, 1024, fp); + if (!s) { + break; + } + + if (memcmp(line, "processor", 9) == 0) { + count++; + } + } + + fclose(fp); + + if (count < 1) { + count = 1; + } + + return count; +#elif __IOS__ + int count = 0; + size_t len = sizeof(count); + sysctlbyname("hw.ncpu", &count, &len, NULL, 0); + + if (count < 1) { + count = 1; + } + + return count; +#else + return 1; +#endif +} + +int arm_get_meminfo() { +#ifdef PLATFORM_ANDROID +// get cpu count from /proc/cpuinfo + FILE* fp = fopen("/proc/meminfo", "rb"); + if (!fp) { + return 1; + } + + int memsize = 0; + char line[1024]; + while (!feof(fp)) { + char* s = fgets(line, 1024, fp); + if (!s) { + break; + } + sscanf(s, "MemTotal: %d kB", &memsize); + } + + fclose(fp); + + return memsize; +#elif __IOS__ + // to be implemented + return 0; +#endif +} + +#ifdef PLATFORM_ANDROID +int get_max_freq_khz(int cpuid) { + // first try, for all possible cpu + char path[256]; + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state",\ + cpuid); + + FILE* fp = fopen(path, "rb"); + + if (!fp) { + // second try, for online cpu + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",\ + cpuid); + fp = fopen(path, "rb"); + + if (!fp) { + // third try, for online cpu + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",\ + cpuid); + fp = fopen(path, "rb"); + + if (!fp) { + return -1; + } + + int max_freq_khz = -1; + fscanf(fp, "%d", &max_freq_khz); + + fclose(fp); + + return max_freq_khz; + } + } + + int max_freq_khz = 0; + while (!feof(fp)) { + int freq_khz = 0; + int nscan = fscanf(fp, "%d %*d", &freq_khz); + if (nscan != 1) { + break; + } + + if (freq_khz > max_freq_khz) { + max_freq_khz = freq_khz; + } + } + + fclose(fp); + + return max_freq_khz; +} + +int sort_cpuid_by_max_frequency(int cpu_count, std::vector& cpuids, \ + std::vector& cpu_freq, std::vector& cluster_ids) { + //const int cpu_count = cpuids.size(); + + if (cpu_count == 0) { + return 0; + } + + //std::vector cpu_max_freq_khz; + cpuids.resize(cpu_count); + cpu_freq.resize(cpu_count); + cluster_ids.resize(cpu_count); + + for (int i = 0; i < cpu_count; i++) { + int max_freq_khz = get_max_freq_khz(i); + //printf("%d max freq = %d khz\n", i, max_freq_khz); + cpuids[i] = i; + cpu_freq[i] = max_freq_khz / 1000; + } + + // SMP + int mid_max_freq_khz = (cpu_freq.front() + cpu_freq.back()) / 2; + + for (int i = 0; i < cpu_count; i++) { + if (cpu_freq[i] >= mid_max_freq_khz) { + cluster_ids[i] = 0; + } + else{ + cluster_ids[i] = 1; + } + } + + return 0; +} +#endif // __ANDROID__ + +#ifdef __IOS__ +int sort_cpuid_by_max_frequency(int cpu_count, std::vector& cpuids, \ + std::vector& cpu_freq, std::vector& cluster_ids){ + if (cpu_count == 0) { + return 0; + } + cpuids.resize(cpu_count); + cpu_freq.resize(cpu_count); + cluster_ids.resize(cpu_count); + for (int i = 0; i < cpu_count; ++i) { + cpuids[i] = i; + cpu_freq[i] = 1000; + cluster_ids[i] = 0; + } +} +#endif + +#ifdef PLATFORM_ANDROID +int set_sched_affinity(const std::vector& cpuids) { + // cpu_set_t definition + // ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity + + typedef struct { + unsigned long mask_bits[1024 / __NCPUBITS__]; + } cpu_set_t; + + // set affinity for thread + pid_t pid = gettid(); + + cpu_set_t mask; + __CPU_ZERO(&mask); + for (int i = 0; i < (int)cpuids.size(); i++) { + __CPU_SET(cpuids[i], &mask); + } + + int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); + if (syscallret) { + //LOG(ERROR) << "syscall error " << syscallret; + return -1; + } + + return 0; +} + +int set_cpu_affinity(const std::vector& cpuids){ +#ifdef USE_OPENMP + int num_threads = cpuids.size(); + omp_set_num_threads(num_threads); + std::vector ssarets(num_threads, 0); +#pragma omp parallel for + for (int i = 0; i < num_threads; i++) { + ssarets[i] = set_sched_affinity(cpuids); + } + for (int i = 0; i < num_threads; i++) { + if (ssarets[i] != 0) { + //LOG(ERROR)<<"set cpu affinity failed, cpuID: " << cpuids[i]; + return -1; + } + } +#else + std::vector cpuid1; + cpuid1.push_back(cpuids[0]); + int ssaret = set_sched_affinity(cpuid1); + if (ssaret != 0) { + //LOG(ERROR)<<"set cpu affinity failed, cpuID: " << cpuids[0]; + return -1; + } +#endif + return 0; +} +#endif //PLATFORN_ANDROID + +//template <> +void Env::get_info(DeviceInfo& dev) { + //! set to const value, need to fetch from device + dev._L1_cache = 31000; + dev._L2_cache = 2000000; + dev._L3_cache = 0; + + dev._compute_core_num = arm_get_cpucount(); + dev._max_memory = arm_get_meminfo(); + + //_max_stream = _info._compute_core_num; + + std::vector max_freq; + + sort_cpuid_by_max_frequency(dev._compute_core_num, dev._core_ids, max_freq, dev._cluster_ids); + + printf("ARM multiprocessors number: %d\n", dev._compute_core_num); + for (int i = 0; i < dev._compute_core_num; ++i) { + printf("ARM multiprocessors ID: %d, frequence: %d, cluster ID: %d\n", \ + dev._core_ids[i], max_freq[dev._core_ids[i]], dev._cluster_ids[dev._core_ids[i]]); + } + //LOG(INFO) << "L1 DataCache size: " << L1_cache << "B"; + //LOG(INFO) << "L2 Cache size: " << L2_cache << "B"; + printf("Total memory: %d kB\n", dev._max_memory); + + dev._max_frequence = max_freq[0]; + for (int j = 1; j < dev._compute_core_num; ++j) { + if(dev._max_frequence < max_freq[j]){ + dev._max_frequence = max_freq[j]; + } + } +} +#if 0 +template <> +void Device::create_stream() { + _compute_stream.resize(_max_stream); + _data_stream.resize(_max_stream); + for (int i = 0; i < _max_stream; ++i) { + _compute_stream[i] = nullptr; + _data_stream[i] = nullptr; + } +} + +template <> +Device::Device(int max_stream){ + _max_stream = max_stream; + get_info(); + create_stream(); +} +#endif + +void Context::set_cache(size_t l1size, size_t l2size, size_t l3size) { + DeviceInfo& dev = Env::cur_env(); + dev._L1_cache = l1size; + dev._L2_cache = l2size; + dev._L3_cache = l3size; +} + +//template <> +Context::Context() { + //! 1 thread, big core + _act_ids = {0}; + _mode = SABER_POWER_HIGH; +} + +PowerMode Context::get_mode(int& threads) { + threads = _act_ids.size(); + return _mode; +} + +std::vector Context::get_act_ids() { + return _act_ids; +} + +Context::Context(const Context& ctx){ + _mode = ctx._mode; + _act_ids = ctx._act_ids; +} + +void Context::bind_dev() { + set_cpu_affinity(_act_ids); +} + +void Context::set_run_mode(PowerMode mode, int threads) { + DeviceInfo& dev = Env::cur_env(); + std::vector big_cores; + std::vector small_cores; + for (int i = 0; i < dev._cluster_ids.size(); ++i) { + if (dev._cluster_ids[i] == 0) { + big_cores.push_back(dev._core_ids[i]); + } else { + small_cores.push_back(dev._core_ids[i]); + } + } + int big_core_size = big_cores.size(); + int small_core_size = small_cores.size(); + if (threads > big_core_size + small_core_size) { + threads = big_core_size + small_core_size; + } + switch (mode) { + case SABER_POWER_FULL: + _mode = mode; + _act_ids.clear(); + for (int i = 0; i < threads; ++i) { + if (i < big_core_size) { + _act_ids.push_back(big_cores[i]); + } else { + _act_ids.push_back(small_cores[i - big_core_size]); + } + } + break; + case SABER_POWER_HIGH: + _act_ids.clear(); + if (big_core_size > 0) { + _mode = SABER_POWER_HIGH; + if (threads > big_core_size) { + printf("threads: %d, exceed the big cores size: %d\n", threads, big_core_size); + _act_ids = big_cores; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(big_cores[i]); + } + } + } else { + _mode = SABER_POWER_LOW; + printf("HIGH POWER MODE is not support, switch to small cores\n"); + if(threads > small_core_size) { + _act_ids = small_cores; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(small_cores[i]); + } + } + + } + break; + case SABER_POWER_LOW: + _act_ids.clear(); + if (small_core_size > 0) { + _mode = SABER_POWER_LOW; + if (threads > small_core_size) { + printf("threads: %d, exceed the small cores size: %d\n", threads, small_core_size); + _act_ids = small_cores; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(small_cores[i]); + } + } + } else { + _mode = SABER_POWER_HIGH; + printf("LOW POWER MODE is not support, switch to big cores\n"); + if(threads > big_core_size) { + _act_ids = big_cores; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(small_cores[i]); + } + } + + } + break; + } + + bind_dev(); +} + +#if 0 +template <> +Context::Context(int device_id, int data_stream_id, int compute_stream_id) { + typename Env::Devs& devs = Env::cur_env(); + LCHECK_GT(devs.size(), 0, "Env is not initialized or current target is not exit!"); + if (device_id >= devs.size()){ + printf("device index exceeds the number of devices, set to default device(0)!\n"); + _device_id = 0; + } else { + _device_id = device_id; + } + if (data_stream_id >= devs[_device_id]._max_stream) { + printf("data stream index exceeds the maximum stream number, set to default stream(0)!\n"); + data_stream_id = 0; + } + _stream_data = devs[_device_id]._data_stream[data_stream_id]; + _data_stream_id = data_stream_id; + + if (compute_stream_id >= devs[_device_id]._max_stream) { + printf("compute stream index exceeds the maximum stream number, set to default stream(0)!\n"); + compute_stream_id = 0; + } + _stream_compute = devs[_device_id]._compute_stream[compute_stream_id]; + _compute_stream_id = compute_stream_id; + _act_ids = {0}; + _mode = SABER_POWER_HIGH; +} + +template <> +Context::Context(const Context& ctx){ + _device_id = ctx._device_id; + _data_stream_id = ctx._data_stream_id; + _compute_stream_id = ctx._compute_stream_id; + _stream_compute = ctx._stream_compute; + _stream_data = ctx._stream_data; + _mode = ctx._mode; + _act_ids = ctx._act_ids; +} + +template <> +Context& Context::operator=(const Context& ctx){ + this->_device_id = ctx._device_id; + this->_data_stream_id = ctx._data_stream_id; + this->_compute_stream_id = ctx._compute_stream_id; + this->_stream_data = ctx._stream_data; + this->_stream_compute = ctx._stream_compute; + return *this; +} + +template<> +bool Context::operator==(const Context &right) { + bool comp_eq = true; + comp_eq = comp_eq && (_device_id == right._device_id); + comp_eq = comp_eq && (_data_stream_id == right._data_stream_id); + comp_eq = comp_eq && (_compute_stream_id == right._compute_stream_id); + return comp_eq; +} + +/** + * \brief get device id of current context + * @return + */ +template <> +int Context::get_device_id() { + return _device_id; +} + +/** + * \brief get data process stream + * @return + */ +template <> +typename TargetTrait::stream_t Context::get_data_stream(){ + return _stream_data; +} + +/** + * \brief get compute process stream + * @return + */ +template <> +typename TargetTrait::stream_t Context::get_compute_stream(){ + return _stream_compute; +} + +template <> +void Context::bind_dev() { + set_cpu_affinity(_act_ids); +} + +template <> +void Context::set_run_mode(PowerMode mode, int threads) { + typename Env::Devs& devs = Env::cur_env(); + Device dev = devs[_device_id]; + std::vector big_cores; + std::vector small_cores; + for (int i = 0; i < dev._info._cluster_ids.size(); ++i) { + if (dev._info._cluster_ids[i] == 0) { + big_cores.push_back(dev._info._core_ids[i]); + } else { + small_cores.push_back(dev._info._core_ids[i]); + } + } + int big_core_size = big_cores.size(); + int small_core_size = small_cores.size(); + if (threads > big_core_size + small_core_size) { + threads = big_core_size + small_core_size; + } + switch (mode) { + case SABER_POWER_FULL: + _mode = mode; + _act_ids.clear(); + for (int i = 0; i < threads; ++i) { + if (i < big_core_size) { + _act_ids.push_back(big_cores[i]); + } else { + _act_ids.push_back(small_cores[i - big_core_size]); + } + } + break; + case SABER_POWER_HIGH: + _act_ids.clear(); + if (big_core_size > 0) { + _mode = SABER_POWER_HIGH; + if (threads > big_core_size) { + printf("threads: %d, exceed the big cores size: %d\n", threads, big_core_size); + _act_ids = big_cores; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(big_cores[i]); + } + } + } else { + _mode = SABER_POWER_LOW; + printf("HIGH POWER MODE is not support, switch to small cores\n"); + if(threads > small_core_size) { + _act_ids = small_cores; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(small_cores[i]); + } + } + + } + break; + case SABER_POWER_LOW: + _act_ids.clear(); + if (small_core_size > 0) { + _mode = SABER_POWER_LOW; + if (threads > small_core_size) { + printf("threads: %d, exceed the small cores size: %d\n", threads, small_core_size); + _act_ids = small_cores; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(small_cores[i]); + } + } + } else { + _mode = SABER_POWER_HIGH; + printf("LOW POWER MODE is not support, switch to big cores\n"); + if(threads > big_core_size) { + _act_ids = big_cores; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(small_cores[i]); + } + } + + } + break; + } + + bind_dev(); +} +// +//void set_act_cores(std::vector ids) { +// Device dev = devs[_device_id]; +// if (ids.size() == 0){ +// _act_ids.resize(1); +// _act_ids[0] = dev._info._core_ids[0]; +// }else { +// _act_ids.clear(); +// for (int i = 0; i < ids.size(); ++i) { +// if (ids[i] < dev._info._core_ids.size()){ +// _act_ids.push_back(ids[i]); +// } +// } +// } +// bind_dev(); +//} + +template <> +PowerMode Context::get_mode(int& threads) { + threads = _act_ids.size(); + return _mode; +} +template <> +std::vector Context::get_act_ids() { + return _act_ids; +} +#endif +} //namespace lite + +} //namespace saber + +} //namespace anakin + diff --git a/saber/lite/core/context_lite.h b/saber/lite/core/context_lite.h new file mode 100644 index 000000000..063d2d300 --- /dev/null +++ b/saber/lite/core/context_lite.h @@ -0,0 +1,133 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_CORE_DEVICE_LITE_H +#define ANAKIN_SABER_LITE_CORE_DEVICE_LITE_H +#include "saber/lite/core/common_lite.h" +namespace anakin{ + +namespace saber{ + +namespace lite{ + +struct DeviceInfo{ + std::string _device_name; + int _max_frequence; + int _min_frequence; + int _generate_arch; + int _compute_core_num; + int _max_memory; + int _sharemem_size; + int _L1_cache; + int _L2_cache; + int _L3_cache; + std::vector _core_ids; + std::vector _cluster_ids; +}; + +//template +//class Device { +//public: +// Device(int max_stream = 4); +// void get_info(); +// void create_stream(); +// +// DeviceInfo _info; +// int _max_stream; +// std::vector::stream_t> _data_stream; +// std::vector::stream_t> _compute_stream; +//}; + + +//template +class Env { +public: + //typedef std::vector> Devs; + static DeviceInfo& cur_env() { + static DeviceInfo* _g_env = new DeviceInfo(); + return *_g_env; + } + static void env_init(int max_stream = 4) { + DeviceInfo& devs = cur_env(); + get_info(devs); + } + +private: + static void get_info(DeviceInfo& dev); + Env(){} +}; + +//template +class Context { +public: + Context(); + /** + * \brief context constructor, set device id, data stream id and compute stream id + * @param device_id + * @param data_stream_id + * @param compute_stream_id + */ + Context(PowerMode mode, int threads); + + Context(const Context& ctx); + +#if 0 + /** + * \brief get device id of current context + * @return + */ + int get_device_id(); + + /** + * \brief get data process stream + * @return + */ + typename TargetTrait::stream_t get_data_stream(); + + /** + * \brief get compute process stream + * @return + */ + typename TargetTrait::stream_t get_compute_stream(); +#endif + void set_run_mode(PowerMode mode, int threads); + //void set_act_cores(std::vector ids); + void bind_dev(); + PowerMode get_mode(int& threads); + std::vector get_act_ids(); + void set_cache(size_t l1size, size_t l2size, size_t l3size); +private: +#if 0 + //! current stream to process + typename TargetTrait::stream_t _stream_data; + typename TargetTrait::stream_t _stream_compute; + //! current device id + int _device_id; + int _data_stream_id; + int _compute_stream_id; +#endif + //! SABER_POWER_HIGH stands for using big cores, + //! SABER_POWER_LOW stands for using small core, + //! SABER_POWER_FULL stands for using all cores + PowerMode _mode; + std::vector _act_ids; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_LITE_CORE_DEVICE_LITE_H diff --git a/saber/lite/core/shape_lite.h b/saber/lite/core/shape_lite.h new file mode 100644 index 000000000..2e712a80d --- /dev/null +++ b/saber/lite/core/shape_lite.h @@ -0,0 +1,256 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_CORE_SHAPE_LITE_H +#define ANAKIN_SABER_LITE_CORE_SHAPE_LITE_H +#include +#include "saber/lite/core/common_lite.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//! default layout is NCHW, CHW, HW, W +//! maximum dim is 4 + +class Shape : public std::vector { +public: + using vector = std::vector; + + Shape():vector(){} + template + Shape(First first, Args... res) { + init_dims(first, res...); + } + + int num() const { + if (dims() == 0) { + return 0; + } + int i = 1; + if (dims() == 4) { + i = data()[0]; + } + return i; + } + int channel() const { + if (dims() == 0) { + return 0; + } + int i = 1; + if (dims() >= 3) { + i = data()[dims() - 3]; + } + return i; + } + + int height() const { + if (dims() == 0) { + return 0; + } + int i = 1; + if (dims() >= 2) { + i = data()[dims() - 2]; + } + return i; + } + + int width() const { + if (dims() == 0) { + return 0; + } + return data()[dims() - 1]; + } + + Shape stride() const { + assert(dims() > 0); + //CHECK_GT(dims(), 0) << "shape is empty"; + Shape sh(dims()); + for (int i = 0; i < dims(); ++i) { + sh[i] = count(i + 1); + } + return sh; + } + + void set_num(int num) { + assert(dims() > 0); + //CHECK_GT(dims(), 0) << "shape is empty"; + if (dims() == 4) { + data()[0] = num; + } + } + + void set_channel(int channel) { + assert(dims() > 0); + //CHECK_GT(dims(), 0) << "shape is empty"; + if (dims() >= 3) { + data()[dims() - 3] = channel; + } + } + + void set_height(int height) { + assert(dims() > 0); + //CHECK_GT(dims(), 0) << "shape is empty"; + if (dims() >= 2) { + data()[dims() - 2] = height; + } + } + + void set_width(int width) { + assert(dims() > 0); + //CHECK_GT(dims(), 0) << "shape is empty"; + if (dims() >= 1) { + data()[dims() - 1] = width; + } + } + + Shape operator+(const Shape& shape) { + assert(dims() == shape.dims()); + Shape tmp_shape(*this); + int* p = data(); + for (size_t i = 0; i < size(); i++) { + tmp_shape[i] = p[i] + shape[i]; + } + return tmp_shape; + } + + Shape operator-(const Shape& shape) { + assert(dims() == shape.dims()); + Shape tmp_shape(*this); + int* p = data(); + for (size_t i = 0; i < size(); i++) { + tmp_shape[i] = p[i] - shape[i]; + } + return tmp_shape; + } + + bool operator<(const Shape& shape) const { + bool flag = size() == shape.size(); + if (!flag) { + return false; + } + const int* p = data(); + for (size_t i = 0; i < size(); i++) { + flag &= (p[i] < shape[i]); + } + return flag; + } + + bool operator<=(const Shape& shape) const{ + bool flag = size() == shape.size(); + if (!flag) { + return false; + } + const int* p = data(); + for (size_t i = 0; i < size(); i++) { + flag &= (p[i] <= shape[i]); + } + return flag; + } + + bool operator==(const Shape& shape) const{ + bool flag = size() == shape.size(); + if (!flag) { + return false; + } + const int* p = data(); + for (size_t i = 0; i < size(); i++) { + flag &= (p[i] == shape[i]); + } + return flag; + } + + bool is_continue(const Shape real_shape) const { + if (real_shape.size() != this->size()){ + return false; + } + const int* p = data(); + for (int i = this->size() - 1; i >= 0; i--) { + if (p[i] != real_shape[i]) { + int size = this->count() / this->count(i); + return size == 1; + } + } + return true; + } + + int count(int start = 0) const { + if (this->size() == 0) { + return 0; + } + int sum = 1; + for (int i = 0; i < this->size(); i++) { + sum *= data()[i]; + } + return sum; + } + + int count(int start, int end) { + int dim = dims(); + if (start > end || start > dim) { + return 0; + } + if (start < 0) { + start = 0; + } + if (end > dim) { + end = dim; + } + int sum = 1; + for (int i = start; i < end; ++i) { + sum *= this->data()[i]; + } + return sum; + } + + int dims() const { + return this->size(); + } + + static Shape zero(int dims){ + Shape sh; + for (int i = 0; i < dims; ++i) { + sh.push_back(0); + } + return sh; + } + + static Shape minusone(int dims){ + Shape sh; + for (int i = 0; i < dims; ++i) { + sh.push_back(-1); + } + return sh; + } + +private: + template + void init_dims(First head, Args...args){ + push_back(head); + init_dims(args...); + } + void init_dims(){}; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_LITE_CORE_SHAPE_LITE_H + + diff --git a/saber/lite/core/tensor_lite.cpp b/saber/lite/core/tensor_lite.cpp new file mode 100644 index 000000000..1a8a50370 --- /dev/null +++ b/saber/lite/core/tensor_lite.cpp @@ -0,0 +1,321 @@ +#include "saber/lite/core/tensor_lite.h" +#include +namespace anakin{ + +namespace saber{ + +namespace lite{ + +template +Tensor::Tensor() { + _buf = std::make_shared>(); + _is_subbuf = false; + _is_shared = false; +} + +template +Tensor::Tensor(Shape shape) { + _shape = shape; + _valid_shape = shape; + _offset = Shape::zero(shape.dims()); + _buf = std::make_shared>(shape.count() * _type_len); + _is_shared = false; + _is_subbuf = false; + } + +template +Tensor::Tensor(Dtype* data_ptr, Shape shape) { + _shape = shape; + _valid_shape = shape; + _offset = Shape::zero(shape.dims()); + _buf = std::make_shared>(data_ptr, shape.count() * _type_len); + _is_shared = false; + _is_subbuf = false; +} + +template +Tensor::Tensor(const Tensor& tensor){ + _shape = tensor._shape; + _valid_shape = tensor._valid_shape; + _offset = tensor._offset; + _buf = tensor._buf; + _is_shared = tensor._is_shared; + _is_subbuf = tensor._is_subbuf; +} + +template +SaberStatus Tensor::set_shape(Shape valid_shape, Shape shape, Shape offset) { + + if (shape.dims() > 0) { + LCHECK_EQ(shape.dims(), valid_shape.dims(), "input shape dims should be the same"); + _shape = shape; + } + if (offset.dims() > 0 && _is_subbuf) { + LCHECK_EQ(offset.dims(), valid_shape.dims(), "input shape dims should be the same"); + _offset = offset; + } + _valid_shape = valid_shape; + if (!_is_subbuf) { + if (_shape.count() <= _valid_shape.count()) { + _shape = _valid_shape; + } + _offset = Shape::zero(valid_shape.dims()); + } else { + auto shape_zero = Shape::zero(valid_shape.dims()); + if (_shape == shape_zero) { + _shape = valid_shape; + } + LCHECK_EQ(_valid_shape + _offset <= _shape, true, "valid_shape + offet should <= shape"); + } + return SaberSuccess; +} + +template +SaberStatus Tensor::re_alloc(Shape shape){ + LCHECK_EQ(_is_shared || _is_subbuf, false, "shared tensor could not re_alloc"); + _shape = shape; + _valid_shape = _shape; + _offset = Shape::zero(_shape.dims()); + _buf->alloc(_shape.count() * _type_len); + return SaberSuccess; +} + + +template +SaberStatus Tensor::reshape(Shape valid_shape, Shape shape, Shape offset) { + + if (shape.dims() > 0) { + LCHECK_EQ(shape.dims(), valid_shape.dims(), "shape dims must be the same"); + _shape = shape; + } + if (offset.dims() > 0 && _is_subbuf) { + LCHECK_EQ(offset.dims(), valid_shape.dims(), "shape dims must be the same"); + _offset = offset; + } + _valid_shape = valid_shape; + if (!_is_subbuf) { + if (_shape.count() < _valid_shape.count()) { + _shape = _valid_shape; + } + _offset = Shape::zero(valid_shape.dims()); + } else { + LCHECK_EQ(_valid_shape + _offset <= _shape, true, "valid_shape + offet should <= shape"); + } + bool exceed_flag = _shape.count() * _type_len > _buf->get_capacity() \ + && (_is_subbuf || _is_shared); + LCHECK_EQ(exceed_flag, false, "shared tensor shape exceed origin data buffer size"); + _buf->re_alloc(_shape.count() * _type_len); + return SaberSuccess; +} + +template +bool Tensor::is_continue_mem() const { + if (!_is_subbuf) { + return true; + } + return _valid_shape.is_continue(_shape); +} + +template +int Tensor::count(int start, int end) const { + + LCHECK_GE(start, 0, "start index shold >= 0!"); + LCHECK_LE(end, _shape.size(), "end index shold <= shape dims!"); + LCHECK_LE(start, end, "start index should < end index!"); + int sum = 1; + for (int i = start; i < end; ++i) { + sum *= _shape[i]; + } + return sum; +} + +template +int Tensor::count_valid(int start, int end) const { + + start = std::max(start, 0); + start = std::min(start, _valid_shape.dims()); + end = std::max(start, end); + end = std::min(end, _valid_shape.dims()); + + int sum = 1; + for (int i = start; i < end; ++i) { + sum *= _valid_shape[i]; + } + return sum; +} + +template +int Tensor::size() const { + return _shape.count(); +} + +template +int Tensor::valid_size() const{ + return _valid_shape.count(); +} + +template +int Tensor::dims() const { + return _valid_shape.dims(); +} + +template +Shape Tensor::shape() const{ + return _shape; +} + +template +Shape Tensor::valid_shape() const { + return _valid_shape; +} + +template +Shape Tensor::get_stride() const { + Shape data_stride = Shape::zero(dims()); + if (_is_subbuf) { + for (int i = 0; i < dims(); ++i) { + data_stride[i] = _shape.count(i + 1); + } + } else { + for (int i = 0; i < dims(); ++i) { + data_stride[i] = _valid_shape.count(i + 1); + } + } + return data_stride; +} + +template +Shape Tensor::offset() const { + return _offset; +} + +template +int Tensor::num() const { + return _valid_shape.num(); +} + +template +void Tensor::set_num(int num) { + return _valid_shape.set_num(num); +}; + +template +int Tensor::channel() const { + return _valid_shape.channel(); +} + +template +void Tensor::set_channel(int channel) { + return _valid_shape.set_channel(channel); +} + +template +int Tensor::height() const { + return _valid_shape.height(); +} + +template +void Tensor::set_height(int h) { + return _valid_shape.set_height(h); +} + +template +int Tensor::width() const { + return _valid_shape.width(); +} + +template +void Tensor::set_width(int w) { + return _valid_shape.set_width(w); +} + +template +typename Tensor::Dtype* Tensor::mutable_data(int index) { + if (_buf->get_capacity() == 0){ + return nullptr; + } + return static_cast(_buf->get_data_mutable()) + start_index() + index; +} + +template +const typename Tensor::Dtype * Tensor::data(int index) const { + if (_buf->get_capacity() == 0){ + return nullptr; + } + return static_cast(_buf->get_data()) + start_index() + index; +} + +template +const std::shared_ptr>& Tensor::get_buf() const { + return _buf; +} + +template +template +SaberStatus Tensor::share_from(const Tensor_t& tensor) { + + LCHECK_EQ(_shape.dims() > 0, true, "current tensor is not initialized (no shape info, use set_shape)"); + typedef typename Tensor_t::Dtype_real dtype_real_t; + LCHECK_LE(size() * _type_len, tensor.size() * sizeof(dtype_real_t), "current tensor size should <= input tensor size"); + _buf = tensor.get_buf(); + _is_shared = true; + _is_subbuf = false; + return SaberSuccess; +} + +template +SaberStatus Tensor::share_sub_buffer(const Tensor& tensor, \ + Shape valid_shape, Shape offset) { + + LCHECK_EQ(true, (offset + valid_shape) <= tensor.shape(), "offset + valid_shape <= shape"); + _valid_shape = valid_shape; + _offset = offset; + _shape = tensor.shape(); + _buf = tensor.get_buf(); + _is_subbuf = true; + _is_shared = true; + return SaberSuccess; +} + +template +template +SaberStatus Tensor::copy_from(const Tensor_t& tensor) { + + size_t cap_dst = valid_size() * _type_len; + typedef typename Tensor_t::Dtype_real dtype_real_t; + size_t cap_src = tensor.valid_size() * sizeof(dtype_real_t); + LCHECK_EQ(cap_dst, cap_src, "sizes of two valid shapes must be the same"); + _buf->copy_from(*tensor.get_buf()); + return SaberSuccess; +} + +template +void Tensor::sync() { + //!fixme +} + +template +void Tensor::record_event(stream_t* stream) { + //! fixme +} + +template +int Tensor::start_index() const { + if (!_is_subbuf) { + return 0; + } + Shape stride = get_stride(); + int idx = 0; + for (int i = 0; i < stride.size(); ++i) { + idx += _offset[i] * stride[i]; + } + return idx; +} +template class Tensor; + +} //namespace lite + +} //namespace saber + +} //namespace anakin + diff --git a/saber/lite/core/tensor_lite.h b/saber/lite/core/tensor_lite.h new file mode 100644 index 000000000..8729ba8a2 --- /dev/null +++ b/saber/lite/core/tensor_lite.h @@ -0,0 +1,255 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_CORE_TENSOR_LITE_H +#define ANAKIN_SABER_LITE_CORE_TENSOR_LITE_H + +#include "saber/lite/core/shape_lite.h" +#include "saber/lite/core/buffer_lite.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +template +class Tensor { +public: + typedef typename DataTrait::dtype Dtype;//float, char or CLMEM + typedef typename DataTrait::Dtype Dtype_real;//float, char + typedef typename TargetTrait::event_t event_t; + typedef typename TargetTrait::stream_t stream_t; + /** + * \brief Default constructor + */ + Tensor(); + + /** + * \brief Constructor with shape, memory is alloced according to shape. + */ + Tensor(Shape shape); + + /** + * \brief Constructor with allocated data ptr and entire memory shape. + */ + Tensor(Dtype* data_ptr, Shape shape); + + /** + * \brief Copy constructor, shallow copy. + */ + Tensor(const Tensor& tensor); + + /** + * \brief only change the shape and valid shape, do nothing to memory + * \param shape + * \param valid_shape + * \param offset + */ + SaberStatus set_shape(Shape valid_shape, Shape shape = Shape(), Shape offset = Shape()); + + /** + * \brief Free old buffer and alloc a new tensor buffer. + */ + SaberStatus re_alloc(Shape shape); + + + /** + * \brief Change tensor shape, + * if input shape's count is bigger than the capacity of buffer, alloc a new buffer. + */ + SaberStatus reshape(Shape valid_shape, Shape shape = Shape(), Shape offset = Shape()); + + bool is_continue_mem() const; + + /** + * \brief Return shape count, from start index to end index(end index is excluded). + * \param start Input start index. + * \param end Input end index (exclude in calculation). + * \return the size from start index to end index. + */ + int count(int start, int end) const; + + /** + * \brief return valid_shape count, from start index to end index(end index is excluded). + * \param start input start index. + * \param end input end index (exclude in calculation). + * \return the size from start index to end index. + */ + int count_valid(int start, int end) const; + + /** + * \brief Return tensor shape size, not the valid shape size. + */ + int size() const; + + /** + * \brief Return the valid shape size. + * \return Return the valid shape size. + */ + int valid_size() const; + + /** + * \brief Return tensor shape dims. + */ + int dims() const; + + /** + * \brief Return tensor shape, entire memory buffer shape. + */ + Shape shape() const; + + /** + * \brief Return valid shape of tensor + */ + Shape valid_shape() const; + + /** + * \brief compute data stride. + */ + Shape get_stride() const; + /** + * \brief Return tensor offset, which holds the offset in each dim. + */ + Shape offset() const; + + /** + * \brief Return number + */ + int num() const; + + /** + * \brief Return number index in shape. + */ + void set_num(int num); + + /** + * \brief Return channel. + */ + int channel() const; + + /** + * \brief Return channel index in shape. + * \return + */ + void set_channel(int channel); + + /** + * \brief Return height. + * \return + */ + int height() const; + + /** + * \brief Return height index in shape. + * \return + */ + void set_height(int h); + + /** + * \brief Return width. + * \return + */ + int width() const; + + /** + * \brief Return height index in shape. + * \return + */ + void set_width(int w); + + /** + * \brief Return tensor mutable data pointer, with data type of current tensor (Dtype*). + */ + Dtype* mutable_data(int index = 0); + + /** + * \brief Return tensor data pointer, with data type of current tensor (Dtype*). + */ + const Dtype * data(int index = 0) const; + + /** + * \brief Return reference shared_ptr of tensor. + */ + const std::shared_ptr>& get_buf() const; + + /** + * \brief Share from same layout_type and same date type tensor, + * if shared tensor target is the same with current tensor target, buffer is shared; + * otherwise, tensor buffer is deep copied. + * only shared buffer ptr, current tensor will have continuous memory, + * only if current shape and valid shape are the same, and offset is all set to 0. + */ + + template + SaberStatus share_from(const Tensor_t& tensor); + + SaberStatus share_sub_buffer(const Tensor& tensor, \ + Shape valid_shape, Shape offset); + + /** + * \brief Deep copy data within region of interest from input tensor. + */ + template + SaberStatus copy_from(const Tensor_t& tensor); + + /** + * \brief Synchronize the event tree, wait util all events are done. + */ + void sync(); + + /** + * \brief record Event to current tensor. + * \param stream Input processing stream. + */ + void record_event(stream_t* stream); + + +private: + ///< Length of datatype. + size_t _type_len{sizeof(Dtype_real)}; + + ///< Represent the raw mem shape. + Shape _shape; + + ///< Represent the mem you have right to access shape. + Shape _valid_shape; + + ///< Represent the offset idx between _shape and _real_shape. + Shape _offset; + + ///< Buffer shared ptr, hold the data pointer, and buffer capacity. + std::shared_ptr> _buf{nullptr}; + + ///< share sub-buffer flag. + bool _is_subbuf{false}; + bool _is_shared{false}; + + ///< event + event_t _event; + + /// Get data real start index. + int start_index() const; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin + + +#endif //ANAKIN_SABER_LITE_CORE_TENSOR_LITE_H + diff --git a/saber/lite/core/tensor_op_lite.cpp b/saber/lite/core/tensor_op_lite.cpp new file mode 100644 index 000000000..333025d03 --- /dev/null +++ b/saber/lite/core/tensor_op_lite.cpp @@ -0,0 +1,92 @@ +#include "tensor_op_lite.h" +#include +#include +#include + +namespace anakin { + +namespace saber { + +namespace lite{ + +template <> +void fill_tensor_const(Tensor& tensor, float value) { + float* data_ptr = tensor.mutable_data(); + int size = tensor.valid_size(); + for (int i = 0; i < size; ++i) { + data_ptr[i] = value; + } +} + +template <> +void fill_tensor_rand(Tensor& tensor) { + float* data_ptr = tensor.mutable_data(); + for (int i = 0; i < tensor.size(); ++i) { + data_ptr[i] = static_cast(rand()); + } +} + +template <> +void fill_tensor_rand(Tensor& tensor, float vstart, float vend) { + float* data_ptr = tensor.mutable_data(); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(0, 1.f); + int size = tensor.size(); + for (int i = 0; i < size; ++i) { + float random_num = vstart + (vend - vstart) * dis(gen); + data_ptr[i] = random_num; + } +} + +template <> +void print_tensor(Tensor& tensor) { + printf("host tensor data size: %d\n", tensor.size()); + const float* data_ptr = tensor.mutable_data(); + int size = tensor.size(); + for (int i = 0; i < size; ++i) { + printf("%.2f ", data_ptr[i]); + if ((i + 1) % tensor.width() == 0) { + printf("\n"); + } + } + printf("\n"); +} + +template <> +void print_tensor_valid(Tensor& tensor) { + printf("host tensor data valid size: %d\n", tensor.valid_size()); + const float* data_ptr = tensor.data(); + int size = tensor.valid_size(); + for (int i = 0; i < size; ++i) { + printf("%.2f ", data_ptr[i]); + if ((i + 1) % tensor.width() == 0) { + printf("\n"); + } + } + printf("\n"); +} + +template +void tensor_cmp_host(const Dtype* src1, const Dtype* src2, \ + int size, double& max_ratio, double& max_diff) { + + const double eps = 1e-6f; + max_diff = fabs(src1[0] - src2[0]); + max_ratio = 2.0 * max_diff / (src1[0] + src2[0] + eps); + + for (int i = 1; i < size; ++i) { + double diff = fabs(src1[i] - src2[i]); + + if (max_diff < diff) { + max_diff = diff; + max_ratio = 2.0 * max_diff / (src1[i] + src2[i] + eps); + } + } +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin diff --git a/saber/lite/core/tensor_op_lite.h b/saber/lite/core/tensor_op_lite.h new file mode 100644 index 000000000..ffd0ccbb4 --- /dev/null +++ b/saber/lite/core/tensor_op_lite.h @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_CORE_TENSOR_OP_H +#define ANAKIN_SABER_LITE_CORE_TENSOR_OP_H + +#include "saber/lite/core/tensor_lite.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +const float eps = 1e-6f; + +/** + * \brief Fill the host tensor buffer with rand value. + * \param tensor The reference of input tensor. + */ +template +void fill_tensor_const(Tensor& tensor, typename DataTrait::Dtype value); + + +/** + * \brief Fill the host tensor buffer with rand value. + * \param The reference of input tensor. + */ +template +void fill_tensor_rand(Tensor& tensor); + + +/** + * \brief Fill the host tensor buffer with rand value from vstart to vend. + * \param tensor The reference of input tensor. + */ +template +void fill_tensor_rand(Tensor& tensor, \ + typename DataTrait::Dtype vstart, \ + typename DataTrait::Dtype vend); + +/** + * \brief Print the data in host tensor. + * \param tensor The reference of input tensor. + */ +template +void print_tensor(Tensor& tensor); + +template +void print_tensor_valid(Tensor& tensor); + +template +void tensor_cmp_host(const Dtype* src1, const Dtype* src2, int size, double& max_ratio, double& max_diff); + +} //namespace lite + +} // namespace saber + +} // namespace anakin + +#endif //ANAKIN_SABER_LITE_CORE_TENSOR_OP_H diff --git a/saber/funcs/impl/detection_helper.cpp b/saber/lite/funcs/detection_lite.cpp similarity index 98% rename from saber/funcs/impl/detection_helper.cpp rename to saber/lite/funcs/detection_lite.cpp index bc00e38a2..9563e8203 100644 --- a/saber/funcs/impl/detection_helper.cpp +++ b/saber/lite/funcs/detection_lite.cpp @@ -1,8 +1,11 @@ -#include "saber/funcs/impl/detection_helper.h" +#include "detection_lite.h" +#include +#include +namespace anakin{ -namespace anakin { +namespace saber{ -namespace saber { +namespace lite{ template static bool sort_score_pair_descend(const std::pair& pair1, \ @@ -242,6 +245,8 @@ template void nms_detect(const float* bbox_cpu_data, const float* conf_cpu_data, int keep_topk, int nms_topk, float conf_thresh, float nms_thresh, float nms_eta, bool share_location); +} //namespace lite + } //namespace saber } //namespace anakin \ No newline at end of file diff --git a/saber/funcs/impl/detection_helper.h b/saber/lite/funcs/detection_lite.h similarity index 68% rename from saber/funcs/impl/detection_helper.h rename to saber/lite/funcs/detection_lite.h index 239baeeba..d6f391fb1 100644 --- a/saber/funcs/impl/detection_helper.h +++ b/saber/lite/funcs/detection_lite.h @@ -13,17 +13,17 @@ limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_DETECTION_HELPER_H -#define ANAKIN_SABER_FUNCS_DETECTION_HELPER_H +#ifndef ANAKIN_SABER_LITE_FUNCS_DETECTION_LITE_H +#define ANAKIN_SABER_LITE_FUNCS_DETECTION_LITE_H -#include "saber/core/common.h" -#include -#include +#include "saber/lite/core/common_lite.h" namespace anakin{ namespace saber{ +namespace lite{ + template dtype jaccard_overlap(const dtype* bbox1, const dtype* bbox2); @@ -39,16 +39,15 @@ void nms_detect(const dtype* bbox_cpu_data, int keep_topk, int nms_topk, float conf_thresh, float nms_thresh, float nms_eta, bool share_location); -#ifdef USE_CUDA -template -void decode_bboxes(const int nthreads, const Dtype* loc_data, const Dtype* prior_data, \ - const CodeType code_type, const bool variance_encoded_in_target, \ - const int num_priors, const bool share_location, \ - const int num_loc_classes, const int background_label_id, \ - Dtype* bbox_data, cudaStream_t stream); -#endif +void decode_bboxes(const int batch_num, const float* loc_data, const float* prior_data, \ + const CodeType code_type, const bool variance_encoded_in_target, \ + const int num_priors, const bool share_location, \ + const int num_loc_classes, const int background_label_id, \ + float* bbox_data); + +} //namespace lite } //namespace saber } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_DETECTION_HELPER_H +#endif //ANAKIN_SABER_LITE_FUNCS_DETECTION_LITE_H diff --git a/saber/lite/funcs/neon/impl/conv3x3s1_direct.cpp b/saber/lite/funcs/neon/impl/conv3x3s1_direct.cpp new file mode 100644 index 000000000..0b1e4f2db --- /dev/null +++ b/saber/lite/funcs/neon/impl/conv3x3s1_direct.cpp @@ -0,0 +1,1335 @@ +#include "saber/lite/funcs/neon/impl/conv_arm_impl.h" +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +void conv_3x3s1_direct(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space) { + //! 3x3s1 convolution, implemented by direct algorithm + //! pad is done implicit + const float zero[6] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + //! for 4x6 convolution window + const int right_pad_idx[4] = {3, 2, 1, 0}; + + int size_in_channel = win * hin; + int size_out_channel = wout * hout; + int w_stride = chin * 9; + + int tile_w = (win + 3) >> 2; + int tile_h = (hin + 1) >> 1; + int w_in_twice = win << 1; + int cnt_col = tile_w - 2; + + int size_pad_right = 1 + (tile_w << 2) - win; + int size_pad_bottom = 1 + (tile_h << 1) - hin; + + int cremain = chout - ((chout >> 1) << 1); + + uint32x4_t vmask_rp = vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(size_pad_right)); + unsigned int pmask_rp[4]; + vst1q_u32(pmask_rp, vmask_rp); + int right_pad_sub = (size_pad_right - 1) * sizeof(float); + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * chin * size_in_channel; + float *dout_batch = dout + n * chin * size_out_channel; +#pragma omp parallel for + for (int c = 0; c < chout - 1; c += 2) { + + float* dout_c0 = dout_batch + c * size_out_channel; + float* dout_c1 = dout_c0 + size_out_channel; + + if (flag_bias) { + fill_bias(dout_c0, &bias[c], 1, size_out_channel); + fill_bias(dout_c1, &bias[c + 1], 1, size_out_channel); + } else { + fill_bias(dout_c0, zero, 1, size_out_channel); + fill_bias(dout_c1, zero, 1, size_out_channel); + } + + //float* dout_c2 = dout_c1 + size_out_channel; + //float* dout_c3 = dout_c2 + size_out_channel; + + const float* wc0 = weights + c * w_stride; + const float* wc1 = wc0 + w_stride; + + //const float* wc2 = wc0 + w_stride; + //const float* wc3 = wc0 + w_stride; + + for (int i = 0; i < chin; ++i) { + + int relu = 0; + if ((i == chin - 1) && flag_relu) { + relu = 1; + } + + const float *din_channel = din_batch + i * size_in_channel; + + const float* wcin0 = wc0 + i * 9; + const float* wcin1 = wc1 + i * 9; + float32x4_t wr00 = vld1q_f32(wcin0); + float32x4_t wr01 = vld1q_f32(wcin0 + 3); + float32x4_t wr02 = vld1q_f32(wcin0 + 6); + + float32x4_t wr10 = vld1q_f32(wcin1); + float32x4_t wr11 = vld1q_f32(wcin1 + 3); + float32x4_t wr12 = vld1q_f32(wcin1 + 6); + + float *doutc0r0 = dout_c0; + float *doutc0r1 = doutc0r0 + wout; + + float *doutc1r0 = dout_c1; + float *doutc1r1 = doutc1r0 + wout; + + const float *dr0 = din_channel; + const float *dr1 = dr0 + win; + const float *dr2 = dr1 + win; + const float *dr3 = dr2 + win; + + const float *din0_ptr = dr0; + const float *din1_ptr = dr1; + const float *din2_ptr = dr2; + const float *din3_ptr = dr3; + + float* ptr_zero = const_cast(zero); + + //! deal with top pad + int h = 0; + { + //! process + if (1) { +#ifdef __aarch64__ + // todo +#else + int cnt = cnt_col; + + float tmp1[4]; + float* ptr1 = tmp1; + float tmp2[4]; + float* ptr2 = tmp2; + + asm volatile( + //! process left pad + "pld [%[doutc0r0], #192] @ preload data\n" + "pld [%[doutc0r1], #192] @ preload data\n" + "vld1.32 {d26-d27}, [%[doutc0r0]] @ load dout0r0\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "pld [%[din2_ptr], #192] @ preload data\n" + "vld1.32 {d28-d29}, [%[doutc0r1]] @ load dout0r1\n" + "pld [%[doutc1r0], #192] @ preload data\n" + "pld [%[doutc1r1], #192] @ preload data\n" + + //! 1st row + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "vld1.32 {d16-d17}, [%[doutc1r0]] @ load dout1r0\n" + "vld1.32 {d18-d19}, [%[doutc1r1]] @ load dout1r1\n" + + "vmla.f32 q13, q10, %e[wr01][1] @ mul weight0 11, out0r0\n" + "vmla.f32 q14, q10, %e[wr00][1] @ mul weight0 01, out0r1\n" + "vmla.f32 q8, q10, %e[wr11][1] @ mul weight1 11, out1r0\n" + "vmla.f32 q9, q10, %e[wr10][1] @ mul weight1 01, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q13, q12, %f[wr01][0] @ mul weight0 12, out0r0\n" + "vmla.f32 q14, q12, %f[wr00][0] @ mul weight0 02, out0r1\n" + "vmla.f32 q8, q12, %f[wr11][0] @ mul weight1 12, out1r0\n" + "vmla.f32 q9, q12, %f[wr10][0] @ mul weight1 02, out1r1\n" + + "vmov.u32 q15, #0 @ dump zero\n" + "vext.32 q12, q15, q10, #3 @ shift right r1\n" + "vmla.f32 q13, q12, %e[wr01][0] @ mul weight0 10, out0r0\n" + "vmla.f32 q14, q12, %e[wr00][0] @ mul weight0 00, out0r1\n" + "vmla.f32 q8, q12, %e[wr11][0] @ mul weight1 10, out1r0\n" + "vmla.f32 q9, q12, %e[wr10][0] @ mul weight1 00, out1r1\n" + + //! 2nd row + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r2\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "vmla.f32 q13, q10, %e[wr02][1] @ mul weight0 21, out0r0\n" + "vmla.f32 q14, q10, %e[wr01][1] @ mul weight0 11, out0r1\n" + "vmla.f32 q8, q10, %e[wr12][1] @ mul weight1 21, out1r0\n" + "vmla.f32 q9, q10, %e[wr11][1] @ mul weight1 11, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r2\n" + "vmla.f32 q13, q12, %f[wr02][0] @ mul weight0 22, out0r0\n" + "vmla.f32 q14, q12, %f[wr01][0] @ mul weight0 12, out0r1\n" + "vmla.f32 q8, q12, %f[wr12][0] @ mul weight1 22, out1r0\n" + "vmla.f32 q9, q12, %f[wr11][0] @ mul weight1 12, out1r1\n" + + "vext.32 q12, q15, q10, #3 @ shift right r2\n" + "vmla.f32 q13, q12, %e[wr02][0] @ mul weight0 20, out0r0\n" + "vmla.f32 q14, q12, %e[wr01][0] @ mul weight0 10, out0r1\n" + "vmla.f32 q8, q12, %e[wr12][0] @ mul weight1 20, out1r0\n" + "vmla.f32 q9, q12, %e[wr11][0] @ mul weight1 10, out1r1\n" + + //! 3rd row + "vld1.32 {d20-d22}, [%[din2_ptr]]! @ load din r3\n" + "pld [%[din2_ptr], #192] @ preload data\n" + "vmla.f32 q14, q10, %e[wr02][1] @ mul weight0 21, out0r1\n" + "vmla.f32 q9, q10, %e[wr12][1] @ mul weight1 21, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r3\n" + "vmla.f32 q14, q12, %f[wr02][0] @ mul weight0 22, out0r1\n" + "vmla.f32 q9, q12, %f[wr12][0] @ mul weight1 22, out1r1\n" + + "vext.32 q12, q15, q10, #3 @ shift right r3\n" + "vmla.f32 q14, q12, %e[wr02][0] @ mul weight0 20, out0r1\n" + "vmla.f32 q9, q12, %e[wr12][0] @ mul weight1 20, out1r1\n" + + "cmp %[relu], #1 @ check whether has mid cols\n" + "blt store_tl @ jump to store without relu\n" + "vmax.f32 q13, q13, q15 @ relu\n" + "vmax.f32 q14, q14, q15 @ relu\n" + "vmax.f32 q8, q8, q15 @ relu\n" + "vmax.f32 q9, q9, q15 @ relu\n" + + "store_tl: @ store top left result\n" + "vst1.32 {d26-d27}, [%[doutc0r0]]! @ store result, add pointer\n" + "pld [%[doutc0r0], #192] @ preload data\n" + "vst1.32 {d28-d29}, [%[doutc0r1]]! @ store result, add pointer\n" + "pld [%[doutc0r1], #192] @ preload data\n" + + "vst1.32 {d16-d17}, [%[doutc1r0]]! @ store result, add pointer\n" + "pld [%[doutc1r0], #192] @ preload data\n" + "vst1.32 {d18-d19}, [%[doutc1r1]]! @ store result, add pointer\n" + "pld [%[doutc1r1], #192] @ preload data\n" + + "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" + + //! process mid cols + "cmp %[cnt], #1 @ check whether has mid cols\n" + "blt start_top_right @ jump to main loop start point\n" + "start_top_mid: @ main loop start point\n" + + "vld1.32 {d26-d27}, [%[doutc0r0]] @ load dout0r0\n" + "vld1.32 {d28-d29}, [%[doutc0r1]] @ load dout0r1\n" + + //! 1st row + "vld1.32 {d20-d22}, [%[din0_ptr]] @ load din r1\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "vmla.f32 q13, q10, %e[wr01][0] @ mul weight0 10, out0r0\n" + "vmla.f32 q14, q10, %e[wr00][0] @ mul weight0 00, out0r1\n" + "vld1.32 {d16-d17}, [%[doutc1r0]] @ load dout1r0\n" + "vld1.32 {d18-d19}, [%[doutc1r1]] @ load dout1r1\n" + "vmla.f32 q8, q10, %e[wr11][0] @ mul weight1 10, out1r0\n" + "vmla.f32 q9, q10, %e[wr10][0] @ mul weight1 00, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q13, q12, %e[wr01][1] @ mul weight0 11, out0r0\n" + "vmla.f32 q14, q12, %e[wr00][1] @ mul weight0 01, out0r1\n" + "vmla.f32 q8, q12, %e[wr11][1] @ mul weight1 11, out1r0\n" + "vmla.f32 q9, q12, %e[wr10][1] @ mul weight1 01, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q13, q12, %f[wr01][0] @ mul weight0 12, out0r0\n" + "vmla.f32 q14, q12, %f[wr00][0] @ mul weight0 02, out0r1\n" + "vmla.f32 q8, q12, %f[wr11][0] @ mul weight1 12, out1r0\n" + "vmla.f32 q9, q12, %f[wr10][0] @ mul weight1 02, out1r1\n" + + //! 2nd row + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r2\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "vmla.f32 q13, q10, %e[wr02][0] @ mul weight0 20, out0r0\n" + "vmla.f32 q14, q10, %e[wr01][0] @ mul weight0 10, out0r1\n" + "vmla.f32 q8, q10, %e[wr12][0] @ mul weight1 20, out1r0\n" + "vmla.f32 q9, q10, %e[wr11][0] @ mul weight1 10, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r2\n" + "vmla.f32 q13, q12, %e[wr02][1] @ mul weight0 21, out0r0\n" + "vmla.f32 q14, q12, %e[wr01][1] @ mul weight0 11, out0r1\n" + "vmla.f32 q8, q12, %e[wr12][1] @ mul weight1 21, out1r0\n" + "vmla.f32 q9, q12, %e[wr11][1] @ mul weight1 11, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift left r2\n" + "vmla.f32 q13, q12, %f[wr02][0] @ mul weight0 22, out0r0\n" + "vmla.f32 q14, q12, %f[wr01][0] @ mul weight0 12, out0r1\n" + "vmla.f32 q8, q12, %f[wr12][0] @ mul weight1 22, out1r0\n" + "vmla.f32 q9, q12, %f[wr11][0] @ mul weight1 12, out1r1\n" + + //! 3rd row + "vld1.32 {d20-d22}, [%[din2_ptr]]! @ load din r3\n" + "pld [%[din2_ptr], #192] @ preload data\n" + "vmla.f32 q14, q10, %e[wr02][0] @ mul weight0 20, out0r1\n" + "vmla.f32 q9, q10, %e[wr12][0] @ mul weight1 20, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r3\n" + "vmla.f32 q14, q12, %e[wr02][1] @ mul weight0 21, out0r1\n" + "vmla.f32 q9, q12, %e[wr12][1] @ mul weight1 21, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift left r3\n" + "vmla.f32 q14, q12, %f[wr02][0] @ mul weight0 22, out0r1\n" + "vmla.f32 q9, q12, %f[wr12][0] @ mul weight1 22, out1r1\n" + + "cmp %[relu], #1 @ check whether has mid cols\n" + "blt store_tm @ jump to store without relu\n" + "vmax.f32 q13, q13, q15 @ relu\n" + "vmax.f32 q14, q14, q15 @ relu\n" + "vmax.f32 q8, q8, q15 @ relu\n" + "vmax.f32 q9, q9, q15 @ relu\n" + + "store_tm: @ store top mid result\n" + "vst1.32 {d26-d27}, [%[doutc0r0]]! @ store result, add pointer\n" + "pld [%[doutc0r0], #192] @ preload data\n" + "vst1.32 {d28-d29}, [%[doutc0r1]]! @ store result, add pointer\n" + "pld [%[doutc0r1], #192] @ preload data\n" + + "vst1.32 {d16-d17}, [%[doutc1r0]]! @ store result, add pointer\n" + "pld [%[doutc1r0], #192] @ preload data\n" + "vst1.32 {d18-d19}, [%[doutc1r1]]! @ store result, add pointer\n" + "pld [%[doutc1r1], #192] @ preload data\n" + + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din2_ptr], #8 @ 2 float data overlap with previous data\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "bne start_top_mid @ jump to main loop start point\n" + + //! process right pad + "start_top_right: @ right pad entry\n" + "vld1.32 {d26-d27}, [%[doutc0r0]] @ load dout0r0\n" + "vld1.32 {d28-d29}, [%[doutc0r1]] @ load dout0r1\n" + + //! 1st row + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[vmask_rp] @ bit select, deal with right pad\n" + "vbif d22, d31, %f[vmask_rp] @ bit select, deal with right pad\n" + "vmla.f32 q13, q10, %e[wr01][0] @ mul weight0 10, out0r0\n" + "vmla.f32 q14, q10, %e[wr00][0] @ mul weight0 00, out0r1\n" + "vld1.32 {d16-d17}, [%[doutc1r0]] @ load dout1r0\n" + "vld1.32 {d18-d19}, [%[doutc1r1]] @ load dout1r1\n" + "vmla.f32 q8, q10, %e[wr11][0] @ mul weight1 10, out1r0\n" + "vmla.f32 q9, q10, %e[wr10][0] @ mul weight1 00, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q13, q12, %e[wr01][1] @ mul weight0 11, out0r0\n" + "vmla.f32 q14, q12, %e[wr00][1] @ mul weight0 01, out0r1\n" + "vmla.f32 q8, q12, %e[wr11][1] @ mul weight1 11, out1r0\n" + "vmla.f32 q9, q12, %e[wr10][1] @ mul weight1 01, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q13, q12, %f[wr01][0] @ mul weight0 12, out0r0\n" + "vmla.f32 q14, q12, %f[wr00][0] @ mul weight0 02, out0r1\n" + "vmla.f32 q8, q12, %f[wr11][0] @ mul weight1 12, out1r0\n" + "vmla.f32 q9, q12, %f[wr10][0] @ mul weight1 02, out1r1\n" + + //! 2nd row + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r2\n" + "vbif d21, d31, %e[vmask_rp] @ bit select, deal with right pad\n" + "vbif d22, d31, %f[vmask_rp] @ bit select, deal with right pad\n" + "vmla.f32 q13, q10, %e[wr02][0] @ mul weight0 20, out0r0\n" + "vmla.f32 q14, q10, %e[wr01][0] @ mul weight0 10, out0r1\n" + "vmla.f32 q8, q10, %e[wr12][0] @ mul weight1 20, out1r0\n" + "vmla.f32 q9, q10, %e[wr11][0] @ mul weight1 10, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r2\n" + "vmla.f32 q13, q12, %e[wr02][1] @ mul weight0 21, out0r0\n" + "vmla.f32 q14, q12, %e[wr01][1] @ mul weight0 11, out0r1\n" + "vmla.f32 q8, q12, %e[wr12][1] @ mul weight1 21, out1r0\n" + "vmla.f32 q9, q12, %e[wr11][1] @ mul weight1 11, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift left r2\n" + "vmla.f32 q13, q12, %f[wr02][0] @ mul weight0 22, out0r0\n" + "vmla.f32 q14, q12, %f[wr01][0] @ mul weight0 12, out0r1\n" + "vmla.f32 q8, q12, %f[wr12][0] @ mul weight1 22, out1r0\n" + "vmla.f32 q9, q12, %f[wr11][0] @ mul weight1 12, out1r1\n" + + //! 3rd row + "vld1.32 {d20-d22}, [%[din2_ptr]]! @ load din r3\n" + "vbif d21, d31, %e[vmask_rp] @ bit select, deal with right pad\n" + "vbif d22, d31, %f[vmask_rp] @ bit select, deal with right pad\n" + "vmla.f32 q14, q10, %e[wr02][0] @ mul weight0 20, out0r1\n" + "vmla.f32 q9, q10, %e[wr12][0] @ mul weight1 20, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r3\n" + "vmla.f32 q14, q12, %e[wr02][1] @ mul weight0 21, out0r1\n" + "vmla.f32 q9, q12, %e[wr12][1] @ mul weight1 21, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift left r3\n" + "vmla.f32 q14, q12, %f[wr02][0] @ mul weight0 22, out0r1\n" + "vmla.f32 q9, q12, %f[wr12][0] @ mul weight1 22, out1r1\n" + + "cmp %[relu], #1 @ check whether has mid cols\n" + "blt store_tr @ jump to store without relu\n" + "vmax.f32 q13, q13, q15 @ relu\n" + "vmax.f32 q14, q14, q15 @ relu\n" + "vmax.f32 q8, q8, q15 @ relu\n" + "vmax.f32 q9, q9, q15 @ relu\n" + + "store_tr: @ store top mid result\n" + + "vld1.32 {d20-d21}, [%[doutc0r0]] @ load dout0r0\n" + "vld1.32 {d22-d23}, [%[doutc0r1]] @ load dout0r1\n" + + "vmvn.32 q12, q15 @ \n" + "vext.32 q15, q12, %q[vmask_rp], #3 @ shift mask right 1\n" + "vbif q13, q10, q15 @ bit select\n" + "vbif q14, q11, q15 @ bit select\n" + + "vld1.32 {d20-d21}, [%[doutc1r0]] @ load dout1r0\n" + "vld1.32 {d22-d23}, [%[doutc1r1]] @ load dout1r1\n" + + "vst1.32 {d26-d27}, [%[doutc0r0]]! @ store result, add pointer\n" + "vst1.32 {d28-d29}, [%[doutc0r1]]! @ store result, add pointer\n" + + + "vbif q8, q10, q15 @ bit select\n" + "vbif q9, q11, q15 @ bit select\n" + + "vst1.32 {d16-d17}, [%[doutc1r0]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[doutc1r1]]! @ store result, add pointer\n" + + "sub %[doutc0r0], %[doutc0r0], %[right_pad_sub] @ sub \n" + "sub %[doutc0r1], %[doutc0r1], %[right_pad_sub] @ sub \n" + "sub %[doutc1r0], %[doutc1r0], %[right_pad_sub] @ sub \n" + "sub %[doutc1r1], %[doutc1r1], %[right_pad_sub] @ sub \n" + + :[doutc0r0] "+r"(doutc0r0), [doutc0r1] "+r"(doutc0r1), \ + [doutc1r0] "+r" (doutc1r0), [doutc1r1] "+r" (doutc1r1),\ + [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), \ + [din2_ptr] "+r"(din2_ptr), [cnt] "+r"(cnt) + :[wr00] "w"(wr00), [wr01] "w"(wr01), [wr02] "w"(wr02), \ + [wr10] "w"(wr10), [wr11] "w"(wr11), [wr12] "w"(wr12), \ + [vmask_rp] "w" (vmask_rp), [right_pad_sub] "r" (right_pad_sub), \ + [relu] "r"(relu) + :"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +#endif //__aarch64__ + } + //! after process, increase pointer + doutc0r0 += wout; + doutc0r1 = doutc0r0 + wout; + doutc1r0 += wout; + doutc1r1 = doutc1r0 + wout; + + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + win; + dr3 = dr2 + win; + } //! end of process top row + + + //! process mid row + for (h = 1; h < tile_h - 1; h++) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + din3_ptr = dr3; + + { +#ifdef __aarch64__ + // todo +#else + int cnt = cnt_col; + asm volatile ( + //! process left pad + "pld [%[doutc0r0], #192] @ preload data\n" + "pld [%[doutc0r1], #192] @ preload data\n" + "vld1.32 {d26-d27}, [%[doutc0r0]] @ load dout0r0\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "pld [%[din2_ptr], #192] @ preload data\n" + "pld [%[din3_ptr], #192] @ preload data\n" + "vld1.32 {d28-d29}, [%[doutc0r1]] @ load dout0r1\n" + "pld [%[doutc1r0], #192] @ preload data\n" + "pld [%[doutc1r1], #192] @ preload data\n" + + //! 1st row + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r0\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "vmla.f32 q13, q10, %e[wr00][1] @ mul weight0 01, out0r0\n" + "vld1.32 {d16-d17}, [%[doutc1r0]] @ load dout1r0\n" + "vld1.32 {d18-d19}, [%[doutc1r1]] @ load dout1r1\n" + "vmla.f32 q8, q10, %e[wr10][1] @ mul weight1 01, out1r0\n" + + "vext.32 q12, q10, q11, #1 @ shift left r0\n" + "vmla.f32 q13, q12, %f[wr00][0] @ mul weight0 02, out0r0\n" + "vmla.f32 q8, q12, %f[wr10][0] @ mul weight1 02, out1r0\n" + + "vmov.u32 q15, #0 @ dump zero\n" + "vext.32 q12, q15, q10, #3 @ shift right r1\n" + "vmla.f32 q13, q12, %e[wr00][0] @ mul weight0 00, out0r0\n" + "vmla.f32 q8, q12, %e[wr10][0] @ mul weight1 00, out1r0\n" + + //! 2nd row + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "vmla.f32 q13, q10, %e[wr01][1] @ mul weight0 11, out0r0\n" + "vmla.f32 q14, q10, %e[wr00][1] @ mul weight0 01, out0r1\n" + "vmla.f32 q8, q10, %e[wr11][1] @ mul weight1 11, out1r0\n" + "vmla.f32 q9, q10, %e[wr10][1] @ mul weight1 01, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q13, q12, %f[wr01][0] @ mul weight0 12, out0r0\n" + "vmla.f32 q14, q12, %f[wr00][0] @ mul weight0 02, out0r1\n" + "vmla.f32 q8, q12, %f[wr11][0] @ mul weight1 12, out1r0\n" + "vmla.f32 q9, q12, %f[wr10][0] @ mul weight1 02, out1r1\n" + + "vext.32 q12, q15, q10, #3 @ shift right r1\n" + "vmla.f32 q13, q12, %e[wr01][0] @ mul weight0 10, out0r0\n" + "vmla.f32 q14, q12, %e[wr00][0] @ mul weight0 00, out0r1\n" + "vmla.f32 q8, q12, %e[wr11][0] @ mul weight1 10, out1r0\n" + "vmla.f32 q9, q12, %e[wr10][0] @ mul weight1 00, out1r1\n" + + //! 3rd row + "vld1.32 {d20-d22}, [%[din2_ptr]]! @ load din r2\n" + "pld [%[din2_ptr], #192] @ preload data\n" + "vmla.f32 q13, q10, %e[wr02][1] @ mul weight0 21, out0r0\n" + "vmla.f32 q14, q10, %e[wr01][1] @ mul weight0 11, out0r1\n" + "vmla.f32 q8, q10, %e[wr12][1] @ mul weight1 21, out1r0\n" + "vmla.f32 q9, q10, %e[wr11][1] @ mul weight1 11, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r2\n" + "vmla.f32 q13, q12, %f[wr02][0] @ mul weight0 22, out0r0\n" + "vmla.f32 q14, q12, %f[wr01][0] @ mul weight0 12, out0r1\n" + "vmla.f32 q8, q12, %f[wr12][0] @ mul weight1 22, out1r0\n" + "vmla.f32 q9, q12, %f[wr11][0] @ mul weight1 12, out1r1\n" + + "vext.32 q12, q15, q10, #3 @ shift right r2\n" + "vmla.f32 q13, q12, %e[wr02][0] @ mul weight0 20, out0r0\n" + "vmla.f32 q14, q12, %e[wr01][0] @ mul weight0 10, out0r1\n" + "vmla.f32 q8, q12, %e[wr12][0] @ mul weight1 20, out1r0\n" + "vmla.f32 q9, q12, %e[wr11][0] @ mul weight1 10, out1r1\n" + + //! 4rd row + "vld1.32 {d20-d22}, [%[din3_ptr]]! @ load din r3\n" + "pld [%[din3_ptr], #192] @ preload data\n" + "vmla.f32 q14, q10, %e[wr02][1] @ mul weight0 21, out0r1\n" + "vmla.f32 q9, q10, %e[wr12][1] @ mul weight1 21, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r3\n" + "vmla.f32 q14, q12, %f[wr02][0] @ mul weight0 22, out0r1\n" + "vmla.f32 q9, q12, %f[wr12][0] @ mul weight1 22, out1r1\n" + + "vext.32 q12, q15, q10, #3 @ shift right r3\n" + "vmla.f32 q14, q12, %e[wr02][0] @ mul weight0 20, out0r1\n" + "vmla.f32 q9, q12, %e[wr12][0] @ mul weight1 20, out1r1\n" + + "cmp %[relu], #1 @ check whether has mid cols\n" + "blt store_ml @ jump to store without relu\n" + "vmax.f32 q13, q13, q15 @ relu\n" + "vmax.f32 q14, q14, q15 @ relu\n" + "vmax.f32 q8, q8, q15 @ relu\n" + "vmax.f32 q9, q9, q15 @ relu\n" + + "store_ml: @ store top mid result\n" + "vst1.32 {d26-d27}, [%[doutc0r0]]! @ store result, add pointer\n" + "pld [%[doutc0r0], #192] @ preload data\n" + "vst1.32 {d28-d29}, [%[doutc0r1]]! @ store result, add pointer\n" + "pld [%[doutc0r1], #192] @ preload data\n" + + "vst1.32 {d16-d17}, [%[doutc1r0]]! @ store result, add pointer\n" + "pld [%[doutc1r0], #192] @ preload data\n" + "vst1.32 {d18-d19}, [%[doutc1r1]]! @ store result, add pointer\n" + "pld [%[doutc1r1], #192] @ preload data\n" + + "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" + + //! process mid cols + "cmp %[cnt], #1 @ check whether has mid cols\n" + "blt start_mid_right @ jump to main loop start point\n" + "start_mid_mid: @ main loop start point\n" + + "vld1.32 {d26-d27}, [%[doutc0r0]] @ load dout0r0\n" + "vld1.32 {d28-d29}, [%[doutc0r1]] @ load dout0r1\n" + + //! 1st row + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r0\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "vmla.f32 q13, q10, %e[wr00][0] @ mul weight0 00, out0r0\n" + "vld1.32 {d16-d17}, [%[doutc1r0]] @ load dout1r0\n" + "vld1.32 {d18-d19}, [%[doutc1r1]] @ load dout1r1\n" + "vmla.f32 q8, q10, %e[wr10][0] @ mul weight1 00, out1r0\n" + + "vext.32 q12, q10, q11, #1 @ shift left r0\n" + "vmla.f32 q13, q12, %e[wr00][1] @ mul weight0 01, out0r0\n" + "vmla.f32 q8, q12, %e[wr10][1] @ mul weight1 01, out1r0\n" + + "vext.32 q12, q10, q11, #2 @ shift left r0\n" + "vmla.f32 q13, q12, %f[wr00][0] @ mul weight0 02, out0r0\n" + "vmla.f32 q8, q12, %f[wr10][0] @ mul weight1 02, out1r0\n" + + //! 2nd row + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "vmla.f32 q13, q10, %e[wr01][0] @ mul weight0 10, out0r0\n" + "vmla.f32 q14, q10, %e[wr00][0] @ mul weight0 00, out0r1\n" + "vmla.f32 q8, q10, %e[wr11][0] @ mul weight1 10, out1r0\n" + "vmla.f32 q9, q10, %e[wr10][0] @ mul weight1 00, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q13, q12, %e[wr01][1] @ mul weight0 11, out0r0\n" + "vmla.f32 q14, q12, %e[wr00][1] @ mul weight0 01, out0r1\n" + "vmla.f32 q8, q12, %e[wr11][1] @ mul weight1 11, out1r0\n" + "vmla.f32 q9, q12, %e[wr10][1] @ mul weight1 01, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q13, q12, %f[wr01][0] @ mul weight0 12, out0r0\n" + "vmla.f32 q14, q12, %f[wr00][0] @ mul weight0 02, out0r1\n" + "vmla.f32 q8, q12, %f[wr11][0] @ mul weight1 12, out1r0\n" + "vmla.f32 q9, q12, %f[wr10][0] @ mul weight1 02, out1r1\n" + + //! 3rd row + "vld1.32 {d20-d22}, [%[din2_ptr]]! @ load din r2\n" + "pld [%[din2_ptr], #192] @ preload data\n" + "vmla.f32 q13, q10, %e[wr02][0] @ mul weight0 20, out0r0\n" + "vmla.f32 q14, q10, %e[wr01][0] @ mul weight0 10, out0r1\n" + "vmla.f32 q8, q10, %e[wr12][0] @ mul weight1 20, out1r0\n" + "vmla.f32 q9, q10, %e[wr11][0] @ mul weight1 10, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r2\n" + "vmla.f32 q13, q12, %e[wr02][1] @ mul weight0 21, out0r0\n" + "vmla.f32 q14, q12, %e[wr01][1] @ mul weight0 11, out0r1\n" + "vmla.f32 q8, q12, %e[wr12][1] @ mul weight1 21, out1r0\n" + "vmla.f32 q9, q12, %e[wr11][1] @ mul weight1 11, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift right r2\n" + "vmla.f32 q13, q12, %f[wr02][0] @ mul weight0 22, out0r0\n" + "vmla.f32 q14, q12, %f[wr01][0] @ mul weight0 12, out0r1\n" + "vmla.f32 q8, q12, %f[wr12][0] @ mul weight1 22, out1r0\n" + "vmla.f32 q9, q12, %f[wr11][0] @ mul weight1 12, out1r1\n" + + //! 4rd row + "vld1.32 {d20-d22}, [%[din3_ptr]]! @ load din r3\n" + "pld [%[din3_ptr], #192] @ preload data\n" + "vmla.f32 q14, q10, %e[wr02][0] @ mul weight0 20, out0r1\n" + "vmla.f32 q9, q10, %e[wr12][0] @ mul weight1 20, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r3\n" + "vmla.f32 q14, q12, %e[wr02][1] @ mul weight0 21, out0r1\n" + "vmla.f32 q9, q12, %e[wr12][1] @ mul weight1 21, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift left r3\n" + "vmla.f32 q14, q12, %f[wr02][0] @ mul weight0 22, out0r1\n" + "vmla.f32 q9, q12, %f[wr12][0] @ mul weight1 22, out1r1\n" + + "cmp %[relu], #1 @ check whether has mid cols\n" + "blt store_mm @ jump to store without relu\n" + "vmax.f32 q13, q13, q15 @ relu\n" + "vmax.f32 q14, q14, q15 @ relu\n" + "vmax.f32 q8, q8, q15 @ relu\n" + "vmax.f32 q9, q9, q15 @ relu\n" + + "store_mm: @ store top mid result\n" + "vst1.32 {d26-d27}, [%[doutc0r0]]! @ store result, add pointer\n" + "pld [%[doutc0r0], #192] @ preload data\n" + "vst1.32 {d28-d29}, [%[doutc0r1]]! @ store result, add pointer\n" + "pld [%[doutc0r1], #192] @ preload data\n" + + "vst1.32 {d16-d17}, [%[doutc1r0]]! @ store result, add pointer\n" + "pld [%[doutc1r0], #192] @ preload data\n" + "vst1.32 {d18-d19}, [%[doutc1r1]]! @ store result, add pointer\n" + "pld [%[doutc1r1], #192] @ preload data\n" + + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din2_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din3_ptr], #8 @ 2 float data overlap with previous data\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "bne start_mid_mid @ jump to main loop start point\n" + + //! process right pad + "start_mid_right: @ right pad entry\n" + + "vld1.32 {d26-d27}, [%[doutc0r0]] @ load dout0r0\n" + "vld1.32 {d28-d29}, [%[doutc0r1]] @ load dout0r1\n" + + //! 1st row + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r0\n" + "vbif d21, d31, %e[vmask_rp] @ bit select, deal with right pad\n" + "vbif d22, d31, %f[vmask_rp] @ bit select, deal with right pad\n" + "vmla.f32 q13, q10, %e[wr00][0] @ mul weight0 00, out0r0\n" + "vld1.32 {d16-d17}, [%[doutc1r0]] @ load dout1r0\n" + "vld1.32 {d18-d19}, [%[doutc1r1]] @ load dout1r1\n" + "vmla.f32 q8, q10, %e[wr10][0] @ mul weight1 00, out1r0\n" + + "vext.32 q12, q10, q11, #1 @ shift left r0\n" + "vmla.f32 q13, q12, %e[wr00][1] @ mul weight0 01, out0r0\n" + "vmla.f32 q8, q12, %e[wr10][1] @ mul weight1 01, out1r0\n" + + "vext.32 q12, q10, q11, #2 @ shift left r0\n" + "vmla.f32 q13, q12, %f[wr00][0] @ mul weight0 02, out0r0\n" + "vmla.f32 q8, q12, %f[wr10][0] @ mul weight1 02, out1r0\n" + + //! 2nd row + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[vmask_rp] @ bit select, deal with right pad\n" + "vbif d22, d31, %f[vmask_rp] @ bit select, deal with right pad\n" + "vmla.f32 q13, q10, %e[wr01][0] @ mul weight0 10, out0r0\n" + "vmla.f32 q14, q10, %e[wr00][0] @ mul weight0 00, out0r1\n" + "vmla.f32 q8, q10, %e[wr11][0] @ mul weight1 10, out1r0\n" + "vmla.f32 q9, q10, %e[wr10][0] @ mul weight1 00, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q13, q12, %e[wr01][1] @ mul weight0 11, out0r0\n" + "vmla.f32 q14, q12, %e[wr00][1] @ mul weight0 01, out0r1\n" + "vmla.f32 q8, q12, %e[wr11][1] @ mul weight1 11, out1r0\n" + "vmla.f32 q9, q12, %e[wr10][1] @ mul weight1 01, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q13, q12, %f[wr01][0] @ mul weight0 12, out0r0\n" + "vmla.f32 q14, q12, %f[wr00][0] @ mul weight0 02, out0r1\n" + "vmla.f32 q8, q12, %f[wr11][0] @ mul weight1 12, out1r0\n" + "vmla.f32 q9, q12, %f[wr10][0] @ mul weight1 02, out1r1\n" + + //! 3rd row + "vld1.32 {d20-d22}, [%[din2_ptr]]! @ load din r2\n" + "vbif d21, d31, %e[vmask_rp] @ bit select, deal with right pad\n" + "vbif d22, d31, %f[vmask_rp] @ bit select, deal with right pad\n" + "vmla.f32 q13, q10, %e[wr02][0] @ mul weight0 20, out0r0\n" + "vmla.f32 q14, q10, %e[wr01][0] @ mul weight0 10, out0r1\n" + "vmla.f32 q8, q10, %e[wr12][0] @ mul weight1 20, out1r0\n" + "vmla.f32 q9, q10, %e[wr11][0] @ mul weight1 10, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r2\n" + "vmla.f32 q13, q12, %e[wr02][1] @ mul weight0 21, out0r0\n" + "vmla.f32 q14, q12, %e[wr01][1] @ mul weight0 11, out0r1\n" + "vmla.f32 q8, q12, %e[wr12][1] @ mul weight1 21, out1r0\n" + "vmla.f32 q9, q12, %e[wr11][1] @ mul weight1 11, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift right r2\n" + "vmla.f32 q13, q12, %f[wr02][0] @ mul weight0 22, out0r0\n" + "vmla.f32 q14, q12, %f[wr01][0] @ mul weight0 12, out0r1\n" + "vmla.f32 q8, q12, %f[wr12][0] @ mul weight1 22, out1r0\n" + "vmla.f32 q9, q12, %f[wr11][0] @ mul weight1 12, out1r1\n" + + //! 4rd row + "vld1.32 {d20-d22}, [%[din3_ptr]]! @ load din r3\n" + "vbif d21, d31, %e[vmask_rp] @ bit select, deal with right pad\n" + "vbif d22, d31, %f[vmask_rp] @ bit select, deal with right pad\n" + + "vmla.f32 q14, q10, %e[wr02][0] @ mul weight0 20, out0r1\n" + "vmla.f32 q9, q10, %e[wr12][0] @ mul weight1 20, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r3\n" + "vmla.f32 q14, q12, %e[wr02][1] @ mul weight0 21, out0r1\n" + "vmla.f32 q9, q12, %e[wr12][1] @ mul weight1 21, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift left r3\n" + "vmla.f32 q14, q12, %f[wr02][0] @ mul weight0 22, out0r1\n" + "vmla.f32 q9, q12, %f[wr12][0] @ mul weight1 22, out1r1\n" + + "cmp %[relu], #1 @ check whether has mid cols\n" + "blt store_mr @ jump to store without relu\n" + "vmax.f32 q13, q13, q15 @ relu\n" + "vmax.f32 q14, q14, q15 @ relu\n" + "vmax.f32 q8, q8, q15 @ relu\n" + "vmax.f32 q9, q9, q15 @ relu\n" + + "store_mr: @ store top mid result\n" + + "vld1.32 {d20-d21}, [%[doutc0r0]] @ load dout0r0\n" + "vld1.32 {d22-d23}, [%[doutc0r1]] @ load dout0r1\n" + + "vmvn.32 q12, q15 @ \n" + "vext.32 q15, q12, %q[vmask_rp], #3 @ shift mask right 1\n" + "vbif q13, q10, q15 @ bit select\n" + "vbif q14, q11, q15 @ bit select\n" + + "vst1.32 {d26-d27}, [%[doutc0r0]]! @ store result, add pointer\n" + + "vst1.32 {d28-d29}, [%[doutc0r1]]! @ store result, add pointer\n" + + "vld1.32 {d20-d21}, [%[doutc1r0]] @ load dout1r0\n" + "vld1.32 {d22-d23}, [%[doutc1r1]] @ load dout1r1\n" + + "vbif q8, q10, q15 @ bit select\n" + "vbif q9, q11, q15 @ bit select\n" + + "vst1.32 {d16-d17}, [%[doutc1r0]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[doutc1r1]]! @ store result, add pointer\n" + + "sub %[doutc0r0], %[doutc0r0], %[right_pad_sub] @ sub \n" + "sub %[doutc0r1], %[doutc0r1], %[right_pad_sub] @ sub \n" + "sub %[doutc1r0], %[doutc1r0], %[right_pad_sub] @ sub \n" + "sub %[doutc1r1], %[doutc1r1], %[right_pad_sub] @ sub \n" + + :[doutc0r0] "+r"(doutc0r0), [doutc0r1] "+r"(doutc0r1), \ + [doutc1r0] "+r" (doutc1r0), [doutc1r1] "+r" (doutc1r1),\ + [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), \ + [din2_ptr] "+r"(din2_ptr), [din3_ptr] "+r"(din3_ptr), \ + [cnt] "+r"(cnt) + :[wr00] "w"(wr00), [wr01] "w"(wr01), [wr02] "w"(wr02), \ + [wr10] "w"(wr10), [wr11] "w"(wr11), [wr12] "w"(wr12), \ + [vmask_rp] "w" (vmask_rp), [right_pad_sub] "r" (right_pad_sub), \ + [relu] "r"(relu) + :"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +#endif //__aarch64__ + } + doutc0r0 += wout; + doutc0r1 = doutc0r0 + wout; + doutc1r0 += wout; + doutc1r1 = doutc1r0 + wout; + + dr0 = dr2; + dr1 = dr3; + dr2 = dr1 + win; + dr3 = dr2 + win; + } //! end of processing mid rows + + //! deal with bottom pad + if (1) { + + din0_ptr = dr0; + din1_ptr = dr1; + if (size_pad_bottom == 2) { +#ifdef __aarch64__ + // todo +#else + int cnt = cnt_col; + asm volatile ( + //! process left pad + "pld [%[doutc0r0]] @ preload data\n" + "pld [%[doutc1r0]] @ preload data\n" + "vld1.32 {d12-d13}, [%[doutc0r0]] @ load dout0r0\n" + "pld [%[din0_ptr]] @ preload data\n" + "pld [%[din1_ptr]] @ preload data\n" + "vld1.32 {d14-d15}, [%[doutc1r0]] @ load dout1r0\n" + + //! 1st row + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + + "vmla.f32 q6, q8, %e[wr00][1] @ mul weight0 01, out0r0\n" + "vmla.f32 q7, q8, %e[wr10][1] @ mul weight1 01, out1r0\n" + + "pld [%[din0_ptr]] @ preload data\n" + + "vext.32 q12, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q6, q12, %f[wr00][0] @ mul weight0 02, out0r0\n" + "vmla.f32 q7, q12, %f[wr10][0] @ mul weight1 02, out1r0\n" + + "vmov.u32 q15, #0 @ dump zero\n" + "vext.32 q12, q15, q8, #3 @ shift right r1\n" + "vmla.f32 q6, q12, %e[wr00][0] @ mul weight0 00, out0r0\n" + "vmla.f32 q7, q12, %e[wr10][0] @ mul weight1 00, out1r0\n" + + //! 2nd row + "vmla.f32 q6, q10, %e[wr01][1] @ mul weight0 11, out0r0\n" + "vmla.f32 q7, q10, %e[wr11][1] @ mul weight1 11, out1r0\n" + + "pld [%[din1_ptr]] @ preload data\n" + + "vext.32 q12, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q6, q12, %f[wr01][0] @ mul weight0 12, out0r0\n" + "vmla.f32 q7, q12, %f[wr11][0] @ mul weight1 12, out1r0\n" + + "vext.32 q12, q15, q10, #3 @ shift right r1\n" + "vmla.f32 q6, q12, %e[wr01][0] @ mul weight0 10, out0r0\n" + "vmla.f32 q7, q12, %e[wr11][0] @ mul weight1 10, out1r0\n" + + "cmp %[relu], #1 @ check whether has mid cols\n" + "blt store_bl_1 @ jump to store without relu\n" + "vmax.f32 q6, q6, q15 @ relu\n" + "vmax.f32 q7, q7, q15 @ relu\n" + + "store_bl_1: @ store top mid result\n" + "vst1.32 {d12-d13}, [%[doutc0r0]]! @ store result, add pointer\n" + "vst1.32 {d14-d15}, [%[doutc1r0]]! @ store result, add pointer\n" + + "pld [%[doutc0r0]] @ preload data\n" + "pld [%[doutc1r0]] @ preload data\n" + + "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" + + //! process mid cols + "cmp %[cnt], #1 @ check whether has mid cols\n" + "blt conv3x3_bot_right @ jump to main loop start point\n" + "conv3x3_bot_mid: @ main loop start point\n" + + "vld1.32 {d12-d13}, [%[doutc0r0]] @ load dout0r0\n" + "vld1.32 {d14-d15}, [%[doutc1r0]] @ load dout1r0\n" + + //! 1st row + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + + "vmla.f32 q6, q8, %e[wr00][0] @ mul weight0 00, out0r0\n" + "vmla.f32 q7, q8, %e[wr10][0] @ mul weight1 00, out1r0\n" + + "pld [%[din0_ptr]] @ preload data\n" + + "vext.32 q12, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q6, q12, %e[wr00][1] @ mul weight0 01, out0r0\n" + "vmla.f32 q7, q12, %e[wr10][1] @ mul weight1 01, out1r0\n" + + "vext.32 q12, q8, q9, #2 @ shift left r0\n" + "vmla.f32 q6, q12, %f[wr00][0] @ mul weight0 02, out0r0\n" + "vmla.f32 q7, q12, %f[wr10][0] @ mul weight1 02, out1r0\n" + + //! 2nd row + "vmla.f32 q6, q10, %e[wr01][0] @ mul weight0 10, out0r0\n" + "vmla.f32 q7, q10, %e[wr11][0] @ mul weight1 10, out1r0\n" + + "pld [%[din1_ptr]] @ preload data\n" + + "vext.32 q12, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q6, q12, %e[wr01][1] @ mul weight0 11, out0r0\n" + "vmla.f32 q7, q12, %e[wr11][1] @ mul weight1 11, out1r0\n" + + "vext.32 q12, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q6, q12, %f[wr01][0] @ mul weight0 12, out0r0\n" + "vmla.f32 q7, q12, %f[wr11][0] @ mul weight1 12, out1r0\n" + + "cmp %[relu], #1 @ check whether has mid cols\n" + "blt store_bm_1 @ jump to store without relu\n" + "vmax.f32 q6, q6, q15 @ relu\n" + "vmax.f32 q7, q7, q15 @ relu\n" + + "store_bm_1: @ store top mid result\n" + "vst1.32 {d12-d13}, [%[doutc0r0]]! @ store result, add pointer\n" + "vst1.32 {d14-d15}, [%[doutc1r0]]! @ store result, add pointer\n" + "pld [%[doutc0r0]] @ preload data\n" + "pld [%[doutc1r0]] @ preload data\n" + + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne conv3x3_bot_mid @ jump to main loop start point\n" + + //! process right pad + "conv3x3_bot_right: @ right pad entry\n" + + "vld1.32 {d12-d13}, [%[doutc0r0]] @ load dout0r0\n" + "vld1.32 {d14-d15}, [%[doutc1r0]] @ load dout1r0\n" + + //! 1st row + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + + "vbif d17, d31, %e[vmask_rp] @ bit select, deal with right pad\n" + "vbif d18, d31, %f[vmask_rp] @ bit select, deal with right pad\n" + + "vmla.f32 q6, q8, %e[wr00][0] @ mul weight0 00, out0r0\n" + "vmla.f32 q7, q8, %e[wr10][0] @ mul weight1 00, out1r0\n" + + "vext.32 q12, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q6, q12, %e[wr00][1] @ mul weight0 01, out0r0\n" + "vmla.f32 q7, q12, %e[wr10][1] @ mul weight1 01, out1r0\n" + + "vext.32 q12, q8, q9, #2 @ shift left r0\n" + "vmla.f32 q6, q12, %f[wr00][0] @ mul weight0 02, out0r0\n" + "vmla.f32 q7, q12, %f[wr10][0] @ mul weight1 02, out1r0\n" + + //! 2nd row + "vbif d21, d31, %e[vmask_rp] @ bit select, deal with right pad\n" + "vbif d22, d31, %f[vmask_rp] @ bit select, deal with right pad\n" + "vmla.f32 q6, q10, %e[wr01][0] @ mul weight0 10, out0r0\n" + "vmla.f32 q7, q10, %e[wr11][0] @ mul weight1 10, out1r0\n" + + "vext.32 q12, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q6, q12, %e[wr01][1] @ mul weight0 11, out0r0\n" + "vmla.f32 q7, q12, %e[wr11][1] @ mul weight1 11, out1r0\n" + + "vext.32 q12, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q6, q12, %f[wr01][0] @ mul weight0 12, out0r0\n" + "vmla.f32 q7, q12, %f[wr11][0] @ mul weight1 12, out1r0\n" + + "cmp %[relu], #1 @ check whether has mid cols\n" + "blt store_br_1 @ jump to store without relu\n" + "vmax.f32 q6, q6, q15 @ relu\n" + "vmax.f32 q7, q7, q15 @ relu\n" + + "store_br_1: @ store top mid result\n" + + "vld1.32 {d16-d17}, [%[doutc0r0]] @ load dout0r0\n" + "vld1.32 {d18-d19}, [%[doutc1r0]] @ load dout0r0\n" + + "vmvn.32 q12, q15 @ \n" + "vext.32 q15, q12, %q[vmask_rp], #3 @ shift mask right 1\n" + "vbif q6, q8, q15 @ bit select\n" + "vbif q7, q9, q15 @ bit select\n" + + "vst1.32 {d12-d13}, [%[doutc0r0]] @ store result, add pointer\n" + "vst1.32 {d14-d15}, [%[doutc1r0]] @ store result, add pointer\n" + :[doutc0r0] "+r"(doutc0r0), [doutc1r0] "+r" (doutc1r0),\ + [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), \ + [cnt] "+r"(cnt) + :[wr00] "w"(wr00), [wr01] "w"(wr01), \ + [wr10] "w"(wr10), [wr11] "w"(wr11), \ + [vmask_rp] "w" (vmask_rp), [relu] "r"(relu) + :"q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +#endif //__aarch64__ + + } else { // write 2 rows + din2_ptr = dr2; +#ifdef __aarch64__ + // todo +#else + int cnt = cnt_col; + asm volatile ( + //! process left pad + "pld [%[doutc0r0], #192] @ preload data\n" + "pld [%[doutc0r1], #192] @ preload data\n" + "pld [%[doutc1r0], #192] @ preload data\n" + "pld [%[doutc1r1], #192] @ preload data\n" + + "vld1.32 {d26-d27}, [%[doutc0r0]] @ load dout0r0\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "pld [%[din2_ptr], #192] @ preload data\n" + "vld1.32 {d28-d29}, [%[doutc0r1]] @ load dout0r1\n" + + //! 1st row + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r0\n" + "vmla.f32 q13, q10, %e[wr00][1] @ mul weight0 01, out0r0\n" + "vld1.32 {d16-d17}, [%[doutc1r0]] @ load dout1r0\n" + "vld1.32 {d18-d19}, [%[doutc1r1]] @ load dout1r1\n" + "vmla.f32 q8, q10, %e[wr10][1] @ mul weight1 01, out1r0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 q12, q10, q11, #1 @ shift left r0\n" + "vmla.f32 q13, q12, %f[wr00][0] @ mul weight0 02, out0r0\n" + "vmla.f32 q8, q12, %f[wr10][0] @ mul weight1 02, out1r0\n" + + "vmov.u32 q15, #0 @ dump zero\n" + "vext.32 q12, q15, q10, #3 @ shift right r1\n" + "vmla.f32 q13, q12, %e[wr00][0] @ mul weight0 00, out0r0\n" + "vmla.f32 q8, q12, %e[wr10][0] @ mul weight1 00, out1r0\n" + + //! 2nd row + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vmla.f32 q13, q10, %e[wr01][1] @ mul weight0 11, out0r0\n" + "vmla.f32 q14, q10, %e[wr00][1] @ mul weight0 01, out0r1\n" + "vmla.f32 q8, q10, %e[wr11][1] @ mul weight1 11, out1r0\n" + "vmla.f32 q9, q10, %e[wr10][1] @ mul weight1 01, out1r1\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vext.32 q12, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q13, q12, %f[wr01][0] @ mul weight0 12, out0r0\n" + "vmla.f32 q14, q12, %f[wr00][0] @ mul weight0 02, out0r1\n" + "vmla.f32 q8, q12, %f[wr11][0] @ mul weight1 12, out1r0\n" + "vmla.f32 q9, q12, %f[wr10][0] @ mul weight1 02, out1r1\n" + + "vext.32 q12, q15, q10, #3 @ shift right r1\n" + "vmla.f32 q13, q12, %e[wr01][0] @ mul weight0 10, out0r0\n" + "vmla.f32 q14, q12, %e[wr00][0] @ mul weight0 00, out0r1\n" + "vmla.f32 q8, q12, %e[wr11][0] @ mul weight1 10, out1r0\n" + "vmla.f32 q9, q12, %e[wr10][0] @ mul weight1 00, out1r1\n" + + //! 3rd row + "vld1.32 {d20-d22}, [%[din2_ptr]] @ load din r2\n" + "vmla.f32 q13, q10, %e[wr02][1] @ mul weight0 21, out0r0\n" + "vmla.f32 q14, q10, %e[wr01][1] @ mul weight0 11, out0r1\n" + "vmla.f32 q8, q10, %e[wr12][1] @ mul weight1 21, out1r0\n" + "vmla.f32 q9, q10, %e[wr11][1] @ mul weight1 11, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r2\n" + "vmla.f32 q13, q12, %f[wr02][0] @ mul weight0 22, out0r0\n" + "vmla.f32 q14, q12, %f[wr01][0] @ mul weight0 12, out0r1\n" + "vmla.f32 q8, q12, %f[wr12][0] @ mul weight1 22, out1r0\n" + "vmla.f32 q9, q12, %f[wr11][0] @ mul weight1 12, out1r1\n" + + "vext.32 q12, q15, q10, #3 @ shift right r2\n" + "vmla.f32 q13, q12, %e[wr02][0] @ mul weight0 20, out0r0\n" + "vmla.f32 q14, q12, %e[wr01][0] @ mul weight0 10, out0r1\n" + "vmla.f32 q8, q12, %e[wr12][0] @ mul weight1 20, out1r0\n" + "vmla.f32 q9, q12, %e[wr11][0] @ mul weight1 10, out1r1\n" + + "cmp %[relu], #1 @ check whether has mid cols\n" + "blt store_bl_2 @ jump to store without relu\n" + "vmax.f32 q13, q13, q15 @ relu\n" + "vmax.f32 q14, q14, q15 @ relu\n" + "vmax.f32 q8, q8, q15 @ relu\n" + "vmax.f32 q9, q9, q15 @ relu\n" + + "store_bl_2: @ store top mid result\n" + "vst1.32 {d26-d27}, [%[doutc0r0]]! @ store result, add pointer\n" + "vst1.32 {d16-d17}, [%[doutc1r0]]! @ store result, add pointer\n" + + "pld [%[doutc0r0], #192] @ preload data\n" + "pld [%[doutc1r0], #192] @ preload data\n" + + "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" + + "vst1.32 {d28-d29}, [%[doutc0r1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[doutc1r1]]! @ store result, add pointer\n" + "add %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" + "pld [%[din2_ptr], #192] @ preload data\n" + "pld [%[doutc0r1], #192] @ preload data\n" + "pld [%[doutc1r1], #192] @ preload data\n" + + //! process mid cols + "cmp %[cnt], #1 @ check whether has mid cols\n" + "blt conv3x3_bot_right_2 @ jump to main loop start point\n" + "conv3x3_bot_mid_2: @ main loop start point\n" + + "vld1.32 {d26-d27}, [%[doutc0r0]] @ load dout0r0\n" + "vld1.32 {d28-d29}, [%[doutc0r1]] @ load dout0r1\n" + + //! 1st row + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r0\n" + "vmla.f32 q13, q10, %e[wr00][0] @ mul weight0 00, out0r0\n" + "vld1.32 {d16-d17}, [%[doutc1r0]] @ load dout1r0\n" + "vld1.32 {d18-d19}, [%[doutc1r1]] @ load dout1r1\n" + "vmla.f32 q8, q10, %e[wr10][0] @ mul weight1 00, out1r0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 q12, q10, q11, #1 @ shift left r0\n" + "vmla.f32 q13, q12, %e[wr00][1] @ mul weight0 01, out0r0\n" + "vmla.f32 q8, q12, %e[wr10][1] @ mul weight1 01, out1r0\n" + + "vext.32 q12, q10, q11, #2 @ shift left r0\n" + "vmla.f32 q13, q12, %f[wr00][0] @ mul weight0 02, out0r0\n" + "vmla.f32 q8, q12, %f[wr10][0] @ mul weight1 02, out1r0\n" + + //! 2nd row + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vmla.f32 q13, q10, %e[wr01][0] @ mul weight0 10, out0r0\n" + "vmla.f32 q14, q10, %e[wr00][0] @ mul weight0 00, out0r1\n" + "vmla.f32 q8, q10, %e[wr11][0] @ mul weight1 10, out1r0\n" + "vmla.f32 q9, q10, %e[wr10][0] @ mul weight1 00, out1r1\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vext.32 q12, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q13, q12, %e[wr01][1] @ mul weight0 11, out0r0\n" + "vmla.f32 q14, q12, %e[wr00][1] @ mul weight0 01, out0r1\n" + "vmla.f32 q8, q12, %e[wr11][1] @ mul weight1 11, out1r0\n" + "vmla.f32 q9, q12, %e[wr10][1] @ mul weight1 01, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q13, q12, %f[wr01][0] @ mul weight0 12, out0r0\n" + "vmla.f32 q14, q12, %f[wr00][0] @ mul weight0 02, out0r1\n" + "vmla.f32 q8, q12, %f[wr11][0] @ mul weight1 12, out1r0\n" + "vmla.f32 q9, q12, %f[wr10][0] @ mul weight1 02, out1r1\n" + + //! 3rd row + "vld1.32 {d20-d22}, [%[din2_ptr]] @ load din r2\n" + "vmla.f32 q13, q10, %e[wr02][0] @ mul weight0 20, out0r0\n" + "vmla.f32 q14, q10, %e[wr01][0] @ mul weight0 10, out0r1\n" + "vmla.f32 q8, q10, %e[wr12][0] @ mul weight1 20, out1r0\n" + "vmla.f32 q9, q10, %e[wr11][0] @ mul weight1 10, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r2\n" + "vmla.f32 q13, q12, %e[wr02][1] @ mul weight0 21, out0r0\n" + "vmla.f32 q14, q12, %e[wr01][1] @ mul weight0 11, out0r1\n" + "vmla.f32 q8, q12, %e[wr12][1] @ mul weight1 21, out1r0\n" + "vmla.f32 q9, q12, %e[wr11][1] @ mul weight1 11, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift right r2\n" + "vmla.f32 q13, q12, %f[wr02][0] @ mul weight0 22, out0r0\n" + "vmla.f32 q14, q12, %f[wr01][0] @ mul weight0 12, out0r1\n" + "vmla.f32 q8, q12, %f[wr12][0] @ mul weight1 22, out1r0\n" + "vmla.f32 q9, q12, %f[wr11][0] @ mul weight1 12, out1r1\n" + + "cmp %[relu], #1 @ check whether has mid cols\n" + "blt store_bm_2 @ jump to store without relu\n" + "vmax.f32 q13, q13, q15 @ relu\n" + "vmax.f32 q14, q14, q15 @ relu\n" + "vmax.f32 q8, q8, q15 @ relu\n" + "vmax.f32 q9, q9, q15 @ relu\n" + + "store_bm_2: @ store top mid result\n" + "vst1.32 {d26-d27}, [%[doutc0r0]]! @ store result, add pointer\n" + "vst1.32 {d16-d17}, [%[doutc1r0]]! @ store result, add pointer\n" + "pld [%[doutc0r0], #192] @ preload data\n" + "pld [%[doutc1r0], #192] @ preload data\n" + + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + + "vst1.32 {d28-d29}, [%[doutc0r1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[doutc1r1]]! @ store result, add pointer\n" + "pld [%[doutc0r1], #192] @ preload data\n" + "pld [%[doutc1r1], #192] @ preload data\n" + "add %[din2_ptr], #16 @ point to 4 data ahead\n" + "pld [%[din2_ptr], #192] @ preload data\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne conv3x3_bot_mid_2 @ jump to main loop start point\n" + + //! process right pad + "conv3x3_bot_right_2: @ right pad entry\n" + + "vld1.32 {d26-d27}, [%[doutc0r0]] @ load dout0r0\n" + "vld1.32 {d28-d29}, [%[doutc0r1]] @ load dout0r1\n" + + //! 1st row + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r0\n" + "vbif d21, d31, %e[vmask_rp] @ bit select, deal with right pad\n" + "vbif d22, d31, %f[vmask_rp] @ bit select, deal with right pad\n" + "vmla.f32 q13, q10, %e[wr00][0] @ mul weight0 00, out0r0\n" + "vld1.32 {d16-d17}, [%[doutc1r0]] @ load dout1r0\n" + "vld1.32 {d18-d19}, [%[doutc1r1]] @ load dout1r1\n" + "vmla.f32 q8, q10, %e[wr10][0] @ mul weight1 00, out1r0\n" + + "vext.32 q12, q10, q11, #1 @ shift left r0\n" + "vmla.f32 q13, q12, %e[wr00][1] @ mul weight0 01, out0r0\n" + "vmla.f32 q8, q12, %e[wr10][1] @ mul weight1 01, out1r0\n" + + "vext.32 q12, q10, q11, #2 @ shift left r0\n" + "vmla.f32 q13, q12, %f[wr00][0] @ mul weight0 02, out0r0\n" + "vmla.f32 q8, q12, %f[wr10][0] @ mul weight1 02, out1r0\n" + + //! 2nd row + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[vmask_rp] @ bit select, deal with right pad\n" + "vbif d22, d31, %f[vmask_rp] @ bit select, deal with right pad\n" + "vmla.f32 q13, q10, %e[wr01][0] @ mul weight0 10, out0r0\n" + "vmla.f32 q14, q10, %e[wr00][0] @ mul weight0 00, out0r1\n" + "vmla.f32 q8, q10, %e[wr11][0] @ mul weight1 10, out1r0\n" + "vmla.f32 q9, q10, %e[wr10][0] @ mul weight1 00, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q13, q12, %e[wr01][1] @ mul weight0 11, out0r0\n" + "vmla.f32 q14, q12, %e[wr00][1] @ mul weight0 01, out0r1\n" + "vmla.f32 q8, q12, %e[wr11][1] @ mul weight1 11, out1r0\n" + "vmla.f32 q9, q12, %e[wr10][1] @ mul weight1 01, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q13, q12, %f[wr01][0] @ mul weight0 12, out0r0\n" + "vmla.f32 q14, q12, %f[wr00][0] @ mul weight0 02, out0r1\n" + "vmla.f32 q8, q12, %f[wr11][0] @ mul weight1 12, out1r0\n" + "vmla.f32 q9, q12, %f[wr10][0] @ mul weight1 02, out1r1\n" + + //! 3rd row + "vld1.32 {d20-d22}, [%[din2_ptr]] @ load din r2\n" + "vbif d21, d31, %e[vmask_rp] @ bit select, deal with right pad\n" + "vbif d22, d31, %f[vmask_rp] @ bit select, deal with right pad\n" + "vmla.f32 q13, q10, %e[wr02][0] @ mul weight0 20, out0r0\n" + "vmla.f32 q14, q10, %e[wr01][0] @ mul weight0 10, out0r1\n" + "vmla.f32 q8, q10, %e[wr12][0] @ mul weight1 20, out1r0\n" + "vmla.f32 q9, q10, %e[wr11][0] @ mul weight1 10, out1r1\n" + + "vext.32 q12, q10, q11, #1 @ shift left r2\n" + "vmla.f32 q13, q12, %e[wr02][1] @ mul weight0 21, out0r0\n" + "vmla.f32 q14, q12, %e[wr01][1] @ mul weight0 11, out0r1\n" + "vmla.f32 q8, q12, %e[wr12][1] @ mul weight1 21, out1r0\n" + "vmla.f32 q9, q12, %e[wr11][1] @ mul weight1 11, out1r1\n" + + "vext.32 q12, q10, q11, #2 @ shift right r2\n" + "vmla.f32 q13, q12, %f[wr02][0] @ mul weight0 22, out0r0\n" + "vmla.f32 q14, q12, %f[wr01][0] @ mul weight0 12, out0r1\n" + "vmla.f32 q8, q12, %f[wr12][0] @ mul weight1 22, out1r0\n" + "vmla.f32 q9, q12, %f[wr11][0] @ mul weight1 12, out1r1\n" + + "cmp %[relu], #1 @ check whether has mid cols\n" + "blt store_br_2 @ jump to store without relu\n" + "vmax.f32 q13, q13, q15 @ relu\n" + "vmax.f32 q14, q14, q15 @ relu\n" + "vmax.f32 q8, q8, q15 @ relu\n" + "vmax.f32 q9, q9, q15 @ relu\n" + + "store_br_2: @ store top mid result\n" + + "vld1.32 {d20-d21}, [%[doutc0r0]] @ load dout0r0\n" + "vld1.32 {d22-d23}, [%[doutc1r0]] @ load dout0r1\n" + + "vmvn.32 q12, q15 @ \n" + "vext.32 q15, q12, %q[vmask_rp], #3 @ shift mask right 1\n" + "vbif q13, q10, q15 @ bit select\n" + "vbif q8, q11, q15 @ bit select\n" + + "vst1.32 {d26-d27}, [%[doutc0r0]] @ store result, add pointer\n" + "vst1.32 {d16-d17}, [%[doutc1r0]] @ store result, add pointer\n" + + "vld1.32 {d20-d21}, [%[doutc0r1]] @ load dout0r0\n" + "vld1.32 {d22-d23}, [%[doutc1r1]] @ load dout0r1\n" + + "vbif q14, q10, q15 @ bit select\n" + "vbif q9, q11, q15 @ bit select\n" + + "vst1.32 {d28-d29}, [%[doutc0r1]] @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[doutc1r1]] @ store result, add pointer\n" + + :[doutc0r0] "+r"(doutc0r0), [doutc0r1] "+r"(doutc0r1), \ + [doutc1r0] "+r" (doutc1r0), [doutc1r1] "+r" (doutc1r1),\ + [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), \ + [din2_ptr] "+r"(din2_ptr), [cnt] "+r"(cnt) + :[wr00] "w"(wr00), [wr01] "w"(wr01), [wr02] "w"(wr02), \ + [wr10] "w"(wr10), [wr11] "w"(wr11), [wr12] "w"(wr12), \ + [vmask_rp] "w" (vmask_rp), [relu] "r" (relu) + :"q8", "q9", "q10", \ + "q11", "q12", "q13", "q14", "q15" + ); +#endif //__aarch64__ + } + } // end of processing bottom pad + } // end of processing channels + } //end of processing output channel + if (cremain > 0) { + for (int c = 0; c < cremain; ++c) { + + int cidx = chout - cremain + c; + float* dout_c = dout_batch + cidx * size_out_channel; + + if (flag_bias) { + fill_bias(dout_c, &bias[cidx], 1, size_out_channel); + } else { + fill_bias(dout_c, zero, 1, size_out_channel); + } + + const float* wc0 = weights + cidx * w_stride; + + for (int i = 0; i < chin; ++i) { + + bool relu = (i == chin - 1) && flag_relu; + + const float* din_channel = din_batch + i * size_in_channel; + for (int h = 0; h < hout; ++h) { + + int hstart = h - pad_h; + int hend = hstart + 3; + hstart = std::max(hstart, 0); + hend = std::min(hend, hin); + + int khstart = hend < kernel_h? kernel_h - hend : 0; + + float* dout_row = dout_c + h * wout; + + for (int w = 0; w < wout; ++w) { + int wstart = w - pad_w; + int wend = wstart + 3; + wstart = std::max(wstart, 0); + wend = std::min(wend, win); + int kwstart = wend < kernel_w? kernel_w - wend : 0; + + for (int kh = hstart; kh < hend; ++kh) { + for (int kw = wstart; kw < wend; ++kw) { + dout_row[w] += din_channel[kh * win + kw] * \ + wc0[(khstart + kh - hstart) * 3 + kwstart + kw - wstart]; + } + } + if (relu) { + dout_row[w] = dout_row[w] > 0.f? dout_row[w] : 0.f; + } + } + } + wc0 += 9; + } + } + } // end of remain out channel + + } // end of processing batchs +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif diff --git a/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp new file mode 100644 index 000000000..1a344e82e --- /dev/null +++ b/saber/lite/funcs/neon/impl/conv_arm_depthwise.cpp @@ -0,0 +1,2456 @@ +#include "saber/lite/funcs/neon/impl/conv_arm_impl.h" + +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +void conv_depthwise_3x3s1p1_bias(float* dout, const float* din, \ + const float* weights, const float* bias, bool flag_bias, \ + const int num, const int ch_in, const int h_in, const int w_in, \ + const int h_out, const int w_out); + +void conv_depthwise_3x3s2p1_bias(float* dout, const float* din, \ + const float* weights, const float* bias, bool flag_bias, \ + const int num, const int ch_in, const int h_in, const int w_in, \ + const int h_out, const int w_out); + +void conv_depthwise_3x3s1p1_bias_relu(float* dout, const float* din, \ + const float* weights, const float* bias, bool flag_bias, \ + const int num, const int ch_in, const int h_in, const int w_in, \ + const int h_out, const int w_out); + +void conv_depthwise_3x3s2p1_bias_relu(float* dout, const float* din, \ + const float* weights, const float* bias, bool flag_bias, \ + const int num, const int ch_in, const int h_in, const int w_in, \ + const int h_out, const int w_out); + + +void conv_depthwise_3x3(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space) { + + //! only support stride = 1 or 2 + //CHECK_EQ(stride_h, stride_w) << "stride w and h must = 1 or 2"; + + if (stride_h == 1) { + if (flag_relu) { + conv_depthwise_3x3s1p1_bias_relu(dout, din, weights, bias, flag_bias, \ + num, chin, hin, win, hout, wout); + } else { + conv_depthwise_3x3s1p1_bias(dout, din, weights, bias, flag_bias, \ + num, chin, hin, win, hout, wout); + } + } else { //! stride = 2 + if (flag_relu) { + conv_depthwise_3x3s2p1_bias_relu(dout, din, weights, bias, flag_bias, \ + num, chin, hin, win, hout, wout); + } else { + conv_depthwise_3x3s2p1_bias(dout, din, weights, bias, flag_bias, \ + num, chin, hin, win, hout, wout); + } + } +} + +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias + */ +void conv_depthwise_3x3s1p1_bias(float* dout, const float* din, \ + const float* weights, const float* bias, bool flag_bias, \ + const int num, const int ch_in, const int h_in, const int w_in, \ + const int h_out, const int w_out) { + //! 3x3s1 convolution, implemented by direct algorithm + //! pad is done implicit + const float zero[6] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + //! for 4x6 convolution window + const int right_pad_idx[4] = {3, 2, 1, 0}; + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + int tile_w = (w_in + 3) >> 2; + int tile_h = (h_in + 1) >> 1; + int w_in_twice = w_in << 1; + int cnt_col = tile_w - 2; + + int size_pad_right = 1 + (tile_w << 2) - w_in; + int size_pad_bottom = 1 + (tile_h << 1) - h_in; + + uint32x4_t vmask_rp = vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(size_pad_right)); + int right_pad_sub = (size_pad_right - 1) * sizeof(float); + + //printf("size_pad_right: %d, right_pad_sub: %d, cnt_col: %d\n", size_pad_right, right_pad_sub, cnt_col); + unsigned int tmp1[4]; + vst1q_u32(tmp1, vmask_rp); + //printf("mask_rp: %d, %d, %d, %d\n", tmp1[0], tmp1[1], tmp1[2], tmp1[3]); + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float *din_channel = din_batch + i * size_in_channel; + + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } + + float *dout_channel = dout_batch + i * size_out_channel; + + const float *dr0 = din_channel; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + + const float *din0_ptr = dr0; + const float *din1_ptr = dr1; + const float *din2_ptr = dr2; + const float *din3_ptr = dr3; + + float *doutr0 = dout_channel; + float *doutr1 = dout_channel + w_out; + + float* ptr_zero = const_cast(zero); + + //! deal with top pad + int h = 0; + //! process +#ifdef __aarch64__ + // todo +#else + int cnt = cnt_col; + asm volatile( + //! process left pad + "pld [%[din0_ptr], #192] @ preload data\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vmul.f32 q5, q10, %e[wr0][1] @ mul weight 00, outr1\n" + "vmul.f32 q4, q10, %e[wr1][1] @ mul weight 10, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r2\n" + "vmla.f32 q5, q12, %e[wr1][1] @ mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][1] @ mul weight 20, outr0\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + "vld1.32 {d28-d30}, [%[din2_ptr]]! @ load din r3\n" + "vmla.f32 q5, q14, %e[wr2][1] @ mul weight 20, outr1\n" + + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @ mul weight 01, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @ mul weight 11, outr0\n" + + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @ mul weight 11, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @ mul weight 21, outr0\n" + + "vext.32 q6, q14, q15, #1 @ shift left r3\n" + "vmla.f32 q5, q6, %f[wr2][0] @ mul weight 21, outr1\n" + + "vmov.u32 d31, #0 @ zero\n" + + "vext.32 d12, d31, d20, #1 @ shift right r1\n" + "vext.32 d13, d20, d21, #1 @ shift right r1\n" + "vmla.f32 q5, q6, %e[wr0][0] @ mul weight 02, outr1\n" + "vmla.f32 q4, q6, %e[wr1][0] @ mul weight 12, outr0\n" + + "vext.32 d12, d31, d24, #1 @ shift right r0\n" + "vext.32 d13, d24, d25, #1 @ shift right r0\n" + "vmla.f32 q5, q6, %e[wr1][0] @ mul weight 12, outr1\n" + "vmla.f32 q4, q6, %e[wr2][0] @ mul weight 22, outr0\n" + + "vext.32 d12, d31, d28, #1 @ shift right r0\n" + "vext.32 d13, d28, d29, #1 @ shift right r0\n" + "vmla.f32 q5, q6, %e[wr2][0] @ mul weight 22, outr1\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" + + //! process mid cols + "cmp %[cnt], #1 @ check whether has mid cols\n" + "blt start_top_right @ jump to main loop start point\n" + "start_top_mid: @ main loop start point\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vmul.f32 q5, q10, %e[wr0][0] @ mul weight 00, outr1\n" + "vmul.f32 q4, q10, %e[wr1][0] @ mul weight 10, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r2\n" + "vmla.f32 q5, q12, %e[wr1][0] @ mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][0] @ mul weight 20, outr0\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + "vld1.32 {d28-d30}, [%[din2_ptr]]! @ load din r3\n" + "vmla.f32 q5, q14, %e[wr2][0] @ mul weight 20, outr1\n" + + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %e[wr0][1] @ mul weight 01, outr1\n" + "vmla.f32 q4, q6, %e[wr1][1] @ mul weight 11, outr0\n" + + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %e[wr1][1] @ mul weight 11, outr1\n" + "vmla.f32 q4, q6, %e[wr2][1] @ mul weight 21, outr0\n" + + "vext.32 q6, q14, q15, #1 @ shift left r3\n" + "vmla.f32 q5, q6, %e[wr2][1] @ mul weight 21, outr1\n" + + "vext.32 q6, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @ mul weight 02, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @ mul weight 12, outr0\n" + + "vext.32 q6, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @ mul weight 12, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @ mul weight 22, outr0\n" + + "vext.32 q6, q14, q15, #2 @ shift right r3\n" + "vmla.f32 q5, q6, %f[wr2][0] @ mul weight 22, outr1\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din2_ptr], #8 @ 2 float data overlap with previous data\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "bne start_top_mid @ jump to main loop start point\n" + + //! process right pad + "start_top_right: @ right pad entry\n" + "vmov.u32 d31, #0 @ zero buf\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[mask] @ bit select, deal with right pad\n" + "vmul.f32 q5, q10, %e[wr0][0] @ mul weight 00, outr1\n" + "vmul.f32 q4, q10, %e[wr1][0] @ mul weight 10, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r2\n" + "vbif d25, d31, %e[mask] @ bit select, deal with right pad\n" + "vmla.f32 q5, q12, %e[wr1][0] @ mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][0] @ mul weight 20, outr0\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + "vld1.32 {d28-d30}, [%[din2_ptr]]! @ load din r3\n" + "vbif d29, d31, %e[mask] @ bit select, deal with right pad\n" + "vmla.f32 q5, q14, %e[wr2][0] @ mul weight 20, outr1\n" + + "vbif d22, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %e[wr0][1] @ mul weight 01, outr1\n" + "vmla.f32 q4, q6, %e[wr1][1] @ mul weight 11, outr0\n" + + "vbif d26, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %e[wr1][1] @ mul weight 11, outr1\n" + "vmla.f32 q4, q6, %e[wr2][1] @ mul weight 21, outr0\n" + + "vbif d30, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q14, q15, #1 @ shift left r3\n" + "vmla.f32 q5, q6, %e[wr2][1] @ mul weight 21, outr1\n" + + "vext.32 q6, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @ mul weight 02, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @ mul weight 12, outr0\n" + + "vext.32 q6, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @ mul weight 12, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @ mul weight 22, outr0\n" + + "pld [%[dout_ptr1], #128] @ preload data\n" + + "vext.32 q6, q14, q15, #2 @ shift right r3\n" + "vmla.f32 q5, q6, %f[wr2][0] @ mul weight 22, outr1\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vld1.32 {d20-d21}, [%[dout_ptr1]] @ load dout r0\n" + + "vmvn.32 d24, d31 @ \n" + "vmvn.32 d25, d31 @ \n" + "vext.32 q13, q12, %q[mask], #3 @ shift mask right 1\n" + "vbif q8, q10, q13 @ bit select\n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + + "sub %[dout_ptr1], %[dout_ptr1], %[pad_right] @ sub \n" + "sub %[dout_ptr2], %[dout_ptr2], %[pad_right] @ sub \n" + + :[dout_ptr1] "+r"(doutr0), [dout_ptr2] "+r"(doutr1), \ + [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), \ + [din2_ptr] "+r"(din2_ptr), [pad_right] "+r" (right_pad_sub), \ + [cnt] "+r"(cnt) + :[wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), \ + [bias] "w"(wbias), [mask] "w" (vmask_rp) + :"q4", "q5", "q6", "q8", "q9", \ + "q10", "q11", "q12", "q13", "q14", "q15" + ); + +#endif //__aarch64__ + + //! after process, increase pointer + doutr0 += w_out; + doutr1 = doutr0 + w_out; + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + //! end of process top row + + //! process mid row + for (h = tile_h - 2; h > 0; h--) { + + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + din3_ptr = dr3; + +#ifdef __aarch64__ + // todo +#else + cnt = cnt_col; + asm volatile( + //! process left pad + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "pld [%[din2_ptr], #192] @ preload data\n" + "pld [%[din3_ptr], #192] @ preload data\n" + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vmul.f32 q4, q8, %e[wr0][1] @mul weight 00, outr0\n" + + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vmul.f32 q5, q10, %e[wr0][1] @mul weight 00, outr1\n" + "vmla.f32 q4, q10, %e[wr1][1] @mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" + "vmla.f32 q5, q12, %e[wr1][1] @mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][1] @mul weight 20, outr0\n" + + + "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" + "vmla.f32 q5, q14, %e[wr2][1] @mul weight 20, outr1\n" + + "vext.32 q6, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q4, q6, %f[wr0][0] @mul weight 01, outr0\n" + + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @mul weight 01, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @mul weight 11, outr0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @mul weight 11, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @mul weight 21, outr0\n" + + "vext.32 q6, q14, q15, #1 @ shift left r3\n" + "vmla.f32 q5, q6, %f[wr2][0] @mul weight 21, outr1\n" + + "vmov.u32 d31, #0 @ zero\n" + "vext.32 d12, d31, d16, #1 @ shift right r0\n" + "vext.32 d13, d16, d17, #1 @ shift right r0\n" + "vmla.f32 q4, q6, %e[wr0][0] @mul weight 02, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vext.32 d12, d31, d20, #1 @ shift right r1\n" + "vext.32 d13, d20, d21, #1 @ shift right r1\n" + "vmla.f32 q5, q6, %e[wr0][0] @mul weight 02, outr1\n" + "vmla.f32 q4, q6, %e[wr1][0] @mul weight 12, outr0\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + + "vext.32 d12, d31, d24, #1 @ shift right r0\n" + "vext.32 d13, d24, d25, #1 @ shift right r0\n" + "vmla.f32 q5, q6, %e[wr1][0] @mul weight 12, outr1\n" + "vmla.f32 q4, q6, %e[wr2][0] @mul weight 22, outr0\n" + + "pld [%[din3_ptr], #192] @ preload data\n" + + "vext.32 d12, d31, d28, #1 @ shift right r0\n" + "vext.32 d13, d28, d29, #1 @ shift right r0\n" + "vmla.f32 q5, q6, %e[wr2][0] @mul weight 22, outr1\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" + + //! process mid cols + "cmp %[cnt], #1 @ check whether has mid cols\n" + "blt start_mid_right @ jump to main loop start point\n" + "start_mid_mid: @ main loop start point\n" + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vmul.f32 q4, q8, %e[wr0][0] @ mul weight 00, outr0\n" + + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vmul.f32 q5, q10, %e[wr0][0] @ mul weight 00, outr1\n" + "vmla.f32 q4, q10, %e[wr1][0] @ mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" + "vmla.f32 q5, q12, %e[wr1][0] @ mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][0] @ mul weight 20, outr0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" + "vmla.f32 q5, q14, %e[wr2][0] @ mul weight 20, outr1\n" + + "vext.32 q6, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q4, q6, %e[wr0][1] @ mul weight 01, outr0\n" + + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %e[wr0][1] @ mul weight 01, outr1\n" + "vmla.f32 q4, q6, %e[wr1][1] @ mul weight 11, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %e[wr1][1] @ mul weight 11, outr1\n" + "vmla.f32 q4, q6, %e[wr2][1] @ mul weight 21, outr0\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + + "vext.32 q6, q14, q15, #1 @ shift left r3\n" + "vmla.f32 q5, q6, %e[wr2][1] @ mul weight 21, outr1\n" + + "vext.32 q6, q8, q9, #2 @ shift left r0\n" + "vmla.f32 q4, q6, %f[wr0][0] @ mul weight 02, outr0\n" + + "vext.32 q6, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @ mul weight 02, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @ mul weight 12, outr0\n" + + "pld [%[din3_ptr], #192] @ preload data\n" + + "vext.32 q6, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @mul weight 12, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @mul weight 22, outr0\n" + + "vext.32 q6, q14, q15, #2 @ shift right r3\n" + "vmla.f32 q5, q6, %f[wr2][0] @mul weight 22, outr1\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din2_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din3_ptr], #8 @ 2 float data overlap with previous data\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "bne start_mid_mid @ jump to main loop start point\n" + + //! process right pad + "start_mid_right: @ right pad entry\n" + "vmov.u32 d31, #0 @ zero buf\n" + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vbif d17, d31, %e[mask] @ bit select, deal with right pad\n" + "vmul.f32 q4, q8, %e[wr0][0] @ mul weight 00, outr0\n" + + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[mask] @ bit select, deal with right pad\n" + "vmul.f32 q5, q10, %e[wr0][0] @ mul weight 00, outr1\n" + "vmla.f32 q4, q10, %e[wr1][0] @ mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" + "vbif d25, d31, %e[mask] @ bit select, deal with right pad\n" + "vmla.f32 q5, q12, %e[wr1][0] @ mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][0] @ mul weight 20, outr0\n" + + "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" + "vbif d29, d31, %e[mask] @ bit select, deal with right pad\n" + "vmla.f32 q5, q14, %e[wr2][0] @ mul weight 20, outr1\n" + + "vbif d18, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q4, q6, %e[wr0][1] @ mul weight 01, outr0\n" + + "vbif d22, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %e[wr0][1] @ mul weight 01, outr1\n" + "vmla.f32 q4, q6, %e[wr1][1] @ mul weight 11, outr0\n" + + "vbif d26, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %e[wr1][1] @mul weight 11, outr1\n" + "vmla.f32 q4, q6, %e[wr2][1] @mul weight 21, outr0\n" + + "vbif d30, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q14, q15, #1 @ shift left r3\n" + "vmla.f32 q5, q6, %e[wr2][1] @mul weight 21, outr1\n" + + "vext.32 q6, q8, q9, #2 @ shift left r0\n" + "vmla.f32 q4, q6, %f[wr0][0] @mul weight 02, outr0\n" + + "vext.32 q6, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @mul weight 02, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @mul weight 12, outr0\n" + + "vext.32 q6, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @mul weight 12, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @mul weight 22, outr0\n" + + "pld [%[dout_ptr1], #128] @ preload data\n" + + "vext.32 q6, q14, q15, #2 @ shift right r3\n" + "vmla.f32 q5, q6, %f[wr2][0] @mul weight 22, outr1\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vld1.32 {d20-d21}, [%[dout_ptr1]] @ load dout r0\n" + + "vmvn.32 d22, d31 @ \n" + "vmvn.32 d23, d31 @ \n" + "vext.32 q12, q11, %q[mask], #3 @ shift mask right 1\n" + "vbif q8, q10, q12 @ bit select\n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + + "sub %[dout_ptr1], %[dout_ptr1], %[pad_right] @ sub \n" + "sub %[dout_ptr2], %[dout_ptr2], %[pad_right] @ sub \n" + + :[dout_ptr1] "+r"(doutr0), [dout_ptr2] "+r"(doutr1), \ + [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), \ + [din2_ptr] "+r"(din2_ptr), [din3_ptr] "+r"(din3_ptr), \ + [pad_right] "+r" (right_pad_sub), [cnt] "+r"(cnt) + :[wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), \ + [bias] "w"(wbias), [mask] "w" (vmask_rp) + :"q4", "q5", "q6", "q8", "q9", \ + "q10", "q11", "q12", "q13", "q14", "q15" + ); +#endif //__aarch64__ + + doutr0 += w_out; + doutr1 = doutr0 + w_out; + dr0 = dr2; + dr1 = dr3; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + } //! end of processing mid rows + + //! deal with bottom pad + din0_ptr = dr0; + din1_ptr = dr1; + if (size_pad_bottom == 2){ + din2_ptr = ptr_zero; + } else { + din2_ptr = dr2; + } +#ifdef __aarch64__ + // todo +#else + cnt = cnt_col; + asm volatile( + // process left pad + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "pld [%[din2_ptr], #192] @ preload data\n" + + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vmul.f32 q4, q8, %e[wr0][1] @mul weight 00, outr0\n" + + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vmul.f32 q5, q10, %e[wr0][1] @mul weight 00, outr1\n" + "vmla.f32 q4, q10, %e[wr1][1] @mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din2_ptr]] @ load din r2\n" + "vmla.f32 q5, q12, %e[wr1][1] @mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][1] @mul weight 20, outr0\n" + + "vext.32 q6, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q4, q6, %f[wr0][0] @mul weight 01, outr0\n" + + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @mul weight 01, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @mul weight 11, outr0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @mul weight 11, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @mul weight 21, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vmov.u32 d31, #0 @ zero\n" + "vext.32 d12, d31, d16, #1 @ shift right r0\n" + "vext.32 d13, d16, d17, #1 @ shift right r0\n" + "vmla.f32 q4, q6, %e[wr0][0] @mul weight 02, outr0\n" + + "vext.32 d12, d31, d20, #1 @ shift right r1\n" + "vext.32 d13, d20, d21, #1 @ shift right r1\n" + "vmla.f32 q5, q6, %e[wr0][0] @mul weight 02, outr1\n" + "vmla.f32 q4, q6, %e[wr1][0] @mul weight 12, outr0\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + + "vext.32 d12, d31, d24, #1 @ shift right r2\n" + "vext.32 d13, d24, d25, #1 @ shift right r2\n" + "vmla.f32 q5, q6, %e[wr1][0] @mul weight 12, outr1\n" + "vmla.f32 q4, q6, %e[wr2][0] @mul weight 22, outr0\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "sub %[din0_ptr], #12 @ 1pad + 2 data overlap\n" + "sub %[din1_ptr], #12 @ 1pad + 2 data overlap\n" + "cmp %[bot_pad], #2 @ check if bottom pad is 2\n" + "beq start_bot_mid @ jump to next block\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + "add %[din2_ptr], #12 @ 1pad + 2 data overlap\n" + + // process mid cols + "cmp %[cnt], #1 @ check whether has mid cols\n" + "blt start_bot_right @ jump to main loop start point\n" + "start_bot_mid: @ main loop start point\n" + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vmul.f32 q4, q8, %e[wr0][0] @mul weight 00, outr0\n" + + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vmul.f32 q5, q10, %e[wr0][0] @mul weight 00, outr1\n" + "vmla.f32 q4, q10, %e[wr1][0] @mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din2_ptr]] @ load din r2\n" + "vmla.f32 q5, q12, %e[wr1][0] @mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][0] @mul weight 20, outr0\n" + + "vext.32 q6, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q4, q6, %e[wr0][1] @mul weight 01, outr0\n" + + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %e[wr0][1] @mul weight 01, outr1\n" + "vmla.f32 q4, q6, %e[wr1][1] @mul weight 11, outr0\n" + + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %e[wr1][1] @mul weight 11, outr1\n" + "vmla.f32 q4, q6, %e[wr2][1] @mul weight 21, outr0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 q6, q8, q9, #2 @ shift left r0\n" + "vmla.f32 q4, q6, %f[wr0][0] @mul weight 02, outr0\n" + + "vext.32 q6, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @mul weight 02, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @mul weight 12, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vext.32 q6, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @mul weight 12, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @mul weight 22, outr0\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + + "cmp %[bot_pad], #2 @ check if bottom pad is 2\n" + "beq end_bot_mid @ jump to check point\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + "add %[din2_ptr], #16 @ point to 4 data ahead\n" + + "end_bot_mid: @ check point\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "bne start_bot_mid @ jump to main loop start point\n" + + // process right pad + "start_bot_right: @ right pad process\n" + "vmov.u32 d31, #0 @ zero buf\n" + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vbif d17, d31, %e[mask] @ bit select, deal with right pad\n" + "vmul.f32 q4, q8, %e[wr0][0] @mul weight 00, outr0\n" + + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[mask] @ bit select, deal with right pad\n" + "vmul.f32 q5, q10, %e[wr0][0] @mul weight 00, outr1\n" + "vmla.f32 q4, q10, %e[wr1][0] @mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din2_ptr]] @ load din r2\n" + "vbif d25, d31, %e[mask] @ bit select, deal with right pad\n" + "vmla.f32 q5, q12, %e[wr1][0] @mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][0] @mul weight 20, outr0\n" + + "vbif d18, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q4, q6, %e[wr0][1] @mul weight 01, outr0\n" + + "vbif d22, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %e[wr0][1] @mul weight 01, outr1\n" + "vmla.f32 q4, q6, %e[wr1][1] @mul weight 11, outr0\n" + + "vbif d26, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %e[wr1][1] @mul weight 11, outr1\n" + "vmla.f32 q4, q6, %e[wr2][1] @mul weight 21, outr0\n" + + "vext.32 q6, q8, q9, #2 @ shift left r0\n" + "vmla.f32 q4, q6, %f[wr0][0] @mul weight 02, outr0\n" + + "vext.32 q6, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @mul weight 02, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @mul weight 12, outr0\n" + + "pld [%[dout_ptr1], #128] @ preload data\n" + + "vext.32 q6, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @mul weight 12, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @mul weight 22, outr0\n" + + "pld [%[dout_ptr2], #128] @ preload data\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + + "vld1.32 {d20-d21}, [%[dout_ptr1]] @ load dout r0\n" + "vld1.32 {d22-d23}, [%[dout_ptr2]] @ load dout r1\n" + + "vmvn.32 d24, d31 @ \n" + "vmvn.32 d25, d31 @ \n" + "vext.32 q13, q12, %q[mask], #3 @ shift mask right 1\n" + "vbif q8, q10, q13 @ bit select\n" + "vbif q9, q11, q13 @ bit select\n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "cmp %[bot_pad], #2 @ check if bottom pad is 2\n" + "beq end @ jump to end point\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + "end: @ end\n" + + :[dout_ptr1] "+r"(doutr0), [dout_ptr2] "+r"(doutr1), \ + [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), \ + [din2_ptr] "+r"(din2_ptr), [pad_right] "+r"(right_pad_sub), \ + [bot_pad] "+r"(size_pad_bottom), [cnt] "+r"(cnt) + :[wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), \ + [bias] "w"(wbias), [mask] "w"(vmask_rp) + //, [test] "r"(data_test_ptr) + :"q4", "q5", "q6", "q8", "q9", \ + "q10", "q11", "q12", "q13", "q14", "q15" + ); +#endif //__aarch64__ + //! end of processing bottom pad + } // end of processing channels + } // end of processing batchs +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2 + */ +void conv_depthwise_3x3s2p1_bias(float* dout, const float* din, \ + const float* weights, const float* bias, bool flag_bias, \ + const int num, const int ch_in, const int h_in, const int w_in, \ + const int h_out, const int w_out) { + //! 3x3s2 depthwise convolution, pad 1 is done implicit + int right_pad_idx[4] = {0, 0, 0, 0}; + int right_w_idx[4] = {2, 1, 2, 1}; + int size_pad_right = w_out * 2 - w_in; + int size_pad_bottom = h_out * 2 - h_in; + int size_right_remain = (((w_out + 1) >> 1) << 1) - w_out; + int cnt_col = ((w_out + 1) >> 1) - 2; + if (size_right_remain == 0 || size_pad_right == 0) { + right_pad_idx[0] = 1; + } + if (size_right_remain == 0) { + right_pad_idx[1] = 1; + if (size_pad_right == 0) { + right_pad_idx[2] = 1; + } + } + uint32x4_t mask_rp = vcgtq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(0)); + uint32x4_t mask_w = vcgtq_s32(vld1q_s32(right_w_idx), vdupq_n_s32(size_right_remain)); + + size_right_remain *= sizeof(float); + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + wr0 = vsetq_lane_f32(0.f, wr0, 3); + wr1 = vsetq_lane_f32(0.f, wr1, 3); + wr2 = vsetq_lane_f32(0.f, wr2, 3); + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } + + const float *dr0 = din_channel; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + + const float *din0_ptr = dr0; + const float *din1_ptr = dr1; + const float *din2_ptr = dr2; + + float *doutr0 = dout_channel; + + //! top pad +#ifdef __aarch64__ + // todo +#else + int cnt = cnt_col; + asm volatile( + // process left pad + "pld [%[din0_ptr], #128] @ preload data\n" + "pld [%[din1_ptr], #128] @ preload data\n" + "vmov.u32 q11, #0 @ for left pad\n" + "vld1.32 {d20-d21}, [%[din0_ptr]]! @ load din r1\n" + "vext.32 q7, q11, q10, #3 @ shift right 1 data\n" + "vmul.f32 q8, q7, %q[wr1] @ mul weight 1, out0\n" + + "vext.32 q7, q10, q11, #1 @ shift left r1\n" + "vmul.f32 q9, q7, %q[wr1] @ mul weight 1, out1\n" + + "vld1.32 {d24-d25}, [%[din1_ptr]]! @ load din r2\n" + "vext.32 q7, q11, q12, #3 @ shift right 1 data\n" + "vmla.f32 q8, q7, %q[wr2] @ mul weight 2, out0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 q7, q12, q11, #1 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr2] @ mul weight 2, out1\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #4 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #4 @ 1pad + 2 float data overlap\n" + + // process mid cols + "cmp %[cnt], #1 @ check whether has mid loop\n" + "blt s2_top_right @ jump to rightpad\n" + "s2_top_mid: @ main loop start point\n" + + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vmul.f32 q8, q10, %q[wr1] @ mul weight 1, out0\n" + + "vext.32 q7, q10, q11, #2 @ shift left r1\n" + "vmul.f32 q9, q7, %q[wr1] @ mul weight 1, outr1\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r2\n" + "vmla.f32 q8, q12, %q[wr2] @ mul weight 2, outr0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 q7, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr2] @ mul weight 2, outr1\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #8 @ 1 float data overlap and 1 redundant\n" + "sub %[din1_ptr], #8 @ 1 float data overlap and 1 redundant\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne s2_top_mid @ jump to main loop start point\n" + + //! process right pad + "s2_top_right: @ right pad entry\n" + "vmov.u32 d31, #0 @ zero buf\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[mask_din] @ bit select, deal with right pad\n" + "vmul.f32 q8, q10, %q[wr1] @ mul weight 1, out0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vbif d22, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q10, q11, #2 @ shift left r1\n" + "vmul.f32 q9, q7, %q[wr1] @ mul weight 1, out1\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + "pld [%[dout_ptr1], #64] @ preload data\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r2\n" + "vbif d25, d31, %e[mask_din] @ bit select, deal with right pad\n" + "vmla.f32 q8, q12, %q[wr2] @ mul weight 2, outr0\n" + + "vbif d26, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q12, q13, #2 @ shift left r1\n" + "vmla.f32 q9, q7, %q[wr2] @ mul weight 2, outr1\n" + + "vld1.32 {d20}, [%[dout_ptr1]] @ load dout\n" + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + //"vst1.32 {d17}, [%[tmp_ptr]]! \n" + "vbif d17, d20, %e[mask_w] @ bit select\n" + //"vst1.32 {d17}, [%[tmp_ptr]] \n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "sub %[dout_ptr1], %[dout_ptr1], %[pad_right] @ sub \n" + + :[dout_ptr1] "+r"(doutr0), [din0_ptr] "+r"(din0_ptr), \ + [din1_ptr] "+r"(din1_ptr), [cnt] "+r"(cnt) + :[wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), \ + [bias] "w"(wbias), [mask_din] "w" (mask_rp), \ + [mask_w] "w" (mask_w), \ + [pad_right] "r" (size_right_remain) + :"q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + +#endif //__aarch64__ + + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + w_in; + + //! process mid rows + for (int j = h_out - size_pad_bottom - 1; j > 0; j--) { +#ifdef __aarch64__ + // todo +#else + + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + cnt = cnt_col; + asm volatile( + // process left pad + "pld [%[din0_ptr], #128] @ preload data\n" + "pld [%[din1_ptr], #128] @ preload data\n" + "pld [%[din2_ptr], #128] @ preload data\n" + + "vmov.u32 q11, #0 @ for left pad\n" + "vld1.32 {d20-d21}, [%[din0_ptr]]! @ load din r0\n" + "vext.32 q7, q11, q10, #3 @ shift right 1 data\n" + "vmul.f32 q8, q7, %q[wr0] @ mul weight 00, outr0\n" + + "vext.32 q7, q10, q11, #1 @ shift left r0\n" + "vmul.f32 q9, q7, %q[wr0] @ mul weight 00, outr1\n" + + "vld1.32 {d24-d25}, [%[din1_ptr]]! @ load din r1\n" + "vext.32 q7, q11, q12, #3 @ shift right 1 data\n" + "vmla.f32 q8, q7, %q[wr1] @ mul weight 10, outr0\n" + + "pld [%[din0_ptr], #128] @ preload data\n" + + "vext.32 q7, q12, q11, #1 @ shift left r1\n" + "vmla.f32 q9, q7, %q[wr1] @ mul weight 10, outr1\n" + + "pld [%[din1_ptr], #128] @ preload data\n" + + "vld1.32 {d28-d29}, [%[din2_ptr]]! @ load din r2\n" + "vext.32 q7, q11, q14, #3 @ shift right 1 data\n" + "vmla.f32 q8, q7, %q[wr2] @ mul weight 20, outr0\n" + + "pld [%[din2_ptr], #128] @ preload data\n" + + "vext.32 q7, q14, q11, #1 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr2] @ mul weight 20, outr1\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #4 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #4 @ 1pad + 2 float data overlap\n" + "sub %[din2_ptr], #4 @ 1pad + 2 float data overlap\n" + + // process mid cols + "cmp %[cnt], #1 @ check whether has mid loop\n" + "blt s2_mid_right @ jump to rightpad\n" + "s2_mid_mid: @ main loop start point\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r0\n" + "vmul.f32 q8, q10, %q[wr0] @ mul weight 00, outr0\n" + "vext.32 q7, q10, q11, #2 @ shift left r1\n" + "vmul.f32 q9, q7, %q[wr0] @ mul weight 00, outr1\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r1\n" + "vmla.f32 q8, q12, %q[wr1] @ mul weight 10, outr0\n" + "vext.32 q7, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr1] @ mul weight 10, outr1\n" + + "pld [%[din0_ptr], #128] @ preload data\n" + + "vld1.32 {d28-d30}, [%[din2_ptr]]! @ load din r2\n" + "vmla.f32 q8, q14, %q[wr2] @mul weight 10, outr0\n" + + "pld [%[din1_ptr], #128] @ preload data\n" + + "vext.32 q7, q14, q15, #2 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr2] @mul weight 10, outr1\n" + + "pld [%[din2_ptr], #128] @ preload data\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din2_ptr], #8 @ 2 float data overlap with previous data\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne s2_mid_mid @ jump to main loop start point\n" + + // process right pad + "s2_mid_right: @ right pad entry\n" + "vmov.u32 d31, #0 @ zero buf\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[mask_din] @ bit select, deal with right pad\n" + "vmul.f32 q8, q10, %q[wr0] @mul weight 00, outr0\n" + "vbif d22, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q10, q11, #2 @ shift left r0\n" + "vmul.f32 q9, q7, %q[wr0] @mul weight 00, outr1\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r1\n" + "vbif d25, d31, %e[mask_din] @ bit select, deal with right pad\n" + "vmla.f32 q8, q12, %q[wr1] @mul weight 10, outr0\n" + "vbif d26, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q12, q13, #2 @ shift left r1\n" + "vmla.f32 q9, q7, %q[wr1] @mul weight 10, outr1\n" + + "pld [%[dout_ptr1], #64] @ preload ouput data\n" + + "vld1.32 {d28-d30}, [%[din2_ptr]]! @ load din r1\n" + "vbif d29, d31, %e[mask_din] @ bit select, deal with right pad\n" + "vmla.f32 q8, q14, %q[wr2] @mul weight 20, outr0\n" + "vbif d30, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q14, q15, #2 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr2] @mul weight 20, outr1\n" + + "vld1.32 {d20}, [%[dout_ptr1]] @ load dout\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + "vbif d17, d20, %e[mask_w] @ bit select\n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[dout_ptr1], %[dout_ptr1], %[pad_right] @ sub \n" + + :[dout_ptr1] "+r"(doutr0), [din0_ptr] "+r"(din0_ptr), \ + [din1_ptr] "+r"(din1_ptr), [din2_ptr] "+r"(din2_ptr), \ + [cnt] "+r"(cnt) + :[wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), \ + [bias] "w"(wbias), [mask_din] "w" (mask_rp), \ + [mask_w] "w" (mask_w), \ + [pad_right] "r" (size_right_remain) + :"q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + +#endif //__aarch64__ + + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + } // end of process mid rows + + // process bottom pad if needed + if (size_pad_bottom) { +#ifdef __aarch64__ + // todo +#else + cnt = cnt_col; + asm volatile( + // process left pad + "pld [%[din0_ptr], #128] @ preload data\n" + "pld [%[din1_ptr], #128] @ preload data\n" + + "vmov.u32 q11, #0 @ for left pad\n" + "vld1.32 {d20-d21}, [%[din0_ptr]]! @ load din r0\n" + "vext.32 q7, q11, q10, #3 @ shift right 1 data\n" + "vmul.f32 q8, q7, %q[wr0] @mul weight 00, outr0\n" + "vext.32 q7, q10, q11, #1 @ shift left r0\n" + "vmul.f32 q9, q7, %q[wr0] @mul weight 00, outr1\n" + + "vld1.32 {d24-d25}, [%[din1_ptr]]! @ load din r1\n" + "vext.32 q7, q11, q12, #3 @ shift right 1 data\n" + "vmla.f32 q8, q7, %q[wr1] @mul weight 10, outr0\n" + "vext.32 q7, q12, q11, #1 @ shift left r1\n" + "vmla.f32 q9, q7, %q[wr1] @mul weight 10, outr1\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #4 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #4 @ 1pad + 2 float data overlap\n" + + // process mid cols + "cmp %[cnt], #1 @ check whether has mid loop\n" + "blt s2_bot_right @ jump to rightpad\n" + "s2_bot_mid: @ main loop start point\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r0\n" + "vmul.f32 q8, q10, %q[wr0] @mul weight 00, outr0\n" + "vext.32 q7, q10, q11, #2 @ shift left r1\n" + "vmul.f32 q9, q7, %q[wr0] @mul weight 00, outr1\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r1\n" + "vmla.f32 q8, q12, %q[wr1] @mul weight 10, outr0\n" + "vext.32 q7, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr1] @mul weight 10, outr1\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne s2_bot_mid @ jump to main loop start point\n" + + // process right pad + "s2_bot_right: @ right pad entry\n" + "vmov.u32 d31, #0 @ zero buf\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[mask_din] @ bit select, deal with right pad\n" + "vmul.f32 q8, q10, %q[wr0] @mul weight 00, outr0\n" + "vbif d22, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q10, q11, #2 @ shift left r0\n" + "vmul.f32 q9, q7, %q[wr0] @mul weight 00, outr1\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r1\n" + "vbif d25, d31, %e[mask_din] @ bit select, deal with right pad\n" + + "pld [%[dout_ptr1], #64] @ preload data\n" + + "vmla.f32 q8, q12, %q[wr1] @mul weight 10, outr0\n" + "vbif d26, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q12, q13, #2 @ shift left r1\n" + "vmla.f32 q9, q7, %q[wr1] @mul weight 10, outr1\n" + + "vld1.32 {d20}, [%[dout_ptr1]] @ load dout\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + "vbif d17, d20, %e[mask_w] @ bit select\n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[dout_ptr1], %[dout_ptr1], %[pad_right] @ sub \n" + + :[dout_ptr1] "+r"(doutr0), [din0_ptr] "+r"(din0_ptr), \ + [din1_ptr] "+r"(din1_ptr), [cnt] "+r"(cnt) + :[wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), \ + [bias] "w"(wbias), [mask_din] "w" (mask_rp), \ + [mask_w] "w" (mask_w), \ + [pad_right] "r" (size_right_remain) + :"q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + +#endif //__aarch64__ + } // end of process bottom pad + + } + } +} + +void conv_depthwise_3x3s1p1_bias_relu(float* dout, const float* din, \ + const float* weights, const float* bias, bool flag_bias, \ + const int num, const int ch_in, const int h_in, const int w_in, \ + const int h_out, const int w_out) { + //! 3x3s1 convolution, implemented by direct algorithm + //! pad is done implicit + const float zero[6] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + //! for 4x6 convolution window + const int right_pad_idx[4] = {3, 2, 1, 0}; + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + int tile_w = (w_in + 3) >> 2; + int tile_h = (h_in + 1) >> 1; + int w_in_twice = w_in << 1; + int cnt_col = tile_w - 2; + + int size_pad_right = 1 + (tile_w << 2) - w_in; + int size_pad_bottom = 1 + (tile_h << 1) - h_in; + + uint32x4_t vmask_rp = vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(size_pad_right)); + int right_pad_sub = (size_pad_right - 1) * sizeof(float); + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float *din_channel = din_batch + i * size_in_channel; + + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } + + float *dout_channel = dout_batch + i * size_out_channel; + + const float *dr0 = din_channel; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + + const float *din0_ptr = dr0; + const float *din1_ptr = dr1; + const float *din2_ptr = dr2; + const float *din3_ptr = dr3; + + float *doutr0 = dout_channel; + float *doutr1 = dout_channel + w_out; + + float* ptr_zero = const_cast(zero); + + int h = 0; + //! deal with top pad + //! process +#ifdef __aarch64__ + // todo +#else + int cnt = cnt_col; + asm volatile( + //! process left pad + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "pld [%[din2_ptr], #192] @ preload data\n" + + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vmul.f32 q5, q10, %e[wr0][1] @ mul weight 00, outr1\n" + "vmul.f32 q4, q10, %e[wr1][1] @ mul weight 10, outr0\n" + + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r2\n" + "vmla.f32 q5, q12, %e[wr1][1] @ mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][1] @ mul weight 20, outr0\n" + + + "vld1.32 {d28-d30}, [%[din2_ptr]]! @ load din r3\n" + "vmla.f32 q5, q14, %e[wr2][1] @ mul weight 20, outr1\n" + + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @ mul weight 01, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @ mul weight 11, outr0\n" + + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @ mul weight 11, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @ mul weight 21, outr0\n" + + "vext.32 q6, q14, q15, #1 @ shift left r3\n" + "vmla.f32 q5, q6, %f[wr2][0] @ mul weight 21, outr1\n" + + "vmov.u32 d31, #0 @ zero\n" + + "vext.32 d12, d31, d20, #1 @ shift right r1\n" + "vext.32 d13, d20, d21, #1 @ shift right r1\n" + "vmla.f32 q5, q6, %e[wr0][0] @ mul weight 02, outr1\n" + "vmla.f32 q4, q6, %e[wr1][0] @ mul weight 12, outr0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 d12, d31, d24, #1 @ shift right r0\n" + "vext.32 d13, d24, d25, #1 @ shift right r0\n" + "vmla.f32 q5, q6, %e[wr1][0] @ mul weight 12, outr1\n" + "vmla.f32 q4, q6, %e[wr2][0] @ mul weight 22, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vext.32 d12, d31, d28, #1 @ shift right r0\n" + "vext.32 d13, d28, d29, #1 @ shift right r0\n" + "vmla.f32 q5, q6, %e[wr2][0] @ mul weight 22, outr1\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vmov.u32 q4, #0 @ dump zero to q4 for relu\n" + "vmax.f32 q8, q8, q4 @ relu\n" + "vmax.f32 q9, q9, q4 @ relu\n" + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" + + //! process mid cols + "cmp %[cnt], #1 @ check whether has mid cols\n" + "blt start_top_right_relu @ jump to right pad\n" + "start_top_mid_relu: @ main loop start point\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vmul.f32 q5, q10, %e[wr0][0] @ mul weight 00, outr1\n" + "vmul.f32 q4, q10, %e[wr1][0] @ mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r2\n" + "vmla.f32 q5, q12, %e[wr1][0] @ mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][0] @ mul weight 20, outr0\n" + + "vld1.32 {d28-d30}, [%[din2_ptr]]! @ load din r3\n" + "vmla.f32 q5, q14, %e[wr2][0] @ mul weight 20, outr1\n" + + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %e[wr0][1] @ mul weight 01, outr1\n" + "vmla.f32 q4, q6, %e[wr1][1] @ mul weight 11, outr0\n" + + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %e[wr1][1] @ mul weight 11, outr1\n" + "vmla.f32 q4, q6, %e[wr2][1] @ mul weight 21, outr0\n" + + "vext.32 q6, q14, q15, #1 @ shift left r3\n" + "vmla.f32 q5, q6, %e[wr2][1] @ mul weight 21, outr1\n" + + "vext.32 q6, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @ mul weight 02, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @ mul weight 12, outr0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 q6, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @ mul weight 12, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @ mul weight 22, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vext.32 q6, q14, q15, #2 @ shift right r3\n" + "vmla.f32 q5, q6, %f[wr2][0] @ mul weight 22, outr1\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vmov.u32 q4, #0 @ dump zero to q4 for relu\n" + "vmax.f32 q8, q8, q4 @ relu\n" + "vmax.f32 q9, q9, q4 @ relu\n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din2_ptr], #8 @ 2 float data overlap with previous data\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "bne start_top_mid_relu @ jump to main loop start point\n" + + //! process right pad + "start_top_right_relu: @ right pad entry\n" + "vmov.u32 d31, #0 @ zero buf\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[mask] @ bit select, deal with right pad\n" + "vmul.f32 q5, q10, %e[wr0][0] @ mul weight 00, outr1\n" + "vmul.f32 q4, q10, %e[wr1][0] @ mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r2\n" + "vbif d25, d31, %e[mask] @ bit select, deal with right pad\n" + "vmla.f32 q5, q12, %e[wr1][0] @ mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][0] @ mul weight 20, outr0\n" + + "vld1.32 {d28-d30}, [%[din2_ptr]]! @ load din r3\n" + "vbif d29, d31, %e[mask] @ bit select, deal with right pad\n" + "vmla.f32 q5, q14, %e[wr2][0] @ mul weight 20, outr1\n" + + "vbif d22, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %e[wr0][1] @ mul weight 01, outr1\n" + "vmla.f32 q4, q6, %e[wr1][1] @ mul weight 11, outr0\n" + + "vbif d26, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %e[wr1][1] @ mul weight 11, outr1\n" + "vmla.f32 q4, q6, %e[wr2][1] @ mul weight 21, outr0\n" + + "vbif d30, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q14, q15, #1 @ shift left r3\n" + "vmla.f32 q5, q6, %e[wr2][1] @ mul weight 21, outr1\n" + + "vext.32 q6, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @ mul weight 02, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @ mul weight 12, outr0\n" + + "vext.32 q6, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @ mul weight 12, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @ mul weight 22, outr0\n" + + "vext.32 q6, q14, q15, #2 @ shift right r3\n" + "vmla.f32 q5, q6, %f[wr2][0] @ mul weight 22, outr1\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vmov.u32 q4, #0 @ dump zero to q4 for relu\n" + "vmax.f32 q8, q8, q4 @ relu\n" + "vmax.f32 q9, q9, q4 @ relu\n" + + "pld [%[dout_ptr1], #128] @ preload data\n" + "vld1.32 {d20-d21}, [%[dout_ptr1]] @ load dout r0\n" + + "vmvn.32 d24, d31 @ \n" + "vmvn.32 d25, d31 @ \n" + "vext.32 q13, q12, %q[mask], #3 @ shift mask right 1\n" + "vbif q8, q10, q13 @ bit select\n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + + "sub %[dout_ptr1], %[dout_ptr1], %[pad_right] @ sub \n" + "sub %[dout_ptr2], %[dout_ptr2], %[pad_right] @ sub \n" + + :[dout_ptr1] "+r"(doutr0), [dout_ptr2] "+r"(doutr1), \ + [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), \ + [din2_ptr] "+r"(din2_ptr), [pad_right] "+r" (right_pad_sub), \ + [cnt] "+r"(cnt) + :[wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), \ + [bias] "w"(wbias), [mask] "w" (vmask_rp) + :"q4", "q5", "q6", "q8", "q9", \ + "q10", "q11", "q12", "q13", "q14", "q15" + ); + +#endif //__aarch64__ + + //! after process, increase pointer + doutr0 += w_out; + doutr1 = doutr0 + w_out; + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + //! end of process top row + + //! process mid row + for (h = tile_h - 2; h > 0; h--) { + + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + din3_ptr = dr3; + +#ifdef __aarch64__ + // todo +#else + cnt = cnt_col; + asm volatile( + //! process left pad + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "pld [%[din2_ptr], #192] @ preload data\n" + "pld [%[din3_ptr], #192] @ preload data\n" + + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vmul.f32 q4, q8, %e[wr0][1] @mul weight 00, outr0\n" + + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vmul.f32 q5, q10, %e[wr0][1] @mul weight 00, outr1\n" + "vmla.f32 q4, q10, %e[wr1][1] @mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" + "vmla.f32 q5, q12, %e[wr1][1] @mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][1] @mul weight 20, outr0\n" + + "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" + "vmla.f32 q5, q14, %e[wr2][1] @mul weight 20, outr1\n" + + "vext.32 q6, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q4, q6, %f[wr0][0] @mul weight 01, outr0\n" + + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @mul weight 01, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @mul weight 11, outr0\n" + + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @mul weight 11, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @mul weight 21, outr0\n" + + "vext.32 q6, q14, q15, #1 @ shift left r3\n" + "vmla.f32 q5, q6, %f[wr2][0] @mul weight 21, outr1\n" + + "vmov.u32 d31, #0 @ zero\n" + "vext.32 d12, d31, d16, #1 @ shift right r0\n" + "vext.32 d13, d16, d17, #1 @ shift right r0\n" + "vmla.f32 q4, q6, %e[wr0][0] @mul weight 02, outr0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 d12, d31, d20, #1 @ shift right r1\n" + "vext.32 d13, d20, d21, #1 @ shift right r1\n" + "vmla.f32 q5, q6, %e[wr0][0] @mul weight 02, outr1\n" + "vmla.f32 q4, q6, %e[wr1][0] @mul weight 12, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vext.32 d12, d31, d24, #1 @ shift right r0\n" + "vext.32 d13, d24, d25, #1 @ shift right r0\n" + "vmla.f32 q5, q6, %e[wr1][0] @mul weight 12, outr1\n" + "vmla.f32 q4, q6, %e[wr2][0] @mul weight 22, outr0\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + + "vext.32 d12, d31, d28, #1 @ shift right r0\n" + "vext.32 d13, d28, d29, #1 @ shift right r0\n" + "vmla.f32 q5, q6, %e[wr2][0] @mul weight 22, outr1\n" + + "pld [%[din3_ptr], #192] @ preload data\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vmov.u32 q4, #0 @ dump zero to q4 for relu\n" + "vmax.f32 q8, q8, q4 @ relu\n" + "vmax.f32 q9, q9, q4 @ relu\n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" + "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" + + //! process mid cols + "cmp %[cnt], #1 @ check whether has mid cols\n" + "blt start_mid_right_relu @ @ jump to right pad\n" + "start_mid_mid_relu: @ main loop start point\n" + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vmul.f32 q4, q8, %e[wr0][0] @ mul weight 00, outr0\n" + + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vmul.f32 q5, q10, %e[wr0][0] @ mul weight 00, outr1\n" + "vmla.f32 q4, q10, %e[wr1][0] @ mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" + "vmla.f32 q5, q12, %e[wr1][0] @ mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][0] @ mul weight 20, outr0\n" + + "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" + "vmla.f32 q5, q14, %e[wr2][0] @ mul weight 20, outr1\n" + + "vext.32 q6, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q4, q6, %e[wr0][1] @ mul weight 01, outr0\n" + + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %e[wr0][1] @ mul weight 01, outr1\n" + "vmla.f32 q4, q6, %e[wr1][1] @ mul weight 11, outr0\n" + + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %e[wr1][1] @ mul weight 11, outr1\n" + "vmla.f32 q4, q6, %e[wr2][1] @ mul weight 21, outr0\n" + + "vext.32 q6, q14, q15, #1 @ shift left r3\n" + "vmla.f32 q5, q6, %e[wr2][1] @ mul weight 21, outr1\n" + + "vext.32 q6, q8, q9, #2 @ shift left r0\n" + "vmla.f32 q4, q6, %f[wr0][0] @ mul weight 02, outr0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 q6, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @ mul weight 02, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @ mul weight 12, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vext.32 q6, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @mul weight 12, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @mul weight 22, outr0\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + + "vext.32 q6, q14, q15, #2 @ shift right r3\n" + "vmla.f32 q5, q6, %f[wr2][0] @mul weight 22, outr1\n" + + "pld [%[din3_ptr], #192] @ preload data\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vmov.u32 q4, #0 @ dump zero to q4 for relu\n" + "vmax.f32 q8, q8, q4 @ relu\n" + "vmax.f32 q9, q9, q4 @ relu\n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din2_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din3_ptr], #8 @ 2 float data overlap with previous data\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "bne start_mid_mid_relu @ jump to main loop start point\n" + + //! process right pad + "start_mid_right_relu: @ right pad entry\n" + "vmov.u32 d31, #0 @ zero buf\n" + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vbif d17, d31, %e[mask] @ bit select, deal with right pad\n" + "vmul.f32 q4, q8, %e[wr0][0] @ mul weight 00, outr0\n" + + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[mask] @ bit select, deal with right pad\n" + "vmul.f32 q5, q10, %e[wr0][0] @ mul weight 00, outr1\n" + "vmla.f32 q4, q10, %e[wr1][0] @ mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" + "vbif d25, d31, %e[mask] @ bit select, deal with right pad\n" + "vmla.f32 q5, q12, %e[wr1][0] @ mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][0] @ mul weight 20, outr0\n" + + "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" + "vbif d29, d31, %e[mask] @ bit select, deal with right pad\n" + "vmla.f32 q5, q14, %e[wr2][0] @ mul weight 20, outr1\n" + + "vbif d18, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q4, q6, %e[wr0][1] @ mul weight 01, outr0\n" + + "vbif d22, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %e[wr0][1] @ mul weight 01, outr1\n" + "vmla.f32 q4, q6, %e[wr1][1] @ mul weight 11, outr0\n" + + "vbif d26, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %e[wr1][1] @mul weight 11, outr1\n" + "vmla.f32 q4, q6, %e[wr2][1] @mul weight 21, outr0\n" + + "vbif d30, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q14, q15, #1 @ shift left r3\n" + "vmla.f32 q5, q6, %e[wr2][1] @mul weight 21, outr1\n" + + "vext.32 q6, q8, q9, #2 @ shift left r0\n" + "vmla.f32 q4, q6, %f[wr0][0] @mul weight 02, outr0\n" + + "vext.32 q6, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @mul weight 02, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @mul weight 12, outr0\n" + + "vext.32 q6, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @mul weight 12, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @mul weight 22, outr0\n" + + "vext.32 q6, q14, q15, #2 @ shift right r3\n" + "vmla.f32 q5, q6, %f[wr2][0] @mul weight 22, outr1\n" + + "pld [%[dout_ptr1], #128] @ preload data\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vmov.u32 q4, #0 @ dump zero to q4 for relu\n" + "vmax.f32 q8, q8, q4 @ relu\n" + "vmax.f32 q9, q9, q4 @ relu\n" + + "vld1.32 {d20-d21}, [%[dout_ptr1]] @ load dout r0\n" + + "vmvn.32 d22, d31 @ \n" + "vmvn.32 d23, d31 @ \n" + "vext.32 q12, q11, %q[mask], #3 @ shift mask right 1\n" + "vbif q8, q10, q12 @ bit select\n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + + "sub %[dout_ptr1], %[dout_ptr1], %[pad_right] @ sub \n" + "sub %[dout_ptr2], %[dout_ptr2], %[pad_right] @ sub \n" + + :[dout_ptr1] "+r"(doutr0), [dout_ptr2] "+r"(doutr1), \ + [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), \ + [din2_ptr] "+r"(din2_ptr), [din3_ptr] "+r"(din3_ptr), \ + [pad_right] "+r" (right_pad_sub), [cnt] "+r"(cnt) + :[wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), \ + [bias] "w"(wbias), [mask] "w" (vmask_rp) + :"q4", "q5", "q6", "q8", "q9", \ + "q10", "q11", "q12", "q13", "q14", "q15" + ); +#endif //__aarch64__ + + doutr0 += w_out; + doutr1 = doutr0 + w_out; + dr0 = dr2; + dr1 = dr3; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + } //! end of processing mid rows + + //! deal with bottom pad + din0_ptr = dr0; + din1_ptr = dr1; + if (size_pad_bottom == 2){ + din2_ptr = ptr_zero; + } else { + din2_ptr = dr2; + } + //! process +#ifdef __aarch64__ + // todo +#else + cnt = cnt_col; + asm volatile( + //! process left pad + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "pld [%[din2_ptr], #192] @ preload data\n" + + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vmul.f32 q4, q8, %e[wr0][1] @mul weight 00, outr0\n" + + + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vmul.f32 q5, q10, %e[wr0][1] @mul weight 00, outr1\n" + "vmla.f32 q4, q10, %e[wr1][1] @mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din2_ptr]] @ load din r2\n" + "vmla.f32 q5, q12, %e[wr1][1] @mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][1] @mul weight 20, outr0\n" + + "vext.32 q6, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q4, q6, %f[wr0][0] @mul weight 01, outr0\n" + + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @mul weight 01, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @mul weight 11, outr0\n" + + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @mul weight 11, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @mul weight 21, outr0\n" + + "vmov.u32 d31, #0 @ zero\n" + "vext.32 d12, d31, d16, #1 @ shift right r0\n" + "vext.32 d13, d16, d17, #1 @ shift right r0\n" + "vmla.f32 q4, q6, %e[wr0][0] @mul weight 02, outr0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 d12, d31, d20, #1 @ shift right r1\n" + "vext.32 d13, d20, d21, #1 @ shift right r1\n" + "vmla.f32 q5, q6, %e[wr0][0] @mul weight 02, outr1\n" + "vmla.f32 q4, q6, %e[wr1][0] @mul weight 12, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vext.32 d12, d31, d24, #1 @ shift right r2\n" + "vext.32 d13, d24, d25, #1 @ shift right r2\n" + "vmla.f32 q5, q6, %e[wr1][0] @mul weight 12, outr1\n" + "vmla.f32 q4, q6, %e[wr2][0] @mul weight 22, outr0\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vmov.u32 q4, #0 @ dump zero to q4 for relu\n" + "vmax.f32 q8, q8, q4 @ relu\n" + "vmax.f32 q9, q9, q4 @ relu\n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "sub %[din0_ptr], #12 @ 1pad + 2 data overlap\n" + "sub %[din1_ptr], #12 @ 1pad + 2 data overlap\n" + "cmp %[bot_pad], #2 @ check if bottom pad is 2\n" + "beq start_bot_mid_relu @ jump to next block\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + "add %[din2_ptr], #12 @ 1pad + 2 data overlap\n" + + // process mid cols + "cmp %[cnt], #1 @ check whether has mid cols\n" + "blt start_bot_right_relu @ jump to right pad\n" + "start_bot_mid_relu: @ main loop start point\n" + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vmul.f32 q4, q8, %e[wr0][0] @mul weight 00, outr0\n" + + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vmul.f32 q5, q10, %e[wr0][0] @mul weight 00, outr1\n" + "vmla.f32 q4, q10, %e[wr1][0] @mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din2_ptr]] @ load din r2\n" + "vmla.f32 q5, q12, %e[wr1][0] @mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][0] @mul weight 20, outr0\n" + + "vext.32 q6, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q4, q6, %e[wr0][1] @mul weight 01, outr0\n" + + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %e[wr0][1] @mul weight 01, outr1\n" + "vmla.f32 q4, q6, %e[wr1][1] @mul weight 11, outr0\n" + + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %e[wr1][1] @mul weight 11, outr1\n" + "vmla.f32 q4, q6, %e[wr2][1] @mul weight 21, outr0\n" + + "vext.32 q6, q8, q9, #2 @ shift left r0\n" + "vmla.f32 q4, q6, %f[wr0][0] @mul weight 02, outr0\n" + + "pld [%[din0_ptr], #192] @ preload data\n" + + "vext.32 q6, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @mul weight 02, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @mul weight 12, outr0\n" + + "pld [%[din1_ptr], #192] @ preload data\n" + + "vext.32 q6, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @mul weight 12, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @mul weight 22, outr0\n" + + "pld [%[din2_ptr], #192] @ preload data\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vmov.u32 q4, #0 @ dump zero to q4 for relu\n" + "vmax.f32 q8, q8, q4 @ relu\n" + "vmax.f32 q9, q9, q4 @ relu\n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + + "cmp %[bot_pad], #2 @ check if bottom pad is 2\n" + "beq end_bot_mid_relu @ jump to check point\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + "add %[din2_ptr], #16 @ point to 4 data ahead\n" + + "end_bot_mid_relu: @ check point\n" + "subs %[cnt], #1 @ loop count minus 1\n" + "bne start_bot_mid_relu @ jump to main loop start point\n" + + // process right pad + "start_bot_right_relu: @ right pad process\n" + "vmov.u32 d31, #0 @ zero buf\n" + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" + "vbif d17, d31, %e[mask] @ bit select, deal with right pad\n" + "vmul.f32 q4, q8, %e[wr0][0] @mul weight 00, outr0\n" + + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[mask] @ bit select, deal with right pad\n" + "vmul.f32 q5, q10, %e[wr0][0] @mul weight 00, outr1\n" + "vmla.f32 q4, q10, %e[wr1][0] @mul weight 10, outr0\n" + + "vld1.32 {d24-d26}, [%[din2_ptr]] @ load din r2\n" + "vbif d25, d31, %e[mask] @ bit select, deal with right pad\n" + "vmla.f32 q5, q12, %e[wr1][0] @mul weight 10, outr1\n" + "vmla.f32 q4, q12, %e[wr2][0] @mul weight 20, outr0\n" + + "vbif d18, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q8, q9, #1 @ shift left r0\n" + "vmla.f32 q4, q6, %e[wr0][1] @mul weight 01, outr0\n" + + "vbif d22, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q10, q11, #1 @ shift left r1\n" + "vmla.f32 q5, q6, %e[wr0][1] @mul weight 01, outr1\n" + "vmla.f32 q4, q6, %e[wr1][1] @mul weight 11, outr0\n" + + "vbif d26, d31, %f[mask] @ bit select, deal with right pad\n" + "vext.32 q6, q12, q13, #1 @ shift left r2\n" + "vmla.f32 q5, q6, %e[wr1][1] @mul weight 11, outr1\n" + "vmla.f32 q4, q6, %e[wr2][1] @mul weight 21, outr0\n" + + "vext.32 q6, q8, q9, #2 @ shift left r0\n" + "vmla.f32 q4, q6, %f[wr0][0] @mul weight 02, outr0\n" + + "vext.32 q6, q10, q11, #2 @ shift left r1\n" + "vmla.f32 q5, q6, %f[wr0][0] @mul weight 02, outr1\n" + "vmla.f32 q4, q6, %f[wr1][0] @mul weight 12, outr0\n" + + "pld [%[dout_ptr1], #128] @ preload data\n" + + "vext.32 q6, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q5, q6, %f[wr1][0] @mul weight 12, outr1\n" + "vmla.f32 q4, q6, %f[wr2][0] @mul weight 22, outr0\n" + + "pld [%[dout_ptr2], #128] @ preload data\n" + + "vadd.f32 q8, q4, %q[bias] @ add bias \n" + "vadd.f32 q9, q5, %q[bias] @ add bias \n" + + "vmov.u32 q4, #0 @ dump zero to q4 for relu\n" + "vmax.f32 q8, q8, q4 @ relu\n" + "vmax.f32 q9, q9, q4 @ relu\n" + + + "vld1.32 {d20-d21}, [%[dout_ptr1]] @ load dout r0\n" + "vld1.32 {d22-d23}, [%[dout_ptr2]] @ load dout r1\n" + + "vmvn.32 d24, d31 @ \n" + "vmvn.32 d25, d31 @ \n" + "vext.32 q13, q12, %q[mask], #3 @ shift mask right 1\n" + "vbif q8, q10, q13 @ bit select\n" + "vbif q9, q11, q13 @ bit select\n" + + "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "cmp %[bot_pad], #2 @ check if bottom pad is 2\n" + "beq end_relu @ jump to end point\n" + "vst1.32 {d18-d19}, [%[dout_ptr2]]! @ store result, add pointer\n" + "end_relu: @ end\n" + + :[dout_ptr1] "+r"(doutr0), [dout_ptr2] "+r"(doutr1), \ + [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr), \ + [din2_ptr] "+r"(din2_ptr), [pad_right] "+r"(right_pad_sub), \ + [bot_pad] "+r"(size_pad_bottom), [cnt] "+r"(cnt) + :[wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), \ + [bias] "w"(wbias), [mask] "w"(vmask_rp) + :"q4", "q5", "q6", "q8", "q9", \ + "q10", "q11", "q12", "q13", "q14", "q15" + ); +#endif //__aarch64__ + //! end of processing bottom pad + } // end of processing channels + } // end of processing batchs +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2, with reulu + */ +void conv_depthwise_3x3s2p1_bias_relu(float* dout, const float* din, \ + const float* weights, const float* bias, bool flag_bias, \ + const int num, const int ch_in, const int h_in, const int w_in, \ + const int h_out, const int w_out) { + //! 3x3s2 depthwise convolution, pad 1 is done implicit + int right_pad_idx[4] = {0, 0, 0, 0}; + int right_w_idx[4] = {2, 1, 2, 1}; + int size_pad_right = w_out * 2 - w_in; + int size_pad_bottom = h_out * 2 - h_in; + int size_right_remain = (((w_out + 1) >> 1) << 1) - w_out; + int cnt_col = ((w_out + 1) >> 1) - 2; + + if (size_right_remain == 0 || size_pad_right == 0) { + right_pad_idx[0] = 1; + } + if (size_right_remain == 0) { + right_pad_idx[1] = 1; + if (size_pad_right == 0) { + right_pad_idx[2] = 1; + } + } + uint32x4_t mask_rp = vcgtq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(0)); + uint32x4_t mask_w = vcgtq_s32(vld1q_s32(right_w_idx), vdupq_n_s32(size_right_remain)); + + size_right_remain *= sizeof(float); + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + wr0 = vsetq_lane_f32(0.f, wr0, 3); + wr1 = vsetq_lane_f32(0.f, wr1, 3); + wr2 = vsetq_lane_f32(0.f, wr2, 3); + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } + + const float *dr0 = din_channel; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + + const float *din0_ptr = dr0; + const float *din1_ptr = dr1; + const float *din2_ptr = dr2; + + float *doutr0 = dout_channel; + + float32x4_t vzero = vdupq_n_f32(0.f); + + //! top pad + +#ifdef __aarch64__ + // todo +#else + int cnt = cnt_col; + asm volatile( + // process left pad + "pld [%[din0_ptr], #128] @ preload data\n" + "pld [%[din1_ptr], #128] @ preload data\n" + "vmov.u32 q11, #0 @ for left pad\n" + "vld1.32 {d20-d21}, [%[din0_ptr]]! @ load din r1\n" + "vext.32 q7, q11, q10, #3 @ shift right 1 data\n" + "vmul.f32 q8, q7, %q[wr1] @ mul weight 1, out0\n" + + "vext.32 q7, q10, q11, #1 @ shift left r1\n" + "vmul.f32 q9, q7, %q[wr1] @ mul weight 1, out1\n" + + "vld1.32 {d24-d25}, [%[din1_ptr]]! @ load din r2\n" + "vext.32 q7, q11, q12, #3 @ shift right 1 data\n" + "vmla.f32 q8, q7, %q[wr2] @ mul weight 2, out0\n" + + "vext.32 q7, q12, q11, #1 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr2] @ mul weight 2, out1\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + "vmax.f32 d17, d17, %e[vzero] @ relu\n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #4 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #4 @ 1pad + 2 float data overlap\n" + + // process mid cols + "cmp %[cnt], #1 @ check whether has mid loop\n" + "blt s2_top_right_relu @ jump to rightpad\n" + "s2_top_mid_relu: @ main loop start point\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vmul.f32 q8, q10, %q[wr1] @ mul weight 1, out0\n" + + "vext.32 q7, q10, q11, #2 @ shift left r1\n" + "vmul.f32 q9, q7, %q[wr1] @ mul weight 1, outr1\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r2\n" + "vmla.f32 q8, q12, %q[wr2] @ mul weight 2, outr0\n" + + "vext.32 q7, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr2] @ mul weight 2, outr1\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + "vmax.f32 d17, d17, %e[vzero] @ relu\n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #8 @ 1 float data overlap and 1 redundant\n" + "sub %[din1_ptr], #8 @ 1 float data overlap and 1 redundant\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne s2_top_mid_relu @ jump to main loop start point\n" + + //! process right pad + "s2_top_right_relu: @ right pad entry\n" + "vmov.u32 d31, #0 @ zero buf\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[mask_din] @ bit select, deal with right pad\n" + "vmul.f32 q8, q10, %q[wr1] @ mul weight 1, out0\n" + + "vbif d22, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q10, q11, #2 @ shift left r1\n" + "vmul.f32 q9, q7, %q[wr1] @ mul weight 1, out1\n" + + "pld [%[dout_ptr1], #64] @ preload data\n" + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r2\n" + "vbif d25, d31, %e[mask_din] @ bit select, deal with right pad\n" + "vmla.f32 q8, q12, %q[wr2] @ mul weight 2, outr0\n" + + "vbif d26, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q12, q13, #2 @ shift left r1\n" + "vmla.f32 q9, q7, %q[wr2] @ mul weight 2, outr1\n" + + "vld1.32 {d20}, [%[dout_ptr1]] @ load dout\n" + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + "vmax.f32 d17, d17, %e[vzero] @ relu\n" + + "vbif d17, d20, %e[mask_w] @ bit select\n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + "sub %[dout_ptr1], %[dout_ptr1], %[pad_right] @ sub \n" + + :[dout_ptr1] "+r"(doutr0), [din0_ptr] "+r"(din0_ptr), \ + [din1_ptr] "+r"(din1_ptr), [cnt] "+r"(cnt) + :[wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), \ + [bias] "w"(wbias), [mask_din] "w" (mask_rp), \ + [mask_w] "w" (mask_w), [vzero] "w" (vzero), \ + [pad_right] "r" (size_right_remain) + :"q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + +#endif //__aarch64__ + + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + w_in; + //! end of top pad + + //! process mid rows + for (int j = h_out - size_pad_bottom - 1; j > 0; j--) { +#ifdef __aarch64__ + // todo +#else + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + cnt = cnt_col; + asm volatile( + // process left pad + "pld [%[din0_ptr], #128] @ preload data\n" + "pld [%[din1_ptr], #128] @ preload data\n" + "pld [%[din2_ptr], #128] @ preload data\n" + + "vmov.u32 q11, #0 @ for left pad\n" + "vld1.32 {d20-d21}, [%[din0_ptr]]! @ load din r0\n" + "vext.32 q7, q11, q10, #3 @ shift right 1 data\n" + "vmul.f32 q8, q7, %q[wr0] @ mul weight 00, outr0\n" + + "vext.32 q7, q10, q11, #1 @ shift left r0\n" + "vmul.f32 q9, q7, %q[wr0] @ mul weight 00, outr1\n" + + "vld1.32 {d24-d25}, [%[din1_ptr]]! @ load din r1\n" + "vext.32 q7, q11, q12, #3 @ shift right 1 data\n" + "vmla.f32 q8, q7, %q[wr1] @ mul weight 10, outr0\n" + + "vext.32 q7, q12, q11, #1 @ shift left r1\n" + "vmla.f32 q9, q7, %q[wr1] @ mul weight 10, outr1\n" + + "vld1.32 {d28-d29}, [%[din2_ptr]]! @ load din r2\n" + "vext.32 q7, q11, q14, #3 @ shift right 1 data\n" + "vmla.f32 q8, q7, %q[wr2] @ mul weight 20, outr0\n" + + "vext.32 q7, q14, q11, #1 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr2] @ mul weight 20, outr1\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + "vmax.f32 d17, d17, %e[vzero] @ relu\n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #4 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #4 @ 1pad + 2 float data overlap\n" + "sub %[din2_ptr], #4 @ 1pad + 2 float data overlap\n" + + // process mid cols + "cmp %[cnt], #1 @ check whether has mid loop\n" + "blt s2_mid_right_relu @ jump to rightpad\n" + "s2_mid_mid_relu: @ main loop start point\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "pld [%[din2_ptr], #192] @ preload data\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r0\n" + "vmul.f32 q8, q10, %q[wr0] @ mul weight 00, outr0\n" + "vext.32 q7, q10, q11, #2 @ shift left r1\n" + "vmul.f32 q9, q7, %q[wr0] @ mul weight 00, outr1\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r1\n" + "vmla.f32 q8, q12, %q[wr1] @ mul weight 10, outr0\n" + "vext.32 q7, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr1] @ mul weight 10, outr1\n" + + "vld1.32 {d28-d30}, [%[din2_ptr]]! @ load din r2\n" + "vmla.f32 q8, q14, %q[wr2] @mul weight 10, outr0\n" + "vext.32 q7, q14, q15, #2 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr2] @mul weight 10, outr1\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + "vmax.f32 d17, d17, %e[vzero] @ relu\n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din2_ptr], #8 @ 2 float data overlap with previous data\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne s2_mid_mid_relu @ jump to main loop start point\n" + + // process right pad + "s2_mid_right_relu: @ right pad entry\n" + "vmov.u32 d31, #0 @ zero buf\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[mask_din] @ bit select, deal with right pad\n" + "vmul.f32 q8, q10, %q[wr0] @mul weight 00, outr0\n" + "vbif d22, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q10, q11, #2 @ shift left r0\n" + "vmul.f32 q9, q7, %q[wr0] @mul weight 00, outr1\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r1\n" + "vbif d25, d31, %e[mask_din] @ bit select, deal with right pad\n" + "vmla.f32 q8, q12, %q[wr1] @mul weight 10, outr0\n" + "vbif d26, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q12, q13, #2 @ shift left r1\n" + "vmla.f32 q9, q7, %q[wr1] @mul weight 10, outr1\n" + + "pld [%[dout_ptr1], #64] @ preload ouput data\n" + + "vld1.32 {d28-d30}, [%[din2_ptr]]! @ load din r1\n" + "vbif d29, d31, %e[mask_din] @ bit select, deal with right pad\n" + "vmla.f32 q8, q14, %q[wr2] @mul weight 20, outr0\n" + "vbif d30, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q14, q15, #2 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr2] @mul weight 20, outr1\n" + + "vld1.32 {d20}, [%[dout_ptr1]] @ load dout\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + "vmax.f32 d17, d17, %e[vzero] @ relu\n" + + "vbif d17, d20, %e[mask_w] @ bit select\n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[dout_ptr1], %[dout_ptr1], %[pad_right] @ sub \n" + + :[dout_ptr1] "+r"(doutr0), [din0_ptr] "+r"(din0_ptr), \ + [din1_ptr] "+r"(din1_ptr), [din2_ptr] "+r"(din2_ptr), \ + [cnt] "+r"(cnt) + :[wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), \ + [bias] "w"(wbias),[mask_din] "w" (mask_rp), \ + [mask_w] "w" (mask_w), [vzero] "w" (vzero), \ + [pad_right] "r" (size_right_remain) + :"q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + +#endif //__aarch64__ + + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + } // end of process mid rows + + // process bottom pad if needed + if (size_pad_bottom) { +#ifdef __aarch64__ + // todo +#else + cnt = cnt_col; + asm volatile( + // process left pad + "pld [%[din0_ptr], #128] @ preload data\n" + "pld [%[din1_ptr], #128] @ preload data\n" + + "vmov.u32 q11, #0 @ for left pad\n" + "vld1.32 {d20-d21}, [%[din0_ptr]]! @ load din r0\n" + "vext.32 q7, q11, q10, #3 @ shift right 1 data\n" + "vmul.f32 q8, q7, %q[wr0] @mul weight 00, outr0\n" + "vext.32 q7, q10, q11, #1 @ shift left r0\n" + "vmul.f32 q9, q7, %q[wr0] @mul weight 00, outr1\n" + + "vld1.32 {d24-d25}, [%[din1_ptr]]! @ load din r1\n" + "vext.32 q7, q11, q12, #3 @ shift right 1 data\n" + "vmla.f32 q8, q7, %q[wr1] @mul weight 10, outr0\n" + "vext.32 q7, q12, q11, #1 @ shift left r1\n" + "vmla.f32 q9, q7, %q[wr1] @mul weight 10, outr1\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + "vmax.f32 d17, d17, %e[vzero] @ relu\n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #4 @ 1pad + 2 float data overlap\n" + "sub %[din1_ptr], #4 @ 1pad + 2 float data overlap\n" + + // process mid cols + "cmp %[cnt], #1 @ check whether has mid loop\n" + "blt s2_bot_right_relu @ jump to rightpad\n" + "s2_bot_mid_relu: @ main loop start point\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r0\n" + "vmul.f32 q8, q10, %q[wr0] @mul weight 00, outr0\n" + "vext.32 q7, q10, q11, #2 @ shift left r1\n" + "vmul.f32 q9, q7, %q[wr0] @mul weight 00, outr1\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r1\n" + "vmla.f32 q8, q12, %q[wr1] @mul weight 10, outr0\n" + "vext.32 q7, q12, q13, #2 @ shift left r2\n" + "vmla.f32 q9, q7, %q[wr1] @mul weight 10, outr1\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + "vmax.f32 d17, d17, %e[vzero] @ relu\n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[din0_ptr], #8 @ 2 float data overlap with previous data\n" + "sub %[din1_ptr], #8 @ 2 float data overlap with previous data\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne s2_bot_mid_relu @ jump to main loop start point\n" + + // process right pad + "s2_bot_right_relu: @ right pad entry\n" + "vmov.u32 d31, #0 @ zero buf\n" + "pld [%[din0_ptr], #192] @ preload data\n" + "pld [%[din1_ptr], #192] @ preload data\n" + "vld1.32 {d20-d22}, [%[din0_ptr]]! @ load din r1\n" + "vbif d21, d31, %e[mask_din] @ bit select, deal with right pad\n" + "vmul.f32 q8, q10, %q[wr0] @mul weight 00, outr0\n" + "vbif d22, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q10, q11, #2 @ shift left r0\n" + "vmul.f32 q9, q7, %q[wr0] @mul weight 00, outr1\n" + + "vld1.32 {d24-d26}, [%[din1_ptr]]! @ load din r1\n" + "vbif d25, d31, %e[mask_din] @ bit select, deal with right pad\n" + + "pld [%[dout_ptr1], #64] @ preload data\n" + + "vmla.f32 q8, q12, %q[wr1] @mul weight 10, outr0\n" + "vbif d26, d31, %f[mask_din] @ bit select, deal with right pad\n" + "vext.32 q7, q12, q13, #2 @ shift left r1\n" + "vmla.f32 q9, q7, %q[wr1] @mul weight 10, outr1\n" + + "vld1.32 {d20}, [%[dout_ptr1]] @ load dout\n" + + "vpadd.f32 d22, d16, d17 @ pair add of out0 \n" + "vpadd.f32 d23, d18, d19 @ pair add of out1 \n" + "vpadd.f32 d16, d22, d23 @ get finnal out0,1\n" + + "vadd.f32 d17, d16, %e[bias] @ add bias \n" + "vmax.f32 d17, d17, %e[vzero] @ relu\n" + + "vbif d17, d20, %e[mask_w] @ bit select\n" + + "vst1.32 {d17}, [%[dout_ptr1]]! @ store result, add pointer\n" + + "sub %[dout_ptr1], %[dout_ptr1], %[pad_right] @ sub \n" + + :[dout_ptr1] "+r"(doutr0), [din0_ptr] "+r"(din0_ptr), \ + [din1_ptr] "+r"(din1_ptr), [cnt] "+r"(cnt) + :[wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), \ + [bias] "w"(wbias), [mask_din] "w" (mask_rp), \ + [mask_w] "w" (mask_w), [vzero] "w" (vzero), \ + [pad_right] "r" (size_right_remain) + :"q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +#endif //__aarch64__ + } // end of process bottom pad + } + } +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //USE_ARM_PLACE \ No newline at end of file diff --git a/saber/lite/funcs/neon/impl/conv_arm_impl.cpp b/saber/lite/funcs/neon/impl/conv_arm_impl.cpp new file mode 100644 index 000000000..c1b9777e8 --- /dev/null +++ b/saber/lite/funcs/neon/impl/conv_arm_impl.cpp @@ -0,0 +1,321 @@ +#include "saber/lite/funcs/neon/impl/conv_arm_impl.h" +#ifdef USE_ARM_PLACE +#include "saber/lite/funcs/neon/impl/sgemv_arm.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ +/** + * \brief neon implementation to add bias + * @param tensor + * @param bias + * @param channel + * @param channel_size + */ +void fill_bias(float* tensor, const float* bias, int channel, int channel_size) { + + float* data = tensor; + + for (int j = 0; j < channel; ++j) { + float32x4_t vdata = vdupq_n_f32(bias[j]); + int i = 0; + for (; i < channel_size - 3; i += 4) { + vst1q_f32(data + i, vdata); + } + for (; i < channel_size; i++) { + data[i] = bias[j]; + } + data += channel_size; + } +} +/** + * \brief basic direct convolution function + */ +void conv_arm_basic(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space) { + + const int size_kernel = kernel_h * kernel_w; + + int kernel_ext_w = (kernel_w - 1) * dila_w + 1; + int kernel_ext_h = (kernel_h - 1) * dila_h + 1; + + const int ch_out_g = chout / group; + const int ch_in_g = chin / group; + const int size_in_channel = win * hin; + const int size_in_batch = size_in_channel * chin; + const int size_out_channel = wout * hout; + const int size_out_batch = size_out_channel * chout; + + for (int b = 0; b < num; ++b) { + float *outptr_batch = dout + b * size_out_batch; + const float* data_in_batch = din + b * size_in_batch; +#pragma omp parallel for collapse(2) + for (int g = 0; g < group; ++g) { + for (int c = 0; c < ch_out_g; ++c) { + const float *inptr_group = data_in_batch + g * ch_in_g * size_in_channel; + float *outptr_ch = outptr_batch + (g * ch_out_g + c) * size_out_channel; + const float *weight_ch = weights + (g * ch_out_g + c) * ch_in_g * size_kernel; + + float bias_value = flag_bias? bias[g * ch_out_g + c] : 0.f; + fill_bias(outptr_ch, &bias_value, 1, wout * hout); + + for (int i = 0; i < hout; ++i) { + for (int j = 0; j < wout; ++j) { + + const float *weight_ch_in = weight_ch; + + int hstart = i * stride_h - pad_h; + int wstart = j * stride_w - pad_w; + int hend = std::min(hstart + kernel_ext_h, hin); + int wend = std::min(wstart + kernel_ext_w, win); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + + int khstart = hend < kernel_ext_h? (kernel_ext_h - hend) / dila_h : 0; + int kwstart = wend < kernel_ext_w? (kernel_ext_w - wend) / dila_w : 0; + + //printf("channel: %d, index: %d, %d, %d, %d, %d, %d\n", c, hstart, wstart, hend, wend, khstart, kwstart); + const float* inptr_ch = inptr_group + hstart * win + wstart; + + for (int k = 0; k < ch_in_g; ++k) { + const float* inptr_kernel = inptr_ch; + int khidx = khstart; + for (int idxh = hstart; idxh < hend; idxh += dila_h, khidx++) { + const float* inptr_kernel_w = inptr_kernel; + int kwidx = kwstart; + for (int idxw = wstart; idxw < wend; idxw += dila_w, kwidx++) { + outptr_ch[j] += weight_ch_in[khidx * kernel_w + kwidx] * inptr_kernel_w[0]; + inptr_kernel_w += dila_w; + } + inptr_kernel += dila_h * win; + } + inptr_ch += size_in_channel; + weight_ch_in += size_kernel; + } + if (flag_relu) { + outptr_ch[j] = outptr_ch[j] > 0? outptr_ch[j] : 0.f; + } + } + outptr_ch += wout; + } + } + } + } +} + +/** + * \brief inline funcs used in im2col + * @param a + * @param b + * @return + */ +inline bool is_a_ge_zero_and_a_lt_b(int a, int b) { + return static_cast(a) < static_cast(b); +} + +/** + * \brief normal im2col function for gemm conv + * @tparam dtype + * @param data_im + * @param channels + * @param height + * @param width + * @param kernel_size + * @param pad + * @param stride + * @param data_col + */ +template +void im2col(const Dtype* data_im, const int channels, const int height, const int width, \ + const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, \ + const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, \ + Dtype* data_col) { + + const int output_h = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int output_w = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + const int channel_size = height * width; + for (int channel = channels; channel--; data_im += channel_size) { + for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { + for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { + int input_row = -pad_h + kernel_row * dilation_h; + for (int output_rows = output_h; output_rows; output_rows--) { + if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { + for (int output_cols = output_w; output_cols; output_cols--) { + *(data_col++) = 0; + } + } else { + int input_col = -pad_w + kernel_col * dilation_w; + for (int output_col = output_w; output_col; output_col--) { + if (is_a_ge_zero_and_a_lt_b(input_col, width)) { + *(data_col++) = data_im[input_row * width + input_col]; + } else { + *(data_col++) = 0; + } + input_col += stride_w; + } + } + input_row += stride_h; + } + } + } + } +} + +/** + * \brief specify im2col for kernel size 1x1, and stride = 2 + * @param data_im + * @param channels + * @param height + * @param width + * @param data_col + */ +void im2col1x1s2(const float* data_im, const int channels, const int height, \ + const int width, float* data_col){ + float32x4x2_t vdin; + int size = height * width; + int width_out = (width - 1) / 2 + 1; + int height_out = (height - 1) / 2 + 1; + int size_out = width_out * height_out; +//#pragma omp parallel for + for (int i = 0; i < channels; ++i) { + float* dout = data_col + i * size_out; + const float* din = data_im + i * size; + for (int j = 0; j < height - 1; j += 2) { + + const float* dinh = din + j * width; + int k = 0; + for (; k < width - 7; k += 8) { + vdin = vld2q_f32(dinh + k); + vst1q_f32(dout, vdin.val[0]); + dout += 4; + } + for (; k < width; k += 2) { + *(dout++) = *(din + k); + } + } + } +} + +/** + * \brief convolution function for kernel size 1x1, stride size 1, gemm implementation + */ +void conv1x1s1_gemm(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space) { + + int channel_size_out = wout * hout; + int channel_size_in = win * hin; + + const int m = chout / group; + const int n = hout * wout; + const int k = chin / group; + + int weights_size_per_group = chout * chin / (group * group); + + //! use gemv when the output channel size = 1 + if (n == 1) { + for (int b = 0; b < num; ++b) { + for (int g = 0; g < group; ++g) { + float* dout_group = dout + (b * chout + g * m) * channel_size_out; + const float* din_group = din + (b * chin + g * k)* channel_size_in; + const float* weights_group = weights + g * weights_size_per_group; + const float* bias_group = bias + g * m; + if (flag_bias){ + if (flag_relu) { + sgemv_bias_relu(false, m, k, weights_group, din_group, dout_group, bias_group); + } else { + sgemv_bias(false, m, k, weights_group, din_group, dout_group, bias_group); + } + + } else { + if (flag_relu) { + sgemv_relu(false, m, k, weights_group, din_group, dout_group); + } else { + sgemv(false, m, k, weights_group, din_group, dout_group); + } + } + } + } + + } else { + for (int b = 0; b < num; ++b) { + // dC + for (int g = 0; g < group; ++g) { + float* dout_group = dout + (b * chout + g * m) * channel_size_out; + const float* din_group = din + (b * chin + g * k) * channel_size_in; + const float* weights_group = weights + g * weights_size_per_group; + const float* bias_group = bias + g * m; + float beta = 0.f; + if (flag_bias) { + fill_bias(dout_group, bias_group, m, wout * hout); + beta = 1.f; + } + gemmer(weights_group, k, din_group, n, dout_group, n, 1.f, beta, flag_relu); + } + + } + } +} + +/** + * \brief convolution function for kernel size 3x3, stride size 2, gemm implementation + */ +void conv_im2col_gemm(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space) { + + const int m = chout / group; + const int n = hout * wout; + const int k = chin * kernel_h * kernel_w / group; + + const int chin_per_group = chin / group; + + int channel_size_out = wout * hout; + int channel_size_in = win * hin; + + int weights_size_per_group = chout * chin * kernel_w * kernel_h / (group * group); + + for (int b = 0; b < num; ++b) { + // dC + for (int g = 0; g < group; ++g) { + float* dout_group = dout + (b * chout + g * m) * channel_size_out; + const float* din_group = din + (b * chin + g * chin_per_group) * channel_size_in; + const float* weights_group = weights + g * weights_size_per_group; + const float* bias_group = bias + g * m; + float* dB = (float*)work_space; + if (kernel_w == 1 && pad_w == 0) { + im2col1x1s2(din_group, chin_per_group, hin, win, dB); + } else { + im2col(din_group, chin_per_group, hin, win, kernel_h, kernel_w, \ + pad_h, pad_w, stride_h, stride_w, dila_h, dila_w, dB); + } + float beta = 0.f; + if (flag_bias) { + fill_bias(dout_group, bias_group, m, wout * hout); + beta = 1.f; + } + + gemmer(weights_group, k, dB, n, dout_group, n, 1.f, beta, flag_relu); + } + } +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //USE_ARM_PLACE \ No newline at end of file diff --git a/saber/lite/funcs/neon/impl/conv_arm_impl.h b/saber/lite/funcs/neon/impl/conv_arm_impl.h new file mode 100644 index 000000000..541048648 --- /dev/null +++ b/saber/lite/funcs/neon/impl/conv_arm_impl.h @@ -0,0 +1,117 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#ifndef ANAKIN_SABER_LITE_FUNCS_NEON_IMPL_CONV_ARM_IMPL_H +#define ANAKIN_SABER_LITE_FUNCS_NEON_IMPL_CONV_ARM_IMPL_H + +#include "saber/lite/core/tensor_lite.h" + +#ifdef USE_ARM_PLACE + +#include "saber/lite/funcs/neon/impl/sgemm_arm.h" +namespace anakin{ + +namespace saber{ + +namespace lite{ + +void conv_arm_basic(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space); + +void conv_3x3s1_direct(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space); + +void conv1x1s1_gemm(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space); + +void conv_im2col_gemm(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space); + +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias + */ +void conv_depthwise_3x3(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space); + +void conv_arm_winograd3x3(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space); + +void winograd_transform_weights(float* dout, const float* din, int ch_out, \ + int ch_in, void* work_space); + +void fill_bias(float* tensor, const float* bias, int channel, int channel_size); +#if 0 +class ConvWinogradF63 { +public: + ConvWinogradF63(); + ~ConvWinogradF63(); + bool init(const size_t l1_cache, const size_t l2_cache, \ + const int chout, const int chin, const int hin, const int win, const int threads = 4); + bool operator()(const float* trans_weights, const float* din, float* dout, \ + void* workspace = nullptr); + +private: + + unsigned int _k_block{0}; + unsigned int _x_block{0}; + unsigned int _Mround{0}; + + unsigned int _loop_count{0}; + unsigned int _cblock_size{0}; + int _thread_num{1}; + + void* _work_space_ptr{nullptr}; + + size_t _work_size{0}; + size_t _a_worksize{0}; + size_t _b_worksize{0}; + load_data _load_a; + load_data _load_b; + + bool _init_flag{false}; +}; +#endif + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_NEON_IMPL_CONV_ARM_IMPL_H diff --git a/saber/lite/funcs/neon/impl/conv_arm_winograd.cpp b/saber/lite/funcs/neon/impl/conv_arm_winograd.cpp new file mode 100644 index 000000000..848d977f5 --- /dev/null +++ b/saber/lite/funcs/neon/impl/conv_arm_winograd.cpp @@ -0,0 +1,431 @@ +#include "saber/lite/funcs/neon/impl/conv_arm_impl.h" + +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +void transpose(float* data_out, const float* data_in, int w_in, int h_in); +void transform_input_f6x6(float* dout, const float* din); +void transform_output_f6x6(float* output, const float* din, float bias); +#if 0 +ConvWinogradF63::ConvWinogradF63() { + +} + +ConvWinogradF63::~ConvWinogradF63() { + +} + +bool ConvWinogradF63::init(const size_t l1_cache, const size_t l2_cache, \ + const int chout, const int chin, const int hin, \ + const int win, const int threads) { + + return true; +} + +bool ConvWinogradF63::operator()(const float *trans_weights, const float *din, \ + float *dout, void *workspace) { + + return true; +} +#endif +void conv_arm_winograd3x3(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space) { + + int size_in_channel = win * hin; + int size_out_channel = wout * hout; + + //! transform input + int tile_w = (wout + 5) / 6; + int tile_h = (hout + 5) / 6; + int size_tile = tile_h * tile_w; + int size_trans_channel = 8 * 8 * size_tile; + int max_ch = chin > chout? chin : chout; + + //! tmp data buffer for input transform + float* tmp_data1 = (float*)work_space; + //! tmp data buffer for dot mul + float* tmp_data2 = tmp_data1 + size_trans_channel * max_ch; + + //SaberTimer t1; + //Context ctx1; + + for (int i = 0; i < num; ++i) { + + const float* din_batch = din + i * chin * size_in_channel; + float* dout_batch = dout + i * chout * size_out_channel; + + //t1.start(ctx1); + //! transform input Bt * data * B +#pragma omp parallel for + for (int j = 0; j < chin; ++j) { + + const float* din_channel = din_batch + j * size_in_channel; + float* data_trans_channel = tmp_data1 + j * size_trans_channel; + + for (int h = 0; h < tile_h; h++) { + + for (int w = 0; w < tile_w; w ++) { + //! prepare data 8x8 + //! row 8 + float data_in_tmp[8][8] = {0.f}; + //memset(data_in_tmp[0], 0, sizeof(float) * 64); + for (int j = 0; j < 8; ++j) { + int start_row = h * 6 + j - pad_h; + if (start_row >= 0 && start_row < hin){ + for (int k = 0; k < 8; ++k) { + int start_col = w * 6 + k - pad_w; + if (start_col >= 0 && start_col < win) { + data_in_tmp[j][k] = din_channel[start_row * win + start_col]; + } + } + } + } + transform_input_f6x6(data_trans_channel, data_in_tmp[0]); + data_trans_channel += 64; + } + } + } + //! end of transform input + +#if 1 + //////////////////////////////////////////////////////////////////////////////// + //! dot mul + //! transpose input, convert from ch_in * tile_h * tile_w * 64 to + //! 64 * ch_in * tile_h * tile_w + int stride_a = chout * chin; + int stride_b = chin * size_tile; + int stride_c = chout * size_tile; + transpose(tmp_data2, tmp_data1, 64, stride_b); + + //t1.end(ctx1); + //LOG(INFO) << "winograd conv transform input time: " << t1.get_average_ms(); + + //t1.clear(); + //t1.start(ctx1); + + //! gemm +//#pragma omp parallel for + for (int l = 0; l < 64; ++l) { + const float* ptr_a = weights + l * stride_a; + const float* ptr_b = tmp_data2 + l * stride_b; + float* ptr_c = tmp_data1 + l * stride_c; + gemmer(ptr_a, chin, ptr_b, size_tile, ptr_c, size_tile, 1.f, 0.f, false); + } + + //! transpose output, convert from 64 * ch_out * tile_h * tile_w to + //! ch_out * tile_h * tile_w * 64 + transpose(tmp_data2, tmp_data1, stride_c, 64); + //! end of dot mul +#endif + //t1.end(ctx1); + //LOG(INFO) << "winograd conv dot mul time: " << t1.get_average_ms(); + + + //t1.clear(); + //t1.start(ctx1); +#if 1 + /////////////////////////////////////////////////////////////////////////////// + //! transform output +#pragma omp parallel for + for (int i = 0; i < chout; ++i) { + + float bias_value = flag_bias? bias[i] : 0.f; + float* dout_tmp = tmp_data2 + i * size_trans_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + for (int h = 0; h < tile_h; ++h) { + for (int w = 0; w < tile_w; ++w) { + + float out_tmp[6][6]; + + transform_output_f6x6(out_tmp[0], dout_tmp, bias_value); + dout_tmp += 64; + + for (int j = 0; j < 6; ++j) { + int end_row = h * 6 + j; + if (end_row < hout) { + for (int k = 0; k < 6; ++k) { + int end_col = w * 6 + k; + if (end_col < wout){ + dout_channel[end_row * wout + end_col] = out_tmp[j][k]; + } + } + } + } + } + } + } + //! end of transform output +#endif + //t1.end(ctx1); + //LOG(INFO) << "winograd conv transform output time: " << t1.get_average_ms(); + } +} + +/** + * \brief transpose with arm neon optimization + * @param data_out + * @param data_in + * @param w_in + * @param h_in + */ +void transpose(float* data_out, const float* data_in, int w_in, int h_in) { + + int nw = w_in >> 2; + int nh = h_in >> 2; +#pragma omp parallel for + for (int i = 0; i < nh; i++) { + for (int j = 0; j < nw; j++) { + const float *ptr = data_in + i * 4 * w_in + j * 4; + float *outptr = data_out + j * 4 * h_in + i * 4; + + const float *in0 = ptr; + const float *in1 = in0 + w_in; + const float *in2 = in1 + w_in; + const float *in3 = in2 + w_in; + + float *out0 = outptr; + float *out1 = out0 + h_in; + float *out2 = out1 + h_in; + float *out3 = out2 + h_in; +#ifdef __aarch64__ +#else + asm( "vld1.32 {d0, d1}, [%[in0]] \n" + "vld1.32 {d2, d3}, [%[in1]] \n" + "vld1.32 {d4, d5}, [%[in2]] \n" + "vld1.32 {d6, d7}, [%[in3]] \n" + "vtrn.32 q0, q1 \n" + "vtrn.32 q2, q3 \n" + "vswp d1, d4 \n" + "vswp d3, d6 \n" + "vst1.32 {d0, d1}, [%[out0]] \n" + "vst1.32 {d2, d3}, [%[out1]] \n" + "vst1.32 {d4, d5}, [%[out2]] \n" + "vst1.32 {d6, d7}, [%[out3]] \n" + : + : [out0] "r" (out0), [out1] "r" (out1), [out2] "r" (out2), [out3] "r" (out3), + [in0] "r" (in0), [in1] "r" (in1), [in2] "r" (in2), [in3] "r" (in3) + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + ); +#endif //__aarch64__ + } + } + //! process remains + for (int i = 0; i < nw * 4; i++) { + for (int j = nh * 4; j < h_in; j++) { + const float *ptr = data_in + j * w_in + i; + float *outptr = data_out + i * h_in + j; + *outptr = *ptr; + } + } + for (int i = nw * 4; i < w_in; i++) { + for (int j = 0; j < h_in; j++) { + const float *ptr = data_in + w_in * j + i; + float *outptr = data_out + i * h_in + j; + *outptr = *ptr; + } + } +} + +/** + * \brief winograd transform conv3x3 weights, f63 + * this is done in op initialization or creation, only do once + * dout = G * g * GT, where G is the transform coeff, g is the input weights + * @param dout + * @param din + * @param ch_out + * @param ch_in + * @param work_space + */ +void winograd_transform_weights(float* dout, const float* din, int ch_out, \ + int ch_in, void* work_space) { + const float coeff[8][3] = { + { 1.0f, 0.0f, 0.0f}, + { -2.0f / 9, -2.0f / 9, -2.0f / 9}, + { -2.0f / 9, 2.0f / 9, -2.0f / 9}, + { 1.0f / 90, 1.0f / 45, 2.0f / 45}, + { 1.0f / 90, -1.0f / 45, 2.0f / 45}, + {32.0f / 45, 16.0f / 45, 8.0f / 45}, + {32.0f / 45, -16.0f / 45, 8.0f / 45}, + { 0.0f, 0.0f, 1.0f} + }; + + float* ptr_out = (float*)work_space; + + for (int i = 0; i < ch_out; i++) { + for (int j = 0; j < ch_in; j++) { + const float* kernel0 = din + (i * ch_in + j) * 9; + float* ptr_channel = ptr_out + (i * ch_in + j) * 64; + + //! transform kernel, transposed + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + //! h + float tmp[8][3]; + for (int i = 0; i < 8; i++) { + tmp[i][0] = k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2]; + tmp[i][1] = k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2]; + tmp[i][2] = k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2]; + } + + //! v + for (int j = 0; j < 8; j++) { + float* tmpp = &tmp[j][0]; + for (int i = 0; i < 8; i++) { + ptr_channel[j*8 + i] = tmpp[0] * coeff[i][0] + tmpp[1] * coeff[i][1] + \ + tmpp[2] * coeff[i][2]; + } + } + } + } + transpose(dout, ptr_out, 64, ch_out * ch_in); +} + +/** + * \brief winograd conv, transform input, f6x3 + * dout = BT * d * B, whrer B is the transform + * BT = 1 0 -21/4 0 21/4 0 -1 0 + * 0 1 1 -17/4 -17/4 1 1 0 + * 0 -1 1 17/4 -17/4 -1 1 0 + * 0 1/2 1/4 -5/2 -5/4 2 1 0 + * 0 -1/2 1/4 5/2 -5/4 -2 1 0 + * 0 2 4 -5/2 -5 1/2 1 0 + * 0 -2 4 5/2 -5 -1/2 1 0 + * 0 -1 0 21/4 0 -21/4 0 1 + * @param dout + * @param din + */ +void transform_input_f6x6(float* dout, const float* din) { + float tmp[8][8]; + //! BT * d + for (int m = 0; m < 8; m++) { + tmp[0][m] = din[0] - din[6] + (din[4] - din[2]) * 5.25f; + tmp[7][m] = din[7] - din[1] + (din[3] - din[5]) * 5.25f; + + float tmp12a = din[2] + din[6] - din[4] * 4.25f; + float tmp12b = din[1] + din[5] - din[3] * 4.25f; + + tmp[1][m] = tmp12a + tmp12b; + tmp[2][m] = tmp12a - tmp12b; + + float tmp34a = din[6] + din[2] * 0.25f - din[4] * 1.25f; + float tmp34b = din[1] * 0.5f - din[3] * 2.5f + din[5] * 2.f; + + tmp[3][m] = tmp34a + tmp34b; + tmp[4][m] = tmp34a - tmp34b; + + float tmp56a = din[6] + (din[2] - din[4] * 1.25f) * 4.f; + float tmp56b = din[1] * 2.f - din[3] * 2.5f + din[5] * 0.5f; + + tmp[5][m] = tmp56a + tmp56b; + tmp[6][m] = tmp56a - tmp56b; + + din += 8; + } + + for (int m = 0; m < 8; m++) { + const float* tmp0 = tmp[m]; + + dout[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25f; + dout[7] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25f; + + float tmp12a = tmp0[2] + tmp0[6] - tmp0[4] * 4.25f; + float tmp12b = tmp0[1] + tmp0[5] - tmp0[3] * 4.25f; + + dout[1] = tmp12a + tmp12b; + dout[2] = tmp12a - tmp12b; + + float tmp34a = tmp0[6] + tmp0[2] * 0.25f - tmp0[4] * 1.25f; + float tmp34b = tmp0[1] * 0.5f - tmp0[3] * 2.5f + tmp0[5] * 2.f; + + dout[3] = tmp34a + tmp34b; + dout[4] = tmp34a - tmp34b; + + float tmp56a = tmp0[6] + (tmp0[2] - tmp0[4] * 1.25f) * 4.f; + float tmp56b = tmp0[1] * 2.f - tmp0[3] * 2.5f + tmp0[5] * 0.5f; + + dout[5] = tmp56a + tmp56b; + dout[6] = tmp56a - tmp56b; + + dout += 8; + } +} + +/** + * \brief winograd conv, transform output, f63 + * out = AT * din * A + * AT = 1 1 1 1 1 1 1 0 + * 0 1 -1 2 -2 1/2 -1/2 0 + * 0 1 1 4 4 1/4 1/4 0 + * 0 1 -1 8 -8 1/8 -1/8 0 + * 0 1 1 16 16 1/16 1/16 0 + * 0 1 -1 32 -32 1/32 -1/32 1 + * @param output + * @param din + * @param bias + */ +void transform_output_f6x6(float* output, const float* din, float bias) { + float tmp[6][8]; + for (int m = 0; m < 8; m++) { + float tmp024a = din[1] + din[2]; + float tmp135a = din[1] - din[2]; + + float tmp024b = din[3] + din[4]; + float tmp135b = din[3] - din[4]; + + float tmp024c = din[5] + din[6]; + float tmp135c = din[5] - din[6]; + + tmp[0][m] = din[0] + tmp024a + tmp024b + tmp024c; + tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 0.25f; + tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c * 0.0625f; + + tmp[1][m] = tmp135a + tmp135b * 2 + tmp135c * 0.5f; + tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 0.125f; + tmp[5][m] = din[7] + tmp135a + tmp135b * 32 + tmp135c * 0.03125f; + + din += 8; + } + + for (int m = 0; m < 6; m++) { + const float* tmp0 = tmp[m]; + + float tmp024a = tmp0[1] + tmp0[2]; + float tmp135a = tmp0[1] - tmp0[2]; + + float tmp024b = tmp0[3] + tmp0[4]; + float tmp135b = tmp0[3] - tmp0[4]; + + float tmp024c = tmp0[5] + tmp0[6]; + float tmp135c = tmp0[5] - tmp0[6]; + + output[0] = bias + tmp0[0] + tmp024a + tmp024b + tmp024c; + output[2] = bias + tmp024a + tmp024b * 4 + tmp024c * 0.25f; + output[4] = bias + tmp024a + tmp024b * 16 + tmp024c * 0.0625f; + + output[1] = bias + tmp135a + tmp135b * 2 + tmp135c * 0.5f; + output[3] = bias + tmp135a + tmp135b * 8 + tmp135c * 0.125f; + output[5] = bias + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c * 0.03125f; + + output += 6; + } +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //USE_ARM_PLACE \ No newline at end of file diff --git a/saber/lite/funcs/neon/impl/neon_mathfun.h b/saber/lite/funcs/neon/impl/neon_mathfun.h new file mode 100644 index 000000000..b36cb1735 --- /dev/null +++ b/saber/lite/funcs/neon/impl/neon_mathfun.h @@ -0,0 +1,324 @@ +/* NEON implementation of sin, cos, exp and log + * + * Inspired by Intel Approximate Math library, and based on the + * corresponding algorithms of the cephes math library + */ + +/* Copyright (C) 2011 Julien Pommier + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * (this is the zlib license) + */ +#ifndef ANAKIN_SABER_FUNCS_ARM_IMPL_NEON_MATHFUN_H +#define ANAKIN_SABER_FUNCS_ARM_IMPL_NEON_MATHFUN_H + +#include "saber/lite/core/common_lite.h" + +#ifdef USE_ARM_PLACE + +#define c_inv_mant_mask ~0x7f800000u +#define c_cephes_SQRTHF 0.707106781186547524 +#define c_cephes_log_p0 7.0376836292E-2 +#define c_cephes_log_p1 - 1.1514610310E-1 +#define c_cephes_log_p2 1.1676998740E-1 +#define c_cephes_log_p3 - 1.2420140846E-1 +#define c_cephes_log_p4 + 1.4249322787E-1 +#define c_cephes_log_p5 - 1.6668057665E-1 +#define c_cephes_log_p6 + 2.0000714765E-1 +#define c_cephes_log_p7 - 2.4999993993E-1 +#define c_cephes_log_p8 + 3.3333331174E-1 +#define c_cephes_log_q1 -2.12194440e-4 +#define c_cephes_log_q2 0.693359375 + +/* natural logarithm computed for 4 simultaneous float + * return NaN for x <= 0 + */ +static inline float32x4_t log_ps(float32x4_t x) +{ + float32x4_t one = vdupq_n_f32(1); + + x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ + uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); + + int32x4_t ux = vreinterpretq_s32_f32(x); + + int32x4_t emm0 = vshrq_n_s32(ux, 23); + + /* keep only the fractional part */ + ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); + ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); + x = vreinterpretq_f32_s32(ux); + + emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); + float32x4_t e = vcvtq_f32_s32(emm0); + + e = vaddq_f32(e, one); + + /* part2: + * if( x < SQRTHF ) { + * e -= 1; + * x = x + x - 1.0; + * } else { x = x - 1.0; } + */ + uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); + float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); + x = vsubq_f32(x, one); + e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); + x = vaddq_f32(x, tmp); + + float32x4_t z = vmulq_f32(x,x); + + float32x4_t y = vdupq_n_f32(c_cephes_log_p0); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); + y = vmulq_f32(y, x); + + y = vmulq_f32(y, z); + + + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); + y = vaddq_f32(y, tmp); + + + tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); + y = vsubq_f32(y, tmp); + + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); + x = vaddq_f32(x, y); + x = vaddq_f32(x, tmp); + x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN + return x; +} + +#define c_exp_hi 88.3762626647949f +#define c_exp_lo -88.3762626647949f + +#define c_cephes_LOG2EF 1.44269504088896341 +#define c_cephes_exp_C1 0.693359375 +#define c_cephes_exp_C2 -2.12194440e-4 + +#define c_cephes_exp_p0 1.9875691500E-4 +#define c_cephes_exp_p1 1.3981999507E-3 +#define c_cephes_exp_p2 8.3334519073E-3 +#define c_cephes_exp_p3 4.1665795894E-2 +#define c_cephes_exp_p4 1.6666665459E-1 +#define c_cephes_exp_p5 5.0000001201E-1 + +/* exp() computed for 4 float at once */ +static inline float32x4_t exp_ps(float32x4_t x) +{ + float32x4_t tmp, fx; + + float32x4_t one = vdupq_n_f32(1); + x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); + x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); + + /* perform a floorf */ + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + + /* if greater, substract 1 */ + uint32x4_t mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); + + + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + + tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); + float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + + static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 }; + float32x4_t y = vld1q_dup_f32(cephes_exp_p+0); + float32x4_t c1 = vld1q_dup_f32(cephes_exp_p+1); + float32x4_t c2 = vld1q_dup_f32(cephes_exp_p+2); + float32x4_t c3 = vld1q_dup_f32(cephes_exp_p+3); + float32x4_t c4 = vld1q_dup_f32(cephes_exp_p+4); + float32x4_t c5 = vld1q_dup_f32(cephes_exp_p+5); + + y = vmulq_f32(y, x); + z = vmulq_f32(x, x); + + y = vaddq_f32(y, c1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c5); + + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, one); + + /* build 2^n */ + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); + mm = vshlq_n_s32(mm, 23); + float32x4_t pow2n = vreinterpretq_f32_s32(mm); + + y = vmulq_f32(y, pow2n); + return y; +} + +#define c_minus_cephes_DP1 -0.78515625 +#define c_minus_cephes_DP2 -2.4187564849853515625e-4 +#define c_minus_cephes_DP3 -3.77489497744594108e-8 +#define c_sincof_p0 -1.9515295891E-4 +#define c_sincof_p1 8.3321608736E-3 +#define c_sincof_p2 -1.6666654611E-1 +#define c_coscof_p0 2.443315711809948E-005 +#define c_coscof_p1 -1.388731625493765E-003 +#define c_coscof_p2 4.166664568298827E-002 +#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI + +/* evaluation of 4 sines & cosines at once. + * + * The code is the exact rewriting of the cephes sinf function. + * Precision is excellent as long as x < 8192 (I did not bother to + * take into account the special handling they have for greater values + * -- it does not return garbage for arguments over 8192, though, but + * the extra precision is missing). + * + * Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + * surprising but correct result. + * + * Note also that when you compute sin(x), cos(x) is available at + * almost no extra price so both sin_ps and cos_ps make use of + * sincos_ps.. + */ +static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos) +{ + // any x + float32x4_t xmm1, xmm2, xmm3, y; + + uint32x4_t emm2; + + uint32x4_t sign_mask_sin, sign_mask_cos; + sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); + x = vabsq_f32(x); + + /* scale by 4/Pi */ + y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); + + /* store the integer part of y in mm0 */ + emm2 = vcvtq_u32_f32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); + emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); + y = vcvtq_f32_u32(emm2); + + /* get the polynom selection mask + * there is one polynom for 0 <= x <= Pi/4 + * and another one for Pi/4 +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +void pooling_basic(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h) { + //no need to pad input tensor, border is zero pad inside this function + int size_channel_in = win * hin; + int size_channel_out = wout * hout; + + float* data_out = dout; + const float* data_in = din; + + if (global) { + switch (type) { + case Pooling_max: + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < chout; ++c) { + const float* data_in_channel = data_in_batch + c * size_channel_in;//in address + data_out_batch[c] = data_in_channel[0]; + for (int i = 0; i < size_channel_in; ++i) { + data_out_batch[c] = data_out_batch[c] > data_in_channel[i] ? \ + data_out_batch[c] : data_in_channel[i]; + } + } + } + break; + + case Pooling_average_include_padding: + + case Pooling_average_exclude_padding: + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < chout; ++c) { + const float* data_in_channel = data_in_batch + c * size_channel_in;//in address + float sum = 0.f; + for (int i = 0; i < size_channel_in; ++i) { + sum += data_in_channel[i]; + } + data_out_batch[c] = sum / size_channel_in; + } + } + break; + default: + printf("not support\n"); + } + return; + } + + switch (type) { + case Pooling_max: + for (int n = 0; n < num; ++n) { + float* data_out_channel = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int q = 0; q < chout; q++) { + + float* data_out_row = data_out_channel + q * size_channel_out; + const float* data_in_channel = data_in_batch + q * size_channel_in; + + for (int i = 0; i < hout; i++) { + for (int j = 0; j < wout; j++) { + int hstart = i * stride_h - pad_h; + int wstart = j * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, hin + pad_h); + int wend = std::min(wstart + kernel_w, win + pad_w); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + hend = std::min(hend, hin); + wend = std::min(wend, win); + + data_out_row[j] = data_in_channel[hstart * win + wstart]; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + data_out_row[j] = data_out_row[j] > \ + data_in_channel[h * win + w] ? \ + data_out_row[j] : data_in_channel[h * win + w]; + } + } + } + data_out_row += wout; + } + } + } + break; + + case Pooling_average_include_padding: + for (int n = 0; n < num; ++n) { + int pool_size = kernel_w * kernel_h;//(hend - hstart) * (wend - wstart);//problem + float* data_out_channel = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int q = 0; q < chout; q++) { + + float* data_out_row = data_out_channel + q * size_channel_out; + const float* data_in_channel = data_in_batch + q * size_channel_in; + for (int i = 0; i < hout; i++) { + for (int j = 0; j < wout; j++) { + int hstart = i * stride_h - pad_h; + int wstart = j * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, hin + pad_h); + int wend = std::min(wstart + kernel_w, win + pad_w); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + hend = std::min(hend, hin); + wend = std::min(wend, win); + + data_out_row[j] = data_in_channel[hstart * win + wstart]; + float sum = 0.f; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + sum += data_in_channel[h * win + w]; + } + } + data_out_row[j] = sum / pool_size; + } + data_out_row += wout; + } + } + } + break; + case Pooling_average_exclude_padding: + for (int n = 0; n < num; ++n) { + float* data_out_channel = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int q = 0; q < chout; q++) { + + float* data_out_row = data_out_channel + q * size_channel_out; + const float* data_in_channel = data_in_batch + q * size_channel_in; + for (int i = 0; i < hout; i++) { + for (int j = 0; j < wout; j++) { + int hstart = i * stride_h - pad_h; + int wstart = j * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, hin + pad_h); + int wend = std::min(wstart + kernel_w, win + pad_w); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + hend = std::min(hend, hin); + wend = std::min(wend, win); + + data_out_row[j] = data_in_channel[hstart * win + wstart]; + float sum = 0.f; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + sum += data_in_channel[h * win + w]; + } + } + int pool_size = (hend - hstart) * (wend - wstart); + data_out_row[j] = sum / pool_size; + } + data_out_row += wout; + } + } + } + break; + default: + printf("not support\n"); + } +} + +void pooling_global(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h) { + + int size_channel_in = win * hin; + float* data_out = dout; + const float* data_in = din; + + int cnt = size_channel_in / 8; + + for (int n = 0; n < num; ++n) { + float* data_out_batch = data_out + n * chout; + const float* data_in_batch = data_in + n * chin * size_channel_in; + if (type == Pooling_max) { +#pragma omp parallel for + for (int c = 0; c < chout; ++c) { + const float* data_in_channel = data_in_batch + c * size_channel_in; + int i = 0; + float32x4_t vmax = vdupq_n_f32(std::numeric_limits::min()); +#ifdef __aarch64__ + for(; i < cnt; i++) { + float32x4_t vdin1 = vld1q_f32(data_in_channel); + vmax = vmaxq_f32(vdin1, vmax); + float32x4_t vdin2 = vld1q_f32(data_in_channel + 4); + vmax = vmaxq_f32(vmax, vdin2); + data_in_channel += 8; + } +#else + int num = cnt; + if (num > 0) { + asm volatile( + "max_loop: @main loop\n" + "vld1.f32 {d0-d1}, [%[data_in_channel]]! @load q1, data_in_channel\n" + "vmax.f32 %q[vmax], %q[vmax], q0 @max vmax, vmax, data_in_channel\n" + "vld1.f32 {d2-d3}, [%[data_in_channel]]! @ load 2nd 4 data" + "vmax.f32 %q[vmax], %q[vmax], q1 @ compare 2nd 4 datas\n" + "subs %[num], #1 @subs num, 1\n" + "bne max_loop @bne num\n" + :[data_in_channel] "+r" (data_in_channel), [num] "+r" (num), [vmax] "+w" (vmax) + :"r" (data_in_channel), "r" (num) + : "q0" + ); + } +#endif //__aarch64__ + float32x2_t vmax_tmp = vmax_f32(vget_low_f32(vmax), vget_high_f32(vmax)); + float tmp1 = vget_lane_f32(vmax_tmp, 0); + float tmp2 = vget_lane_f32(vmax_tmp, 1); + float max_tmp = tmp1 > tmp2? tmp1 : tmp2; + for (i = cnt * 8; i < size_channel_in; ++i) { + /* code */ + max_tmp = max_tmp > data_in_channel[0] ? max_tmp : data_in_channel[0]; + data_in_channel++; + } + data_out_batch[c] = max_tmp; + } + } + else { +#pragma omp parallel for + for(int c = 0;c < chout; c++){ + const float* data_in_channel = data_in_batch + c * size_channel_in;//in address + int i = 0; + float32x4_t vsum = vdupq_n_f32(0.0f); +#ifdef __aarch64__ + for(; i < cnt; i++){// + vsum = vaddq_f32(vld1q_f32(data_in_channel),vsum); + data_in_channel += 4; + } +#else + int num = cnt; + if (num > 0) { + asm volatile( + "add_loop: @main loop\n" + "vld1.f32 {d0-d1}, [%[data_in_channel]]! @load q1, data_in_channel\n" + "vadd.f32 %q[vsum], %q[vsum], q0 @add vmax, vmax, data_in_channel\n" + "subs %[num], #1 @subs num, 1\n" + "bne add_loop @bne num\n" + :[data_in_channel] "+r" (data_in_channel), [num] "+r" (num), [vsum] "+w" (vsum) + :"r" (data_in_channel), "r" (num), "w" (vsum) + : "q0" + ); + } +#endif //__aarch64__ + float32x2_t vsum_tmp = vadd_f32(vget_low_f32(vsum),vget_high_f32(vsum)); + float sum = vget_lane_f32(vsum_tmp,0) + vget_lane_f32(vsum_tmp,1); + for(i = cnt * 4;i < size_channel_in; i++) { + sum += data_in_channel[0]; + data_in_channel++; + } + data_out_batch[c] = sum / size_channel_in; + } + } + } +} + +void pooling2x2s2_max(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h) { + + if (global) { + printf("not supported in this funcs, instead, use the basic func\n"); + return; + } + + int size_channel_out = wout * hout; + int size_channel_in = win * hin; + float* data_out = dout; + const float* data_in = din; + + int w_even = (win >> 1) << 1; + //int w_remains = w_in - w_even; // should be 0 or 1 + int h_even = (hin >> 1) << 1; + //int h_remains = h_in - h_even; // should be 0 or 1 + int w_unroll_size = (w_even >> 3) << 3; + //int w_unroll_remian = w_even - w_unroll_size; + int w_in_2 = win << 1; + float32x4_t vzero = vdupq_n_f32(0.f); + + for (int n = 0; n < num; ++n) { + + float* data_out_batch = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < chout; c++) { + float* data_out_channel = data_out_batch + c * size_channel_out; + const float* data_in_channel = data_in_batch + c * size_channel_in; + const float* r0 = data_in_channel; + const float* r1 = r0 + win; + int h = 0; + for (; h < h_even; h += 2) { + int w = 0; +#ifdef __aarch64__ + for (; w < w_unroll_size; w += 8) { + prefetch_2x(r0); + prefetch_2x(r1); + float32x4_t dr00 = vld1q_f32(&r0[w]); + float32x4_t dr01 = vld1q_f32(&r0[w + 4]); + float32x4_t dr10 = vld1q_f32(&r1[w]); + float32x4_t dr11 = vld1q_f32(&r1[w + 4]); + float32x4_t dmax1 = vmaxq_f32(dr00, dr10); + float32x4_t dmax2 = vmaxq_f32(dr01, dr11); + #ifdef __aarch64__ + float32x4_t dmax = vpmaxq_f32(dmax1, dmax2); + #else + float32x2_t dmaxl = vpmax_f32(vget_low_f32(dmax1), vget_high_f32(dmax1)); + float32x2_t dmaxh = vpmax_f32(vget_low_f32(dmax2), vget_high_f32(dmax2)); + float32x4_t dmax = vcombine_f32(dmaxl, dmaxh); + #endif + vst1q_f32(&data_out_channel[w >> 1], dmax); + + } +#else + w = w_unroll_size; + int num = w_unroll_size >> 3; + float* dr0 = (float *)r0; + float* dr1 = (float *)r1; + float* dr_out = data_out_channel; + //printf("c: %d, num: %d, dr0: %x, dr1: %x, dr_out: %x\n",c,num,dr0,dr1,dr_out); + if (num > 0){ + asm volatile( + "s2_max_loop: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load q0, dr0\n" + "vld1.f32 {d4-d7}, [%[dr1]]! @load q1, dr1\n" + "vmax.f32 q0, q0, q2 @max q0, q0, q2\n" + "vmax.f32 q1, q1, q3 @max q1, q1, q2\n" + "vpmax.f32 d4, d0, d1 @max d4, d0, d1\n" + "vpmax.f32 d5, d2, d3 @max d5, d2, d3\n" + "vst1.f32 {d4-d5}, [%[dr_out]]! @vst1 q2, dr_out\n" + "subs %[num], #1 @subs num, 1\n" + "bne s2_max_loop @bne num\n" + :[dr0] "+r" (dr0), [dr1] "+r" (dr1), [dr_out] "+r" (dr_out), [num] "+r" (num) + :"r" (dr0), "r" (dr1), "r" (dr_out), "r"(num) + :"q0", "q1", "q2", "q3" + ); + } +#endif //__aarch64__ + //printf("c: %d, w: %d,num: %d\n",c,w,num); + for (; w < w_even; w += 2) { + data_out_channel[w >> 1] = std::max(std::max(r0[w], r0[w + 1]), \ + std::max(r1[w], r1[w + 1])); + } + for (; w < win; ++w) { // run 0 or 1 time + data_out_channel[w >> 1] = std::max(r0[w], r1[w]); + } + r0 += w_in_2;// << 1; + r1 += w_in_2;// << 1; + data_out_channel += wout; + } + // process remain row (odd, last row) + for (; h < hin; h++) { //run 0 or 1 time + int w = 0; +#ifdef __aarch64__ + for (; w < w_unroll_size; w += 8) { + prefetch_2x(r0); + float32x4_t dr00 = vld1q_f32(&r0[w]); + float32x4_t dr01 = vld1q_f32(&r0[w + 4]); + #ifdef __aarch64__ + float32x4_t dmax = vpmaxq_f32(dr00, dr01); + #else + float32x2_t dmaxl = vpmax_f32(vget_low_f32(dr00), vget_high_f32(dr00)); + float32x2_t dmaxh = vpmax_f32(vget_low_f32(dr01), vget_high_f32(dr01)); + float32x4_t dmax = vcombine_f32(dmaxl, dmaxh); + #endif + float32x4_t dmax_cmp_zero = vmaxq_f32(dmax, vzero); + vst1q_f32(&data_out_channel[w >> 1], dmax_cmp_zero); + + } +#else + w = w_unroll_size; + int num = w_unroll_size >> 3; + float* dr0 = (float *)r0; + float* dr_out = data_out_channel; + if (num > 0){ + asm volatile( + "s2_max_loop1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load q0, dr0\n" + "vpmax.f32 d4, d0, d1 @max d4, d0, d1\n" + "vpmax.f32 d5, d2, d3 @max d5, d2, d3\n" + "vst1.f32 {d4-d5}, [%[dr_out]]! @vst1 q2, dr_out\n" + "subs %[num], #1 @subs num, 1\n" + "bne s2_max_loop1 @bne num\n" + :[dr0] "+r" (dr0), [dr_out] "+r" (dr_out), [num] "+r" (num) + :"r" (dr0), "r" (dr_out), "r"(num) + :"q0", "q1", "q2" + ); + } +#endif //__aarch64__ + for (; w < w_even; w += 2) { + data_out_channel[w >> 1] = std::max(std::max(r0[w], r0[w + 1]), 0.f); + } + for (; w < win; ++w) { // run 0 or 1 time + data_out_channel[w >> 1] = std::max(r0[w], 0.f); + } + } + } + + } +} + +void pooling2x2s2_ave(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h) { + + int size_channel_out = wout * hout; + int size_channel_in = win * hin; + float* data_out = dout; + const float* data_in = din; + + int w_even = (win >> 1) << 1; + //int w_remains = w_in - w_even; // should be 0 or 1 + int h_even = (hin >> 1) << 1; + //int h_remains = h_in - h_even; // should be 0 or 1 + int w_unroll_size = (w_even >> 3) << 3; + //int w_unroll_remian = w_even - w_unroll_size; + int w_in_2 = win << 1; + float32x4_t vcoef = vdupq_n_f32(0.25f); //divided by 4 + + for (int n = 0; n < num; ++n) { + + float* data_out_batch = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < chout; c++) { + float* data_out_channel = data_out_batch + c * size_channel_out; + const float* data_in_channel = data_in_batch + c * size_channel_in; + const float* r0 = data_in_channel; + const float* r1 = r0 + win; + int h = 0; + for (; h < h_even; h += 2) { + int w = 0; + #ifdef __aarch64__ + for (; w < w_unroll_size; w += 8) { + prefetch_2x(r0); + prefetch_2x(r1); + float32x4_t dr00 = vld1q_f32(&r0[w]); + float32x4_t dr01 = vld1q_f32(&r0[w + 4]); + float32x4_t dr10 = vld1q_f32(&r1[w]); + float32x4_t dr11 = vld1q_f32(&r1[w + 4]); + float32x4_t dsum1 = vaddq_f32(dr00, dr10); + float32x4_t dsum2 = vaddq_f32(dr01, dr11); + #ifdef __aarch64__ + float32x4_t dsum = vpaddq_f32(dsum1, dsum2); + #else + float32x2_t dsuml = vpadd_f32(vget_low_f32(dsum1), vget_high_f32(dsum1)); + float32x2_t dsumh = vpadd_f32(vget_low_f32(dsum2), vget_high_f32(dsum2)); + float32x4_t dsum = vcombine_f32(dsuml, dsumh); + #endif + float32x4_t res = vmulq_f32(dsum, vcoef); + vst1q_f32(&data_out_channel[w >> 1], res); + + } + #else + w = w_unroll_size; + int num = w_unroll_size >> 3; + float* dr0 = (float *)r0; + float* dr1 = (float *)r1; + float* dr_out = data_out_channel; + //printf("c: %d, num: %d, dr0: %x, dr1: %x, dr_out: %x\n",c,num,dr0,dr1,dr_out); + if (num > 0){ + asm volatile( + "s2_ave_loop: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load q0, dr0\n" + "vld1.f32 {d4-d7}, [%[dr1]]! @load q1, dr1\n" + "vadd.f32 q0, q0, q2 @add q0, q0, q2\n" + "vadd.f32 q1, q1, q3 @add q1, q1, q2\n" + "vpadd.f32 d4, d0, d1 @add d4, d0, d1\n" + "vpadd.f32 d5, d2, d3 @add d5, d2, d3\n" + "vmul.f32 q2, q2, %q[vcoef] @mul q2, q2, vcoef\n" + "vst1.f32 {d4-d5}, [%[dr_out]]! @vst1 q2, dr_out\n" + "subs %[num], #1 @subs num, 1\n" + "bne s2_ave_loop @bne num\n" + :[dr0] "+r" (dr0), [dr1] "+r" (dr1), [dr_out] "+r" (dr_out), [vcoef] "+w" (vcoef), [num] "+r" (num) + :"r" (dr0), "r" (dr1), "r" (dr_out), "r" (num), "w" (vcoef) + :"q0", "q1", "q2", "q3" + ); + } + #endif //__aarch64__ + for (; w < w_even; w += 2) { + data_out_channel[w >> 1] = (r0[w] + r0[w + 1] + r1[w] + r1[w + 1]) / 4.f; + } + for (; w < win; ++w) { // run 0 or 1 time + data_out_channel[w >> 1] = (r0[w] + r1[w]) / 4.f; + } + r0 += w_in_2;// << 1; + r1 += w_in_2;// << 1; + data_out_channel += wout; + } + // process remain row (odd, last row) + for (; h < hin; h++) { //run 0 or 1 time + int w = 0; + #ifdef __aarch64__ + for (; w < w_unroll_size; w += 8) { + prefetch_2x(r0); + float32x4_t dr00 = vld1q_f32(&r0[w]); + float32x4_t dr01 = vld1q_f32(&r0[w + 4]); + #ifdef __aarch64__ + float32x4_t dsum = vpaddq_f32(dr00, dr01); + #else + float32x2_t dsuml = vpadd_f32(vget_low_f32(dr00), vget_high_f32(dr00)); + float32x2_t dsumh = vpadd_f32(vget_low_f32(dr01), vget_high_f32(dr01)); + float32x4_t dsum = vcombine_f32(dsuml, dsumh); + #endif + float32x4_t res = vmulq_f32(dsum, vcoef); + vst1q_f32(&data_out_channel[w >> 1], res); + + } + #else + w = w_unroll_size; + int num = w_unroll_size >> 3; + float* dr0 = (float *)r0; + float* dr_out = data_out_channel; + //printf("c: %d, num: %d, dr0: %x, dr1: %x, dr_out: %x\n",c,num,dr0,dr1,dr_out); + if (num > 0){ + asm volatile( + "s2_ave_loop1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load q0, dr0\n" + "vpadd.f32 d4, d0, d1 @add d4, d0, d1\n" + "vpadd.f32 d5, d2, d3 @add d5, d2, d3\n" + "vmul.f32 q2, q2, %q[vcoef] @mul q2, q2, vcoef\n" + "vst1.f32 {d4-d5}, [%[dr_out]]! @vst1 q2, dr_out\n" + "subs %[num], #1 @subs num, 1\n" + "bne s2_ave_loop @bne num\n" + :[dr0] "+r" (dr0), [dr_out] "+r" (dr_out), [vcoef] "+w" (vcoef), [num] "+r" (num) + :"r" (dr0), "r" (dr_out), "r" (num), "w" (vcoef) + :"q0", "q1", "q2" + ); + } + #endif //__aarch64__ + for (; w < w_even; w += 2) { + data_out_channel[w >> 1] = (r0[w] + r0[w + 1]) / 4.f; + } + for (; w < win; ++w) { // run 0 or 1 time + data_out_channel[w >> 1] = r0[w] / 4.f; + } + } + } + } +} + +void pooling3x3s2_max(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h) { + + if (global) { + printf("not supported in this funcs, instead, use the basic func\n"); + return; + } + + int size_channel_out = wout * hout; + int size_channel_in = win * hin; + float* data_out = dout; + const float* data_in = din; + + int pad_top = pad_h; + int pad_left = pad_w; + int w_needed = wout * 2 + 1; + int h_needed = hout * 2 + 1; + int pad_right = w_needed - win - pad_left; + int pad_bottom = h_needed - hin - pad_top; + int w_even = (win >> 1) << 1; + //int w_remains = w_in - w_even; // should be 0 or 1 + int h_even = (hin >> 1) << 1; + //int h_remains = h_in - h_even; // should be 0 or 1 + //int w_unroll_size = (w_even >> 3) << 3; + //int w_unroll_remian = w_even - w_unroll_size; + int w_in_2 = win << 1; + float minval = std::numeric_limits::lowest(); + float32x4_t vzero = vdupq_n_f32(minval); //zero pad + //printf("minval: %.2f\n", minval); + + for (int n = 0; n < num; ++n) { + + float* data_out_batch = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < chout; c++) { + float* data_out_channel = data_out_batch + c * size_channel_out; + const float* data_in_channel = data_in_batch + c * size_channel_in; + const float* r0 = data_in_channel; + const float* r1 = r0 + win; + const float* r2 = r1 + win; + int cnt_num = win / 8; + //w = w_in - 8; + int cnt_num1 = (win - cnt_num * 8 - 1) / 2; + float* dr_out = data_out_channel; + const float* dr0 = r0; + const float* dr1 = r1; + const float* dr2 = r2; + int w = 1; + int cnt = 1; + data_out_channel[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0], r1[1])); + // first row with zero pad +#ifdef __aarch64__ + for (; w < w_in - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); + float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); + float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678,1); + float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112,1); + float32x2_t vmax_12_34 = vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_56_78 = vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); + float32x2_t vmax_67_89 = vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); + vst1_f32(&data_out_channel[cnt],vmax_123_345); + vst1_f32(&data_out_channel[cnt + 2],vmax_567_789); + cnt += 4; + } + for(; w < w_even - 1; w += 2){ + float32x4_t vr0= vld1q_f32(&r0[w]); + float32x4_t vr1= vld1q_f32(&r1[w]); + vr0 = vsetq_lane_f32(minval, vr0, 3); + vr1 = vsetq_lane_f32(minval, vr1, 3); + float32x4_t vmax1 = vmaxq_f32(vr0, vr1); + float32x2_t vmax2 = vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); + vmax2 = vpmax_f32(vmax2, vmax2); + data_out_channel[cnt] = vget_lane_f32(vmax2, 0); + cnt ++; + } +#else + dr0 = dr0 + 1; + dr1 = dr1 + 1; + dr_out = dr_out + 1; + // printf("cnt_num: %d, cnt_num1: %d \n",cnt_num, cnt_num1); + if (cnt_num > 0 || cnt_num1 > 0){ + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, 0\n" + "ble loop2 @ble exit\n" + "s3_max_loop: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7, dr1\n" + "vmax.f32 q6, q0, q3 @max r0_1234,r1_1234\n" + "vmax.f32 q7, q1, q4 @max r0_5678,r1_5678\n" + "vmax.f32 q8, q2, q5 @max r0_9101112,r1_9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q6, q7, #1 @vext max_2345\n" + "vext.f32 q1, q7, q8, #1 @vext max_6789\n" + "vpmax.f32 d4, d12, d13 @pmax d4, vmax_1234, vmax_1234\n" + "vpmax.f32 d6, d14, d15 @pmax d6, vmax_5678, vmax_5678\n" + "vpmax.f32 d5, d0, d1 @pmax d5, vmax_2345, vmax_2345\n" + "vpmax.f32 d7, d2, d3 @pmax d7, vmax_6789, vmax_6789\n" + "vmax.f32 d8, d4, d5 @max d2, vmax_12_34, vmax_23_45\n" + "vmax.f32 d9, d6, d7 @max d2, vmax_56_78, vmax_67_89\n" + "sub %[dr0], #16 @add w, 8\n" + "sub %[dr1], #16 @add w, 8\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" + "subs %[cnt_num], #1 @subs cnt_num, #1\n" + "bne s3_max_loop @bne s3_max_loop\n" + "loop2: @loop \n" + "cmp %[cnt_num1], #0 @cmp cnt_num, 0\n" + "ble exit @ble exit\n" + "s3_max_loop_1: @main loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, dr1\n" + "vmov.f32 s3,s2 @movs3, s2\n" + "vmov.f32 s7,s6 @movs7, s6\n" + "vmax.f32 q0, q0, q1 @max q0, q0, q1\n" + "vpmax.f32 d0, d0, d1 @pmax d0, d0,d1\n" + "vpmax.f32 d0, d0, d0 @pmax d0, d0, d0\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], dr_out\n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs cnt_num, #1\n" + "bne s3_max_loop_1 @bne s3_max_loop_1\n" + "exit: @exit\n" + :[dr0] "+r" (dr0), [dr1] "+r" (dr1), [dr_out] "+r" (dr_out), [cnt_num] "+r" (cnt_num), [cnt_num1] "+r" (cnt_num1) + :"r" (dr0), "r" (dr1), "r" (dr_out), "r"(cnt_num), "r" (cnt_num1) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6","q7", "q8", "q9" + ); + } + // printf("cnt_num: %d, cnt_num1: %d \n",cnt_num, cnt_num1); +#endif + //int w = w_even - 1; + if (pad_right){ + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = r0[wstart];//std::numeric_limits::min(); + for(int i = wstart; i < wend; i++){//only run 1 or 2 times + tmp = std::max(tmp,std::max(r0[i],r1[i])); + } + data_out_channel[w_even >> 1] = tmp; + //cnt ++; + } + + r0 = r1; + r1 = r0 + win; + r2 = r1 + win; + data_out_channel += wout; + int h = 2; + for (; h < h_even; h += 2) { + // deal with left pad + float maxr0 = std::max(r0[0], r0[1]); + float maxr1 = std::max(r1[0], r1[1]); + float maxr2 = std::max(r2[0], r2[1]); + data_out_channel[0] = std::max(std::max(maxr0, maxr1), maxr2); +#ifdef __aarch64__ + w = 1; + cnt = 1; + for (; w < w_in - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + float32x4_t vr2_1234 = vld1q_f32(&r2[w]); + float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]); + float32x4_t vr2_9101112 = vld1q_f32(&r2[w + 8]); + float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); + vmax_1234 = vmaxq_f32(vmax_1234, vr2_1234); + float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); + vmax_5678 = vmaxq_f32(vmax_5678, vr2_5678); + float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); + vmax_9101112 = vmaxq_f32(vmax_9101112, vr2_9101112); + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678,1); + float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112,1); + float32x2_t vmax_12_34 = vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_56_78 = vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); + float32x2_t vmax_67_89 = vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); + vst1_f32(&data_out_channel[cnt],vmax_123_345); + vst1_f32(&data_out_channel[cnt + 2],vmax_567_789); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + float32x4_t vr1 = vld1q_f32(&r1[w]); + float32x4_t vr2 = vld1q_f32(&r2[w]); + vr0 = vsetq_lane_f32(minval, vr0, 3); + vr1 = vsetq_lane_f32(minval, vr1, 3); + vr2 = vsetq_lane_f32(minval, vr2, 3); + float32x4_t vmax1 = vmaxq_f32(vr0, vr1); + vmax1 = vmaxq_f32(vmax1, vr2); + float32x2_t vmax2 = vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); + float32x2_t vmax = vpmax_f32(vmax2, vmax2); + data_out_channel[cnt] = vget_lane_f32(vmax, 0); + cnt ++; + } +#else + dr_out = data_out_channel + 1; + dr0 = (r0 + 1); + dr1 = (r1 + 1); + dr2 = (r2 + 1); + cnt_num = win / 8; + cnt_num1 = (win - cnt_num * 8 - 1) / 2; + if (cnt_num > 0 || cnt_num1 > 0){ + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, 0\n" + "ble loop3 @ble exit\n" + "s3_max_loop_mid: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d12-d15}, [%[dr2]]! @load d4-d7, dr1\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d16-d17}, [%[dr2]]! @load d4-d7, dr1\n" + "vmax.f32 q9, q0, q3 @max q0,q0,q2\n" + "vmax.f32 q10, q1, q4 @max q1,q1,q3\n" + "vmax.f32 q11, q2, q5 @max q1,q1,q3\n" + "vmax.f32 q0, q9, q6 @max q0,q0,q2 1234\n" + "vmax.f32 q3, q10, q7 @max q1,q1,q3 5678\n" + "vmax.f32 q1, q11, q8 @max q1,q1,q3 9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q4, q0, q3, #1 @vext 2345\n" + "vext.f32 q2, q3, q1, #1 @vext 6789\n" + "vpmax.f32 d10, d0, d1 @pmax d10, vmax_1234, vmax_1234\n" + "vpmax.f32 d12, d6, d7 @pmax d12, vmax_5678, vmax_5678\n" + "vpmax.f32 d11, d8, d9 @pmax d11, vmax_2345, vmax_2345\n" + "vpmax.f32 d13, d4, d5 @pmax d13, vmax_6789, vmax_6789\n" + "vmax.f32 d0, d10, d11 @pmax d0, vmax_12_34, vmax_23_45\n" + "vmax.f32 d1, d12, d13 @pmax d1, vmax_56_78, vmax_67_89\n" + "sub %[dr0], #16 @add w, 8\n" + "sub %[dr1], #16 @add w, 8\n" + "sub %[dr2], #16 @add w, 8\n" + "vst1.f32 d0, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d1, [%[dr_out]]! @vst1 d0, dr_out\n" + "subs %[cnt_num], #1 @subs cnt_num, #1\n" + "bne s3_max_loop_mid @bne s3_max_loop_mid\n" + "loop3: @loop \n" + "cmp %[cnt_num1], #0 @cmp cnt_num, 0\n" + "ble exit1 @ble exit1\n" + "s3_max_loop_mid_1: @mid loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, dr1\n" + "vld1.f32 {d4-d5}, [%[dr2]]! @load d2-d3, dr1\n" + "vmov.f32 s3,s2 @movs3, s2\n" + "vmov.f32 s7,s6 @movs7, s6\n" + "vmov.f32 s11,s10 @movs11, s10\n" + "vmax.f32 q0, q0, q1 @max q0, q0, q1\n" + "vmax.f32 q0, q0, q2 @max q0, q0, q2\n" + "vpmax.f32 d0, d0, d1 @pmax d0, d0,d1\n" + "vpmax.f32 d0, d0, d0 @pmax d0, d0, d0\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], dr_out\n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "sub %[dr2], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs cnt_num, #1\n" + "bne s3_max_loop_mid_1 @bne s3_max_loop_mid_1\n" + "exit1: @exit\n" + :[dr0] "+r" (dr0), [dr1] "+r" (dr1), [dr2] "+r" (dr2), [dr_out] "+r" (dr_out), \ + [cnt_num] "+r" (cnt_num), [cnt_num1] "+r" (cnt_num1) + :"r" (dr0), "r" (dr1), "r" (dr2), "r" (dr_out), "r"(cnt_num), "r" (cnt_num1) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6","q7", "q8", "q9", "q10", "q11", "q12" + ); + } +#endif + if (pad_right){ + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = r0[wstart];//std::numeric_limits::min(); + for(int i = wstart; i < wend; i++){ + tmp = std::max(tmp,std::max(r0[i],r1[i])); + tmp = std::max(tmp,r2[i]); + } + data_out_channel[w_even >> 1] = tmp; + //cnt ++; + } + r0 = r2; + r1 = r0 + win; + r2 = r1 + win; + data_out_channel += wout; + } + + if (pad_bottom) { + //deal with bottom pad + // first row with zero pad + int hstart = (h >> 1) * stride_h - pad_h; + int hend = std::min(std::min(hstart + kernel_h, hin + pad_h), hin); + + if(hstart == hend - 1){//only one lline + data_out_channel[0] = std::max(r0[0], r0[1]); +#ifdef __aarch64__ + w = 1; + cnt = 1; + for (; w < w_in - 8; w += 8) { + float32x4_t vmax_1234 = vld1q_f32(&r0[w]); + float32x4_t vmax_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vmax_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678,1); + float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112,1); + float32x2_t vmax_12_34 = vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_56_78 = vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); + float32x2_t vmax_67_89 = vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); + vst1_f32(&data_out_channel[cnt],vmax_123_345); + vst1_f32(&data_out_channel[cnt + 2],vmax_567_789); + cnt += 4; + } + for(; w < w_even - 1; w += 2){ + float32x4_t vr0= vld1q_f32(&r0[w]); + vr0 = vsetq_lane_f32(minval, vr0, 3); + float32x2_t vmax = vpmax_f32(vget_low_f32(vr0), vget_high_f32(vr0)); + vmax = vpmax_f32(vmax, vmax); + data_out_channel[cnt] = vget_lane_f32(vmax, 0); + cnt ++; + } +#else + dr_out = data_out_channel + 1; + dr0 = (r0 + 1); + cnt_num = win / 8; + cnt_num1 = (win - cnt_num * 8 - 1) / 2; + if (cnt_num > 0 || cnt_num1 > 0){ + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, 0\n" + "ble loop4 @ble exit\n" + "s3_max_loop_bot: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d3, dr0\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d3, dr0\n" + "vext.f32 q4, q0, q1, #1 @vext q4, q0, q1, 1 2345\n" + "vext.f32 q5, q1, q2, #1 @vext q5, q0, q1, 1 6789\n" + "vpmax.f32 d12, d0, d1 @pmax d12, vmax_1234, vmax_1234\n" + "vpmax.f32 d14, d2, d3 @pmax d14, vmax_5678, vmax_5678\n" + "vpmax.f32 d13, d8, d9 @pmax d13, vmax_2345, vmax_2345\n" + "vpmax.f32 d15, d10, d11 @pmax d15, vmax_6789, vmax_6789\n" + "vmax.f32 d0, d12, d13 @max d0, vmax_12_34,vmax_23_45\n" + "vmax.f32 d1, d14, d15 @pmax d2, vmax_56_78, vmax_67_89\n" + "sub %[dr0], #16 @add w, 6\n" + "vst1.f32 d0, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d1, [%[dr_out]]! @vst1 d0, dr_out\n" + "subs %[cnt_num], #1 @subs cnt_num, #1\n" + "bne s3_max_loop_bot @bne s3_max_loop_bot\n" + "loop4: @loop \n" + "cmp %[cnt_num1], #0 @cmp cnt_num, 0\n" + "ble exit3 @ble exit\n" + "s3_max_loop_bot_1: @bot loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, dr0\n" + "vmov.f32 s3,s2 @movs3, s2\n" + "vpmax.f32 d0, d0, d1 @pmax d0, d0,d1\n" + "vpmax.f32 d0, d0, d0 @pmax d0, d0, d0\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], dr_out\n" + "sub %[dr0], #8 @add w, 2\n" + "subs %[cnt_num1], #1 @subs cnt_num, #1\n" + "bne s3_max_loop_bot_1 @bne s3_max_loop_bot_1\n" + "exit3: @exit\n" + :[dr0] "+r" (dr0), [dr1] "+r" (dr1), [dr_out] "+r" (dr_out), [cnt_num] "+r" (cnt_num), [cnt_num1] "+r" (cnt_num1) + :"r" (dr0), "r" (dr1), "r" (dr_out), "r"(cnt_num), "r" (cnt_num1) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6","q7", "q8" + ); + } +#endif + if (pad_right){ + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = r0[wstart];//std::numeric_limits::min(); + for(int i = wstart; i < wend; i++){ + tmp = std::max(tmp,r0[i]); + } + data_out_channel[w_even >> 1] = tmp; + } + }else{//two lines + data_out_channel[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0], r1[1])); +#ifdef __aarch64__ + w = 1; + cnt = 1; + for (; w < w_in - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); + float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); + float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); + float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678,1); + float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112,1); + float32x2_t vmax_12_34 = vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); + float32x2_t vmax_23_45 = vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); + float32x2_t vmax_56_78 = vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); + float32x2_t vmax_67_89 = vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); + float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); + float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); + vst1_f32(&data_out_channel[cnt],vmax_123_345); + vst1_f32(&data_out_channel[cnt + 2],vmax_567_789); + cnt += 4; + } + for(; w < w_even - 1; w += 2){ + float32x4_t vr0= vld1q_f32(&r0[w]); + float32x4_t vr1= vld1q_f32(&r1[w]); + vr0 = vsetq_lane_f32(minval, vr0, 3); + vr1 = vsetq_lane_f32(minval, vr1, 3); + float32x4_t vmax1 = vmaxq_f32(vr0, vr1); + float32x2_t vmax2 = vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); + vmax2 = vpmax_f32(vmax2, vmax2); + data_out_channel[cnt] = vget_lane_f32(vmax2, 0); + cnt ++; + } +#else + dr_out = data_out_channel + 1; + dr0 = (r0 + 1); + dr1 = (r1 + 1); + cnt_num = win / 8; + cnt_num1 = (win - cnt_num * 8 - 1) / 2; + if (cnt_num > 0 || cnt_num1 > 0){ + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, 0\n" + "ble loop5 @ble exit\n" + "s3_max_loop_bot1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d3, dr0\n" + "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7, dr1\n" + "vmax.f32 q6, q0, q3 @max q0,q0,q2 1234\n" + "vmax.f32 q7, q1, q4 @max q1,q1,q3 5678\n" + "vmax.f32 q8, q2, q5 @max q1,q1,q3 9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q6, q7, #1 @vext q0, 2345\n" + "vext.f32 q1, q7, q8, #1 @vext q1, 6789\n" + "vpmax.f32 d4, d12, d13 @pmax d4, vmax_1234, vmax_1234\n" + "vpmax.f32 d6, d14, d15 @pmax d6, vmax_5678, vmax_5678\n" + "vpmax.f32 d5, d0, d1 @pmax d5, vmax_2345, vmax_2345\n" + "vpmax.f32 d7, d2, d3 @pmax d7, vmax_6789, vmax_6789\n" + "vmax.f32 d8, d4, d5 @max d2, vmax_12_34, vmax_23_45\n" + "vmax.f32 d9, d6, d7 @max d2, vmax_56_78, vmax_67_89\n" + "sub %[dr0], #16 @add w, 8\n" + "sub %[dr1], #16 @add w, 8\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" + "subs %[cnt_num], #1 @subs cnt_num, #1\n" + "bne s3_max_loop_bot1 @bne s3_max_loop_bot\n" + "loop5: @loop \n" + "cmp %[cnt_num1], #0 @cmp cnt_num, 0\n" + "ble exit4 @ble exit\n" + "s3_max_loop_bot1_1: @bot loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, dr1\n" + "vmov.f32 s3,s2 @movs3, s2\n" + "vmov.f32 s7,s6 @movs7, s6\n" + "vmax.f32 q0, q0, q1 @max q0, q0, q1\n" + "vpmax.f32 d0, d0, d1 @pmax d0, d0,d1\n" + "vpmax.f32 d0, d0, d0 @pmax d0, d0, d0\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], dr_out\n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs cnt_num, #1\n" + "bne s3_max_loop_bot1_1 @bne s3_max_loop_bot_1\n" + "exit4: @exit\n" + :[dr0] "+r" (dr0), [dr1] "+r" (dr1), [dr_out] "+r" (dr_out), [cnt_num] "+r" (cnt_num), [cnt_num1] "+r" (cnt_num1) + :"r" (dr0), "r" (dr1), "r" (dr_out), "r"(cnt_num), "r" (cnt_num1) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6","q7", "q8", "q9" + ); + } +#endif + if (pad_right){ + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = r0[wstart];//std::numeric_limits::min(); + for(int i = wstart; i < wend; i++){//only run 1 or 2 times + tmp = std::max(tmp,std::max(r0[i],r1[i])); + } + data_out_channel[w_even >> 1] = tmp; + } + } + } + + } + } +} + +void pooling3x3s2_ave(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h) { + + //no need to pad input tensor, pad_size is not used, default border is zero padded + + if (global) { + printf("not supported in this funcs, instead, use the basic func\n"); + return; + } + + int size_channel_out = wout * hout; + int size_channel_in = win * hin; + float* data_out = dout; + const float* data_in = din; + + int pad_top = pad_h; + int pad_left = pad_w; + int w_needed = wout * 2 + 1; + int h_needed = hout * 2 + 1; + int pad_right = w_needed - win - pad_left; + int pad_bottom = h_needed - hin - pad_top; + int w_even = (win >> 1) << 1; + //int w_remains = w_in - w_even; // should be 0 or 1 + int h_even = (hin >> 1) << 1; + //int h_remains = h_in - h_even; // should be 0 or 1 + //int w_unroll_size = (w_even >> 3) << 3; + //int w_unroll_remian = w_even - w_unroll_size; + int w_in_2 = win << 1; + //float minval = std::numeric_limits::lowest(); + //printf("minval: %.2f\n", minval); + int w_unroll_size = win >> 3; + int w_unroll_remian = (w_even - w_unroll_size * 8 - 1) / 2; + + for (int n = 0; n < num; ++n) { + + float* data_out_batch = data_out + n * chout * size_channel_out; + const float* data_in_batch = data_in + n * chin * size_channel_in; +#pragma omp parallel for + for (int c = 0; c < chout; c++) { + float* data_out_channel = data_out_batch + c * size_channel_out; + const float* data_in_channel = data_in_batch + c * size_channel_in; + const float* r0 = data_in_channel; + const float* r1 = r0 + win; + const float* r2 = r1 + win; + int cnt_num = w_unroll_size; + int cnt_num1 = w_unroll_remian; + float* dr_out = data_out_channel; + const float* dr0 = r0; + const float* dr1 = r1; + const float* dr2 = r2; + int w = 1; + int cnt = 1; + float32x4_t vcoef = vdupq_n_f32(1.f/9.f); + float32x4_t vzero = vdupq_n_f32(0.f); + data_out_channel[0] = (r0[0] + r0[1] + r1[0] + r1[1])/9.f; + // first row with zero pad +#ifdef __aarch64__ + for (; w < w_in - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); + float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); + float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112); + + float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678,1); + float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678,2); + float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678,3); + float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112,1); + float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); + vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); + float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); + vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_123_345,2), vsum_123_345, 1); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_567_789,1), vsum_123_345, 2); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_567_789,3), vsum_123_345, 3); + float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef); + vst1q_f32(&data_out_channel[cnt], vrst); + cnt += 4; + } + for(; w < w_even - 1; w += 2){ + float32x4_t vr0= vld1q_f32(&r0[w]); + float32x4_t vr1= vld1q_f32(&r1[w]); + vr0 = vsetq_lane_f32(0.f, vr0, 3); + vr1 = vsetq_lane_f32(0.f, vr1, 3); + float32x4_t vsum1 = vaddq_f32(vr0, vr1); + float32x2_t vsum2 = vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1)); + vsum2 = vpadd_f32(vsum2, vsum2); + float32x2_t vrst = vmul_f32(vsum2, vget_low_f32(vcoef)); + data_out_channel[cnt] = vget_lane_f32(vrst, 0); + cnt ++; + } +#else + dr0 = dr0 + 1; + dr1 = dr1 + 1; + dr_out = dr_out + 1; + // printf("cnt_num: %d, cnt_num1: %d \n",cnt_num, cnt_num1); + if (cnt_num > 0 || cnt_num1 > 0){ + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, 0\n" + "ble loop2_ave @ble exit\n" + "s3_ave_loop: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7, dr1\n" + "vadd.f32 q6, q0, q3 @max r0_1234,r1_1234\n" + "vadd.f32 q7, q1, q4 @max r0_5678,r1_5678\n" + "vadd.f32 q8, q2, q5 @max r0_9101112,r1_9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q6, q7, #1 @vext max_2345\n" + "vext.f32 q1, q6, q7, #3 @vext max_4567\n" + "vext.f32 q2, q6, q7, #2 @vext max_3456\n" + "vext.f32 q3, q7, q8, #1 @vext max_6789\n" + "vadd.f32 q4, q6, q0 @add 1234, 2345 \n" + "vadd.f32 q5, q7, q1 @add 5678, 4567 \n" + "vadd.f32 q4, q4, q2 @add 3456, sum1 \n" + "vadd.f32 q5, q5, q3 @add 6789, sum2 \n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s21 @mov \n" + "vmov.f32 s19, s23 @mov \n" + "vmul.f32 q4, q4, %q[vcoef] @mul \n" + "sub %[dr0], #16 @add w, 8\n" + "sub %[dr1], #16 @add w, 8\n" + "subs %[cnt_num], #1 @subs cnt_num, #1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" + "bne s3_ave_loop @bne s3_max_loop\n" + "loop2_ave: @loop \n" + "cmp %[cnt_num1], #0 @cmp cnt_num, 0\n" + "ble exit_ave @ble exit\n" + "s3_ave_loop_1: @main loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, dr1\n" + "vext.f32 q0, %q[vzero], q0, #3 @ ext v0_0123\n" + "vext.f32 q1, %q[vzero], q1, #3 @ ext v1_0123\n" + "vadd.f32 q0, q0, q1 @add q0, q0, q1\n" + "vpadd.f32 d0, d0, d1 @padd d0, d0,d1\n" + "vpadd.f32 d0, d0, d0 @padd d0, d0, d0\n" + "vmul.f32 d0, d0, %e[vcoef] @mul \n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs cnt_num, #1\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], dr_out\n" + "bne s3_ave_loop_1 @bne s3_max_loop_1\n" + "exit_ave: @exit\n" + :[dr0] "+r" (dr0), [dr1] "+r" (dr1), [dr_out] "+r" (dr_out), [cnt_num] "+r" (cnt_num), \ + [cnt_num1] "+r" (cnt_num1), [vcoef] "+w" (vcoef), [vzero] "+w" (vzero) + :"r" (dr0), "r" (dr1), "r" (dr_out), "r"(cnt_num), "r" (cnt_num1) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6","q7", "q8", "q9" + ); + } + // printf("cnt_num: %d, cnt_num1: %d \n",cnt_num, cnt_num1); +#endif + //int w = w_even - 1; + if (pad_right){ + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = 0.f;//std::numeric_limits::min(); + for(int i = wstart; i < wend; i++){//only run 1 or 2 times + tmp += (r0[i] + r1[i]); + } + data_out_channel[w_even >> 1] = tmp / 9.f; + //cnt ++; + } + + r0 = r1; + r1 = r0 + win; + r2 = r1 + win; + data_out_channel += wout; + int h = 2; + for (; h < h_even; h += 2) { + // deal with left pad + float sum0 = r0[0] + r0[1]; + float sum1 = r1[0] + r1[1]; + float sum2 = r2[0] + r2[1]; + data_out_channel[0] = (sum0 + sum1 + sum2) / 9.f; +#ifdef __aarch64__ + w = 1; + cnt = 1; + for (; w < w_in - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + float32x4_t vr2_1234 = vld1q_f32(&r2[w]); + float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]); + float32x4_t vr2_9101112 = vld1q_f32(&r2[w + 8]); + float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); + float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); + float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112); + vsum_1234 = vaddq_f32(vsum_1234, vr2_1234); + vsum_5678 = vaddq_f32(vsum_5678, vr2_5678); + vsum_9101112 = vaddq_f32(vsum_9101112, vr2_9101112); + + float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678,1); + float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678,2); + float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678,3); + float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112,1); + float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); + vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); + float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); + vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_123_345,2), vsum_123_345, 1); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_567_789,1), vsum_123_345, 2); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_567_789,3), vsum_123_345, 3); + float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef); + vst1q_f32(&data_out_channel[cnt],vrst); + cnt += 4; + } + for (; w < w_even - 1; w += 2) { + float32x4_t vr0 = vld1q_f32(&r0[w]); + float32x4_t vr1 = vld1q_f32(&r1[w]); + float32x4_t vr2 = vld1q_f32(&r2[w]); + vr0 = vsetq_lane_f32(0.f, vr0, 3); + vr1 = vsetq_lane_f32(0.f, vr1, 3); + vr2 = vsetq_lane_f32(0.f, vr2, 3); + float32x4_t vsum1 = vaddq_f32(vr0, vr1); + vsum1 = vaddq_f32(vsum1, vr2); + float32x2_t vsum2 = vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1)); + float32x2_t vsum = vpadd_f32(vsum2, vsum2); + data_out_channel[cnt] = vget_lane_f32(vsum, 0) / 9.f; + cnt ++; + } +#else + dr_out = data_out_channel + 1; + dr0 = (r0 + 1); + dr1 = (r1 + 1); + dr2 = (r2 + 1); + cnt_num = w_unroll_size; + cnt_num1 = w_unroll_remian; + if (cnt_num > 0 || cnt_num1 > 0){ + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, 0\n" + "ble loop3_ave @ble exit\n" + "s3_ave_loop_mid: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d12-d15}, [%[dr2]]! @load d4-d7, dr1\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d16-d17}, [%[dr2]]! @load d4-d7, dr1\n" + "vadd.f32 q9, q0, q3 @max q0,q0,q2\n" + "vadd.f32 q10, q1, q4 @max q1,q1,q3\n" + "vadd.f32 q11, q2, q5 @max q1,q1,q3\n" + "vadd.f32 q6, q9, q6 @max q0,q0,q2 1234\n" + "vadd.f32 q7, q10, q7 @max q1,q1,q3 5678\n" + "vadd.f32 q8, q11, q8 @max q1,q1,q3 9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q6, q7, #1 @vext max_2345\n" + "vext.f32 q1, q6, q7, #3 @vext max_4567\n" + "vext.f32 q2, q6, q7, #2 @vext max_3456\n" + "vext.f32 q3, q7, q8, #1 @vext max_6789\n" + "vadd.f32 q4, q6, q0 @add 1234, 2345 \n" + "vadd.f32 q5, q7, q1 @add 5678, 4567 \n" + "vadd.f32 q4, q4, q2 @add 3456, sum1 \n" + "vadd.f32 q5, q5, q3 @add 6789, sum2 \n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s21 @mov \n" + "vmov.f32 s19, s23 @mov \n" + "vmul.f32 q4, q4, %q[vcoef] @mul \n" + "sub %[dr0], #16 @add w, 8\n" + "sub %[dr1], #16 @add w, 8\n" + "sub %[dr2], #16 @add w, 8\n" + "subs %[cnt_num], #1 @subs cnt_num, #1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" + "bne s3_ave_loop_mid @bne s3_max_loop_mid\n" + "loop3_ave: @loop \n" + "cmp %[cnt_num1], #0 @cmp cnt_num, 0\n" + "ble exit1_ave @ble exit1\n" + "s3_ave_loop_mid_1: @mid loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, dr1\n" + "vld1.f32 {d4-d5}, [%[dr2]]! @load d2-d3, dr1\n" + "vext.f32 q0, %q[vzero], q0, #3 @ ext v0_0123\n" + "vext.f32 q1, %q[vzero], q1, #3 @ ext v1_0123\n" + "vext.f32 q2, %q[vzero], q2, #3 @ ext v1_0123\n" + "vadd.f32 q0, q0, q1 @add q0, q0, q1\n" + "vadd.f32 q0, q0, q2 @add q0, q0, q1\n" + "vpadd.f32 d0, d0, d1 @padd d0, d0,d1\n" + "vpadd.f32 d0, d0, d0 @padd d0, d0, d0\n" + "vmul.f32 d0, d0, %e[vcoef] @mul \n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "sub %[dr2], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs cnt_num, #1\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], dr_out\n" + "bne s3_ave_loop_mid_1 @bne s3_max_loop_mid_1\n" + "exit1_ave: @exit\n" + :[dr0] "+r" (dr0), [dr1] "+r" (dr1), [dr2] "+r" (dr2), [dr_out] "+r" (dr_out), \ + [cnt_num] "+r" (cnt_num), [cnt_num1] "+r" (cnt_num1), [vcoef] "+w" (vcoef), [vzero] "+w" (vzero) + :"r" (dr0), "r" (dr1), "r" (dr2), "r" (dr_out), "r"(cnt_num), "r" (cnt_num1) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6","q7", "q8", "q9", "q10", "q11", "q12" + ); + } +#endif + if (pad_right){ + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = 0.f; + for(int i = wstart; i < wend; i++){ + tmp += (r0[i]+r1[i]+r2[i]); + } + data_out_channel[w_even >> 1] = tmp / 9.f; + //cnt ++; + } + r0 = r2; + r1 = r0 + win; + r2 = r1 + win; + data_out_channel += wout; + } + + if (pad_bottom) { + //deal with bottom pad + // first row with zero pad + int hstart = (h >> 1) * stride_h - pad_h; + int hend = std::min(std::min(hstart + kernel_h, hin + pad_h), hin); + + if(hstart == hend - 1){//only one lline + data_out_channel[0] = (r0[0] + r0[1]) / 9.f; +#ifdef __aarch64__ + w = 1; + cnt = 1; + for (; w < w_in - 8; w += 8) { + float32x4_t vsum_1234 = vld1q_f32(&r0[w]); + float32x4_t vsum_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vsum_9101112 = vld1q_f32(&r0[w + 8]); + + float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678,1); + float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678,2); + float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678,3); + float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112,1); + float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); + vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); + float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); + vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_123_345,2), vsum_123_345, 1); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_567_789,1), vsum_123_345, 2); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_567_789,3), vsum_123_345, 3); + float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef); + vst1q_f32(&data_out_channel[cnt],vrst); + cnt += 4; + } + for(; w < w_even - 1; w += 2){ + float32x4_t vr0= vld1q_f32(&r0[w]); + vr0 = vsetq_lane_f32(0.f, vr0, 3); + float32x2_t vsum = vpadd_f32(vget_low_f32(vr0), vget_high_f32(vr0)); + vsum = vpadd_f32(vsum, vsum); + data_out_channel[cnt] = vget_lane_f32(vsum, 0) / 9.f; + cnt ++; + } +#else + dr_out = data_out_channel + 1; + dr0 = (r0 + 1); + cnt_num = w_unroll_size; + cnt_num1 = w_unroll_remian; + if (cnt_num > 0 || cnt_num1 > 0){ + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, 0\n" + "ble loop4_ave @ble exit\n" + "s3_ave_loop_bot: @main loop\n" + "vld1.f32 {d12-d15}, [%[dr0]]! @load d0-d3, dr0\n" + "vld1.f32 {d16-d17}, [%[dr0]]! @load d0-d3, dr0\n" + "vext.f32 q0, q6, q7, #1 @vext max_2345\n" + "vext.f32 q1, q6, q7, #3 @vext max_4567\n" + "vext.f32 q2, q6, q7, #2 @vext max_3456\n" + "vext.f32 q3, q7, q8, #1 @vext max_6789\n" + "vadd.f32 q4, q6, q0 @add 1234, 2345 \n" + "vadd.f32 q5, q7, q1 @add 5678, 4567 \n" + "vadd.f32 q4, q4, q2 @add 3456, sum1 \n" + "vadd.f32 q5, q5, q3 @add 6789, sum2 \n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s21 @mov \n" + "vmov.f32 s19, s23 @mov \n" + "vmul.f32 q4, q4, %q[vcoef] @mul \n" + "sub %[dr0], #16 @add w, 6\n" + "subs %[cnt_num], #1 @subs cnt_num, #1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" + "bne s3_ave_loop_bot @bne s3_max_loop_bot\n" + "loop4_ave: @loop \n" + "cmp %[cnt_num1], #0 @cmp cnt_num, 0\n" + "ble exit3_ave @ble exit\n" + "s3_ave_loop_bot_1: @bot loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, dr0\n" + "vext.f32 q0, %q[vzero], q0, #3 @ ext v0_0123\n" + "vpadd.f32 d0, d0, d1 @padd d0, d0,d1\n" + "vpadd.f32 d0, d0, d0 @padd d0, d0, d0\n" + "vmul.f32 d0, d0, %e[vcoef] @mul \n" + "sub %[dr0], #8 @add w, 2\n" + "subs %[cnt_num1], #1 @subs cnt_num, #1\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], dr_out\n" + "bne s3_ave_loop_bot_1 @bne s3_max_loop_bot_1\n" + "exit3_ave: @exit\n" + :[dr0] "+r" (dr0), [dr1] "+r" (dr1), [dr_out] "+r" (dr_out), [cnt_num] "+r" (cnt_num), \ + [cnt_num1] "+r" (cnt_num1), [vcoef] "+w" (vcoef), [vzero] "+w" (vzero) + :"r" (dr0), "r" (dr1), "r" (dr_out), "r"(cnt_num), "r" (cnt_num1) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6","q7", "q8" + ); + } +#endif + if (pad_right){ + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = 0.f; + for(int i = wstart; i < wend; i++){ + tmp += r0[i]; + } + data_out_channel[w_even >> 1] = tmp / 9.f; + } + }else{//two lines + data_out_channel[0] =(r0[0] + r0[1] + r1[0] + r1[1]) / 9.f; +#ifdef __aarch64__ + w = 1; + cnt = 1; + for (; w < w_in - 8; w += 8) { + float32x4_t vr0_1234 = vld1q_f32(&r0[w]); + float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); + float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); + float32x4_t vr1_1234 = vld1q_f32(&r1[w]); + float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); + float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); + + float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); + float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); + float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112); + float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678,1); + float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678,2); + float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678,3); + float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112,1); + float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); + vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); + float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); + vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_123_345,2), vsum_123_345, 1); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_567_789,1), vsum_123_345, 2); + vsum_123_345 = vsetq_lane_f32(vgetq_lane_f32(vsum_567_789,3), vsum_123_345, 3); + float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef); + vst1q_f32(&data_out_channel[cnt],vrst); + cnt += 4; + } + for(; w < w_even - 1; w += 2){ + float32x4_t vr0= vld1q_f32(&r0[w]); + float32x4_t vr1= vld1q_f32(&r1[w]); + vr0 = vsetq_lane_f32(0.f, vr0, 3); + vr1 = vsetq_lane_f32(0.f, vr1, 3); + float32x4_t vsum1 = vaddq_f32(vr0, vr1); + float32x2_t vsum2 = vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1)); + vsum2 = vpadd_f32(vsum2, vsum2); + float32x2_t vrst = vmul_f32(vsum2, vget_low_f32(vcoef)); + data_out_channel[cnt] = vget_lane_f32(vrst, 0); + cnt ++; + } +#else + dr_out = data_out_channel + 1; + dr0 = (r0 + 1); + dr1 = (r1 + 1); + cnt_num = w_unroll_size; + cnt_num1 = w_unroll_remian; + if (cnt_num > 0 || cnt_num1 > 0){ + asm volatile( + "cmp %[cnt_num], #0 @cmp cnt_num, 0\n" + "ble loop5_ave @ble exit\n" + "s3_ave_loop_bot1: @main loop\n" + "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, dr0\n" + "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, dr1\n" + "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d3, dr0\n" + "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7, dr1\n" + "vmax.f32 q6, q0, q3 @max q0,q0,q2 1234\n" + "vmax.f32 q7, q1, q4 @max q1,q1,q3 5678\n" + "vmax.f32 q8, q2, q5 @max q1,q1,q3 9101112\n" + //"vmov.f32 s7,s6 @mov s7, s6\n" + "vext.f32 q0, q6, q7, #1 @vext max_2345\n" + "vext.f32 q1, q6, q7, #3 @vext max_4567\n" + "vext.f32 q2, q6, q7, #2 @vext max_3456\n" + "vext.f32 q3, q7, q8, #1 @vext max_6789\n" + "vadd.f32 q4, q6, q0 @add 1234, 2345 \n" + "vadd.f32 q5, q7, q1 @add 5678, 4567 \n" + "vadd.f32 q4, q4, q2 @add 3456, sum1 \n" + "vadd.f32 q5, q5, q3 @add 6789, sum2 \n" + "vmov.f32 s17, s18 @mov \n" + "vmov.f32 s18, s21 @mov \n" + "vmov.f32 s19, s23 @mov \n" + "vmul.f32 q4, q4, %q[vcoef] @mul \n" + "sub %[dr0], #16 @add w, 8\n" + "sub %[dr1], #16 @add w, 8\n" + "subs %[cnt_num], #1 @subs cnt_num, #1\n" + "vst1.f32 d8, [%[dr_out]]! @vst1 d0, dr_out\n" + "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" + "bne s3_ave_loop_bot1 @bne s3_max_loop_bot\n" + "loop5_ave: @loop \n" + "cmp %[cnt_num1], #0 @cmp cnt_num, 0\n" + "ble exit4_ave @ble exit\n" + "s3_ave_loop_bot1_1: @bot loop\n" + "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1, dr0\n" + "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3, dr1\n" + "vext.f32 q0, %q[vzero], q0, #3 @ ext v0_0123\n" + "vext.f32 q1, %q[vzero], q1, #3 @ ext v1_0123\n" + "vadd.f32 q0, q0, q1 @add q0, q0, q1\n" + "vpadd.f32 d0, d0, d1 @padd d0, d0,d1\n" + "vpadd.f32 d0, d0, d0 @padd d0, d0, d0\n" + "vmul.f32 d0, d0, %e[vcoef] @mul \n" + "sub %[dr0], #8 @add w, 6\n" + "sub %[dr1], #8 @add w, 6\n" + "subs %[cnt_num1], #1 @subs cnt_num, #1\n" + "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0], dr_out\n" + "bne s3_ave_loop_bot1_1 @bne s3_max_loop_bot_1\n" + "exit4_ave: @exit\n" + :[dr0] "+r" (dr0), [dr1] "+r" (dr1), [dr_out] "+r" (dr_out), [cnt_num] "+r" (cnt_num), \ + [cnt_num1] "+r" (cnt_num1), [vcoef] "+w" (vcoef), [vzero] "+w" (vzero) + :"r" (dr0), "r" (dr1), "r" (dr_out), "r"(cnt_num), "r" (cnt_num1) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6","q7", "q8", "q9" + ); + } +#endif + if (pad_right){ + // deal with right pad + int wstart = (w_even >> 1) * stride_w - pad_w; + int wend = std::min(std::min(wstart + kernel_w, win + pad_w), win); + float tmp = 0.f; + for(int i = wstart; i < wend; i++){//only run 1 or 2 times + tmp += (r0[i] + r1[i]); + } + data_out_channel[w_even >> 1] = tmp / 9.f; + } + } + } + + } + } +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //USE_ARM_PLACE diff --git a/saber/lite/funcs/neon/impl/pooling_arm_impl.h b/saber/lite/funcs/neon/impl/pooling_arm_impl.h new file mode 100644 index 000000000..ae28178b0 --- /dev/null +++ b/saber/lite/funcs/neon/impl/pooling_arm_impl.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_NEON_IMPL_POOLING_ARM_IMPL_H +#define ANAKIN_SABER_LITE_FUNCS_NEON_IMPL_POOLING_ARM_IMPL_H + +#include "saber/lite/core/tensor_lite.h" + +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +void pooling_basic(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h); + +void pooling_global(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h); + +void pooling2x2s2_max(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h); + +void pooling2x2s2_ave(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h); + +void pooling3x3s2_max(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h); + +void pooling3x3s2_ave(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h); + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_NEON_IMPL_POOLING_ARM_IMPL_H diff --git a/saber/lite/funcs/neon/impl/sgemm_arm.cpp b/saber/lite/funcs/neon/impl/sgemm_arm.cpp new file mode 100644 index 000000000..d09df0972 --- /dev/null +++ b/saber/lite/funcs/neon/impl/sgemm_arm.cpp @@ -0,0 +1,2748 @@ +#include "saber/lite/funcs/neon/impl/sgemm_arm.h" +#ifdef USE_ARM_PLACE +#include +#include "saber/lite/core/buffer_lite.h" +namespace anakin{ + +namespace saber{ + +namespace lite{ + +#ifdef __aarch64__ +const int A_INTERLEAVE = 8; +const int B_INTERLEAVE = 12; +const int OUT_WIDTH = 12; +const int OUT_HEIGHT = 8; +#else +const int A_INTERLEAVE = 6; +const int B_INTERLEAVE = 8; +const int OUT_WIDTH = 8; +const int OUT_HEIGHT = 6; +#endif //__aarch64 +const bool A_TRANSPOSE = false; +const bool B_TRANSPOSE = true; + +const int GEMM_ALIGN = 4096; +const int ALLOC_ROUND = 128; +#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) + +inline void *mem_align(std::size_t alignment, std::size_t size, void *&ptr, std::size_t &space) { + std::uintptr_t pn = reinterpret_cast(ptr); + std::uintptr_t aligned = (pn + alignment - 1) & -alignment; + std::size_t padding = aligned - pn; + if (space < size + padding) { + return nullptr; + } + space -= padding; + return ptr = reinterpret_cast(aligned); +} + +void sgemm_impl(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, \ + int bblocks, int K, long int row_jump=0, long int block_jump=0); + +void load_apanel_no_trans(float* out, const float* in, const int ldin, const int m0, \ + const int mmax, const int k0, const int kmax); +void load_apanel_trans(float* out, const float* in, const int ldin, const int m0, \ + const int mmax, const int k0, const int kmax); +void load_bpanel_no_trans(float* out, const float* in, const int ldin, const int k0, \ + const int kmax, const int n0, const int nmax); +void load_bpanel_trans(float* out, const float* in, const int ldin, const int k0, \ + const int kmax, const int n0, const int nmax); + +void merge_float_basic(float *out, const float *in, const int ldout, const int y0, \ + const int ymax, const int x0, const int xmax, const float alpha, const float beta); +void merge_float_basic_relu(float *out, const float *in, const int ldout, const int y0, \ + const int ymax, const int x0, const int xmax, const float alpha, const float beta); +void merge_float_alpha1_beta1(float *out, const float *in, const int ldout, const int y0, \ + const int ymax, const int x0, const int xmax); +void merge_float_alpha1_beta1_relu(float *out, const float *in, const int ldout, const int y0, \ + const int ymax, const int x0, const int xmax); + +Sgemm::Sgemm() {} + +Sgemm::~Sgemm() { + if (_work_space_ptr != nullptr) { + fast_free(_work_space_ptr); + _work_space_ptr = nullptr; + } +} + +void Sgemm::init(unsigned int L1_cache, unsigned int L2_cache, unsigned int M, unsigned int N, \ + unsigned int K, bool trA, bool trB, int thread_num) { + + _M = M; + _NN = N; + _K = K; + _trA = trA ^ A_TRANSPOSE; + if (_trA) { + //_load_a = Transform; + _load_a = load_apanel_trans; + } else { + //_load_a = Transform; + _load_a = load_apanel_no_trans; + } + _trB = trB ^ B_TRANSPOSE; + if (_trB) { + //_load_b = Transform; + _load_b = load_bpanel_trans; + } else { + //_load_b = Transform; + _load_b = load_bpanel_no_trans; + } + + _thread_num = thread_num; + + unsigned int L1_size = L1_cache; + if (L1_size <= 0) { + //! 32K + L1_size = 32000; + } + //! A72/A53 L1 data cache 32k//ci->L1_size; + unsigned int L2_size = L2_cache; + if (L2_size <= 0) { + //! 2M + L2_size = 2000000; + } + //! rockchip rk3399, with two A72, and four A53 + //! A72, 1M on big core, shared by two core, + //! A53, 512K on little core, shared by four core //ci->L2_size; + + //! Work out blocking parameters + //! k_block: Each iteration will consume (out_width + out_height) + //! operands - so how many iterations will fill the L1? + _k_block = L1_size / (sizeof(float) * (OUT_WIDTH + OUT_HEIGHT)); + + int num_k_blocks = (K + (_k_block - 1)) / _k_block; + _k_block = (K + num_k_blocks - 1) / num_k_blocks; + + //! x_block: Work out how many rows (of length k_block) will fit in the L2 + _x_block = L2_size / (sizeof(float) * _k_block); + _x_block /= OUT_WIDTH; + _x_block *= OUT_WIDTH; + int num_x_blocks = (N + (_x_block - 1)) / _x_block; + _x_block = (N + num_x_blocks - 1) / num_x_blocks; + _x_block = (_x_block + OUT_WIDTH - 1) / OUT_WIDTH; + _x_block *= OUT_WIDTH; + + //! Work out the rounded size of M - needed for some buffers. + _Mround = (M + (OUT_HEIGHT - 1)) / OUT_HEIGHT; + _Mround *= OUT_HEIGHT; + + _a_worksize = ROUND_UP(sizeof(float) * _k_block * _Mround); + _b_worksize = ROUND_UP(sizeof(float) * _x_block * _k_block); + //_c_worksize_per_thread = ROUND_UP(sizeof(float) * _x_block * OUT_HEIGHT); + //_c_worksize = _thread_num * _c_worksize_per_thread; + _cblock_size = ROUND_UP(sizeof(float) * _x_block * OUT_HEIGHT) / sizeof(float); + + _work_size = _a_worksize + _b_worksize + _cblock_size * sizeof(float) * _thread_num; + + _work_space_ptr = fast_malloc(_work_size + GEMM_ALIGN); + _align_ptr = _work_space_ptr; + size_t size_gemm_align = _work_size + GEMM_ALIGN - 1; + if (mem_align(GEMM_ALIGN, _work_size, _align_ptr, \ + size_gemm_align) == nullptr) { + LCHECK_EQ(0, 1, "Not enough space to align buffer!"); + } + _loop_count = (_K - 1) / _k_block; + _init_flag = true; +} + +void Sgemm::operator()(const float *A, const int lda, \ + const float *B, const int ldb, \ + float *C, const int ldc, \ + const float alpha, const float beta, bool flag_relu) { + + LCHECK_EQ(_init_flag, true, "gemm is not init"); + + bool flag_beta = (fabsf(beta - 1.f) < 1e-6f); + bool flag_alpha = (fabsf(alpha -1.f) < 1e-6f); + + int8_t *working_space_bytes = reinterpret_cast(_align_ptr); + intptr_t working_space_int = reinterpret_cast(working_space_bytes); + size_t diff = 0; + + if (working_space_int & 0xF) { + diff = 0x10 - (working_space_int & 0xF); + } + + float* const a_panel = reinterpret_cast(working_space_bytes + diff); + float* const b_panel = reinterpret_cast(working_space_bytes + _a_worksize + diff); + float* const c_panel = reinterpret_cast(working_space_bytes + _a_worksize + _b_worksize + diff); + + int index = 0; + + for (unsigned int k0 = 0; k0 < _K; k0 += _k_block) { + unsigned int kmax = k0 + _k_block; + if (kmax > _K) { + kmax = _K; + } + int kern_k = kmax - k0; + _load_a(a_panel, A, lda, 0, _M, k0, kmax); + for (unsigned int x0 = 0; x0 < _NN; x0 += _x_block) { + + unsigned int xmax = x0 + _x_block; + if (xmax > _NN) { + xmax = _NN; + } + int bblocks = (xmax - x0 + OUT_WIDTH - 1) / OUT_WIDTH; + _load_b(b_panel, B, ldb, k0, kmax, x0, xmax); +#pragma omp parallel for + for (unsigned int y = 0; y < _M; y += OUT_HEIGHT) { + unsigned int ymax = y + OUT_HEIGHT; + if (ymax > _M) { + ymax = _M; + } +#ifdef USE_OPENMP + float* cpan1 = c_panel + omp_get_thread_num() * _cblock_size; +#else + float* cpan1 = c_panel; +#endif + sgemm_impl(a_panel + (y * kern_k), b_panel, cpan1, 1, bblocks, kern_k); + //! bias must add before gemm + if (flag_relu && (index == _loop_count)) { + if ((k0 > 0) || flag_beta) { + merge_float_alpha1_beta1_relu(C, cpan1, ldc, y, ymax, x0, xmax); + } else { + merge_float_basic_relu(C, cpan1, ldc, y, ymax, x0, xmax, alpha, (k0 == 0 ? beta : 1.f)); + } + } else { + if (flag_alpha && (k0 > 0) || flag_beta) { + merge_float_alpha1_beta1(C, cpan1, ldc, y, ymax, x0, xmax); + } else { + merge_float_basic(C, cpan1, ldc, y, ymax, x0, xmax, alpha, (k0 == 0 ? beta : 1.f)); + } + } + } + } + index++; + } +} + +#ifdef __aarch64__ +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 8x12), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. +void sgemm_impl(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, \ + int bblocks, int K, long int row_jump, long int block_jump) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for (int yb=0; yb(out); + const uint32_t *inptr = reinterpret_cast(in); + + +#ifdef __aarch64__ + // todo +#else + uint32_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + //! data A is not transposed, transpose A to k * 6 + for (int y = m0; y < mmax; y += 6) { + const uint32_t *inptr0 = inptr + y * ldin + k0; + const uint32_t *inptr1 = inptr0 + ldin; + const uint32_t *inptr2 = inptr1 + ldin; + const uint32_t *inptr3 = inptr2 + ldin; + const uint32_t *inptr4 = inptr3 + ldin; + const uint32_t *inptr5 = inptr4 + ldin; + + int x = kmax - k0; + + for (; x > 7; x -= 8) { + //! cope with row index exceed real size, set to zero buffer + if ((y + 5) >= mmax) { + switch ((y + 5) - mmax) { + case 4: + inptr1 = zerobuff; + case 3: + inptr2 = zerobuff; + case 2: + inptr3 = zerobuff; + case 1: + inptr4 = zerobuff; + case 0: + inptr5 = zerobuff; + default: + break; + } + } + //! zip load 8 elements (2 neon Q registers) from each of 6 rows + asm volatile ( +#if 0 + "vld4.32 {d0-d3}, [%[inptr0]]! @ zip load r0, q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n" + "vld4.32 {d4-d7}, [%[inptr1]]! @ zip load r1, q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n" + "vtrn.32 q0, q2 @ trans data: q0=r00,r10,r01,r11; q2=r04,r14,r05,r15\n" + "vst1.32 {d0}, [%[outptr]]! @ write d0(q0,low),r00,r10\n" + + "vld4.32 {d8-d11}, [%[inptr2]]! @ zip load r2, q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n" + "vld4.32 {d12-d15}, [%[inptr3]]! @ zip load r3, q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n" + "vtrn.32 q4, q6 @ trans data: q4=r20,r30,r21,r31; q6=r24,r34,r25,r35\n" + "vst1.32 {d8}, [%[outptr]]! @ write d8(q4,low),r20,r30\n" + + "vld4.32 {d16-d19}, [%[inptr4]]! @ zip load r4, q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n" + "vld4.32 {d20-d23}, [%[inptr5]]! @ zip load r5, q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n" + "vtrn.32 q8, q10 @ trans data: q8=r40,r50,r41,r51; q10=r44,r54,r45,r55\n" + "vst1.32 {d16}, [%[outptr]]! @ write d16(q8,low),r40,r50\n" + + //"pld [%[inptr0], #128] @ preload r0 data to cache, fill pipeline\n" + "vst1.32 {d1}, [%[outptr]]! @ write d1(q0,high),r01,r11\n" + "vst1.32 {d9}, [%[outptr]]! @ write d9(q4,high),r21,r31\n" + "vst1.32 {d17}, [%[outptr]]! @ write d17(q8,high),r41,r51\n" + + "vtrn.32 q1, q3 @ trans data: q1=r02,r12,r03,r13; q3=r06,r16,r07,r17\n" + "vst1.32 {d2}, [%[outptr]]! @ write d2(q1,low),r02,r12\n" + "vtrn.32 q5, q7 @ trans data: q5=r22,r32,r23,r33; q7=r26,r36,r27,r37\n" + "vst1.32 {d10}, [%[outptr]]! @ write d10(q5,low),r22,r32\n" + "vtrn.32 q9, q11 @ trans data: q9=r42,r52,r43,r53; q11=r46,r56,r47,r57\n" + "vst1.32 {d18}, [%[outptr]]! @ write d18(q9,low),r42,r52\n" + + //"pld [%[inptr1], #128] @ preload r1 data to cache, fill pipeline\n" + "vst1.32 {d3}, [%[outptr]]! @ write d3(q1,high),r03,r13\n" + "vst1.32 {d11}, [%[outptr]]! @ write d11(q5,high),r23,r33\n" + "vst1.32 {d19}, [%[outptr]]! @ write d19(q9,high),r43,r53\n" + + //"pld [%[inptr2], #128] @ preload r2 data to cache, fill pipeline\n" + "vst1.32 {d4}, [%[outptr]]! @ write d4(q2,low),r04,r14\n" + "vst1.32 {d12}, [%[outptr]]! @ write d12(q6,low),r24,r34\n" + "vst1.32 {d20}, [%[outptr]]! @ write d20(q10,low),r44,r54\n" + + //"pld [%[inptr3], #128] @ preload r3 data to cache, fill pipeline\n" + "vst1.32 {d5}, [%[outptr]]! @ write d5(q2,high),r05,r15\n" + "vst1.32 {d13}, [%[outptr]]! @ write d13(q6,high),r25,r35\n" + "vst1.32 {d21}, [%[outptr]]! @ write d21(q10,high),r45,r55\n" + + //"pld [%[inptr4], #128] @ preload r4 data to cache, fill pipeline\n" + "vst1.32 {d6}, [%[outptr]]! @ write d6(q3,low),r06,r16\n" + "vst1.32 {d14}, [%[outptr]]! @ write d14(q7,low),r26,r36\n" + "vst1.32 {d22}, [%[outptr]]! @ write d22(q11,low),r46,r56\n" + + //"pld [%[inptr5], #128] @ preload r5 data to cache, fill pipeline\n" + "vst1.32 {d7}, [%[outptr]]! @ write d7(q3,high),r07,r17\n" + "vst1.32 {d15}, [%[outptr]]! @ write d15(q7,high),r27,r37\n" + "vst1.32 {d23}, [%[outptr]]! @ write d23(q11,high),r47,r57\n" +#else + "vld4.32 {d0-d3}, [%[inptr0]]! @ zip load r0, q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n" + "vld4.32 {d4-d7}, [%[inptr1]]! @ zip load r1, q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n" + "vld4.32 {d8-d11}, [%[inptr2]]! @ zip load r2, q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n" + "vld4.32 {d12-d15}, [%[inptr3]]! @ zip load r3, q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n" + "vld4.32 {d16-d19}, [%[inptr4]]! @ zip load r4, q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n" + "vld4.32 {d20-d23}, [%[inptr5]]! @ zip load r5, q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n" + + "vtrn.32 q0, q2 @ trans data: q0=r00,r10,r01,r11; q2=r04,r14,r05,r15\n" + "vtrn.32 q4, q6 @ trans data: q4=r20,r30,r21,r31; q6=r24,r34,r25,r35\n" + "vtrn.32 q8, q10 @ trans data: q8=r40,r50,r41,r51; q10=r44,r54,r45,r55\n" + + "vswp d1, d8 @ swap d1, d8, q0=r00,r10,r20,r30; q4=r01,r11,r21,r31\n" + "vst1.32 {d0-d1}, [%[outptr]]! @ write q0:r00,r10,r20,r30\n" + "vst1.32 {d16}, [%[outptr]]! @ write d16(q8,low),r40,r50\n" + "vst1.32 {d8-d9}, [%[outptr]]! @ write q4:r01,r11,r21,r31\n" + "vst1.32 {d17}, [%[outptr]]! @ write d16(q8,high),r41,r51\n" + + "vtrn.32 q1, q3 @ trans data: q1=r02,r12,r03,r13; q3=r06,r16,r07,r17\n" + "vtrn.32 q5, q7 @ trans data: q5=r22,r32,r23,r33; q7=r26,r36,r27,r37\n" + "vtrn.32 q9, q11 @ trans data: q9=r42,r52,r43,r53; q11=r46,r56,r47,r57\n" + + "vswp d3, d10 @ swap d3, d10, q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n" + "vst1.32 {d2-d3}, [%[outptr]]! @ write q1:r02,r12,r22,r32\n" + "vst1.32 {d18}, [%[outptr]]! @ write d18(q9,low),r42,r52\n" + "vst1.32 {d10-d11},[%[outptr]]! @ write q5:r03,r13,r23,r33\n" + "vst1.32 {d19}, [%[outptr]]! @ write d19(q9,high),r43,r53\n" + + "vswp d5, d12 @ swap d5, d12,q2=r04,r14,r24,r34; q6=r05,r15,r25,r35\n" + "vst1.32 {d4-d5}, [%[outptr]]! @ write q2:r04,r14,r24,r34\n" + "vst1.32 {d20}, [%[outptr]]! @ write d20(q10,low),r44,r54\n" + "vst1.32 {d12-d13},[%[outptr]]! @ write q6:r05,r15,r25,r35\n" + "vst1.32 {d21}, [%[outptr]]! @ write d21(q10,high),r45,r55\n" + + "vswp d7, d14 @ swap d7, d14, q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n" + "vst1.32 {d6-d7}, [%[outptr]]! @ write q3:r06,r16,r26,r36\n" + "vst1.32 {d22}, [%[outptr]]! @ write d22(q11,low),r46,r56\n" + "vst1.32 {d14-d15},[%[outptr]]! @ write q7:r07,r17,r27,r37\n" + "vst1.32 {d23}, [%[outptr]]! @ write d23(q11,high),r47,r57\n" +#endif + : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), \ + [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), \ + [outptr] "+r" (outptr) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" + ); + } + + for (; x > 0; x--) { + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + *outptr++ = *inptr4++; + *outptr++ = *inptr5++; + } + } +#endif //__aarch64__ +} + +void load_bpanel_trans(float* out, const float* in, const int ldin, const int k0, \ + const int kmax, const int n0, const int nmax) { + + uint32_t *outptr = reinterpret_cast(out); + const uint32_t *inptr = reinterpret_cast(in) + k0 * ldin + n0; + + uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + int x_len = nmax - n0; + int y_len = kmax - k0; + int right_remain = x_len - 8 * (x_len / 8); + int right_pad = 8 - right_remain; + const size_t copy_len_remain = sizeof(float) * right_remain; + const size_t copy_len_pad = sizeof(float) * right_pad; + const size_t size_ldin = sizeof(float) * ldin; + + uint32_t *outptr_row =outptr; + int stride_out = 8 * y_len; + + uint32x4_t vzero = vdupq_n_u32(0); + uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain)); + uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain)); + +#pragma omp parallel for + for (int y = 0; y < y_len - 3; y += 4) { + + const uint32_t* ptr0 = inptr + y * ldin; + const uint32_t* ptr1 = ptr0 + ldin; + const uint32_t* ptr2 = ptr1 + ldin; + const uint32_t* ptr3 = ptr2 + ldin; +#if 0 + const uint32_t* ptr4 = ptr3 + ldin; + const uint32_t* ptr5 = ptr4 + ldin; + const uint32_t* ptr6 = ptr5 + ldin; + const uint32_t* ptr7 = ptr6 + ldin; + const uint32_t* ptr8 = ptr7 + ldin; + const uint32_t* ptr9 = ptr8 + ldin; +#endif + uint32_t *outptr_row_col = outptr_row + y * 8; + int i = 0; + for (; i < x_len - 7; i += 8) { + uint32_t *ptr_out = outptr_row_col; + asm volatile( + "vld1.32 {d0-d3}, [%[ptr0]]! @ load r0, 8 elements\n" + "vld1.32 {d4-d7}, [%[ptr1]]! @ load r1, 8 elements\n" + "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" + + "vld1.32 {d0-d3}, [%[ptr2]]! @ load r2, 8 elements\n" + "vld1.32 {d4-d7}, [%[ptr3]]! @ load r3, 8 elements\n" + "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" +#if 0 + "vld1.32 {d0-d3}, [%[ptr4]]! @ load r4, 8 elements\n" + "vld1.32 {d4-d7}, [%[ptr5]]! @ load r5, 8 elements\n" + "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" + + "vld1.32 {d0-d3}, [%[ptr6]]! @ load r6, 8 elements\n" + "vld1.32 {d4-d7}, [%[ptr7]]! @ load r7, 8 elements\n" + "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" + + "vld1.32 {d0-d3}, [%[ptr8]]! @ load r8, 8 elements\n" + "vld1.32 {d4-d7}, [%[ptr9]]! @ load r9, 8 elements\n" + "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" +#endif + : [outptr] "+r" (ptr_out), [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), + [ptr2] "+r" (ptr2), [ptr3] "+r" (ptr3) +#if 0 + , [ptr4] "+r" (ptr4), [ptr5] "+r" (ptr5), \ + [ptr6] "+r" (ptr6), [ptr7] "+r" (ptr7), [ptr8] "+r" (ptr8), \ + [ptr9] "+r" (ptr9) +#endif + : + : "q0", "q1", "q2", "q3", "memory" + ); + outptr_row_col += stride_out; + } + if (right_pad > 0) { + uint32_t *ptr_out = outptr_row_col; + asm volatile( + "vld1.32 {d0-d3}, [%[ptr0]]! @ load r0, 8 elements\n" + "vld1.32 {d4-d7}, [%[ptr1]]! @ load r1, 8 elements\n" + "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif q1, %q[vzero], %q[vmask2] @ bit select, pad zero\n" + //"vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif q3, %q[vzero], %q[vmask2] @ bit select, pad zero\n" + "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" + + "vld1.32 {d0-d3}, [%[ptr2]]! @ load r2, 8 elements\n" + "vld1.32 {d4-d7}, [%[ptr3]]! @ load r3, 8 elements\n" + "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif q1, %q[vzero], %q[vmask2] @ bit select, pad zero\n" + //"vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif q3, %q[vzero], %q[vmask2] @ bit select, pad zero\n" + "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" +#if 0 + "vld1.32 {d0-d3}, [%[ptr4]]! @ load r4, 8 elements\n" + "vld1.32 {d4-d7}, [%[ptr5]]! @ load r5, 8 elements\n" + "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif q1, %q[vzero], %q[vmask2] @ bit select, pad zero\n" + "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif q3, %q[vzero], %q[vmask2] @ bit select, pad zero\n" + "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" + + "vld1.32 {d0-d3}, [%[ptr6]]! @ load r6, 8 elements\n" + "vld1.32 {d4-d7}, [%[ptr7]]! @ load r7, 8 elements\n" + "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif q1, %q[vzero], %q[vmask2] @ bit select, pad zero\n" + "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif q3, %q[vzero], %q[vmask2] @ bit select, pad zero\n" + "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" + + "vld1.32 {d0-d3}, [%[ptr8]]! @ load r8, 8 elements\n" + "vld1.32 {d4-d7}, [%[ptr9]]! @ load r9, 8 elements\n" + "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif q1, %q[vzero], %q[vmask2] @ bit select, pad zero\n" + "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif q3, %q[vzero], %q[vmask2] @ bit select, pad zero\n" + "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" +#endif + : [outptr] "+r" (ptr_out), [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), + [ptr2] "+r" (ptr2), [ptr3] "+r" (ptr3) +#if 0 + , [ptr4] "+r" (ptr4), [ptr5] "+r" (ptr5), \ + [ptr6] "+r" (ptr6), [ptr7] "+r" (ptr7), [ptr8] "+r" (ptr8), \ + [ptr9] "+r" (ptr9) +#endif + : [vmask1] "w" (vmask1), [vmask2] "w" (vmask2), \ + [vzero] "w" (vzero) + : "q0", "q1", "q2", "q3", "memory" + ); + } + //outptr_row += 32; + } + +#pragma omp parallel for + for (int y = 4 * (y_len / 4); y < y_len; ++y) { + + const uint32_t* ptr0 = inptr + y * ldin; + uint32_t *outptr_row_col = outptr_row + y * 8; + int i = 0; + for (; i < x_len - 7; i += 8) { + uint32_t *ptr_out = outptr_row_col; + asm volatile( + "vld1.32 {d0-d3}, [%[ptr0]]! @ load r0, 8 elements\n" + "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + : [ptr0] "+r" (ptr0), [outptr] "+r" (ptr_out) + : + : "q0", "q1", "memory" + ); + outptr_row_col += stride_out; + } + if (right_pad > 0) { + uint32_t *ptr_out = outptr_row_col; + asm volatile( + "vld1.32 {d0-d3}, [%[ptr0]]! @ load r0, 8 elements\n" + "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif q1, %q[vzero], %q[vmask2] @ bit select, pad zero\n" + "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" + : [ptr0] "+r" (ptr0), [outptr] "+r" (ptr_out) + : [vmask1] "w" (vmask1), [vmask2] "w" (vmask2), \ + [vzero] "w" (vzero) + : "q0", "q1", "memory" + ); + } + //outptr_row += 8; + } +} + +void load_apanel_trans(float* out, const float* in, const int ldin, const int m0, \ + const int mmax, const int k0, const int kmax) { + uint32_t *outptr = reinterpret_cast(out); + const uint32_t *inptr = reinterpret_cast(in) + k0 * ldin + m0; + + uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + int x_len = mmax - m0; + int y_len = kmax - k0; + int right_remain = x_len - 6 * (x_len / 6); + int right_pad = 6 - right_remain; + + uint32_t *outptr_row = outptr; + int stride_out = 6 * y_len; + + uint32x4_t vzero = vdupq_n_u32(0); + uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain)); + uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain)); + +#pragma omp parallel for + for (int y = 0; y < y_len - 3; y += 4) { + + const uint32_t* ptr0 = inptr + y * ldin; + const uint32_t* ptr1 = ptr0 + ldin; + const uint32_t* ptr2 = ptr1 + ldin; + const uint32_t* ptr3 = ptr2 + ldin; + + uint32_t *outptr_row_col = outptr_row + y * 6; + int i = 0; + for (; i < x_len - 5; i += 6) { + uint32_t *ptr_out = outptr_row_col; + asm volatile( + "vld1.32 {d0-d2}, [%[ptr0]]! @ load r0, 6 elements\n" + "vld1.32 {d4-d6}, [%[ptr1]]! @ load r1, 6 elements\n" + "vst1.32 {d0-d2}, [%[outptr]]! @ write to output ptr\n" + "vst1.32 {d4-d6}, [%[outptr]]! @ write to output ptr\n" + + "vld1.32 {d0-d2}, [%[ptr2]]! @ load r2, 6 elements\n" + "vld1.32 {d4-d6}, [%[ptr3]]! @ load r3, 6 elements\n" + "vst1.32 {d0-d2}, [%[outptr]]! @ write to output ptr\n" + "vst1.32 {d4-d6}, [%[outptr]]! @ write to output ptr\n" + : [outptr] "+r" (ptr_out), [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), + [ptr2] "+r" (ptr2), [ptr3] "+r" (ptr3) + : + : "q0", "q1", "q2", "q3", "memory" + ); + outptr_row_col += stride_out; + } + if (right_pad > 0) { + uint32_t *ptr_out = outptr_row_col; + asm volatile( + "vld1.32 {d0-d2}, [%[ptr0]]! @ load r0, 6 elements\n" + "vld1.32 {d4-d6}, [%[ptr1]]! @ load r1, 6 elements\n" + "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif d2, %e[vzero], %e[vmask2] @ bit select, pad zero\n" + "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif d6, %e[vzero], %e[vmask2] @ bit select, pad zero\n" + "vst1.32 {d0-d2}, [%[outptr]]! @ write to output ptr\n" + "vst1.32 {d4-d6}, [%[outptr]]! @ write to output ptr\n" + + "vld1.32 {d0-d2}, [%[ptr2]]! @ load r2, 8 elements\n" + "vld1.32 {d4-d6}, [%[ptr3]]! @ load r3, 8 elements\n" + "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif d2, %e[vzero], %e[vmask2] @ bit select, pad zero\n" + "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif d6, %e[vzero], %e[vmask2] @ bit select, pad zero\n" + "vst1.32 {d0-d2}, [%[outptr]]! @ write to output ptr\n" + "vst1.32 {d4-d6}, [%[outptr]]! @ write to output ptr\n" + : [outptr] "+r" (ptr_out), [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), + [ptr2] "+r" (ptr2), [ptr3] "+r" (ptr3) + : [vmask1] "w" (vmask1), [vmask2] "w" (vmask2), \ + [vzero] "w" (vzero) + : "q0", "q1", "q2", "q3", "memory" + ); + } + } + +#pragma omp parallel for + for (int y = 4 * (y_len / 4); y < y_len; ++y) { + + const uint32_t* ptr0 = inptr + y * ldin; + uint32_t *outptr_row_col = outptr_row + y * 6; + int i = 0; + for (; i < x_len - 5; i += 6) { + uint32_t *ptr_out = outptr_row_col; + asm volatile( + "vld1.32 {d0-d2}, [%[ptr0]]! @ load r0, 6 elements\n" + "vst1.32 {d0-d2}, [%[outptr]]! @ write to output ptr\n" + : [ptr0] "+r" (ptr0), [outptr] "+r" (ptr_out) + : + : "q0", "q1", "memory" + ); + outptr_row_col += stride_out; + } + if (right_pad > 0) { + uint32_t *ptr_out = outptr_row_col; + asm volatile( + "vld1.32 {d0-d2}, [%[ptr0]]! @ load r0, 6 elements\n" + "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" + "vbif d2, %e[vzero], %e[vmask2] @ bit select, pad zero\n" + "vst1.32 {d0-d2}, [%[outptr]]! @ write to output ptr\n" + : [ptr0] "+r" (ptr0), [outptr] "+r" (ptr_out) + : [vmask1] "w" (vmask1), [vmask2] "w" (vmask2), \ + [vzero] "w" (vzero) + : "q0", "q1", "memory" + ); + } + } +} + +void load_bpanel_no_trans(float* out, const float* in, const int ldin, const int k0, \ + const int kmax, const int n0, const int nmax) { + uint32_t *outptr = reinterpret_cast(out); + const uint32_t *inptr = reinterpret_cast(in); +#ifdef __aarch64__ + // todo +#else + uint32_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + //! data B is not transposed, transpose B to k * 8 + for (int y = n0; y < nmax; y += 8) { + const uint32_t *inptr0 = inptr + y * ldin + k0; + const uint32_t *inptr1 = inptr0 + ldin; + const uint32_t *inptr2 = inptr1 + ldin; + const uint32_t *inptr3 = inptr2 + ldin; + const uint32_t *inptr4 = inptr3 + ldin; + const uint32_t *inptr5 = inptr4 + ldin; + const uint32_t *inptr6 = inptr5 + ldin; + const uint32_t *inptr7 = inptr6 + ldin; + + int x = kmax - k0; + + for (; x > 7; x -= 8) { + //! cope with row index exceed real size, set to zero buffer + if ((y + 7) >= nmax) { + switch ((y + 7) - nmax) { + case 6: + inptr1 = zerobuff; + case 5: + inptr2 = zerobuff; + case 4: + inptr3 = zerobuff; + case 3: + inptr4 = zerobuff; + case 2: + inptr5 = zerobuff; + case 1: + inptr6 = zerobuff; + case 0: + inptr7 = zerobuff; + default: + break; + } + } + //! zip load 8 elements (2 neon Q registers) from each of 8 rows + asm volatile ( + "vld4.32 {d0-d3}, [%[inptr0]]! @ zip load r0, q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n" + "vld4.32 {d4-d7}, [%[inptr1]]! @ zip load r1, q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n" + "vtrn.32 q0, q2 @ trans data: q0=r00,r10,r01,r11; q2=r04,r14,r05,r15\n" + "vst1.32 {d0}, [%[outptr]]! @ write d0(q0,low),r00,r10\n" + + "vld4.32 {d8-d11}, [%[inptr2]]! @ zip load r2, q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n" + "vld4.32 {d12-d15}, [%[inptr3]]! @ zip load r3, q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n" + "vtrn.32 q4, q6 @ trans data: q4=r20,r30,r21,r31; q6=r24,r34,r25,r35\n" + "vst1.32 {d8}, [%[outptr]]! @ write d8(q4,low),r20,r30\n" + + "vld4.32 {d16-d19}, [%[inptr4]]! @ zip load r4, q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n" + "vld4.32 {d20-d23}, [%[inptr5]]! @ zip load r5, q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n" + "vtrn.32 q8, q10 @ trans data: q8=r40,r50,r41,r51; q10=r44,r54,r45,r55\n" + "vst1.32 {d16}, [%[outptr]]! @ write d16(q8,low),r40,r50\n" + + "vld4.32 {d24-d27}, [%[inptr6]]! @ zip load r6, q12,q13=r60,r64,r61,r65,r62,r66,r63,r67\n" + "vld4.32 {d28-d31}, [%[inptr7]]! @ zip load r7, q14,q15=r70,r74,r71,r75,r72,r76,r73,r77\n" + "vtrn.32 q12, q14 @ trans data:q12=r60,r70,r61,r71; q14=r64,r74,r65,r75\n" + "vst1.32 {d24}, [%[outptr]]! @ write d24(q8,low),r60,r70\n" + + //"pld [%[inptr0], #128] @ preload r0 data to cache, fill pipeline\n" + "vst1.32 {d1}, [%[outptr]]! @ write d1(q0,high),r01,r11\n" + "vst1.32 {d9}, [%[outptr]]! @ write d9(q4,high),r21,r31\n" + "vst1.32 {d17}, [%[outptr]]! @ write d17(q8,high),r41,r51\n" + "vst1.32 {d25}, [%[outptr]]! @ write d25(q12,high),r61,r71\n" + + "vtrn.32 q1, q3 @ trans data: q1=r02,r12,r03,r13; q3=r06,r16,r07,r17\n" + "vst1.32 {d2}, [%[outptr]]! @ write d2(q1,low),r02,r12\n" + "vtrn.32 q5, q7 @ trans data: q5=r22,r32,r23,r33; q7=r26,r36,r27,r37\n" + "vst1.32 {d10}, [%[outptr]]! @ write d10(q5,low),r22,r32\n" + "vtrn.32 q9, q11 @ trans data: q9=r42,r52,r43,r53; q11=r46,r56,r47,r57\n" + "vst1.32 {d18}, [%[outptr]]! @ write d18(q9,low),r42,r52\n" + "vtrn.32 q13, q15 @ trans data:q13=r62,r72,r63,r73; q15=r66,r76,r67,r77\n" + "vst1.32 {d26}, [%[outptr]]! @ write d18(q9,low),r62,r72\n" + + //"pld [%[inptr1], #128] @ preload r1 data to cache, fill pipeline\n" + "vst1.32 {d3}, [%[outptr]]! @ write d3(q1,high),r03,r13\n" + "vst1.32 {d11}, [%[outptr]]! @ write d11(q5,high),r23,r33\n" + "vst1.32 {d19}, [%[outptr]]! @ write d19(q9,high),r43,r53\n" + "vst1.32 {d27}, [%[outptr]]! @ write d27(q13,high),r63,r73\n" + + //"pld [%[inptr2], #128] @ preload r2 data to cache, fill pipeline\n" + "vst1.32 {d4}, [%[outptr]]! @ write d4(q2,low),r04,r14\n" + "vst1.32 {d12}, [%[outptr]]! @ write d12(q6,low),r24,r34\n" + "vst1.32 {d20}, [%[outptr]]! @ write d20(q10,low),r44,r54\n" + "vst1.32 {d28}, [%[outptr]]! @ write d28(q14,low),r64,r74\n" + + //"pld [%[inptr3], #128] @ preload r3 data to cache, fill pipeline\n" + "vst1.32 {d5}, [%[outptr]]! @ write d5(q2,high),r05,r15\n" + "vst1.32 {d13}, [%[outptr]]! @ write d13(q6,high),r25,r35\n" + "vst1.32 {d21}, [%[outptr]]! @ write d21(q10,high),r45,r55\n" + "vst1.32 {d29}, [%[outptr]]! @ write d29(q14,high),r65,r75\n" + + //"pld [%[inptr4], #128] @ preload r4 data to cache, fill pipeline\n" + "vst1.32 {d6}, [%[outptr]]! @ write d6(q3,low),r06,r16\n" + "vst1.32 {d14}, [%[outptr]]! @ write d14(q7,low),r26,r36\n" + "vst1.32 {d22}, [%[outptr]]! @ write d22(q11,low),r46,r56\n" + "vst1.32 {d30}, [%[outptr]]! @ write d30(q15,low),r66,r76\n" + + //"pld [%[inptr5], #128] @ preload r5 data to cache, fill pipeline\n" + "vst1.32 {d7}, [%[outptr]]! @ write d7(q3,high),r07,r17\n" + "vst1.32 {d15}, [%[outptr]]! @ write d15(q7,high),r27,r37\n" + "vst1.32 {d23}, [%[outptr]]! @ write d23(q11,high),r47,r57\n" + "vst1.32 {d31}, [%[outptr]]! @ write d31(q15,high),r67,r77\n" + : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), \ + [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), \ + [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7),[outptr] "+r" (outptr) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", + "q13", "q14", "q15" + ); + } + + for (; x > 0; x--) { + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + *outptr++ = *inptr4++; + *outptr++ = *inptr5++; + *outptr++ = *inptr6++; + *outptr++ = *inptr7++; + } + } +#endif //__aarch64__ +} + +void merge_float_basic(float *out, const float *in, const int ldout, const int y0, \ + const int ymax, const int x0, const int xmax, const float alpha, const float beta) { + const float *inptr = in; + + float32x4_t av = vdupq_n_f32(alpha); + float32x4_t bv = vdupq_n_f32(beta); + +#ifdef __aarch64__ + for (int y = y0; y < ymax; y += 8) { + float *outptr0 = out + (y * ldout) + x0; + float *outptr1 = outptr0 + ldout; + float *outptr2 = outptr1 + ldout; + float *outptr3 = outptr2 + ldout; + float *outptr4 = outptr3 + ldout; + float *outptr5 = outptr4 + ldout; + float *outptr6 = outptr5 + ldout; + float *outptr7 = outptr6 + ldout; + + for (int i = x0; i < xmax; i += 12) { + float dummyres[12]; + + /* Make sure we throw away results if Y isn't a multiple of 8. + * We do this by pointing the result pointer at a dummy buffer + * we later discard. */ + if ((y+7) >= ymax) { + switch ((y + 7) - ymax) { + case 6: + outptr1 = dummyres; + case 5: + outptr2 = dummyres; + case 4: + outptr3 = dummyres; + case 3: + outptr4 = dummyres; + case 2: + outptr5 = dummyres; + case 1: + outptr6 = dummyres; + case 0: + outptr7 = dummyres; + default: + break; + } + } + + /* For ragged X, manually copy over the valid results. */ + if ((i + 11) >= xmax) { + for (int xi = 0; xi < 12; xi++) { + if ((i + xi) < xmax) { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta); + outptr4++; + *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta); + outptr5++; + *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta); + outptr6++; + *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta); + outptr7++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + asm volatile ( + // Rows 0-1 + "LDP q16, q17, [%[outptr0]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr0], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr1]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr1], #32]\n" + "pld [%[inptr], #768]\n" + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr]]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #32]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #64]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + "pld [%[inptr], #832]\n" + "FMLA v17.4s, v1.4s, %[av].4s\n" + "STP q16, q17, [%[outptr0]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr0]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + "pld [%[inptr], #896]\n" + "FMLA v20.4s, v4.4s, %[av].4s\n" + "STP q19, q20, [%[outptr1]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr1]], #16\n" + + // Rows 2-3 + "LDP q16, q17, [%[outptr2]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr2], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr3]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr3], #32]\n" + "pld [%[inptr], #960]\n" + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #96]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #128]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #160]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + "pld [%[inptr], #1024]\n" + "FMLA v17.4s, v1.4s, %[av].4s\n" + "STP q16, q17, [%[outptr2]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr2]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + "pld [%[inptr], #1088]\n" + "FMLA v20.4s, v4.4s, %[av].4s\n" + "STP q19, q20, [%[outptr3]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr3]], #16\n" + + // Rows 4-5 + "pld [%[outptr0], #80]\n" + "LDP q16, q17, [%[outptr4]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr4], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr5]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr5], #32]\n" + "pld [%[outptr1], #80]\n" + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #192]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #224]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #256]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + "pld [%[outptr2], #80]\n" + "FMLA v17.4s, v1.4s, %[av].4s\n" + "STP q16, q17, [%[outptr4]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr4]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + "pld [%[outptr3], #80]\n" + "FMLA v20.4s, v4.4s, %[av].4s\n" + "STP q19, q20, [%[outptr5]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr5]], #16\n" + + // Rows 6-7 + "pld [%[outptr4], #80]\n" + "LDP q16, q17, [%[outptr6]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr6], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr7]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr7], #32]\n" + "pld [%[outptr5], #80]\n" + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #288]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #320]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #352]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + "pld [%[outptr6], #128]\n" + "FMLA v17.4s, v1.4s, %[av].4s\n" + "STP q16, q17, [%[outptr6]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr6]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + "pld [%[outptr7], #128]\n" + "FMLA v20.4s, v4.4s, %[av].4s\n" + "STP q19, q20, [%[outptr7]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr7]], #16\n" + "ADD %[inptr], %[inptr], #384\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), \ + [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), \ + [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), \ + [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), \ + [inptr] "+r" (inptr) + : [av] "w" (av), [bv] "w" (bv) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q16", "q17", \ + "q18", "q19", "q20", "q21" + ); + } + } + } +#else + for (int y = y0; y < ymax; y += 8) { + float *outptr0 = out + (y * ldout) + x0; + float *outptr1 = outptr0 + ldout; + float *outptr2 = outptr1 + ldout; + float *outptr3 = outptr2 + ldout; + float *outptr4 = outptr3 + ldout; + float *outptr5 = outptr4 + ldout; + + for (int i=x0; i= ymax) { + switch ((y + 5) - ymax) { + case 4: + outptr1 = dummyres; + case 3: + outptr2 = dummyres; + case 2: + outptr3 = dummyres; + case 1: + outptr4 = dummyres; + case 0: + outptr5 = dummyres; + default: + break; + } + } + if ((i + 7) >= xmax) { + for (int xi = 0; xi < 8; xi++) { + if ((i+xi) < xmax) { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta); + outptr3++; + *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta); + outptr4++; + *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta); + outptr5++; + } + } + inptr += 48; + } else { + asm volatile ( + //! Rows 0-1 + "VLD1.32 {d8-d11}, [%[outptr0]]\n" + "VMUL.f32 q4, q4, %q[bv]\n" + "VLD1.32 {d12-d15}, [%[outptr1]]\n" + "VMUL.f32 q5, q5, %q[bv]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VMUL.f32 q6, q6, %q[bv]\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VMUL.f32 q7, q7, %q[bv]\n" + + "VMLA.f32 q4, q0, %q[av]\n" + "pld [%[inptr], #352]\n" + "VMLA.f32 q5, q1, %q[av]\n" + "VST1.32 {d8-d11}, [%[outptr0]]!\n" + "pld [%[inptr], #416]\n" + "VMLA.f32 q6, q2, %q[av]\n" + "pld [%[inptr], #480]\n" + "VMLA.f32 q7, q3, %q[av]\n" + "VST1.32 {d12-d15}, [%[outptr1]]!\n" + + // Rows 2-3 + "VLD1.32 {d8-d11}, [%[outptr2]]\n" + "VMUL.f32 q4, q4, %q[bv]\n" + "VLD1.32 {d12-d15}, [%[outptr3]]\n" + "VMUL.f32 q5, q5, %q[bv]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VMUL.f32 q6, q6, %q[bv]\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VMUL.f32 q7, q7, %q[bv]\n" + + "VMLA.f32 q4, q0, %q[av]\n" + "pld [%[outptr0], #96]\n" + "VMLA.f32 q5, q1, %q[av]\n" + "VST1.32 {d8-d11}, [%[outptr2]]!\n" + "pld [%[outptr1], #96]\n" + "VMLA.f32 q6, q2, %q[av]\n" + "pld [%[outptr2], #96]\n" + "VMLA.f32 q7, q3, %q[av]\n" + "VST1.32 {d12-d15}, [%[outptr3]]!\n" + + // Rows 4-5 + "VLD1.32 {d8-d11}, [%[outptr4]]\n" + "VMUL.f32 q4, q4, %q[bv]\n" + "VLD1.32 {d12-d15}, [%[outptr5]]\n" + "VMUL.f32 q5, q5, %q[bv]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VMUL.f32 q6, q6, %q[bv]\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VMUL.f32 q7, q7, %q[bv]\n" + + "VMLA.f32 q4, q0, %q[av]\n" + "pld [%[outptr3], #96]\n" + "VMLA.f32 q5, q1, %q[av]\n" + "VST1.32 {d8-d11}, [%[outptr4]]!\n" + "pld [%[outptr4], #96]\n" + "VMLA.f32 q6, q2, %q[av]\n" + "pld [%[outptr5], #128]\n" + "VMLA.f32 q7, q3, %q[av]\n" + "VST1.32 {d12-d15}, [%[outptr5]]!\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), \ + [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), \ + [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), \ + [inptr] "+r" (inptr) + : [av] "w" (av), [bv] "w" (bv) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" + ); + } + } + } +#endif // end of __aarch64__ +} + +void merge_float_basic_relu(float *out, const float *in, const int ldout, const int y0, \ + const int ymax, const int x0, const int xmax, const float alpha, const float beta) { + const float *inptr = in; + + float32x4_t av = vdupq_n_f32(alpha); + float32x4_t bv = vdupq_n_f32(beta); + + float32x4_t vzero = vdupq_n_f32(0.f); + +#ifdef __aarch64__ + for (int y = y0; y < ymax; y += 8) { + float *outptr0 = out + (y * ldout) + x0; + float *outptr1 = outptr0 + ldout; + float *outptr2 = outptr1 + ldout; + float *outptr3 = outptr2 + ldout; + float *outptr4 = outptr3 + ldout; + float *outptr5 = outptr4 + ldout; + float *outptr6 = outptr5 + ldout; + float *outptr7 = outptr6 + ldout; + + for (int i = x0; i < xmax; i += 12) { + float dummyres[12]; + + /* Make sure we throw away results if Y isn't a multiple of 8. + * We do this by pointing the result pointer at a dummy buffer + * we later discard. */ + if ((y+7) >= ymax) { + switch ((y + 7) - ymax) { + case 6: + outptr1 = dummyres; + case 5: + outptr2 = dummyres; + case 4: + outptr3 = dummyres; + case 3: + outptr4 = dummyres; + case 2: + outptr5 = dummyres; + case 1: + outptr6 = dummyres; + case 0: + outptr7 = dummyres; + default: + break; + } + } + + /* For ragged X, manually copy over the valid results. */ + if ((i + 11) >= xmax) { + for (int xi = 0; xi < 12; xi++) { + if ((i + xi) < xmax) { + + outptr0[0] = alpha * inptr[xi] + beta * outptr0[0]; + outptr0[0] = fmaxf(outptr0[0], 0.f); + outptr0++; + + outptr1[0] = alpha * inptr[xi + 12] + beta * outptr1[0]; + outptr1[0] = fmaxf(outptr1[0], 0.f); + outptr1++; + + outptr2[0] = alpha * inptr[xi + 24] + beta * outptr2[0]; + outptr2[0] = fmaxf(outptr2[0], 0.f); + outptr2++; + + outptr3[0] = alpha * inptr[xi + 36] + beta * outptr3[0]; + outptr3[0] = fmaxf(outptr3[0], 0.f); + outptr3++; + + outptr4[0] = alpha * inptr[xi + 48] + beta * outptr4[0]; + outptr4[0] = fmaxf(outptr4[0], 0.f); + outptr4++; + + outptr5[0] = alpha * inptr[xi + 60] + beta * outptr5[0]; + outptr5[0] = fmaxf(outptr5[0], 0.f); + outptr5++; + + outptr6[0] = alpha * inptr[xi + 72] + beta * outptr6[0]; + outptr6[0] = fmaxf(outptr6[0], 0.f); + outptr6++; + + outptr7[0] = alpha * inptr[xi + 84] + beta * outptr7[0]; + outptr7[0] = fmaxf(outptr7[0], 0.f); + outptr7++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + asm volatile ( + // Rows 0-1 + "LDP q16, q17, [%[outptr0]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr0], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr1]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr1], #32]\n" + "pld [%[inptr], #768]\n" + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr]]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #32]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #64]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + "pld [%[inptr], #832]\n" + "FMLA v17.4s, v1.4s, %[av].4s\n" + "FMAX v16.4s, v16.4s, %[vzero].4s\n" + "FMAX v17.4s, v17.4s, %[vzero].4s\n" + "STP q16, q17, [%[outptr0]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr0]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + "pld [%[inptr], #896]\n" + "FMLA v20.4s, v4.4s, %[av].4s\n" + "FMAX v19.4s, v19.4s, %[vzero].4s\n" + "FMAX v20.4s, v20.4s, %[vzero].4s\n" + "STP q19, q20, [%[outptr1]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr1]], #16\n" + + // Rows 2-3 + "LDP q16, q17, [%[outptr2]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr2], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr3]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr3], #32]\n" + "pld [%[inptr], #960]\n" + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #96]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #128]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #160]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + "pld [%[inptr], #1024]\n" + "FMLA v17.4s, v1.4s, %[av].4s\n" + "FMAX v16.4s, v16.4s, %[vzero].4s\n" + "FMAX v17.4s, v17.4s, %[vzero].4s\n" + "STP q16, q17, [%[outptr2]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr2]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + "pld [%[inptr], #1088]\n" + "FMLA v20.4s, v4.4s, %[av].4s\n" + "FMAX v19.4s, v19.4s, %[vzero].4s\n" + "FMAX v20.4s, v20.4s, %[vzero].4s\n" + "STP q19, q20, [%[outptr3]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr3]], #16\n" + + // Rows 4-5 + "pld [%[outptr0], #80]\n" + "LDP q16, q17, [%[outptr4]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr4], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr5]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr5], #32]\n" + "pld [%[outptr1], #80]\n" + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #192]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #224]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #256]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + "pld [%[outptr2], #80]\n" + "FMLA v17.4s, v1.4s, %[av].4s\n" + "FMAX v16.4s, v16.4s, %[vzero].4s\n" + "FMAX v17.4s, v17.4s, %[vzero].4s\n" + "STP q16, q17, [%[outptr4]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr4]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + "pld [%[outptr3], #80]\n" + "FMLA v20.4s, v4.4s, %[av].4s\n" + "FMAX v19.4s, v19.4s, %[vzero].4s\n" + "FMAX v20.4s, v20.4s, %[vzero].4s\n" + "STP q19, q20, [%[outptr5]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr5]], #16\n" + + // Rows 6-7 + "pld [%[outptr4], #80]\n" + "LDP q16, q17, [%[outptr6]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr6], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr7]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr7], #32]\n" + "pld [%[outptr5], #80]\n" + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #288]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #320]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #352]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + "pld [%[outptr6], #128]\n" + "FMLA v17.4s, v1.4s, %[av].4s\n" + "FMAX v16.4s, v16.4s, %[vzero].4s\n" + "FMAX v17.4s, v17.4s, %[vzero].4s\n" + "STP q16, q17, [%[outptr6]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr6]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + "pld [%[outptr7], #128]\n" + "FMLA v20.4s, v4.4s, %[av].4s\n" + "FMAX v19.4s, v19.4s, %[vzero].4s\n" + "FMAX v20.4s, v20.4s, %[vzero].4s\n" + "STP q19, q20, [%[outptr7]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr7]], #16\n" + "ADD %[inptr], %[inptr], #384\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), \ + [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), \ + [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), \ + [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), \ + [inptr] "+r" (inptr) + : [av] "w" (av), [bv] "w" (bv), [vzero] "w" (vzero) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q16", "q17", \ + "q18", "q19", "q20", "q21" + ); + } + } + } +#else + for (int y = y0; y < ymax; y += 8) { + float *outptr0 = out + (y * ldout) + x0; + float *outptr1 = outptr0 + ldout; + float *outptr2 = outptr1 + ldout; + float *outptr3 = outptr2 + ldout; + float *outptr4 = outptr3 + ldout; + float *outptr5 = outptr4 + ldout; + + for (int i=x0; i= ymax) { + switch ((y + 5) - ymax) { + case 4: + outptr1 = dummyres; + case 3: + outptr2 = dummyres; + case 2: + outptr3 = dummyres; + case 1: + outptr4 = dummyres; + case 0: + outptr5 = dummyres; + default: + break; + } + } + + if ((i + 7) >= xmax) { + for (int xi = 0; xi < 8; xi++) { + if ((i + xi) < xmax) { + outptr0[0] = alpha * inptr[xi] + beta * outptr0[0]; + outptr0[0] = fmaxf(outptr0[0], 0.f); + outptr0++; + + outptr1[0] = alpha * inptr[xi + 8] + beta * outptr1[0]; + outptr1[0] = fmaxf(outptr1[0], 0.f); + outptr1++; + + outptr2[0] = alpha * inptr[xi + 16] + beta * outptr2[0]; + outptr2[0] = fmaxf(outptr2[0], 0.f); + outptr2++; + + outptr3[0] = alpha * inptr[xi + 24] + beta * outptr3[0]; + outptr3[0] = fmaxf(outptr3[0], 0.f); + outptr3++; + + outptr4[0] = alpha * inptr[xi + 32] + beta * outptr4[0]; + outptr4[0] = fmaxf(outptr4[0], 0.f); + outptr4++; + + outptr5[0] = alpha * inptr[xi + 40] + beta * outptr5[0]; + outptr5[0] = fmaxf(outptr5[0], 0.f); + outptr5++; + } + } + inptr += 48; + } else { + asm volatile ( + //! Rows 0-1 + "VLD1.32 {d8-d11}, [%[outptr0]]\n" + "VMUL.f32 q4, q4, %q[bv]\n" + "VLD1.32 {d12-d15}, [%[outptr1]]\n" + "VMUL.f32 q5, q5, %q[bv]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VMUL.f32 q6, q6, %q[bv]\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VMUL.f32 q7, q7, %q[bv]\n" + + "VMLA.f32 q4, q0, %q[av]\n" + "pld [%[inptr], #352]\n" + "VMLA.f32 q5, q1, %q[av]\n" + "VMAX.f32 q4, q4, %q[vzero]\n" + "VMAX.f32 q5, q5, %q[vzero]\n" + "VST1.32 {d8-d11}, [%[outptr0]]!\n" + "pld [%[inptr], #416]\n" + "VMLA.f32 q6, q2, %q[av]\n" + "pld [%[inptr], #480]\n" + "VMLA.f32 q7, q3, %q[av]\n" + "VMAX.f32 q6, q6, %q[vzero]\n" + "VMAX.f32 q7, q7, %q[vzero]\n" + "VST1.32 {d12-d15}, [%[outptr1]]!\n" + + // Rows 2-3 + "VLD1.32 {d8-d11}, [%[outptr2]]\n" + "VMUL.f32 q4, q4, %q[bv]\n" + "VLD1.32 {d12-d15}, [%[outptr3]]\n" + "VMUL.f32 q5, q5, %q[bv]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VMUL.f32 q6, q6, %q[bv]\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VMUL.f32 q7, q7, %q[bv]\n" + + "VMLA.f32 q4, q0, %q[av]\n" + "pld [%[outptr0], #96]\n" + "VMLA.f32 q5, q1, %q[av]\n" + "VMAX.f32 q4, q4, %q[vzero]\n" + "VMAX.f32 q5, q5, %q[vzero]\n" + "VST1.32 {d8-d11}, [%[outptr2]]!\n" + "pld [%[outptr1], #96]\n" + "VMLA.f32 q6, q2, %q[av]\n" + "pld [%[outptr2], #96]\n" + "VMLA.f32 q7, q3, %q[av]\n" + "VMAX.f32 q6, q6, %q[vzero]\n" + "VMAX.f32 q7, q7, %q[vzero]\n" + "VST1.32 {d12-d15}, [%[outptr3]]!\n" + + // Rows 4-5 + "VLD1.32 {d8-d11}, [%[outptr4]]\n" + "VMUL.f32 q4, q4, %q[bv]\n" + "VLD1.32 {d12-d15}, [%[outptr5]]\n" + "VMUL.f32 q5, q5, %q[bv]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VMUL.f32 q6, q6, %q[bv]\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VMUL.f32 q7, q7, %q[bv]\n" + + "VMLA.f32 q4, q0, %q[av]\n" + "pld [%[outptr3], #96]\n" + "VMLA.f32 q5, q1, %q[av]\n" + "VMAX.f32 q4, q4, %q[vzero]\n" + "VMAX.f32 q5, q5, %q[vzero]\n" + "VST1.32 {d8-d11}, [%[outptr4]]!\n" + "pld [%[outptr4], #96]\n" + "VMLA.f32 q6, q2, %q[av]\n" + "pld [%[outptr5], #128]\n" + "VMLA.f32 q7, q3, %q[av]\n" + "VMAX.f32 q6, q6, %q[vzero]\n" + "VMAX.f32 q7, q7, %q[vzero]\n" + "VST1.32 {d12-d15}, [%[outptr5]]!\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), \ + [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), \ + [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), \ + [inptr] "+r" (inptr) + : [av] "w" (av), [bv] "w" (bv), [vzero] "w" (vzero) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" + ); + } + } + } +#endif // end of __aarch64__ +} + +void merge_float_alpha1_beta1(float *out, const float *in, const int ldout, const int y0, \ + const int ymax, const int x0, const int xmax) { + const float *inptr = in; + +#ifdef __aarch64__ + + for (int y = y0; y < ymax; y += 8) { + float *outptr0 = out + (y * ldout) + x0; + float *outptr1 = outptr0 + ldout; + float *outptr2 = outptr1 + ldout; + float *outptr3 = outptr2 + ldout; + float *outptr4 = outptr3 + ldout; + float *outptr5 = outptr4 + ldout; + float *outptr6 = outptr5 + ldout; + float *outptr7 = outptr6 + ldout; + + for (int i = x0; i < xmax; i += 12) { + float dummyres[12]; + + /* Make sure we throw away results if Y isn't a multiple of 8. + * We do this by pointing the result pointer at a dummy buffer + * we later discard. */ + if ((y+7) >= ymax) { + switch ((y + 7) - ymax) { + case 6: + outptr1 = dummyres; + case 5: + outptr2 = dummyres; + case 4: + outptr3 = dummyres; + case 3: + outptr4 = dummyres; + case 2: + outptr5 = dummyres; + case 1: + outptr6 = dummyres; + case 0: + outptr7 = dummyres; + default: + break; + } + } + + /* For ragged X, manually copy over the valid results. */ + if ((i + 11) >= xmax) { + for (int xi = 0; xi < 12; xi++) { + if ((i + xi) < xmax) { + *outptr0 = inptr[xi] + *outptr0; + outptr0++; + *outptr1 = inptr[xi + 12] + *outptr1; + outptr1++; + *outptr2 = inptr[xi + 24] + *outptr2; + outptr2++; + *outptr3 = inptr[xi + 36] + *outptr3; + outptr3++; + *outptr4 = inptr[xi + 48] + *outptr4; + outptr4++; + *outptr5 = inptr[xi + 60] + *outptr5; + outptr5++; + *outptr6 = inptr[xi + 72] + *outptr6; + outptr6++; + *outptr7 = inptr[xi + 84] + *outptr7; + outptr7++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + asm volatile ( + // Rows 0-1 + "LDP q16, q17, [%[outptr0]]\n" + "LDR q18, [%[outptr0], #32]\n" + "LDP q19, q20, [%[outptr1]]\n" + "LDR q21, [%[outptr1], #32]\n" + "pld [%[inptr], #768]\n" + "LDP q0, q1, [%[inptr]]\n" + "LDP q2, q3, [%[inptr], #32]\n" + "LDP q4, q5, [%[inptr], #64]\n" + "FADD v16.4s, v0.4s, v16.4s\n" + "pld [%[inptr], #832]\n" + "FADD v17.4s, v1.4s, v17.4s\n" + "STP q16, q17, [%[outptr0]], #32\n" + "FADD v18.4s, v2.4s, v18.4s\n" + "STR q18, [%[outptr0]], #16\n" + "FADD v19.4s, v3.4s, v19.4s\n" + "pld [%[inptr], #896]\n" + "FADD v20.4s, v4.4s, v20.4s\n" + "STP q19, q20, [%[outptr1]], #32\n" + "FADD v21.4s, v5.4s, v21.4s\n" + "STR q21, [%[outptr1]], #16\n" + + // Rows 2-3 + "LDP q16, q17, [%[outptr2]]\n" + "LDR q18, [%[outptr2], #32]\n" + "LDP q19, q20, [%[outptr3]]\n" + "LDR q21, [%[outptr3], #32]\n" + "pld [%[inptr], #960]\n" + "LDP q0, q1, [%[inptr], #96]\n" + "LDP q2, q3, [%[inptr], #128]\n" + "LDP q4, q5, [%[inptr], #160]\n" + "FADD v16.4s, v0.4s, v16.4s\n" + "pld [%[inptr], #1024]\n" + "FADD v17.4s, v1.4s, v17.4s\n" + "STP q16, q17, [%[outptr2]], #32\n" + "FADD v18.4s, v2.4s, v18.4s\n" + "STR q18, [%[outptr2]], #16\n" + "FADD v19.4s, v3.4s, v19.4s\n" + "pld [%[inptr], #1088]\n" + "FADD v20.4s, v4.4s, v20.4s\n" + "STP q19, q20, [%[outptr3]], #32\n" + "FADD v21.4s, v5.4s, v21.4s\n" + "STR q21, [%[outptr3]], #16\n" + + // Rows 4-5 + "pld [%[outptr0], #80]\n" + "LDP q16, q17, [%[outptr4]]\n" + "LDR q18, [%[outptr4], #32]\n" + "LDP q19, q20, [%[outptr5]]\n" + "LDR q21, [%[outptr5], #32]\n" + "pld [%[outptr1], #80]\n" + "LDP q0, q1, [%[inptr], #192]\n" + "LDP q2, q3, [%[inptr], #224]\n" + "LDP q4, q5, [%[inptr], #256]\n" + "FADD v16.4s, v0.4s, v16.4s\n" + "pld [%[outptr2], #80]\n" + "FADD v17.4s, v1.4s, v17.4s\n" + "STP q16, q17, [%[outptr4]], #32\n" + "FADD v18.4s, v2.4s, v18.4s\n" + "STR q18, [%[outptr4]], #16\n" + "FADD v19.4s, v3.4s, v19.4s\n" + "pld [%[outptr3], #80]\n" + "FADD v20.4s, v4.4s, v20.4s\n" + "STP q19, q20, [%[outptr5]], #32\n" + "FADD v21.4s, v5.4s, v21.4s\n" + "STR q21, [%[outptr5]], #16\n" + + // Rows 6-7 + "pld [%[outptr4], #80]\n" + "LDP q16, q17, [%[outptr6]]\n" + "LDR q18, [%[outptr6], #32]\n" + "LDP q19, q20, [%[outptr7]]\n" + "LDR q21, [%[outptr7], #32]\n" + "pld [%[outptr5], #80]\n" + "LDP q0, q1, [%[inptr], #288]\n" + "LDP q2, q3, [%[inptr], #320]\n" + "LDP q4, q5, [%[inptr], #352]\n" + "FADD v16.4s, v0.4s, v16.4s\n" + "pld [%[outptr6], #128]\n" + "FADD v17.4s, v1.4s, v17.4s\n" + "STP q16, q17, [%[outptr6]], #32\n" + "FADD v18.4s, v2.4s, v18.4s\n" + "STR q18, [%[outptr6]], #16\n" + "FADD v19.4s, v3.4s, v19.4s\n" + "pld [%[outptr7], #128]\n" + "FADD v20.4s, v4.4s, v20.4s\n" + "STP q19, q20, [%[outptr7]], #32\n" + "FADD v21.4s, v5.4s, v21.4s\n" + "STR q21, [%[outptr7]], #16\n" + "ADD %[inptr], %[inptr], #384\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), \ + [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), \ + [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), \ + [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), \ + [inptr] "+r" (inptr) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q16", "q17", \ + "q18", "q19", "q20", "q21" + ); + } + } + } + +#else + for (int y = y0; y < ymax; y += 8) { + float *outptr0 = out + (y * ldout) + x0; + float *outptr1 = outptr0 + ldout; + float *outptr2 = outptr1 + ldout; + float *outptr3 = outptr2 + ldout; + float *outptr4 = outptr3 + ldout; + float *outptr5 = outptr4 + ldout; + + for (int i = x0; i < xmax; i += 8) { + float dummyres[8]; + + /* Make sure we throw away results if Y isn't a multiple of 8. + * We do this by pointing the result pointer at a dummy buffer + * we later discard. */ + if ((y+5) >= ymax) { + switch ((y + 5) - ymax) { + case 4: + outptr1 = dummyres; + case 3: + outptr2 = dummyres; + case 2: + outptr3 = dummyres; + case 1: + outptr4 = dummyres; + case 0: + outptr5 = dummyres; + default: + break; + } + } + + /* For ragged X, manually copy over the valid results. */ + if ((i+7) >= xmax) { + for (int xi=0; xi<8; xi++) { + if ((i+xi) < xmax) { + *outptr0 = inptr[xi] + *outptr0; + outptr0++; + *outptr1 = inptr[xi + 8] + *outptr1; + outptr1++; + *outptr2 = inptr[xi + 16] + *outptr2; + outptr2++; + *outptr3 = inptr[xi + 24] + *outptr3; + outptr3++; + *outptr4 = inptr[xi + 32] + *outptr4; + outptr4++; + *outptr5 = inptr[xi + 40] + *outptr5; + outptr5++; + } + } + inptr += 48; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + //! Rows 0-1 + "VLD1.32 {d8-d11}, [%[outptr0]]\n" + "VLD1.32 {d12-d15}, [%[outptr1]]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VADD.f32 q4, q0, q4\n" + "pld [%[inptr], #352]\n" + "VADD.f32 q5, q1, q5\n" + "VST1.32 {d8-d11}, [%[outptr0]]!\n" + "pld [%[inptr], #416]\n" + "VADD.f32 q6, q2, q6\n" + "pld [%[inptr], #480]\n" + "VADD.f32 q7, q3, q7\n" + "VST1.32 {d12-d15}, [%[outptr1]]!\n" + + //! Rows 2-3 + "VLD1.32 {d8-d11}, [%[outptr2]]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VADD.f32 q4, q0, q4\n" + "pld [%[outptr0], #96]\n" + "VLD1.32 {d12-d15}, [%[outptr3]]\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VADD.f32 q5, q1, q5\n" + "VST1.32 {d8-d11}, [%[outptr2]]!\n" + "pld [%[outptr1], #96]\n" + "VADD.f32 q6, q2, q6\n" + "pld [%[outptr2], #96]\n" + "VADD.f32 q7, q3, q7\n" + "VST1.32 {d12-d15}, [%[outptr3]]!\n" + + // Rows 4-5 + "VLD1.32 {d8-d11}, [%[outptr4]]\n" + "VLD1.32 {d12-d15}, [%[outptr5]]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VADD.f32 q4, q0, q4\n" + "pld [%[outptr3], #96]\n" + "VADD.f32 q5, q1, q5\n" + "VST1.32 {d8-d11}, [%[outptr4]]!\n" + "pld [%[outptr4], #96]\n" + "VADD.f32 q6, q2, q6\n" + "pld [%[outptr5], #128]\n" + "VADD.f32 q7, q3, q7\n" + "VST1.32 {d12-d15}, [%[outptr5]]!\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), \ + [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), \ + [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), \ + [inptr] "+r" (inptr) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" + ); + } + } + } +#endif // end of __aarch64__ +} + +void merge_float_alpha1_beta1_relu(float *out, const float *in, const int ldout, const int y0, \ + const int ymax, const int x0, const int xmax) { + const float *inptr = in; + + float32x4_t vzero = vdupq_n_f32(0.f); + +#ifdef __aarch64__ + for (int y = y0; y < ymax; y += 8) { + float *outptr0 = out + (y * ldout) + x0; + float *outptr1 = outptr0 + ldout; + float *outptr2 = outptr1 + ldout; + float *outptr3 = outptr2 + ldout; + float *outptr4 = outptr3 + ldout; + float *outptr5 = outptr4 + ldout; + float *outptr6 = outptr5 + ldout; + float *outptr7 = outptr6 + ldout; + + for (int i = x0; i < xmax; i += 12) { + float dummyres[12]; + if ((y+7) >= ymax) { + switch ((y + 7) - ymax) { + case 6: + outptr1 = dummyres; + case 5: + outptr2 = dummyres; + case 4: + outptr3 = dummyres; + case 3: + outptr4 = dummyres; + case 2: + outptr5 = dummyres; + case 1: + outptr6 = dummyres; + case 0: + outptr7 = dummyres; + default: + break; + } + } + if ((i + 11) >= xmax) { + for (int xi=0; xi<8; xi++) { + if ((i + xi) < xmax) { + *outptr0 = inptr[xi] + *outptr0; + *outptr0 > 0? *outptr0 : 0.f; + outptr0++; + *outptr1 = inptr[xi + 12] + *outptr1; + *outptr1 > 0? *outptr1 : 0.f; + outptr1++; + *outptr2 = inptr[xi + 24] + *outptr2; + *outptr2 > 0? *outptr2 : 0.f; + outptr2++; + *outptr3 = inptr[xi + 36] + *outptr3; + *outptr3 > 0? *outptr3 : 0.f; + outptr3++; + *outptr4 = inptr[xi + 48] + *outptr4; + *outptr4 > 0? *outptr4 : 0.f; + outptr4++; + *outptr5 = inptr[xi + 60] + *outptr5; + *outptr5 > 0? *outptr5 : 0.f; + outptr5++; + *outptr6 = inptr[xi + 72] + *outptr6; + *outptr6 > 0? *outptr6 : 0.f; + outptr6++; + *outptr7 = inptr[xi + 84] + *outptr7; + *outptr7 > 0? *outptr7 : 0.f; + outptr7++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + asm volatile ( + // Rows 0-1 + "LDP q16, q17, [%[outptr0]]\n" + "LDR q18, [%[outptr0], #32]\n" + "LDP q19, q20, [%[outptr1]]\n" + "LDR q21, [%[outptr1], #32]\n" + "pld [%[inptr], #768]\n" + "LDP q0, q1, [%[inptr]]\n" + "LDP q2, q3, [%[inptr], #32]\n" + "LDP q4, q5, [%[inptr], #64]\n" + "FADD v16.4s, v0.4s, v16.4s\n" + "pld [%[inptr], #832]\n" + "FADD v17.4s, v1.4s, v17.4s\n" + "FMAX v16.4s, v16.4s, %[vzero].4s\n" + "FMAX v17.4s, v17.4s, %[vzero].4s\n" + "STP q16, q17, [%[outptr0]], #32\n" + "FADD v18.4s, v2.4s, v18.4s\n" + "STR q18, [%[outptr0]], #16\n" + "FADD v19.4s, v3.4s, v19.4s\n" + "pld [%[inptr], #896]\n" + "FADD v20.4s, v4.4s, v20.4s\n" + "FMAX v19.4s, v19.4s, %[vzero].4s\n" + "FMAX v20.4s, v20.4s, %[vzero].4s\n" + "STP q19, q20, [%[outptr1]], #32\n" + "FADD v21.4s, v5.4s, v21.4s\n" + "STR q21, [%[outptr1]], #16\n" + + // Rows 2-3 + "LDP q16, q17, [%[outptr2]]\n" + "LDR q18, [%[outptr2], #32]\n" + "LDP q19, q20, [%[outptr3]]\n" + "LDR q21, [%[outptr3], #32]\n" + "pld [%[inptr], #960]\n" + "LDP q0, q1, [%[inptr], #96]\n" + "LDP q2, q3, [%[inptr], #128]\n" + "LDP q4, q5, [%[inptr], #160]\n" + "FADD v16.4s, v0.4s, v16.4s\n" + "pld [%[inptr], #1024]\n" + "FADD v17.4s, v1.4s, v17.4s\n" + "FMAX v16.4s, v16.4s, %[vzero].4s\n" + "FMAX v17.4s, v17.4s, %[vzero].4s\n" + "STP q16, q17, [%[outptr2]], #32\n" + "FADD v18.4s, v2.4s, v18.4s\n" + "STR q18, [%[outptr2]], #16\n" + "FADD v19.4s, v3.4s, v19.4s\n" + "pld [%[inptr], #1088]\n" + "FADD v20.4s, v4.4s, v20.4s\n" + "FMAX v19.4s, v19.4s, %[vzero].4s\n" + "FMAX v20.4s, v20.4s, %[vzero].4s\n" + "STP q19, q20, [%[outptr3]], #32\n" + "FADD v21.4s, v5.4s, v21.4s\n" + "STR q21, [%[outptr3]], #16\n" + + // Rows 4-5 + "pld [%[outptr0], #80]\n" + "LDP q16, q17, [%[outptr4]]\n" + "LDR q18, [%[outptr4], #32]\n" + "LDP q19, q20, [%[outptr5]]\n" + "LDR q21, [%[outptr5], #32]\n" + "pld [%[outptr1], #80]\n" + "LDP q0, q1, [%[inptr], #192]\n" + "LDP q2, q3, [%[inptr], #224]\n" + "LDP q4, q5, [%[inptr], #256]\n" + "FADD v16.4s, v0.4s, v16.4s\n" + "pld [%[outptr2], #80]\n" + "FADD v17.4s, v1.4s, v17.4s\n" + "FMAX v16.4s, v16.4s, %[vzero].4s\n" + "FMAX v17.4s, v17.4s, %[vzero].4s\n" + "STP q16, q17, [%[outptr4]], #32\n" + "FADD v18.4s, v2.4s, v18.4s\n" + "STR q18, [%[outptr4]], #16\n" + "FADD v19.4s, v3.4s, v19.4s\n" + "pld [%[outptr3], #80]\n" + "FADD v20.4s, v4.4s, v20.4s\n" + "FMAX v19.4s, v19.4s, %[vzero].4s\n" + "FMAX v20.4s, v20.4s, %[vzero].4s\n" + "STP q19, q20, [%[outptr5]], #32\n" + "FADD v21.4s, v5.4s, v21.4s\n" + "STR q21, [%[outptr5]], #16\n" + + // Rows 6-7 + "pld [%[outptr4], #80]\n" + "LDP q16, q17, [%[outptr6]]\n" + "LDR q18, [%[outptr6], #32]\n" + "LDP q19, q20, [%[outptr7]]\n" + "LDR q21, [%[outptr7], #32]\n" + "pld [%[outptr5], #80]\n" + "LDP q0, q1, [%[inptr], #288]\n" + "LDP q2, q3, [%[inptr], #320]\n" + "LDP q4, q5, [%[inptr], #352]\n" + "FADD v16.4s, v0.4s, v16.4s\n" + "pld [%[outptr6], #128]\n" + "FADD v17.4s, v1.4s, v17.4s\n" + "FMAX v16.4s, v16.4s, %[vzero].4s\n" + "FMAX v17.4s, v17.4s, %[vzero].4s\n" + "STP q16, q17, [%[outptr6]], #32\n" + "FADD v18.4s, v2.4s, v18.4s\n" + "STR q18, [%[outptr6]], #16\n" + "FADD v19.4s, v3.4s, v19.4s\n" + "pld [%[outptr7], #128]\n" + "FADD v20.4s, v4.4s, v20.4s\n" + "FMAX v19.4s, v19.4s, %[vzero].4s\n" + "FMAX v20.4s, v20.4s, %[vzero].4s\n" + "STP q19, q20, [%[outptr7]], #32\n" + "FADD v21.4s, v5.4s, v21.4s\n" + "STR q21, [%[outptr7]], #16\n" + "ADD %[inptr], %[inptr], #384\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), \ + [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), \ + [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), \ + [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), \ + [inptr] "+r" (inptr) + : [vzero] "w" (vzero) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q16", "q17", \ + "q18", "q19", "q20", "q21" + ); + } + } + } +#else + for (int y = y0; y < ymax; y += 8) { + float *outptr0 = out + (y * ldout) + x0; + float *outptr1 = outptr0 + ldout; + float *outptr2 = outptr1 + ldout; + float *outptr3 = outptr2 + ldout; + float *outptr4 = outptr3 + ldout; + float *outptr5 = outptr4 + ldout; + + for (int i = x0; i < xmax; i += 8) { + float dummyres[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + if ((y+5) >= ymax) { + switch ((y + 5) - ymax) { + case 4: + outptr1 = dummyres; + case 3: + outptr2 = dummyres; + case 2: + outptr3 = dummyres; + case 1: + outptr4 = dummyres; + case 0: + outptr5 = dummyres; + default: + break; + } + } + if ((i + 7) >= xmax) { + for (int xi = 0; xi < 8; xi++) { + if ((i + xi) < xmax) { + outptr0[0] = inptr[xi] + outptr0[0]; + outptr0[0] = fmaxf(outptr0[0], 0.f); + outptr0++; + *outptr1 = inptr[xi + 8] + *outptr1; + outptr1[0] = fmaxf(outptr1[0], 0.f); + outptr1++; + *outptr2 = inptr[xi + 16] + *outptr2; + outptr2[0] = fmaxf(outptr2[0], 0.f); + outptr2++; + *outptr3 = inptr[xi + 24] + *outptr3; + outptr3[0] = fmaxf(outptr3[0], 0.f); + outptr3++; + *outptr4 = inptr[xi + 32] + *outptr4; + outptr4[0] = fmaxf(outptr4[0], 0.f); + outptr4++; + *outptr5 = inptr[xi + 40] + *outptr5; + outptr5[0] = fmaxf(outptr5[0], 0.f); + outptr5++; + } + } + inptr += 48; + } else { + asm volatile ( + //! Rows 0-1 + "VLD1.32 {d8-d11}, [%[outptr0]]\n" + "VLD1.32 {d12-d15}, [%[outptr1]]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VADD.f32 q4, q0, q4\n" + "pld [%[inptr], #352]\n" + "VADD.f32 q5, q1, q5\n" + "VMAX.f32 q4, q4, %q[vzero]\n" + "VMAX.f32 q5, q5, %q[vzero]\n" + "VST1.32 {d8-d11}, [%[outptr0]]!\n" + "pld [%[inptr], #416]\n" + "VADD.f32 q6, q2, q6\n" + "pld [%[inptr], #480]\n" + "VADD.f32 q7, q3, q7\n" + "VMAX.f32 q6, q6, %q[vzero] \n" + "VMAX.f32 q7, q7, %q[vzero]\n" + "VST1.32 {d12-d15}, [%[outptr1]]!\n" + + //! Rows 2-3 + "VLD1.32 {d8-d11}, [%[outptr2]]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VADD.f32 q4, q0, q4\n" + "pld [%[outptr0], #96]\n" + "VLD1.32 {d12-d15}, [%[outptr3]]\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VADD.f32 q5, q1, q5\n" + "VMAX.f32 q4, q4, %q[vzero]\n" + "VMAX.f32 q5, q5, %q[vzero]\n" + "VST1.32 {d8-d11}, [%[outptr2]]!\n" + "pld [%[outptr1], #96]\n" + "VADD.f32 q6, q2, q6\n" + "pld [%[outptr2], #96]\n" + "VADD.f32 q7, q3, q7\n" + "VMAX.f32 q6, q6, %q[vzero]\n" + "VMAX.f32 q7, q7, %q[vzero]\n" + "VST1.32 {d12-d15}, [%[outptr3]]!\n" + + // Rows 4-5 + "VLD1.32 {d8-d11}, [%[outptr4]]\n" + "VLD1.32 {d12-d15}, [%[outptr5]]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VADD.f32 q4, q0, q4\n" + "pld [%[outptr3], #96]\n" + "VADD.f32 q5, q1, q5\n" + "VMAX.f32 q4, q4, %q[vzero]\n" + "VMAX.f32 q5, q5, %q[vzero]\n" + "VST1.32 {d8-d11}, [%[outptr4]]!\n" + "pld [%[outptr4], #96]\n" + "VADD.f32 q6, q2, q6\n" + "pld [%[outptr5], #128]\n" + "VADD.f32 q7, q3, q7\n" + "VMAX.f32 q6, q6, %q[vzero]\n" + "VMAX.f32 q7, q7, %q[vzero]\n" + "VST1.32 {d12-d15}, [%[outptr5]]!\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), \ + [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), \ + [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), \ + [inptr] "+r" (inptr) + : [vzero] "w" (vzero) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" + ); + } + } + } +#endif // end of __aarch64__ +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //USE_ARM_PLACE \ No newline at end of file diff --git a/saber/lite/funcs/neon/impl/sgemm_arm.h b/saber/lite/funcs/neon/impl/sgemm_arm.h new file mode 100644 index 000000000..25f357b76 --- /dev/null +++ b/saber/lite/funcs/neon/impl/sgemm_arm.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_NEON_IMPL_SGEMM_ARM_H +#define ANAKIN_SABER_LITE_FUNCS_NEON_IMPL_SGEMM_ARM_H + +#include "saber/lite/core/common_lite.h" +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +typedef void (*load_data)(float* out, const float* in, const int ldin, const int m0, \ + const int mmax, const int k0, const int kmax); + +class Sgemm { + +public: + + Sgemm(); + ~Sgemm(); + void init(unsigned int L1_cache, unsigned int L2_cache, unsigned int M, unsigned int N, \ + unsigned int K, bool trA, bool trB, int thread_num = 1); + + //! Actually execute the GEMM. + void operator()(const float *A, const int lda, const float *B, const int ldb, \ + float *C, const int ldc, const float alpha, const float beta, bool flag_relu = false); + +private: + + unsigned int _M; + unsigned int _NN; + unsigned int _K; + + bool _trA; + bool _trB; + + unsigned int _k_block{0}; + unsigned int _x_block{0}; + unsigned int _Mround{0}; + + unsigned int _loop_count{0}; + unsigned int _cblock_size{0}; + int _thread_num{1}; + + void* _work_space_ptr{nullptr}; + void* _align_ptr{nullptr}; + + size_t _work_size{0}; + size_t _a_worksize{0}; + size_t _b_worksize{0}; + load_data _load_a; + load_data _load_b; + + bool _init_flag{false}; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_NEON_IMPL_SGEMM_ARM_H diff --git a/saber/lite/funcs/neon/impl/sgemv_arm.cpp b/saber/lite/funcs/neon/impl/sgemv_arm.cpp new file mode 100644 index 000000000..a283865eb --- /dev/null +++ b/saber/lite/funcs/neon/impl/sgemv_arm.cpp @@ -0,0 +1,815 @@ +#include "saber/lite/funcs/neon/impl/sgemv_arm.h" +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +void sgemv(const bool transA, const int M, const int N, \ + const float* A, const float* x, float* y) { + float* data_out = y; + const float* data_in = x; + const float* weights_ptr = A; + + int cnt_loop = N >> 3; + int tail = N & 7; + int out_cnt = M >> 2; + + unsigned int imask[8] = {7, 6, 5, 4, 3, 2, 1, 0}; + + uint32x4_t vmask1 = vcgtq_u32(vld1q_u32(imask), vdupq_n_u32(tail)); + uint32x4_t vmask2 = vcgtq_u32(vld1q_u32(imask + 4), vdupq_n_u32(tail)); + +#pragma omp parallel for + for (int j = 0; j < out_cnt; j++) { + + int out_idx = j * 4; + float *ptr_out = data_out + out_idx; + const float *ptr_in = data_in; + const float *ptr_w0 = weights_ptr + (N * out_idx); + const float *ptr_w1 = ptr_w0 + N; + const float *ptr_w2 = ptr_w1 + N; + const float *ptr_w3 = ptr_w2 + N; + + int cnt = cnt_loop; + if (cnt > 0) { + asm volatile( + "pld [%[in]] @ preload cache line, input\n" + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + "pld [%[w1]] @ preload cache line, weights r1\n" + "pld [%[w2]] @ preload cache line, weights r2\n" + "pld [%[w3]] @ preload cache line, weights r3\n" + + "vmov.u32 q0, #0 @ set q0 to 0\n" + "vmov.u32 q1, #0 @ set q1 to 0\n" + "vmov.u32 q2, #0 @ set q2 to 0\n" + "vmov.u32 q3, #0 @ set q3 to 0\n" + + "pld [%[w0], #128] @ preload cache line, weights r0\n" + "pld [%[w1], #128] @ preload cache line, weights r1\n" + "pld [%[w2], #128] @ preload cache line, weights r2\n" + "pld [%[w3], #128] @ preload cache line, weights r3\n" + + "sgemv_loop: @ main loop\n" + "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" + //"pld [%[in]] @ preload cache line, in\n" + //"pld [%[in], #128] @ preload cache line, in\n" + + "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + "pld [%[w0], #128] @ preload cache line, weights r0\n" + + "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" + "pld [%[w1]] @ preload cache line, weights r1\n" + "pld [%[w1], #128] @ preload cache line, weights r1\n" + + "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" + "pld [%[w2]] @ preload cache line, weights r2\n" + "pld [%[w2], #128] @ preload cache line, weights r2\n" + + "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" + "pld [%[w3]] @ preload cache line, weights r3\n" + "pld [%[w3], #128] @ preload cache line, weights r3\n" + + "vmla.f32 q0, q4, q6 @ mul add\n" + "vmla.f32 q1, q4, q8 @ mul add\n" + "vmla.f32 q2, q4, q10 @ mul add\n" + "vmla.f32 q3, q4, q12 @ mul add\n" + + "vmla.f32 q0, q5, q7 @ mul add\n" + "vmla.f32 q1, q5, q9 @ mul add\n" + "vmla.f32 q2, q5, q11 @ mul add\n" + "vmla.f32 q3, q5, q13 @ mul add\n" + + // check loop end + "subs %[cnt], #1 @ sub loop count \n" + "bne sgemv_loop @ jump to main loop\n" + + // check tails + "cmp %[tail], #1 @ check whether has mid cols\n" + "blt sgemv_end @ jump to end\n" + + // process tail + "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" + // deal with right pad + "vmov.u32 q6, #0 @ dump q8 to zero, for bit select in tail\n" + "vbif q4, q6, %q[mask1] @ bit select, deal with right pad\n" + "vbif q5, q6, %q[mask2] @ bit select, deal with right pad\n" + + "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" + "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" + "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" + "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12, q13\n" + + "vmla.f32 q0, q4, q6 @ mul add\n" + "vmla.f32 q1, q4, q8 @ mul add\n" + "vmla.f32 q2, q4, q10 @ mul add\n" + "vmla.f32 q3, q4, q12 @ mul add\n" + + "vmla.f32 q0, q5, q7 @ mul add\n" + "vmla.f32 q1, q5, q9 @ mul add\n" + "vmla.f32 q2, q5, q11 @ mul add\n" + "vmla.f32 q3, q5, q13 @ mul add\n" + + // pair add to final result + "sgemv_end: @ end processing\n" + "vpadd.f32 d8, d0, d1 @ pair add, first step\n" + "vpadd.f32 d9, d2, d3 @ pair add, first step\n" + "vpadd.f32 d10, d4, d5 @ pair add, first step\n" + "vpadd.f32 d11, d6, d7 @ pair add, first step\n" + + "vpadd.f32 d0, d8, d9 @ pair add, second step\n" + "vpadd.f32 d1, d10, d11 @ pair add, second step\n" + + "vst1.32 {d0-d1}, [%[out]] @ save result\n" + + :[in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [w1] "+r"(ptr_w1), \ + [w2] "+r"(ptr_w2), [w3] "+r"(ptr_w3), [out] "+r"(ptr_out), \ + [cnt] "+r"(cnt) + :[tail] "r" (tail), [mask1] "w" (vmask1), [mask2] "w" (vmask2) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", \ + "q10", "q11", "q12", "q13" + ); + } + } + + //! deal with remains + #pragma omp parallel for + for (int j = out_cnt * 4; j < M; ++j) { + float *ptr_out = data_out + j; + const float *ptr_in = data_in; + const float *ptr_w0 = weights_ptr + (N * j); + int cnt = cnt_loop; + asm volatile( + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + "vmov.u32 q0, #0 @ set q0 to 0\n" + "pld [%[in], #128] @ preload cache line, input\n" + "pld [%[w0], #128] @ preload cache line, weights r0\n" + + "sgemv_loop2: @ main loop\n" + "vld1.32 {d24-d25}, [%[in]]! @ load input, q12,q13\n" + "vld1.32 {d28-d29}, [%[w0]]! @ load weights r0, q14\n" + + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + + "vmla.f32 q0, q12, q14 @ mul add\n" + + "vld1.32 {d26-d27}, [%[in]]! @ load input, q12,q13\n" + "vld1.32 {d30-d31}, [%[w0]]! @ load weights r0, q14\n" + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + + "vmla.f32 q0, q13, q15 @ mul add\n" + "subs %[cnt] , #1 @ sub loop count \n" + "bne sgemv_loop2 @ jump to main loop\n" + + // check tails + "cmp %[tail], #1 @ check whether has mid cols\n" + "blt sgemv_end2 @ jump to end\n" + + // process tail + "vld1.32 {d24-d27}, [%[in]]! @ load input, q12,q13\n" + // deal with right pad + "vmov.u32 q1, #0 @ dump q8 to zero, for bit select in tail\n" + "vbif q12, q1, %q[mask1] @ bit select, deal with right pad\n" + "vbif q13, q1, %q[mask2] @ bit select, deal with right pad\n" + + "vld1.32 {d28-d29}, [%[w0]]! @ load weights r0, q14\n" + "vmla.f32 q0, q12, q14 @ mul add\n" + "vld1.32 {d30-d31}, [%[w0]]! @ load weights r0, q15\n" + "vmla.f32 q0, q13, q15 @ mul add\n" + + // pair add to final result + "sgemv_end2: @ end processing\n" + "vpadd.f32 d2, d0, d1 @ pair add, first step\n" + "vpadd.f32 d3, d2, d2 @ pair add, final step\n" + "vst1.32 {d3[0]}, [%[out]] @ save result\n" + :[in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [out] "+r"(ptr_out), \ + [cnt] "+r"(cnt) + :[tail] "r" (tail), [mask1] "w" (vmask1), [mask2] "w" (vmask2) + :"q0", "q1", "q12", "q13", "q14", "q15" + ); + } +} + + +void sgemv_relu(const bool transA, const int M, const int N, \ + const float* A, const float* x, float* y) { + float* data_out = y; + const float* data_in = x; + const float* weights_ptr = A; + + int cnt_loop = N >> 3; + int tail = N & 7; + int out_cnt = M >> 2; + + unsigned int imask[8] = {7, 6, 5, 4, 3, 2, 1, 0}; + + uint32x4_t vmask1 = vcgtq_u32(vld1q_u32(imask), vdupq_n_u32(tail)); + uint32x4_t vmask2 = vcgtq_u32(vld1q_u32(imask + 4), vdupq_n_u32(tail)); + +#pragma omp parallel for + for (int j = 0; j < out_cnt; j++) { + + int out_idx = j * 4; + float *ptr_out = data_out + out_idx; + const float *ptr_in = data_in; + const float *ptr_w0 = weights_ptr + (N * out_idx); + const float *ptr_w1 = ptr_w0 + N; + const float *ptr_w2 = ptr_w1 + N; + const float *ptr_w3 = ptr_w2 + N; + + int cnt = cnt_loop; + if (cnt > 0) { + asm volatile( + "pld [%[in]] @ preload cache line, input\n" + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + "pld [%[w1]] @ preload cache line, weights r1\n" + "pld [%[w2]] @ preload cache line, weights r2\n" + "pld [%[w3]] @ preload cache line, weights r3\n" + + "vmov.u32 q0, #0 @ set q0 to 0\n" + "vmov.u32 q1, #0 @ set q1 to 0\n" + "vmov.u32 q2, #0 @ set q2 to 0\n" + "vmov.u32 q3, #0 @ set q3 to 0\n" + + "pld [%[w0], #128] @ preload cache line, weights r0\n" + "pld [%[w1], #128] @ preload cache line, weights r1\n" + "pld [%[w2], #128] @ preload cache line, weights r2\n" + "pld [%[w3], #128] @ preload cache line, weights r3\n" + + "sgemv_relu_loop: @ main loop\n" + "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" + //"pld [%[in]] @ preload cache line, in\n" + //"pld [%[in], #128] @ preload cache line, in\n" + + "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + "pld [%[w0], #128] @ preload cache line, weights r0\n" + + "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" + "pld [%[w1]] @ preload cache line, weights r1\n" + "pld [%[w1], #128] @ preload cache line, weights r1\n" + + "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" + "pld [%[w2]] @ preload cache line, weights r2\n" + "pld [%[w2], #128] @ preload cache line, weights r2\n" + + "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" + "pld [%[w3]] @ preload cache line, weights r3\n" + "pld [%[w3], #128] @ preload cache line, weights r3\n" + + "vmla.f32 q0, q4, q6 @ mul add\n" + "vmla.f32 q1, q4, q8 @ mul add\n" + "vmla.f32 q2, q4, q10 @ mul add\n" + "vmla.f32 q3, q4, q12 @ mul add\n" + + "vmla.f32 q0, q5, q7 @ mul add\n" + "vmla.f32 q1, q5, q9 @ mul add\n" + "vmla.f32 q2, q5, q11 @ mul add\n" + "vmla.f32 q3, q5, q13 @ mul add\n" + + // check loop end + "subs %[cnt], #1 @ sub loop count \n" + "bne sgemv_relu_loop @ jump to main loop\n" + + // check tails + "cmp %[tail], #1 @ check whether has mid cols\n" + "blt sgemv_relu_end @ jump to end\n" + + // process tail + "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" + // deal with right pad + "vmov.u32 q6, #0 @ dump q8 to zero, for bit select in tail\n" + "vbif q4, q6, %q[mask1] @ bit select, deal with right pad\n" + "vbif q5, q6, %q[mask2] @ bit select, deal with right pad\n" + + "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" + "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" + "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" + "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12, q13\n" + + "vmla.f32 q0, q4, q6 @ mul add\n" + "vmla.f32 q1, q4, q8 @ mul add\n" + "vmla.f32 q2, q4, q10 @ mul add\n" + "vmla.f32 q3, q4, q12 @ mul add\n" + + "vmla.f32 q0, q5, q7 @ mul add\n" + "vmla.f32 q1, q5, q9 @ mul add\n" + "vmla.f32 q2, q5, q11 @ mul add\n" + "vmla.f32 q3, q5, q13 @ mul add\n" + + // pair add to final result + "sgemv_relu_end: @ end processing\n" + "vpadd.f32 d8, d0, d1 @ pair add, first step\n" + "vpadd.f32 d9, d2, d3 @ pair add, first step\n" + "vpadd.f32 d10, d4, d5 @ pair add, first step\n" + "vpadd.f32 d11, d6, d7 @ pair add, first step\n" + + "vpadd.f32 d0, d8, d9 @ pair add, second step\n" + "vpadd.f32 d1, d10, d11 @ pair add, second step\n" + + "vmov.u32 q1, #0 @ set q1 to zero, for relu\n" + "vmax.f32 q2, q0, q1 @ relu\n" + + "vst1.32 {d4-d5}, [%[out]] @ save result\n" + + :[in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [w1] "+r"(ptr_w1), \ + [w2] "+r"(ptr_w2), [w3] "+r"(ptr_w3), [out] "+r"(ptr_out), \ + [cnt] "+r"(cnt) + :[tail] "r" (tail), [mask1] "w" (vmask1), [mask2] "w" (vmask2) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", \ + "q10", "q11", "q12", "q13" + ); + } + } + + //! deal with remains +#pragma omp parallel for + for (int j = out_cnt * 4; j < M; ++j) { + float *ptr_out = data_out + j; + const float *ptr_in = data_in; + const float *ptr_w0 = weights_ptr + (N * j); + int cnt = cnt_loop; + asm volatile( + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + "vmov.u32 q0, #0 @ set q0 to 0\n" + "pld [%[in], #128] @ preload cache line, input\n" + "pld [%[w0], #128] @ preload cache line, weights r0\n" + + "sgemv_relu_loop2: @ main loop\n" + "vld1.32 {d24-d25}, [%[in]]! @ load input, q12,q13\n" + "vld1.32 {d28-d29}, [%[w0]]! @ load weights r0, q14\n" + + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + + "vmla.f32 q0, q12, q14 @ mul add\n" + + "vld1.32 {d26-d27}, [%[in]]! @ load input, q12,q13\n" + "vld1.32 {d30-d31}, [%[w0]]! @ load weights r0, q14\n" + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + + "vmla.f32 q0, q13, q15 @ mul add\n" + "subs %[cnt] , #1 @ sub loop count \n" + "bne sgemv_relu_loop2 @ jump to main loop\n" + + // check tails + "cmp %[tail], #1 @ check whether has mid cols\n" + "blt sgemv_relu_end2 @ jump to end\n" + + // process tail + "vld1.32 {d24-d27}, [%[in]]! @ load input, q12,q13\n" + // deal with right pad + "vmov.u32 q1, #0 @ dump q8 to zero, for bit select in tail\n" + "vbif q12, q1, %q[mask1] @ bit select, deal with right pad\n" + "vbif q13, q1, %q[mask2] @ bit select, deal with right pad\n" + + "vld1.32 {d28-d29}, [%[w0]]! @ load weights r0, q14\n" + "vmla.f32 q0, q12, q14 @ mul add\n" + "vld1.32 {d30-d31}, [%[w0]]! @ load weights r0, q15\n" + "vmla.f32 q0, q13, q15 @ mul add\n" + + // pair add to final result + "sgemv_relu_end2: @ end processing\n" + "vpadd.f32 d2, d0, d1 @ pair add, first step\n" + "vpadd.f32 d3, d2, d2 @ pair add, final step\n" + + "vmov.u32 d0, #0 @ set q1 to zero, for relu\n" + "vmax.f32 d1, d3, d0 @ relu\n" + + "vst1.32 {d1[0]}, [%[out]] @ save result\n" + + :[in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [out] "+r"(ptr_out), \ + [cnt] "+r"(cnt) + :[tail] "r" (tail), [mask1] "w" (vmask1), [mask2] "w" (vmask2) + :"q0", "q1", "q12", "q13", "q14", "q15" + ); + } +} + +void sgemv_bias(const bool transA, const int M, const int N, \ + const float* A, const float* x, float* y, const float* bias) { + float* data_out = y; + const float* data_in = x; + const float* weights_ptr = A; + + int cnt_loop = N >> 3; + int tail = N & 7; + int out_cnt = M >> 2; + + unsigned int imask[8] = {7, 6, 5, 4, 3, 2, 1, 0}; + + uint32x4_t vmask1 = vcgtq_u32(vld1q_u32(imask), vdupq_n_u32(tail)); + uint32x4_t vmask2 = vcgtq_u32(vld1q_u32(imask + 4), vdupq_n_u32(tail)); + +#pragma omp parallel for + for (int j = 0; j < out_cnt; j++) { + + int out_idx = j * 4; + float *ptr_out = data_out + out_idx; + const float *ptr_in = data_in; + const float *ptr_w0 = weights_ptr + (N * out_idx); + const float *ptr_w1 = ptr_w0 + N; + const float *ptr_w2 = ptr_w1 + N; + const float *ptr_w3 = ptr_w2 + N; + + const float* ptr_bias = bias + out_idx; + + int cnt = cnt_loop; + if (cnt > 0) { + asm volatile( + "pld [%[in]] @ preload cache line, input\n" + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + "pld [%[w1]] @ preload cache line, weights r1\n" + "pld [%[w2]] @ preload cache line, weights r2\n" + "pld [%[w3]] @ preload cache line, weights r3\n" + + "vmov.u32 q0, #0 @ set q0 to 0\n" + "vmov.u32 q1, #0 @ set q1 to 0\n" + "vmov.u32 q2, #0 @ set q2 to 0\n" + "vmov.u32 q3, #0 @ set q3 to 0\n" + + "pld [%[w0], #128] @ preload cache line, weights r0\n" + "pld [%[w1], #128] @ preload cache line, weights r1\n" + "pld [%[w2], #128] @ preload cache line, weights r2\n" + "pld [%[w3], #128] @ preload cache line, weights r3\n" + + "sgemv_bias_loop: @ main loop\n" + "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" + //"pld [%[in]] @ preload cache line, in\n" + //"pld [%[in], #128] @ preload cache line, in\n" + + "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + "pld [%[w0], #128] @ preload cache line, weights r0\n" + + "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" + "pld [%[w1]] @ preload cache line, weights r1\n" + "pld [%[w1], #128] @ preload cache line, weights r1\n" + + "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" + "pld [%[w2]] @ preload cache line, weights r2\n" + "pld [%[w2], #128] @ preload cache line, weights r2\n" + + "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" + "pld [%[w3]] @ preload cache line, weights r3\n" + "pld [%[w3], #128] @ preload cache line, weights r3\n" + + "vmla.f32 q0, q4, q6 @ mul add\n" + "vmla.f32 q1, q4, q8 @ mul add\n" + "vmla.f32 q2, q4, q10 @ mul add\n" + "vmla.f32 q3, q4, q12 @ mul add\n" + + "vmla.f32 q0, q5, q7 @ mul add\n" + "vmla.f32 q1, q5, q9 @ mul add\n" + "vmla.f32 q2, q5, q11 @ mul add\n" + "vmla.f32 q3, q5, q13 @ mul add\n" + + // check loop end + "subs %[cnt], #1 @ sub loop count \n" + "bne sgemv_bias_loop @ jump to main loop\n" + + // check tails + "cmp %[tail], #1 @ check whether has mid cols\n" + "blt sgemv_bias_end @ jump to end\n" + + // process tail + "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" + // deal with right pad + "vmov.u32 q6, #0 @ dump q8 to zero, for bit select in tail\n" + "vbif q4, q6, %q[mask1] @ bit select, deal with right pad\n" + "vbif q5, q6, %q[mask2] @ bit select, deal with right pad\n" + + "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" + "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" + "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" + "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12, q13\n" + + "vmla.f32 q0, q4, q6 @ mul add\n" + "vmla.f32 q1, q4, q8 @ mul add\n" + "vmla.f32 q2, q4, q10 @ mul add\n" + "vmla.f32 q3, q4, q12 @ mul add\n" + + "vmla.f32 q0, q5, q7 @ mul add\n" + "vmla.f32 q1, q5, q9 @ mul add\n" + "vmla.f32 q2, q5, q11 @ mul add\n" + "vmla.f32 q3, q5, q13 @ mul add\n" + + // pair add to final result + "sgemv_bias_end: @ end processing\n" + "vld1.32 {d12-d13}, [%[bias]] @ load weights r0, q6,q7\n" + "vpadd.f32 d8, d0, d1 @ pair add, first step\n" + "vpadd.f32 d9, d2, d3 @ pair add, first step\n" + "vpadd.f32 d10, d4, d5 @ pair add, first step\n" + "vpadd.f32 d11, d6, d7 @ pair add, first step\n" + + "vpadd.f32 d0, d8, d9 @ pair add, second step\n" + "vpadd.f32 d1, d10, d11 @ pair add, second step\n" + + "vadd.f32 q1, q0, q6 @ add bias\n" + + "vst1.32 {d2-d3}, [%[out]] @ save result\n" + + :[in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [w1] "+r"(ptr_w1), \ + [w2] "+r"(ptr_w2), [w3] "+r"(ptr_w3), [out] "+r"(ptr_out), \ + [cnt] "+r"(cnt) + :[tail] "r" (tail), [mask1] "w" (vmask1), [mask2] "w" (vmask2), \ + [bias] "r" (ptr_bias) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", \ + "q10", "q11", "q12", "q13" + ); + } + } + + //! deal with remains +#pragma omp parallel for + for (int j = out_cnt * 4; j < M; ++j) { + float *ptr_out = data_out + j; + const float *ptr_in = data_in; + const float *ptr_w0 = weights_ptr + (N * j); + int cnt = cnt_loop; + float32x2_t vbias = vdup_n_f32(bias[j]); + asm volatile( + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + "vmov.u32 q0, #0 @ set q0 to 0\n" + "pld [%[in], #128] @ preload cache line, input\n" + "pld [%[w0], #128] @ preload cache line, weights r0\n" + + "sgemv_bias_loop2: @ main loop\n" + "vld1.32 {d24-d25}, [%[in]]! @ load input, q12,q13\n" + "vld1.32 {d28-d29}, [%[w0]]! @ load weights r0, q14\n" + + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + + "vmla.f32 q0, q12, q14 @ mul add\n" + + "vld1.32 {d26-d27}, [%[in]]! @ load input, q12,q13\n" + "vld1.32 {d30-d31}, [%[w0]]! @ load weights r0, q14\n" + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + + "vmla.f32 q0, q13, q15 @ mul add\n" + "subs %[cnt] , #1 @ sub loop count \n" + "bne sgemv_bias_loop2 @ jump to main loop\n" + + // check tails + "cmp %[tail], #1 @ check whether has mid cols\n" + "blt sgemv_bias_end2 @ jump to end\n" + + // process tail + "vld1.32 {d24-d27}, [%[in]]! @ load input, q12,q13\n" + // deal with right pad + "vmov.u32 q1, #0 @ dump q8 to zero, for bit select in tail\n" + "vbif q12, q1, %q[mask1] @ bit select, deal with right pad\n" + "vbif q13, q1, %q[mask2] @ bit select, deal with right pad\n" + + "vld1.32 {d28-d29}, [%[w0]]! @ load weights r0, q14\n" + "vmla.f32 q0, q12, q14 @ mul add\n" + "vld1.32 {d30-d31}, [%[w0]]! @ load weights r0, q15\n" + "vmla.f32 q0, q13, q15 @ mul add\n" + + // pair add to final result + "sgemv_bias_end2: @ end processing\n" + "vpadd.f32 d2, d0, d1 @ pair add, first step\n" + "vpadd.f32 d3, d2, d2 @ pair add, final step\n" + + "vadd.f32 d3, %P[bias] @ add bias\n" + + "vst1.32 {d3[0]}, [%[out]] @ save result\n" + + :[in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [out] "+r"(ptr_out), \ + [cnt] "+r"(cnt) + :[tail] "r" (tail), [mask1] "w" (vmask1), [mask2] "w" (vmask2), \ + [bias] "w" (vbias) + :"q0", "q1", "q12", "q13", "q14", "q15" + ); + } +} + + +void sgemv_bias_relu(const bool transA, const int M, const int N, \ + const float* A, const float* x, float* y, const float* bias) { + float* data_out = y; + const float* data_in = x; + const float* weights_ptr = A; + + int cnt_loop = N >> 3; + int tail = N & 7; + int out_cnt = M >> 2; + + unsigned int imask[8] = {7, 6, 5, 4, 3, 2, 1, 0}; + + uint32x4_t vmask1 = vcgtq_u32(vld1q_u32(imask), vdupq_n_u32(tail)); + uint32x4_t vmask2 = vcgtq_u32(vld1q_u32(imask + 4), vdupq_n_u32(tail)); + +#pragma omp parallel for + for (int j = 0; j < out_cnt; j++) { + + int out_idx = j * 4; + float *ptr_out = data_out + out_idx; + const float *ptr_in = data_in; + const float *ptr_w0 = weights_ptr + (N * out_idx); + const float *ptr_w1 = ptr_w0 + N; + const float *ptr_w2 = ptr_w1 + N; + const float *ptr_w3 = ptr_w2 + N; + + const float* ptr_bias = bias + out_idx; + + int cnt = cnt_loop; + if (cnt > 0) { + asm volatile( + "pld [%[in]] @ preload cache line, input\n" + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + "pld [%[w1]] @ preload cache line, weights r1\n" + "pld [%[w2]] @ preload cache line, weights r2\n" + "pld [%[w3]] @ preload cache line, weights r3\n" + + "vmov.u32 q0, #0 @ set q0 to 0\n" + "vmov.u32 q1, #0 @ set q1 to 0\n" + "vmov.u32 q2, #0 @ set q2 to 0\n" + "vmov.u32 q3, #0 @ set q3 to 0\n" + + "pld [%[w0], #128] @ preload cache line, weights r0\n" + "pld [%[w1], #128] @ preload cache line, weights r1\n" + "pld [%[w2], #128] @ preload cache line, weights r2\n" + "pld [%[w3], #128] @ preload cache line, weights r3\n" + + "sgemv_bias_relu_loop: @ main loop\n" + "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" + //"pld [%[in]] @ preload cache line, in\n" + //"pld [%[in], #128] @ preload cache line, in\n" + + "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + "pld [%[w0], #128] @ preload cache line, weights r0\n" + + "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" + "pld [%[w1]] @ preload cache line, weights r1\n" + "pld [%[w1], #128] @ preload cache line, weights r1\n" + + "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" + "pld [%[w2]] @ preload cache line, weights r2\n" + "pld [%[w2], #128] @ preload cache line, weights r2\n" + + "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" + "pld [%[w3]] @ preload cache line, weights r3\n" + "pld [%[w3], #128] @ preload cache line, weights r3\n" + + "vmla.f32 q0, q4, q6 @ mul add\n" + "vmla.f32 q1, q4, q8 @ mul add\n" + "vmla.f32 q2, q4, q10 @ mul add\n" + "vmla.f32 q3, q4, q12 @ mul add\n" + + "vmla.f32 q0, q5, q7 @ mul add\n" + "vmla.f32 q1, q5, q9 @ mul add\n" + "vmla.f32 q2, q5, q11 @ mul add\n" + "vmla.f32 q3, q5, q13 @ mul add\n" + + // check loop end + "subs %[cnt], #1 @ sub loop count \n" + "bne sgemv_bias_relu_loop @ jump to main loop\n" + + // check tails + "cmp %[tail], #1 @ check whether has mid cols\n" + "blt sgemv_bias_relu_end @ jump to end\n" + + // process tail + "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" + // deal with right pad + "vmov.u32 q6, #0 @ dump q8 to zero, for bit select in tail\n" + "vbif q4, q6, %q[mask1] @ bit select, deal with right pad\n" + "vbif q5, q6, %q[mask2] @ bit select, deal with right pad\n" + + "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" + "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" + "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" + "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12, q13\n" + + "vmla.f32 q0, q4, q6 @ mul add\n" + "vmla.f32 q1, q4, q8 @ mul add\n" + "vmla.f32 q2, q4, q10 @ mul add\n" + "vmla.f32 q3, q4, q12 @ mul add\n" + + "vmla.f32 q0, q5, q7 @ mul add\n" + "vmla.f32 q1, q5, q9 @ mul add\n" + "vmla.f32 q2, q5, q11 @ mul add\n" + "vmla.f32 q3, q5, q13 @ mul add\n" + + // pair add to final result + "sgemv_bias_relu_end: @ end processing\n" + "vld1.32 {d12-d13}, [%[bias]] @ load weights r0, q6,q7\n" + "vpadd.f32 d8, d0, d1 @ pair add, first step\n" + "vpadd.f32 d9, d2, d3 @ pair add, first step\n" + "vpadd.f32 d10, d4, d5 @ pair add, first step\n" + "vpadd.f32 d11, d6, d7 @ pair add, first step\n" + + "vpadd.f32 d0, d8, d9 @ pair add, second step\n" + "vpadd.f32 d1, d10, d11 @ pair add, second step\n" + + "vmov.u32 q2, #0 @ for relu\n" + "vadd.f32 q1, q0, q6 @ add bias\n" + "vmax.f32 q0, q1, q2 @ relu\n" + + "vst1.32 {d0-d1}, [%[out]] @ save result\n" + + :[in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [w1] "+r"(ptr_w1), \ + [w2] "+r"(ptr_w2), [w3] "+r"(ptr_w3), [out] "+r"(ptr_out), \ + [cnt] "+r"(cnt) + :[tail] "r" (tail), [mask1] "w" (vmask1), [mask2] "w" (vmask2), \ + [bias] "r" (ptr_bias) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", \ + "q10", "q11", "q12", "q13" + ); + } + } + + //! deal with remains +#pragma omp parallel for + for (int j = out_cnt * 4; j < M; ++j) { + float *ptr_out = data_out + j; + const float *ptr_in = data_in; + const float *ptr_w0 = weights_ptr + (N * j); + int cnt = cnt_loop; + float32x2_t vbias = vdup_n_f32(bias[j]); + asm volatile( + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + "vmov.u32 q0, #0 @ set q0 to 0\n" + "pld [%[in], #128] @ preload cache line, input\n" + "pld [%[w0], #128] @ preload cache line, weights r0\n" + + "sgemv_bias_relu_loop2: @ main loop\n" + "vld1.32 {d24-d25}, [%[in]]! @ load input, q12,q13\n" + "vld1.32 {d28-d29}, [%[w0]]! @ load weights r0, q14\n" + + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + + "vmla.f32 q0, q12, q14 @ mul add\n" + + "vld1.32 {d26-d27}, [%[in]]! @ load input, q12,q13\n" + "vld1.32 {d30-d31}, [%[w0]]! @ load weights r0, q14\n" + "pld [%[in]] @ preload cache line, input\n" + "pld [%[w0]] @ preload cache line, weights r0\n" + + "vmla.f32 q0, q13, q15 @ mul add\n" + "subs %[cnt] , #1 @ sub loop count \n" + "bne sgemv_bias_relu_loop2 @ jump to main loop\n" + + // check tails + "cmp %[tail], #1 @ check whether has mid cols\n" + "blt sgemv_bias_relu_end2 @ jump to end\n" + + // process tail + "vld1.32 {d24-d27}, [%[in]]! @ load input, q12,q13\n" + // deal with right pad + "vmov.u32 q1, #0 @ dump q8 to zero, for bit select in tail\n" + "vbif q12, q1, %q[mask1] @ bit select, deal with right pad\n" + "vbif q13, q1, %q[mask2] @ bit select, deal with right pad\n" + + "vld1.32 {d28-d29}, [%[w0]]! @ load weights r0, q14\n" + "vmla.f32 q0, q12, q14 @ mul add\n" + "vld1.32 {d30-d31}, [%[w0]]! @ load weights r0, q15\n" + "vmla.f32 q0, q13, q15 @ mul add\n" + + // pair add to final result + "sgemv_bias_relu_end2: @ end processing\n" + "vpadd.f32 d2, d0, d1 @ pair add, first step\n" + "vpadd.f32 d3, d2, d2 @ pair add, final step\n" + + "vadd.f32 d3, %P[bias] @ add bias\n" + + "vmov.u32 d2, #0 @ for relu\n" + "vmax.f32 d1, d2, d3 @ relu\n" + + "vst1.32 {d1[0]}, [%[out]] @ save result\n" + + :[in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [out] "+r"(ptr_out), \ + [cnt] "+r"(cnt) + :[tail] "r" (tail), [mask1] "w" (vmask1), [mask2] "w" (vmask2), \ + [bias] "w" (vbias) + :"q0", "q1", "q12", "q13", "q14", "q15" + ); + } +} + +} //lite + +} //saber + +} //namespace anakin + +#endif //USE_ARM_PLACE \ No newline at end of file diff --git a/saber/lite/funcs/neon/impl/sgemv_arm.h b/saber/lite/funcs/neon/impl/sgemv_arm.h new file mode 100644 index 000000000..706bbb2de --- /dev/null +++ b/saber/lite/funcs/neon/impl/sgemv_arm.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_FUNCS_ARM_IMPL_SGEMV_ARM_H +#define ANAKIN_SABER_FUNCS_ARM_IMPL_SGEMV_ARM_H + +#include "saber/lite/core/common_lite.h" +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +// fixme now only support transA = false +void sgemv(const bool transA, const int M, const int N, \ + const float* A, const float* x, float* y); + +void sgemv_relu(const bool transA, const int M, const int N, \ + const float* A, const float* x, float* y); + +void sgemv_bias(const bool transA, const int M, const int N, \ + const float* A, const float* x, float* y, const float* bias); + +void sgemv_bias_relu(const bool transA, const int M, const int N, \ + const float* A, const float* x, float* y, const float* bias); + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_FUNCS_ARM_IMPL_SGEMV_ARM_H diff --git a/saber/lite/funcs/neon/saber_activation.cpp b/saber/lite/funcs/neon/saber_activation.cpp new file mode 100644 index 000000000..e9c24ff9a --- /dev/null +++ b/saber/lite/funcs/neon/saber_activation.cpp @@ -0,0 +1,100 @@ +#include "saber/lite/funcs/saber_activation.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +SaberActivation::SaberActivation(ActiveType type, float neg_slop) { + _type = type; + _neg_slop = neg_slop; +} + + +SaberStatus SaberActivation::load_param(ActiveType type, float neg_slop) { + _type = type; + _neg_slop = neg_slop; + return SaberSuccess; +} + +SaberStatus SaberActivation::compute_output_shape(const std::vector *> &inputs, + std::vector *> &outputs) { + outputs[0]->set_shape(inputs[0]->valid_shape()); + return SaberSuccess; +} + +SaberStatus SaberActivation::init(const std::vector *> &inputs, + std::vector *> &outputs, Context &ctx) { + _ctx = ctx; + return SaberSuccess; +} + +SaberStatus SaberActivation::dispatch(const std::vector *> &inputs, + std::vector *> &outputs) { + + float* ptr_out = outputs[0]->mutable_data(); + const float* ptr_in = inputs[0]->data(); + + int size = inputs[0]->valid_size(); + int threads = _ctx.get_act_ids().size(); + int nums_per_thread = size / threads; + int remain = size - threads * nums_per_thread; + int neon_loop_cnt = nums_per_thread >> 4; + int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4); + float32x4_t vzero = vdupq_n_f32(0.f); + switch (_type){ + case Active_relu: + #pragma omp parallel for + for (int i = 0; i < threads; ++i) { + const float* ptr_in_thread = ptr_in + i * nums_per_thread; + float* ptr_out_thread = ptr_out + i * nums_per_thread; + int cnt = neon_loop_cnt; + asm volatile ( + "relu_loop: @ loop header\n" + "vld1.32 {d0-d1}, [%[din]]! @ load din 0\n" + "vld1.32 {d2-d3}, [%[din]]! @ load din 0\n" + "vld1.32 {d4-d5}, [%[din]]! @ load din 0\n" + "vld1.32 {d6-d7}, [%[din]]! @ load din 0\n" + + "vmax.f32 q8, q0, %q[vzero] @ relu\n" + "vmax.f32 q9, q1, %q[vzero] @ relu\n" + "vmax.f32 q10, q2, %q[vzero] @ relu\n" + "vmax.f32 q11, q3, %q[vzero] @ relu\n" + + "vst1.32 {d16-d17}, [%[dout]]! @ store result, add pointer\n" + "pld [%[din]] @ preload data\n" + "vst1.32 {d18-d19}, [%[dout]]! @ store result, add pointer\n" + "pld [%[din], #128] @ preload data\n" + "vst1.32 {d20-d21}, [%[dout]]! @ store result, add pointer\n" + "pld [%[din], #256] @ preload data\n" + "vst1.32 {d22-d23}, [%[dout]]! @ store result, add pointer\n" + "pld [%[din], #384] @ preload data\n" + + "subs %[cnt], #1 @ loop count minus 1\n" + "bne relu_loop @ jump to main loop start point\n" + :[dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread), [cnt] "+r"(cnt) + :[vzero] "w" (vzero) + :"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); + for (int j = 0; j < neon_loop_remain; ++j) { + ptr_out_thread[0] = ptr_in_thread[0] > 0.f? ptr_in_thread[0] : 0.f; + ptr_in_thread++; + ptr_out_thread++; + } + } + return SaberSuccess; + case Active_sigmoid: + return SaberUnImplError; + case Active_tanh: + return SaberUnImplError; + default: + return SaberUnKownError; + } +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin \ No newline at end of file diff --git a/saber/lite/funcs/neon/saber_concat.cpp b/saber/lite/funcs/neon/saber_concat.cpp new file mode 100644 index 000000000..1edad1516 --- /dev/null +++ b/saber/lite/funcs/neon/saber_concat.cpp @@ -0,0 +1,94 @@ +#include "saber/lite/funcs/saber_concat.h" + +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +SaberConcat::SaberConcat(int axis) { + _axis = axis; +} + +SaberStatus SaberConcat::load_param(int axis) { + _axis = axis; + return SaberSuccess; +} + +SaberStatus SaberConcat::compute_output_shape(const std::vector *> &inputs, + std::vector *> &outputs) { + unsigned long input_size = inputs.size(); + + Shape shape_out = inputs[0]->valid_shape(); + + //! compute output shape + for (int i = 1; i < input_size; ++i) { + Shape sh = inputs[i]->valid_shape(); + for (int j = 0; j < sh.dims(); ++j) { + if (j == _axis) { continue; } + LCHECK_EQ(shape_out[j], sh[j], "All inputs must have the same shape, except at concat_axis."); + } + shape_out[_axis] += sh[_axis]; + } + return outputs[0]->set_shape(shape_out); +} + +SaberStatus SaberConcat::init(const std::vector *> &inputs, + std::vector *> &outputs, + Context &ctx) { + _ctx = ctx; + _num_concats = inputs[0]->count_valid(0, _axis); + _concat_input_size = inputs[0]->count_valid(_axis + 1, inputs[0]->dims()); + return SaberSuccess; +} + +template +void concat_kernel_arm(const int len, const dtype* src, dtype* dst) { + if (dst != src) { + memcpy(dst, src, sizeof(dtype) * len); + } +} + + +SaberStatus SaberConcat::dispatch(const std::vector *> &inputs, + std::vector *> &outputs) { + + int input_size = inputs.size(); + + //! get output data, valid shape and stride shape + int offset_concat_axis = 0; + Shape out_shape = outputs[0]->valid_shape(); + const int out_concat_axis = out_shape[_axis]; + + if (inputs.size() == 1) { + outputs[0]->copy_from(*inputs[0]); + return SaberSuccess; + } + + float* dout = outputs[0]->mutable_data(); + + for (int i = 0; i < input_size; ++i) { + Shape sh_in = inputs[i]->valid_shape(); + const float* din = inputs[i]->data(); + const int in_concat_axis = sh_in[_axis]; + for (int n = 0; n < _num_concats; ++n) { + concat_kernel_arm(in_concat_axis * _concat_input_size, + din + n * in_concat_axis * _concat_input_size, + dout + (n * out_concat_axis + offset_concat_axis) + * _concat_input_size); + } + offset_concat_axis += in_concat_axis; + } + return SaberSuccess; +} + + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif // USE_ARM_PLACE diff --git a/saber/lite/funcs/neon/saber_conv.cpp b/saber/lite/funcs/neon/saber_conv.cpp new file mode 100755 index 000000000..1cea17137 --- /dev/null +++ b/saber/lite/funcs/neon/saber_conv.cpp @@ -0,0 +1,224 @@ +#include "saber/lite/funcs/saber_conv.h" +#ifdef USE_ARM_PLACE +#include "saber/lite/funcs/neon/impl/conv_arm_impl.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +SaberConv2D::SaberConv2D() { + _impl = nullptr; + _workspace_fwd_sizes = 0; + _is_trans_weights = false; + _flag_relu = false; + _bias_term = true; +} + +SaberConv2D::SaberConv2D(int weights_size, int num_output, int group, int kw, int kh, \ + int stride_w, int stride_h, int pad_w, int pad_h, int dila_w, int dila_h, \ + bool flag_bias, const float* weights, const float* bias) { + _num_output = num_output; + _group = group; + _kw = kw; + _kh = kh; + _stride_w = stride_w; + _stride_h = stride_h; + _pad_w = pad_w; + _pad_h = pad_h; + _dila_w = dila_w; + _dila_h = dila_h; + _bias_term = flag_bias; + _weights = weights; + _bias = bias; + _weights_size = weights_size; +} + +SaberStatus SaberConv2D::load_param(int weights_size, int num_output, int group, int kw, int kh, \ + int stride_w, int stride_h, int pad_w, int pad_h, int dila_w, int dila_h, \ + bool flag_bias, const float* weights, const float* bias) { + _num_output = num_output; + _group = group; + _kw = kw; + _kh = kh; + _stride_w = stride_w; + _stride_h = stride_h; + _pad_w = pad_w; + _pad_h = pad_h; + _dila_w = dila_w; + _dila_h = dila_h; + _bias_term = flag_bias; + _weights = weights; + _bias = bias; + _weights_size = weights_size; + return SaberSuccess; +} + +SaberStatus SaberConv2D::compute_output_shape(const std::vector *> &inputs, + std::vector *> &outputs) { + Shape output_shape = inputs[0]->valid_shape(); + LCHECK_EQ(inputs[0]->valid_shape().dims(), 4, "using reshape2d to reshape a 1d conv?"); + + output_shape.set_num(inputs[0]->num()); // N + output_shape.set_channel(_num_output); // K + + int input_dim = inputs[0]->height(); // P + int kernel_exten = _dila_h * (_kh - 1) + 1; + int output_dim = (input_dim + 2 * _pad_h - kernel_exten) / _stride_h + 1; + + output_shape.set_height(output_dim); + + input_dim = inputs[0]->width(); // Q + kernel_exten = _dila_w * (_kw - 1) + 1; + output_dim = (input_dim + 2 * _pad_w - kernel_exten) / _stride_w + 1; + + output_shape.set_width(output_dim); + + return outputs[0]->set_shape(output_shape); +} + +//template <> +SaberStatus SaberConv2D::init(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, Context &ctx) { + + _ctx = ctx; + + int threads = _ctx.get_act_ids().size(); + + Shape shape_in = inputs[0]->valid_shape(); + Shape shape_out = outputs[0]->valid_shape(); + int win = inputs[0]->width(); + int hin = inputs[0]->height(); + int chin = inputs[0]->channel(); + int num = inputs[0]->num(); + int wout = outputs[0]->width(); + int hout = outputs[0]->height(); + int chout = outputs[0]->channel(); + + int l1_cache = Env::cur_env()._L1_cache; + int l2_cache = Env::cur_env()._L2_cache; + //! if L1 cache size is not provided, set to 31K + l1_cache = l1_cache > 0? l1_cache : 31000; + //! if L2 cache size is not provided, set to 2M + l2_cache = l2_cache > 0? l2_cache : 2000000; + + LCHECK_EQ(chin % _group, 0, "input channel or group size error"); + LCHECK_EQ(chout % _group, 0, "output channel or group size error"); +#if 0 + //! return basic conv func + if (_dila_h != 1 || _dila_w != 1) { + //! basic conv + _impl = conv_arm_basic; + printf("USE BASIC\n"); + return SaberSuccess; + } +#endif + //! depthwise conv, 3x3s1 or 3x3s2, pad must = 1 + if (_group == chin && chin == chout && _kw == 3 && _pad_w == 1 && _pad_h == 1) { + _impl = conv_depthwise_3x3; + printf("USE DW\n"); + return SaberSuccess; + } + + //! 3x3s1, when channel size or image size is large enough, use winograd + //! otherwise use direct conv + + if (_kw == 3 && _kh == 3 && _stride_h == 1 && \ + _pad_w == 1 && _group == 1) { + + if (chout / (wout * hout) > 1 || chin < 16 || chout < 14) { + //! use direct + _impl = conv_3x3s1_direct; + printf("USE 3x3 direct\n"); + } else { + //! use winograd + _weights_trans.reshape(Shape(8 * 8 * chout * chin * 2)); + //! space for computation + int tile_w = (wout + 5) / 6; + int tile_h = (hout + 5) / 6; + int size_tile = tile_h * tile_w; + int size_trans_channel = 8 * 8 * size_tile; + int max_ch = chin > chout? chin : chout; + + _workspace_data.reshape(Shape(size_trans_channel * max_ch * 2)); + + void* trans_tmp_ptr =(void*)(_weights_trans.mutable_data() + 8 * 8 * chout * chin); + float* weights_trans = _weights_trans.mutable_data(); + winograd_transform_weights(weights_trans, _weights, chout, chin, trans_tmp_ptr); + + const int m_wino = chout; + const int n_wino = size_tile; + const int k_wino = chin; + + _gemmer.init(l1_cache, l2_cache, m_wino, n_wino, k_wino, false, false, threads); + _impl = conv_arm_winograd3x3; + _is_trans_weights = true; + printf("USE WINO\n"); + } + return SaberSuccess; + } + + //! use im2col and gemm conv + const int m = chout / _group; + const int n = hout * wout; + const int k = chin * _kh * _kw / _group; + if (_kw == 1 && _kh == 1 && _stride_w == 1 && _stride_h == 1 && \ + _pad_w == 0 && _pad_h == 0) { + //! 1x1s1p0 + _impl = conv1x1s1_gemm; + _workspace_fwd_sizes = 0; + } else { + //! otherwise + _impl = conv_im2col_gemm; + _workspace_fwd_sizes = k * n; + _workspace_data.reshape(Shape(_workspace_fwd_sizes)); + } + + _gemmer.init(l1_cache, l2_cache, m, n, k, false, false, threads); + printf("USE GEMM\n"); + return SaberSuccess; +} + +SaberStatus SaberConv2D::set_activation(bool flag) { + _flag_relu = flag; + return SaberSuccess; +} + + +//template <> +SaberStatus SaberConv2D::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs) { + + const float* weight = _weights; + if (_is_trans_weights) { + weight = _weights_trans.data(); + } + const float* bias = nullptr; + if (_bias_term) { + bias = _bias; + } + int num = inputs[0]->num(); + int chout = outputs[0]->channel(); + int hout = outputs[0]->height(); + int wout = outputs[0]->width(); + int chin = inputs[0]->channel(); + int hin = inputs[0]->height(); + int win = inputs[0]->width(); + _impl(inputs[0]->data(), outputs[0]->mutable_data(), num, chout, hout, wout, \ + chin, hin, win, weight, bias, _group, _kw, _kh, _stride_w, _stride_h, \ + _dila_w, _dila_h, _pad_w, _pad_h, _bias_term, _flag_relu, _gemmer, \ + (void*)_workspace_data.mutable_data()); + return SaberSuccess; +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif // USE_ARM_PLACE + + diff --git a/saber/lite/funcs/neon/saber_detection_output.cpp b/saber/lite/funcs/neon/saber_detection_output.cpp new file mode 100644 index 000000000..7d38c953d --- /dev/null +++ b/saber/lite/funcs/neon/saber_detection_output.cpp @@ -0,0 +1,854 @@ +#include "saber/lite/funcs/saber_detection_output.h" +#ifdef USE_ARM_PLACE +#include +#include "saber/lite/funcs/neon/impl/neon_mathfun.h" +#include +#include +namespace anakin{ + +namespace saber{ + +namespace lite{ + +void decode_bbox_corner_variance_kernel(const int batch_num, \ + const float* loc_data, const float* prior_data, const float* variance, \ + const int num_priors, const bool share_location, const int num_loc_classes, \ + const int background_label_id, float* bbox_data) { + + LCHECK_EQ(share_location, true, "decode boxes without share_location is unimplemented"); + + int cnt = num_priors / 4; + int len_batch = num_priors * 4; + + for (int n = 0; n < batch_num; ++n) { + + const float* ptr_loc_batch = loc_data + n * len_batch; + float* ptr_bbox_batch = bbox_data + n * len_batch; +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + int idx = i * 16; + const float* ptr_loc = ptr_loc_batch + idx; + const float* ptr_prior = prior_data + idx; + float* ptr_bbox = ptr_bbox_batch + idx; + + float32x4_t vloc1 = vld1q_f32(ptr_loc); + float32x4_t vloc2 = vld1q_f32(ptr_loc + 4); + float32x4_t vloc3 = vld1q_f32(ptr_loc + 8); + float32x4_t vloc4 = vld1q_f32(ptr_loc + 12); + + float32x4_t vprior1 = vld1q_f32(ptr_prior); + float32x4_t vprior2 = vld1q_f32(ptr_prior + 4); + float32x4_t vprior3 = vld1q_f32(ptr_prior + 8); + float32x4_t vprior4 = vld1q_f32(ptr_prior + 12); + + vst1q_f32(ptr_bbox, vaddq_f32(vloc1, vprior1)); + vst1q_f32(ptr_bbox + 4, vaddq_f32(vloc2, vprior2)); + vst1q_f32(ptr_bbox + 8, vaddq_f32(vloc3, vprior3)); + vst1q_f32(ptr_bbox + 12, vaddq_f32(vloc4, vprior4)); + } +#pragma omp parallel for + for (int i = cnt * 4; i < num_priors; i++) { + int idx = i * 4; + float32x4_t vloc = vld1q_f32(ptr_loc_batch + idx); + float32x4_t vprior = vld1q_f32(prior_data + idx); + vst1q_f32(ptr_bbox_batch + idx , vaddq_f32(vloc, vprior)); + } + } +} + +void decode_bbox_corner_no_variance_kernel(const int batch_num, \ + const float* loc_data, const float* prior_data, const float* variance, \ + const int num_priors, const bool share_location, const int num_loc_classes, \ + const int background_label_id, float* bbox_data) { + + LCHECK_EQ(share_location, true, "decode boxes without share_location is unimplemented"); + + int cnt = num_priors / 4; + int len_batch = num_priors * 4; + + for (int n = 0; n < batch_num; ++n) { + + const float *ptr_loc_batch = loc_data + n * len_batch; + float *ptr_bbox_batch = bbox_data + n * len_batch; + +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + int idx = i * 16; + const float* ptr_loc = ptr_loc_batch + idx; + const float* ptr_prior = prior_data + idx; + const float* ptr_var = variance + idx; + float* ptr_bbox = ptr_bbox_batch + idx; + + float32x4_t vloc1 = vld1q_f32(ptr_loc); + float32x4_t vprior1 = vld1q_f32(ptr_prior); + float32x4_t vvar1 = vld1q_f32(ptr_var); + float32x4_t vout1 = vmulq_f32(vloc1, vvar1); + + float32x4_t vloc2 = vld1q_f32(ptr_loc + 4); + float32x4_t vprior2 = vld1q_f32(ptr_prior + 4); + float32x4_t vvar2 = vld1q_f32(ptr_var + 4); + float32x4_t vout2 = vmulq_f32(vloc2, vvar2); + + float32x4_t vloc3 = vld1q_f32(ptr_loc + 8); + float32x4_t vprior3 = vld1q_f32(ptr_prior + 8); + float32x4_t vvar3 = vld1q_f32(ptr_var + 8); + float32x4_t vout3 = vmulq_f32(vloc3, vvar3); + + float32x4_t vloc4 = vld1q_f32(ptr_loc + 12); + float32x4_t vprior4 = vld1q_f32(ptr_prior + 12); + float32x4_t vvar4 = vld1q_f32(ptr_var + 12); + float32x4_t vout4 = vmulq_f32(vloc4, vvar4); + + vst1q_f32(ptr_bbox, vaddq_f32(vout1, vprior1)); + vst1q_f32(ptr_bbox + 4, vaddq_f32(vout2, vprior2)); + vst1q_f32(ptr_bbox + 8, vaddq_f32(vout3, vprior3)); + vst1q_f32(ptr_bbox + 12, vaddq_f32(vout4, vprior4)); + } + + for (int i = cnt * 4; i < num_priors; i++) { + int idx = i * 4; + float32x4_t vloc = vld1q_f32(ptr_loc_batch + idx); + float32x4_t vprior = vld1q_f32(prior_data + idx); + float32x4_t vvar = vld1q_f32(variance + idx); + float32x4_t vout = vmulq_f32(vloc, vvar); + vst1q_f32(ptr_bbox_batch + idx, vaddq_f32(vout, vprior)); + } + } +} + +void decode_bbox_center_variance_kernel(const int batch_num, \ + const float* loc_data, const float* prior_data, const float* variance, \ + const int num_priors, const bool share_location, const int num_loc_classes, \ + const int background_label_id, float* bbox_data) { + + LCHECK_EQ(share_location, true, "decode boxes without share_location is unimplemented"); + + int cnt = num_priors / 4; + //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax + //! vloc 0: xmin, 1: ymin, 2: xmax, 3: ymax + //! vvar + float32x4_t vhalf = vdupq_n_f32(0.5f); + + int len_batch = num_priors * 4; + + for (int n = 0; n < batch_num; ++n) { + + const float *ptr_loc_batch = loc_data + n * len_batch; + float *ptr_bbox_batch = bbox_data + n * len_batch; + +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + int idx = i * 16; + const float* ptr_loc = ptr_loc_batch + idx; + const float* ptr_prior = prior_data + idx; + float* ptr_bbox = ptr_bbox_batch + idx; + + float32x4x4_t vprior = vld4q_f32(ptr_prior); + float32x4x4_t vloc = vld4q_f32(ptr_loc); + float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]); + float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]); + float32x4_t vprior_cx = vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf); + float32x4_t vprior_cy = vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf); + + float32x4_t vdec_bbx_cx = vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx); + float32x4_t vdec_bbx_cy = vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy); + float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]); + float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]); + vprior_width = vmulq_f32(vprior_width, vhalf); + vprior_height = vmulq_f32(vprior_height, vhalf); + vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width); + vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height); + + vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w); + vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h); + vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w); + vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h); + + vst4q_f32(ptr_bbox, vloc); + } +#pragma omp parallel for + for (int i = cnt * 4; i < num_priors; i++) { + int idx = i * 4; + float p_xmin = prior_data[idx]; + float p_ymin = prior_data[idx + 1]; + float p_xmax = prior_data[idx + 2]; + float p_ymax = prior_data[idx + 3]; + float prior_width = p_xmax - p_xmin; + float prior_height = p_ymax - p_ymin; + float prior_center_x = (p_xmin + p_xmax) / 2.f; + float prior_center_y = (p_ymin + p_ymax) / 2.f; + + float xmin = ptr_loc_batch[idx]; + float ymin = ptr_loc_batch[idx + 1]; + float xmax = ptr_loc_batch[idx + 2]; + float ymax = ptr_loc_batch[idx + 3]; + + //! variance is encoded in target, we simply need to retore the offset predictions. + float decode_bbox_center_x = xmin * prior_width + prior_center_x; + float decode_bbox_center_y = ymin * prior_height + prior_center_y; + float decode_bbox_width = expf(xmax) * prior_width; + float decode_bbox_height = expf(ymax) * prior_height; + + ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f; + ptr_bbox_batch[idx + 1] = decode_bbox_center_y - decode_bbox_height / 2.f; + ptr_bbox_batch[idx + 2] = decode_bbox_center_x + decode_bbox_width / 2.f; + ptr_bbox_batch[idx + 3] = decode_bbox_center_y + decode_bbox_height / 2.f; + } + } +} + +void decode_bbox_center_no_variance_kernel(const int batch_num, \ + const float* loc_data, const float* prior_data, const float* variance, \ + const int num_priors, const bool share_location, const int num_loc_classes, \ + const int background_label_id, float* bbox_data) { + + LCHECK_EQ(share_location, true, "decode boxes without share_location is unimplemented"); + + int cnt = num_priors / 4; + //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax + //! vloc 0: xmin, 1: ymin, 2: xmax, 3: ymax + //! vvar + float32x4_t vhalf = vdupq_n_f32(0.5f); + + int len_batch = num_priors * 4; + + for (int n = 0; n < batch_num; ++n) { + + const float *ptr_loc_batch = loc_data + n * len_batch; + float *ptr_bbox_batch = bbox_data + n * len_batch; + +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + + int idx = i * 16; + + const float* ptr_loc = ptr_loc_batch + idx; + const float* ptr_prior = prior_data + idx; + const float* ptr_var = variance + idx; + float* ptr_bbox = ptr_bbox_batch + idx; + + float32x4x4_t vprior = vld4q_f32(ptr_prior); + float32x4x4_t vloc = vld4q_f32(ptr_loc); + float32x4x4_t vvar = vld4q_f32(ptr_var); + float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]); + float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]); + float32x4_t vprior_cx = vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf); + float32x4_t vprior_cy = vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf); + + vloc.val[0] = vmulq_f32(vloc.val[0], vvar.val[0]); + vloc.val[1] = vmulq_f32(vloc.val[1], vvar.val[1]); + vloc.val[2] = vmulq_f32(vloc.val[2], vvar.val[2]); + vloc.val[3] = vmulq_f32(vloc.val[3], vvar.val[3]); + + float32x4_t vdec_bbx_cx = vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx); + float32x4_t vdec_bbx_cy = vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy); + float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]); + float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]); + vprior_width = vmulq_f32(vprior_width, vhalf); + vprior_height = vmulq_f32(vprior_height, vhalf); + vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width); + vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height); + + vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w); + vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h); + vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w); + vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h); + + vst4q_f32(ptr_bbox, vloc); + } + +#pragma omp parallel for + for (int i = cnt * 4; i < num_priors; i++) { + int idx = i * 4; + float p_xmin = prior_data[idx]; + float p_ymin = prior_data[idx + 1]; + float p_xmax = prior_data[idx + 2]; + float p_ymax = prior_data[idx + 3]; + float prior_width = p_xmax - p_xmin; + float prior_height = p_ymax - p_ymin; + float prior_center_x = (p_xmin + p_xmax) / 2.f; + float prior_center_y = (p_ymin + p_ymax) / 2.f; + + float xmin = ptr_loc_batch[idx]; + float ymin = ptr_loc_batch[idx + 1]; + float xmax = ptr_loc_batch[idx + 2]; + float ymax = ptr_loc_batch[idx + 3]; + + //! variance is encoded in target, we simply need to retore the offset predictions. + float decode_bbox_center_x = variance[idx] * xmin * prior_width + prior_center_x; + float decode_bbox_center_y = variance[idx + 1] * ymin * prior_height + prior_center_y; + float decode_bbox_width = expf(variance[idx + 2] * xmax) * prior_width; + float decode_bbox_height = expf(variance[idx + 3] * ymax) * prior_height; + + ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f; + ptr_bbox_batch[idx + 1] = decode_bbox_center_y - decode_bbox_height / 2.f; + ptr_bbox_batch[idx + 2] = decode_bbox_center_x + decode_bbox_width / 2.f; + ptr_bbox_batch[idx + 3] = decode_bbox_center_y + decode_bbox_height / 2.f; + } + } +} + +void decode_bbox_corner_size_variance_kernel(const int batch_num, \ + const float* loc_data, const float* prior_data, const float* variance, \ + const int num_priors, const bool share_location, const int num_loc_classes, \ + const int background_label_id, float* bbox_data) { + + LCHECK_EQ(share_location, true, "decode boxes without share_location is unimplemented"); + + int cnt = num_priors / 4; + //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax + //! bbx + + int len_batch = num_priors * 4; + + for (int n = 0; n < batch_num; ++n) { + + const float *ptr_loc_batch = loc_data + n * len_batch; + float *ptr_bbox_batch = bbox_data + n * len_batch; + +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + + int idx = i * 16; + + const float* ptr_loc = ptr_loc_batch + idx; + const float* ptr_prior = prior_data + idx; + const float* ptr_var = variance + idx; + float* ptr_bbox = ptr_bbox_batch + idx; + + float32x4x4_t vprior = vld4q_f32(ptr_prior); + float32x4x4_t vloc = vld4q_f32(ptr_loc); + + float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]); + float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]); + + float32x4x4_t vbbx; + vbbx.val[0] = vmulq_f32(vloc.val[0], vprior_width); + vbbx.val[1] = vmulq_f32(vloc.val[1], vprior_height); + vbbx.val[2] = vmulq_f32(vloc.val[2], vprior_width); + vbbx.val[3] = vmulq_f32(vloc.val[3], vprior_height); + + vbbx.val[0] = vaddq_f32(vprior.val[0], vbbx.val[0]); + vbbx.val[1] = vaddq_f32(vprior.val[1], vbbx.val[1]); + vbbx.val[2] = vaddq_f32(vprior.val[2], vbbx.val[2]); + vbbx.val[3] = vaddq_f32(vprior.val[3], vbbx.val[3]); + + vst4q_f32(ptr_bbox, vbbx); + } + +#pragma omp parallel for + for (int i = cnt * 4; i < num_priors; i++) { + int idx = i * 4; + float p_xmin = prior_data[idx]; + float p_ymin = prior_data[idx + 1]; + float p_xmax = prior_data[idx + 2]; + float p_ymax = prior_data[idx + 3]; + float prior_width = p_xmax - p_xmin; + float prior_height = p_ymax - p_ymin; + + ptr_bbox_batch[idx] = p_xmin + ptr_loc_batch[idx] * prior_width; + ptr_bbox_batch[idx + 1] = p_ymin + ptr_loc_batch[idx + 1] * prior_height; + ptr_bbox_batch[idx + 2] = p_xmax + ptr_loc_batch[idx + 2] * prior_width; + ptr_bbox_batch[idx + 3] = p_ymax + ptr_loc_batch[idx + 3] * prior_height; + } + + } +} + +void decode_bbox_corner_size_no_variance_kernel(const int batch_num, \ + const float* loc_data, const float* prior_data, const float* variance, \ + const int num_priors, const bool share_location, const int num_loc_classes, \ + const int background_label_id, float* bbox_data) { + + LCHECK_EQ(share_location, true, "decode boxes without share_location is unimplemented"); + + int cnt = num_priors / 4; + //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax + //! bbx + + int len_batch = num_priors * 4; + + for (int n = 0; n < batch_num; ++n) { + + const float *ptr_loc_batch = loc_data + n * len_batch; + float *ptr_bbox_batch = bbox_data + n * len_batch; + +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + int idx = i * 16; + + const float* ptr_loc = ptr_loc_batch + idx; + const float* ptr_prior = prior_data + idx; + const float* ptr_var = variance + idx; + float* ptr_bbox = ptr_bbox_batch + idx; + + float32x4x4_t vprior = vld4q_f32(ptr_prior); + float32x4x4_t vloc = vld4q_f32(ptr_loc); + + float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]); + float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]); + + float32x4x4_t vbbx; + vbbx.val[0] = vmulq_f32(vloc.val[0], vprior_width); + vbbx.val[1] = vmulq_f32(vloc.val[1], vprior_height); + vbbx.val[2] = vmulq_f32(vloc.val[2], vprior_width); + vbbx.val[3] = vmulq_f32(vloc.val[3], vprior_height); + + vloc = vld4q_f32(ptr_var); + vbbx.val[0] = vmulq_f32(vbbx.val[0], vloc.val[0]); + vbbx.val[1] = vmulq_f32(vbbx.val[1], vloc.val[1]); + vbbx.val[2] = vmulq_f32(vbbx.val[2], vloc.val[2]); + vbbx.val[3] = vmulq_f32(vbbx.val[3], vloc.val[3]); + + vbbx.val[0] = vaddq_f32(vprior.val[0], vbbx.val[0]); + vbbx.val[1] = vaddq_f32(vprior.val[1], vbbx.val[1]); + vbbx.val[2] = vaddq_f32(vprior.val[2], vbbx.val[2]); + vbbx.val[3] = vaddq_f32(vprior.val[3], vbbx.val[3]); + + vst4q_f32(ptr_bbox, vbbx); + } +#pragma omp parallel for + for (int i = cnt * 4; i < num_priors; i++) { + int idx = i * 4; + float p_xmin = prior_data[idx]; + float p_ymin = prior_data[idx + 1]; + float p_xmax = prior_data[idx + 2]; + float p_ymax = prior_data[idx + 3]; + float prior_width = p_xmax - p_xmin; + float prior_height = p_ymax - p_ymin; + + ptr_bbox_batch[idx] = + p_xmin + ptr_loc_batch[idx] * variance[idx] * prior_width; + ptr_bbox_batch[idx + 1] = + p_ymin + ptr_loc_batch[idx + 1] * variance[idx + 1] * prior_height; + ptr_bbox_batch[idx + 2] = + p_xmax + ptr_loc_batch[idx + 2] * variance[idx + 2] * prior_width; + ptr_bbox_batch[idx + 3] = + p_ymax + ptr_loc_batch[idx + 3] * variance[idx + 3] * prior_height; + } + } +} + +void decode_bboxes(const int batch_num, const float* loc_data, const float* prior_data, \ + const CodeType code_type, const bool variance_encoded_in_target,\ + const int num_priors, const bool share_location, \ + const int num_loc_classes, const int background_label_id, \ + float* bbox_data) { + const float* variance_data = prior_data + 4 * num_priors; + if (code_type == CORNER) { + if (variance_encoded_in_target) { + decode_bbox_corner_variance_kernel(batch_num, \ + loc_data, prior_data, variance_data, \ + num_priors, share_location, num_loc_classes, \ + background_label_id, bbox_data); + } else { + decode_bbox_corner_no_variance_kernel(batch_num, \ + loc_data, prior_data, variance_data, \ + num_priors, share_location, num_loc_classes, \ + background_label_id, bbox_data); + } + } else if (code_type == CENTER_SIZE) { + if (variance_encoded_in_target) { + decode_bbox_center_variance_kernel(batch_num, \ + loc_data, prior_data, variance_data, \ + num_priors, share_location, num_loc_classes, \ + background_label_id, bbox_data); + } else { + decode_bbox_center_no_variance_kernel(batch_num, \ + loc_data, prior_data, variance_data, \ + num_priors, share_location, num_loc_classes, \ + background_label_id, bbox_data); + } + } else if (code_type == CORNER_SIZE) { + if (variance_encoded_in_target) { + decode_bbox_corner_size_variance_kernel(batch_num, \ + loc_data, prior_data, variance_data, \ + num_priors, share_location, num_loc_classes, \ + background_label_id, bbox_data); + } else { + decode_bbox_corner_size_no_variance_kernel(batch_num, \ + loc_data, prior_data, variance_data, \ + num_priors, share_location, num_loc_classes, \ + background_label_id, bbox_data); + } + } +} + +template +static bool sort_score_pair_descend(const std::pair& pair1, \ + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +void get_max_score_index(const float* scores, int num, float threshold, \ + int top_k, std::vector >* score_index_vec) { + //! Generate index score pairs. + for (int i = 0; i < num; ++i) { + if (scores[i] > threshold) { + score_index_vec->push_back(std::make_pair(scores[i], i)); + } + } + + //! Sort the score pair according to the scores in descending order + std::stable_sort(score_index_vec->begin(), score_index_vec->end(), \ + sort_score_pair_descend); + + //! Keep top_k scores if needed. + if (top_k > -1 && top_k < score_index_vec->size()) { + score_index_vec->resize(top_k); + } +} + +float bbox_size(const float* bbox, bool normalized = true) { + if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) { + // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0. + return 0.f; + } else { + const float width = bbox[2] - bbox[0]; + const float height = bbox[3] - bbox[1]; + + if (normalized) { + return width * height; + } else { + // If bbox is not within range [0, 1]. + return (width + 1) * (height + 1); + } + } +} + +float jaccard_overlap(const float* bbox1, const float* bbox2) { + if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] || + bbox2[1] > bbox1[3] || bbox2[3] < bbox1[1]) { + return 0.f; + } else { + const float inter_xmin = std::max(bbox1[0], bbox2[0]); + const float inter_ymin = std::max(bbox1[1], bbox2[1]); + const float inter_xmax = std::min(bbox1[2], bbox2[2]); + const float inter_ymax = std::min(bbox1[3], bbox2[3]); + + const float inter_width = inter_xmax - inter_xmin; + const float inter_height = inter_ymax - inter_ymin; + const float inter_size = inter_width * inter_height; + + const float bbox1_size = bbox_size(bbox1); + const float bbox2_size = bbox_size(bbox2); + + return inter_size / (bbox1_size + bbox2_size - inter_size); + } +} + +void apply_nms_fast(const float* bboxes, const float* scores, int num, + float score_threshold, float nms_threshold, + float eta, int top_k, std::vector* indices) { + // Get top_k scores (with corresponding indices). + std::vector> score_index_vec; + get_max_score_index(scores, num, score_threshold, top_k, &score_index_vec); + + // Do nms. + float adaptive_threshold = nms_threshold; + indices->clear(); + + while (score_index_vec.size() != 0) { + const int idx = score_index_vec.front().second; + bool keep = true; + + for (int k = 0; k < indices->size(); ++k) { + if (keep) { + const int kept_idx = (*indices)[k]; + float overlap = jaccard_overlap(bboxes + idx * 4, bboxes + kept_idx * 4); + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + + if (keep) { + indices->push_back(idx); + } + + score_index_vec.erase(score_index_vec.begin()); + + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } +} + +void nms_detect(const float* bbox_cpu_data, const float* conf_cpu_data, std::vector& result, \ + int batch_num, int class_num, int num_priors, int background_id, \ + int keep_topk, int nms_topk, float conf_thresh, float nms_thresh, \ + float nms_eta, bool share_location) { + + int num_kept = 0; + std::vector>> all_indices; + + for (int i = 0; i < batch_num; ++i) { + std::map> indices; + int num_det = 0; + const int conf_idx = i * class_num * num_priors; + int bbox_idx; + + if (share_location) { + bbox_idx = i * num_priors * 4; + } else { + bbox_idx = conf_idx * 4; + } + + for (int c = 0; c < class_num; ++c) { + if (c == background_id) { + // Ignore background class. + continue; + } + + const float* cur_conf_data = conf_cpu_data + conf_idx + c * num_priors; + const float* cur_bbox_data = bbox_cpu_data + bbox_idx; + + if (!share_location) { + cur_bbox_data += c * num_priors * 4; + } + + apply_nms_fast(cur_bbox_data, cur_conf_data, num_priors, \ + conf_thresh, nms_thresh, nms_eta, nms_topk, &(indices[c])); + num_det += indices[c].size(); + } + + if (keep_topk > -1 && num_det > keep_topk) { + std::vector>> score_index_pairs; + + for (auto it = indices.begin(); it != indices.end(); ++it) { + int label = it->first; + const std::vector& label_indices = it->second; + + for (int j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + float score = conf_cpu_data[conf_idx + label * num_priors + idx]; + score_index_pairs.push_back(std::make_pair(score, std::make_pair(label, idx))); + } + } + + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + sort_score_pair_descend>); + score_index_pairs.resize(keep_topk); + // Store the new indices. + std::map> new_indices; + + for (int j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + + all_indices.push_back(new_indices); + num_kept += keep_topk; + } else { + all_indices.push_back(indices); + num_kept += num_det; + } + } + + if (num_kept == 0) { + result.clear(); + return; + } else { + result.resize(num_kept * 7); + } + + int count = 0; + + for (int i = 0; i < batch_num; ++i) { + const int conf_idx = i * class_num * num_priors; + int bbox_idx; + + if (share_location) { + bbox_idx = i * num_priors * 4; + } else { + bbox_idx = conf_idx * 4; + } + + for (auto it = all_indices[i].begin(); it != all_indices[i].end(); ++it) { + int label = it->first; + std::vector& indices = it->second; + const float* cur_conf_data = + conf_cpu_data + conf_idx + label * num_priors; + const float* cur_bbox_data = bbox_cpu_data + bbox_idx; + + if (!share_location) { + cur_bbox_data += label * num_priors * 4; + } + + for (int j = 0; j < indices.size(); ++j) { + int idx = indices[j]; + result[count * 7] = i; + result[count * 7 + 1] = label; + result[count * 7 + 2] = cur_conf_data[idx]; + + for (int k = 0; k < 4; ++k) { + result[count * 7 + 3 + k] = cur_bbox_data[idx * 4 + k]; + } + + ++count; + } + } + } +} + +void permute_conf(const float* conf_data, const int num, + const int num_priors, const int num_classes, + float* conf_preds) { + for (int i = 0; i < num; ++i) { + const float* batch_conf = conf_data + i * num_classes * num_priors; + float* batch_data_permute = conf_preds + i * num_classes * num_priors; + for (int p = 0; p < num_priors; ++p) { + int start_idx = p * num_classes; + for (int c = 0; c < num_classes; ++c) { + batch_data_permute[c * num_priors + p] = batch_conf[start_idx + c]; + } + } + } +} + +SaberDetectionOutput::SaberDetectionOutput(bool share_loc, + bool variance_encode, + int class_num, + int background_id, + int keep_topk, + CodeType type, + float conf_thresh, + int nms_topk, + float nms_thresh, + float nms_eta) { + LITE_CHECK(load_param(share_loc, variance_encode, class_num, background_id, \ + keep_topk, type, conf_thresh, nms_topk, nms_thresh, nms_eta)); +} + +SaberStatus SaberDetectionOutput::load_param(bool share_loc, + bool variance_encode, + int class_num, + int background_id, + int keep_topk, + CodeType type, + float conf_thresh, + int nms_topk, + float nms_thresh, + float nms_eta) { + _share_loacation = share_loc; + _variance_encode_in_target = variance_encode; + _class_num = class_num; + _background_id = background_id; + _keep_top_k = keep_topk; + _type = type; + _conf_thresh = conf_thresh; + _nms_top_k = nms_topk; + _nms_thresh = nms_thresh; + _nms_eta = nms_eta; + return SaberSuccess; +} + +SaberStatus SaberDetectionOutput::compute_output_shape(const std::vector *> &inputs, + std::vector *> &outputs) { + //! output tensor's dims = 2 + Shape shape_out; + shape_out.resize(2); + //CHECK_EQ(shape_out.dims(), 4) << "only support 4d layout"; + shape_out[0] = inputs[0]->num() * _keep_top_k; + shape_out[1] = 7; + + return outputs[0]->set_shape(shape_out); +} + +SaberStatus SaberDetectionOutput::init(const std::vector*>& inputs, \ + std::vector*>& outputs, Context &ctx) { + _ctx = ctx; + +//! inputs[0]: location map, dims = 2 {N, boxes * 4} +//! inputs[1]: confidence map, dims = 2 {N, boxes * classes} +//! inputs[2]: prior boxes, dims = 3 {1, 2, boxes * 4(xmin, ymin, xmax, ymax)} + int size_loc = inputs[0]->valid_size(); + int size_conf = inputs[1]->valid_size(); + int size_prior = inputs[2]->valid_size(); + + Shape sh_loc = inputs[0]->valid_shape(); + Shape sh_conf = inputs[1]->valid_shape(); +//! shape {1, 2, boxes * 4(xmin, ymin, xmax, ymax)}, boxes = size / 2 / 4 +//! the priors is in the last dim + + int num = inputs[0]->num(); + _num_priors = size_prior / 8; + + if (_class_num == 0) { + _class_num = size_conf / (num * _num_priors); + } + if (_share_loacation) { + _num_loc_classes = 1; + } else { + _num_loc_classes = _class_num; + _bbox_permute.reshape(sh_loc); + } + + _bbox_preds.reshape(sh_loc); + _conf_permute.reshape(sh_conf); + + return SaberSuccess; +} + + +//template <> +SaberStatus SaberDetectionOutput::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs) { + + Tensor* t_loc = inputs[0]; + Tensor* t_conf = inputs[1]; + Tensor* t_prior = inputs[2]; + + const int num = t_loc->num(); + + const float* loc_data = t_loc->data(); + const float* prior_data = t_prior->data(); + const float* conf_data = t_conf->data(); + + float* bbox_data = _bbox_preds.mutable_data(); + + if (!_share_loacation) { + return SaberUnImplError; + } + + //! Decode predictions. + //! Retrieve all decoded location predictions. + decode_bboxes(num, loc_data, prior_data, _type, _variance_encode_in_target, \ + _num_priors, _share_loacation, _num_loc_classes, \ + _background_id, bbox_data); + + //! Retrieve all confidences, permute to classes * boxes_size + float* conf_permute_data = _conf_permute.mutable_data(); + permute_conf(conf_data, num, _num_priors, _class_num, conf_permute_data); + + std::vector result; + + nms_detect(bbox_data, conf_permute_data, result, num, _class_num, _num_priors, _background_id, \ + _keep_top_k, _nms_top_k, _conf_thresh, _nms_thresh, _nms_eta, _share_loacation); + + if(result.size() == 0) { + result.resize(7); + for (int i = 0; i < 7; ++i) { + result[i] = -1.f; + } + outputs[0]->reshape({1, 1, 1, 7}); + } else { + outputs[0]->reshape({1, 1, result.size() / 7, 7}); + } + + memcpy(outputs[0]->mutable_data(), result.data(), \ + result.size() * sizeof(float)); + + return SaberSuccess; +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif diff --git a/saber/lite/funcs/neon/saber_eltwise.cpp b/saber/lite/funcs/neon/saber_eltwise.cpp new file mode 100644 index 000000000..bfa1324c1 --- /dev/null +++ b/saber/lite/funcs/neon/saber_eltwise.cpp @@ -0,0 +1,257 @@ +#include "saber/lite/funcs/saber_eltwise.h" + +#ifdef USE_ARM_PLACE +namespace anakin{ + +namespace saber{ + +namespace lite{ + +template +void eltwise_prod(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int size, \ + std::vector coef); + +template +void eltwise_sum(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int size, \ + std::vector coef); + +template +void eltwise_max(const Dtype* din_a, const Dtype* din_b, Dtype* dout, const int size, \ + std::vector coef); + +template <> +void eltwise_prod(const float* din_a, const float* din_b, float* dout, const int size, \ + std::vector coef) { + + float* out_ptr = dout; + const float* a_ptr = din_a; + const float* b_ptr = din_b; + + int cnt = size >> 3; + int remain = size & 7; +#ifdef __aarch64__ + for (int i = 0; i < cnt; ++i) { + float32x4_t va0 = vld1q_f32(a_ptr); + float32x4_t vb0 = vld1q_f32(b_ptr); + float32x4_t va1 = vld1q_f32(a_ptr + 4); + float32x4_t vb1 = vld1q_f32(b_ptr + 4); + float32x4_t vout1 = vmulq_f32(va0, vb0); + vst1q_f32(out_ptr, vout1); + float32x4_t vout2 = vmulq_f32(va1, vb1); + vst1q_f32(out_ptr + 4, vout2); + a_ptr += 8; + b_ptr += 8; + out_ptr += 8; + } +#else + int loop_cnt = cnt; + if (loop_cnt > 0) { + asm volatile( + "prod_loop: @ main loop start point\n" + "vld1.f32 {d0-d1}, [%[a_ptr]]! @ load din r0\n" + "vld1.f32 {d2-d3}, [%[b_ptr]]! @ load din r1n\n" + "vld1.f32 {d4-d5}, [%[a_ptr]]! @ load din r0\n" + "vld1.f32 {d6-d7}, [%[b_ptr]]! @ load din r1n\n" + "vmul.f32 q8, q0, q1 @ q8 = q0 * q1\n" + "vmul.f32 q9, q2, q3 @ q9 = q2 * q3\n" + "subs %[loop_cnt], #1 @ loop --\n" + "vst1.f32 {d16-d17}, [%[out_ptr]]! @ store data\n" + "vst1.f32 {d18-d19}, [%[out_ptr]]! @ store data\n" + "bne prod_loop @ top_loop \n" + :[loop_cnt] "+r" (loop_cnt), [a_ptr] "+r" (a_ptr), \ + [b_ptr] "+r" (b_ptr), [out_ptr] "+r" (out_ptr) + : + :"q0", "q1", "q2", "q3", "q8", "q9" + ); + } +#endif //__aarch64__ + + for (; remain > 0; remain--) { + *(out_ptr++) = *(a_ptr++) * (*(b_ptr++)); + } +} + +void eltwise_sum(const float* din_a, const float* din_b, float* dout, const int size, \ + std::vector coef) { + + float* out_ptr = dout; + const float* a_ptr = din_a; + const float* b_ptr = din_b; + + int cnt = size >> 3; + int remain = size & 7; +#ifdef __aarch64__ + for (int i = 0; i < cnt; ++i) { + float32x4_t va0 = vld1q_f32(a_ptr); + float32x4_t vb0 = vld1q_f32(b_ptr); + float32x4_t va1 = vld1q_f32(a_ptr + 4); + float32x4_t vb1 = vld1q_f32(b_ptr + 4); + float32x4_t vout1 = vaddq_f32(va0, vb0); + vst1q_f32(out_ptr, vout1); + float32x4_t vout2 = vaddq_f32(va1, vb1); + vst1q_f32(out_ptr + 4, vout2); + a_ptr += 8; + b_ptr += 8; + out_ptr += 8; + } +#else + int loop_cnt = cnt; + if (loop_cnt > 0) { + asm volatile( + "sum_loop: @ main loop start point\n" + "vld1.f32 {d0-d1}, [%[a_ptr]]! @ load din r0\n" + "vld1.f32 {d2-d3}, [%[b_ptr]]! @ load din r1n\n" + "vld1.f32 {d4-d5}, [%[a_ptr]]! @ load din r0\n" + "vld1.f32 {d6-d7}, [%[b_ptr]]! @ load din r1n\n" + "vadd.f32 q8, q0, q1 @ q8 = q0 * q1\n" + "vadd.f32 q9, q2, q3 @ q9 = q2 * q3\n" + "subs %[loop_cnt], #1 @ loop --\n" + "vst1.f32 {d16-d17}, [%[out_ptr]]! @ store data\n" + "vst1.f32 {d18-d19}, [%[out_ptr]]! @ store data\n" + "bne sum_loop @ top_loop \n" + :[loop_cnt] "+r" (loop_cnt), [a_ptr] "+r" (a_ptr), \ + [b_ptr] "+r" (b_ptr), [out_ptr] "+r" (out_ptr) + : + :"q0", "q1", "q2", "q3", "q8", "q9" + ); + } +#endif //__aarch64__ + + for (; remain > 0; remain--) { + *(out_ptr++) = *(a_ptr++) + (*(b_ptr++)); + } +} + +void eltwise_max(const float* din_a, const float* din_b, float* dout, const int size, \ + std::vector coef) { + + float* out_ptr = dout; + const float* a_ptr = din_a; + const float* b_ptr = din_b; + + int cnt = size >> 3; + int remain = size & 7; +#ifdef __aarch64__ + for (int i = 0; i < cnt; ++i) { + float32x4_t va0 = vld1q_f32(a_ptr); + float32x4_t vb0 = vld1q_f32(b_ptr); + float32x4_t va1 = vld1q_f32(a_ptr + 4); + float32x4_t vb1 = vld1q_f32(b_ptr + 4); + float32x4_t vout1 = vmaxq_f32(va0, vb0); + vst1q_f32(out_ptr, vout1); + float32x4_t vout2 = vmaxq_f32(va1, vb1); + vst1q_f32(out_ptr + 4, vout2); + a_ptr += 8; + b_ptr += 8; + out_ptr += 8; + } +#else + int loop_cnt = cnt; + if (loop_cnt > 0) { + asm volatile( + "max_loop: @ main loop start point\n" + "vld1.f32 {d0-d1}, [%[a_ptr]]! @ load din r0\n" + "vld1.f32 {d2-d3}, [%[b_ptr]]! @ load din r1n\n" + "vld1.f32 {d4-d5}, [%[a_ptr]]! @ load din r0\n" + "vld1.f32 {d6-d7}, [%[b_ptr]]! @ load din r1n\n" + "vmax.f32 q8, q0, q1 @ q8 = q0 * q1\n" + "vmax.f32 q9, q2, q3 @ q9 = q2 * q3\n" + "subs %[loop_cnt], #1 @ loop --\n" + "vst1.f32 {d16-d17}, [%[out_ptr]]! @ store data\n" + "vst1.f32 {d18-d19}, [%[out_ptr]]! @ store data\n" + "bne max_loop @ top_loop \n" + :[loop_cnt] "+r" (loop_cnt), [a_ptr] "+r" (a_ptr), \ + [b_ptr] "+r" (b_ptr), [out_ptr] "+r" (out_ptr) + : + :"q0", "q1", "q2", "q3", "q8", "q9" + ); + } +#endif //__aarch64__ + + for (; remain > 0; remain--) { + *(out_ptr++) = std::max(*(a_ptr++), *(b_ptr++)); + } +} + +SaberEltwise::SaberEltwise(EltwiseType type, std::vector coef) { + _type = type; + _coef = coef; +} + +SaberStatus SaberEltwise::load_param(EltwiseType type, std::vector coef) { + _type = type; + _coef = coef; + return SaberSuccess; +} + +SaberStatus SaberEltwise::compute_output_shape(const std::vector *> &inputs, + std::vector *> &outputs) { + for (int i = 1; i < inputs.size(); ++i) { + LCHECK_EQ(inputs[0]->num(), inputs[i]->num(), "input size must be the same"); + LCHECK_EQ(inputs[0]->channel(), inputs[i]->channel(), "input size must be the same"); + LCHECK_EQ(inputs[0]->height(), inputs[i]->height(), "input size must be the same"); + LCHECK_EQ(inputs[0]->width(), inputs[i]->width(), "input size must be the same"); + } + + Shape output_shape = inputs[0]->valid_shape(); + return outputs[0]->set_shape(output_shape); +} + +//template +SaberStatus SaberEltwise::init(\ + const std::vector*>& inputs, + std::vector*>& outputs, Context &ctx) { + + _ctx = ctx; + Shape sh_out_saber = outputs[0]->valid_shape(); + for (int i = 0; i < inputs.size(); i ++){ + Shape sh_in_saber = inputs[i]->valid_shape(); + if (sh_out_saber != sh_in_saber){ + printf("input shape is not same with output shape\n"); + return SaberInvalidValue; + } + } + switch (_type) { + case Eltwise_prod: + _impl = eltwise_prod; + break; + case Eltwise_sum: + _impl = eltwise_sum; + break; + case Eltwise_max: + _impl = eltwise_max; + break; + default: + printf("unknown eltwise type!!\n"); + return SaberUnKownError; + } + return SaberSuccess; +} + +//template +SaberStatus SaberEltwise::dispatch(\ + const std::vector*>& inputs, \ + std::vector*>& outputs) { + + const float* din_a = inputs[0]->data(); + const float* din_b = inputs[1]->data(); + float* dout = outputs[0]->mutable_data(); + + int size = outputs[0]->valid_size(); + + _impl(din_a, din_b, dout, size, _coef); + for (int i = 2; i < inputs.size(); ++i) { + din_a = inputs[i]->data(); + _impl(din_a, dout, dout, size, _coef); + } + + return SaberSuccess; +} + +} //namespace lite + +} //namespace saber + +} // namespace anakin + +#endif //USE_ARM_PLACE \ No newline at end of file diff --git a/saber/lite/funcs/neon/saber_fc.cpp b/saber/lite/funcs/neon/saber_fc.cpp new file mode 100644 index 000000000..814c07b1e --- /dev/null +++ b/saber/lite/funcs/neon/saber_fc.cpp @@ -0,0 +1,143 @@ +#include "saber/lite/funcs/saber_fc.h" + +#ifdef USE_ARM_PLACE + +#include "saber/lite/funcs/neon/impl/sgemv_arm.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +template +void fill_bias_fc(Dtype* tensor, const Dtype* bias, const int num, const int channel); +template <> +void fill_bias_fc(float* tensor, const float* bias, const int num, const int channel) { + + int cnt = channel >> 2; + int remain = channel & 3; + + for (int j = 0; j < num; ++j) { + + const float* ptr_bias = bias; + float* ptr_out = tensor + j * channel; + + if (cnt > 0) { + asm( + ".fill_bias_fc: \n" + "vld1.32 {d0-d1}, [%[ptr_out]] @ load data\n" + "vld1.32 {d2-d3}, [%[ptr_bias]]! @ load data\n" + "vadd.f32 q2, q0, q1 @ add bias\n" + "vst1.32 {d4-d5}, [%[ptr_out]]! @ store result\n" + "subs %[cnt], #1 @ loop count -1\n" + "bne .fill_bias_fc @ jump to main loop\n" + :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \ + [cnt] "+r"(cnt) + : + :"q0", "q1", "q2" + ); + } + + for (; remain > 0; remain--) { + *(ptr_out++) += *(ptr_bias++); + } + } +} + + +SaberFc::SaberFc(int axis, int num_output, bool flag_trans, bool flag_bias, \ + const float *weights, const float *bias) { + + _axis = axis; + _num_output = num_output; + _flag_trans = flag_trans; + _bias_term = flag_bias; + _weights = weights; + _bias = bias; +} + +SaberStatus SaberFc::load_param(int axis, int num_output, bool flag_trans, bool flag_bias, \ + const float *weights, const float *bias) { + + _axis = axis; + _num_output = num_output; + _flag_trans = flag_trans; + _bias_term = flag_bias; + _weights = weights; + _bias = bias; + return SaberSuccess; +} + +SaberStatus SaberFc::compute_output_shape(const std::vector *> &inputs, + std::vector *> &outputs) { + Shape shape_out = inputs[0]->valid_shape(); + int m = inputs[0]->count_valid(0, _axis); + int k = inputs[0]->count_valid(_axis, inputs[0]->dims()); + int n = _num_output; + + shape_out.resize(_axis + 1); + shape_out[_axis] = n; + return outputs[0]->set_shape(shape_out); +} + +SaberStatus SaberFc::init(const std::vector *> &inputs, \ + std::vector *> &outputs, Context &ctx) { + + _ctx = ctx; + int threads = _ctx.get_act_ids().size(); + + _m = inputs[0]->count_valid(0, _axis); + _k = inputs[0]->count_valid(_axis, inputs[0]->dims()); + _n = _num_output; + + int l1_cache = Env::cur_env()._L1_cache; + int l2_cache = Env::cur_env()._L2_cache; + //! if L1 cache size is not provided, set to 31K + l1_cache = l1_cache > 0? l1_cache : 31000; + //! if L2 cache size is not provided, set to 2M + l2_cache = l2_cache > 0? l2_cache : 2000000; + + printf("fc weights transpose: %s\n", _flag_trans? "true" : "false"); + if (_m > 1 || _flag_trans) { + _gemmer.init(l1_cache, l2_cache, _m, _n, _k, false, !_flag_trans, threads); + } + return SaberSuccess; +} + +//template +SaberStatus SaberFc::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs) { + + const float* din = inputs[0]->data(); + float* dout = outputs[0]->mutable_data(); + const float* weights = _weights; + const float* bias = nullptr; + if (_bias_term) { + bias = _bias; + } + + if (_m > 1 || _flag_trans) { + _gemmer(din, _k, weights, (_flag_trans? _n : _k), dout, _n, 1.f, 0.f, false); + if (_bias_term) { + fill_bias_fc(dout, bias, _m, _n); + } + } else { + if (_bias_term) { + sgemv_bias(false, _n, _k, weights, din, dout, bias); + } else { + sgemv(false, _n, _k, weights, din, dout); + } + } + return SaberSuccess; +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif + diff --git a/saber/lite/funcs/neon/saber_permute.cpp b/saber/lite/funcs/neon/saber_permute.cpp new file mode 100644 index 000000000..2d0e9ed26 --- /dev/null +++ b/saber/lite/funcs/neon/saber_permute.cpp @@ -0,0 +1,200 @@ +#include "saber/lite/funcs/saber_permute.h" + +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +template +void permute_basic(const int count, const Dtype* din, const int* permute_order, \ + const int* old_steps, const int* new_steps, const int num_axes, Dtype* dout) { + for (int i = 0; i < count; ++i) { + int old_idx = 0; + int idx = i; + for (int j = 0; j < num_axes; ++j) { + int order = permute_order[j]; + old_idx += (idx / new_steps[j]) * old_steps[order]; + idx %= new_steps[j]; + } + dout[i] = din[old_idx]; + } +} + +template +void transpose_mat(const Dtype* din, Dtype* dout, \ + const int num, const int width, const int height); +void transpose_mat(const float* din, float* dout, \ + const int num, const int width, const int height) { + int nw = width >> 2; + int nh = height >> 2; + int size_in = width * height; + + for (int i = 0; i < num; ++i) { + float* ptr_out = dout + i * size_in; + const float* ptr_in = din + i * size_in; +#pragma omp parallel for + for (int h = 0; h < nh; h++) { + const float* ptr_din_row = ptr_in + h * 4 * width; + for (int w = 0; w < nw; w++) { + float* data_out_ptr = ptr_out + w * 4 * height + h * 4; + const float* din0 = ptr_din_row; + const float* din1 = din0 + width; + const float* din2 = din1 + width; + const float* din3 = din2 + width; + + float* dout0 = data_out_ptr; + float* dout1 = dout0 + height; + float* dout2 = dout1 + height; + float* dout3 = dout2 + height; + +#ifdef __aarch64__ + float32x4_t vr0 = vld1q_f32(din0); + float32x4_t vr1 = vld1q_f32(din1); + float32x4_t vr2 = vld1q_f32(din2); + float32x4_t vr3 = vld1q_f32(din3); + vtrnq_f32(vr0, vr1); + vtrnq_f32(vr2, vr3); + vswp_f32(d1, d4); + vswp_f32(d3, d6); + vst1q_f32(dout0, vr0); + vst1q_f32(dout1, vr1); + vst1q_f32(dout2, vr2); + vst1q_f32(dout3, vr3); +#else + asm( + "vld1.32 {d0, d1}, [%[in0]] \n" + "vld1.32 {d2, d3}, [%[in1]] \n" + "vld1.32 {d4, d5}, [%[in2]] \n" + "vld1.32 {d6, d7}, [%[in3]] \n" + "vtrn.32 q0, q1 \n" + "vtrn.32 q2, q3 \n" + "vswp d1, d4 \n" + "vswp d3, d6 \n" + "vst1.32 {d0, d1}, [%[out0]] \n" + "vst1.32 {d2, d3}, [%[out1]] \n" + "vst1.32 {d4, d5}, [%[out2]] \n" + "vst1.32 {d6, d7}, [%[out3]] \n" + : + : [out0] "r" (dout0), [out1] "r" (dout1), [out2] "r" (dout2), \ + [out3] "r" (dout3), [in0] "r" (din0), [in1] "r" (din1), \ + [in2] "r" (din2), [in3] "r" (din3) + : "q0", "q1", "q2", "q3" + ); +#endif + ptr_din_row += 4; + } + } + //remian + for (int h = 0; h < height; h++){ + for (int w = nw * 4; w < width; w++){ + const float* data_in_ptr = ptr_in + h * width + w; + float* data_out_ptr = ptr_out + w * height + h; + *data_out_ptr = *data_in_ptr; + } + } + for (int w = 0; w < width; w++){ + for (int h = nh * 4; h < height; h++){ + const float* data_in_ptr = ptr_in + h * width + w; + float* data_out_ptr = ptr_out + w * height + h; + *data_out_ptr = *data_in_ptr; + } + } + } + +} + +SaberPermute::SaberPermute() { + _need_permute = false; + _transpose = false; +} + +SaberPermute::SaberPermute(std::vector orders) { + _order_dims = orders; +} + +SaberStatus SaberPermute::load_param(std::vector orders) { + _order_dims = orders; + return SaberSuccess; +} + +//template +SaberStatus SaberPermute::init(const std::vector *> &inputs, + std::vector *> &outputs, Context &ctx) { + _ctx = ctx; + _num_axes = inputs[0]->dims(); + _count = outputs[0]->valid_size(); + + LCHECK_EQ(inputs[0]->dims(), _order_dims.size(), "permute order size is not match to input dims"); + // set _need_permute + _need_permute = false; + for (int i = 0; i < _num_axes; ++i) { + if (_order_dims[i] != i) { + _need_permute = true; + break; + } + } + if (!_need_permute) { + return SaberSuccess; + } + + //! for basic permute + std::vector axis_diff; + int j = 0; + for (int i = 0; i < _num_axes; ++i) { + if (_order_dims[j] != i) { + axis_diff.push_back(j); + //LOG(INFO) << "diff axis: " << _order_dims[j]; + } else { + j++; + } + } + if (axis_diff.size() == 1) { + _transpose = true; + _trans_num = inputs[0]->count_valid(0, std::max(axis_diff[0] - 1, 0)); + _trans_w = inputs[0]->count_valid(axis_diff[0] + 1, _num_axes); + _trans_h = inputs[0]->valid_shape()[axis_diff[0]]; + printf("permute: transpose=true, num= %d, h=%d, w=%d\n", _trans_num , _trans_h, _trans_w); + } else { + _transpose = false; + _new_steps = outputs[0]->get_stride(); + _old_steps = inputs[0]->get_stride(); + printf("permute: transpose=false\n"); + } + + return SaberSuccess; +} + +//template +SaberStatus SaberPermute::dispatch(\ + const std::vector*>& inputs, \ + std::vector*>& outputs) { + + //! only copy the data + if (!_need_permute) { + outputs[0]->copy_from(*inputs[0]); + return SaberSuccess; + } + + const float* din = inputs[0]->data(); + float* dout = outputs[0]->mutable_data(); + //! transpose the data + if (_transpose) { + transpose_mat(din, dout, _trans_num, _trans_w, _trans_h); + } else { + permute_basic(_count, din, _order_dims.data(), \ + _old_steps.data(), _new_steps.data(), _num_axes, dout); + } + + return SaberSuccess; +} + +} //namespace lite + +} //namespace saber + +} // namespace anakin + +#endif //USE_ARM_PLACE \ No newline at end of file diff --git a/saber/lite/funcs/neon/saber_pooling.cpp b/saber/lite/funcs/neon/saber_pooling.cpp new file mode 100644 index 000000000..415303466 --- /dev/null +++ b/saber/lite/funcs/neon/saber_pooling.cpp @@ -0,0 +1,131 @@ +#include "saber/lite/funcs/saber_pooling.h" + +#ifdef USE_ARM_PLACE + +#include "saber/lite/funcs/neon/impl/pooling_arm_impl.h" + +namespace anakin { + +namespace saber { + +namespace lite{ + + + +SaberPooling::SaberPooling(PoolingType type, bool flag_global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h) { + + _type = type; + _is_global = flag_global; + _kw = kernel_w; + _kh = kernel_h; + _stride_w = stride_w; + _stride_h = stride_h; + _pad_w = pad_w; + _pad_h = pad_h; +} + +SaberStatus SaberPooling::load_param(PoolingType type, bool flag_global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h) { + + _type = type; + _is_global = flag_global; + _kw = kernel_w; + _kh = kernel_h; + _stride_w = stride_w; + _stride_h = stride_h; + _pad_w = pad_w; + _pad_h = pad_h; + return SaberSuccess; +} + +SaberStatus SaberPooling::compute_output_shape(const std::vector *> &inputs, + std::vector *> &outputs) { + Shape output_shape = inputs[0]->valid_shape(); + + int in_height = inputs[0]->height(); + int in_width = inputs[0]->width(); + + int out_height; + int out_width; + if (_is_global) { + out_height = 1; + out_width = 1; + } else { + out_height = (in_height + 2 * _pad_h - _kh + _stride_h - 1) / _stride_h + 1; + out_width = (in_width + 2 * _pad_w - _kw + _stride_w - 1) / _stride_w + 1; + } + + output_shape.set_height(out_height); + output_shape.set_width(out_width); + + return outputs[0]->set_shape(output_shape); +} + +//template <> +SaberStatus SaberPooling::init(const std::vector *> &inputs, + std::vector *> &outputs, Context &ctx) { + _ctx = ctx; + + if (_is_global) { + _impl = pooling_global; + return SaberSuccess; + } + + if (_kw != _kh || _stride_w != _stride_h \ + || _stride_w != 2 || _pad_w != _pad_h || _pad_w > 1) { + _impl = pooling_basic; + return SaberSuccess; + } + + if (_kw == 2) { + if (_type == Pooling_max) { + _impl = pooling2x2s2_max; + } else { + _impl = pooling2x2s2_ave; + } + return SaberSuccess; + } + + if (_kw == 3) { + if (_type == Pooling_max) { + _impl = pooling3x3s2_max; + } else { + _impl = pooling3x3s2_ave; + } + return SaberSuccess; + } + + _impl = pooling_basic; + return SaberSuccess; +} + +//template <> +SaberStatus SaberPooling::dispatch(const std::vector *> &inputs, + std::vector *> &outputs) { + + const float* din = inputs[0]->data(); + float* dout = outputs[0]->mutable_data(); + int num = inputs[0]->num(); + int chout = outputs[0]->channel(); + int hout = outputs[0]->height(); + int wout = outputs[0]->width(); + + int chin = inputs[0]->channel(); + int hin = inputs[0]->height(); + int win = inputs[0]->width(); + + _impl(din, dout, num, chout, hout, wout, chin, hin, win, \ + _type, _is_global, _kw, _kh, \ + _stride_w, _stride_h, \ + _pad_w, _pad_h); + return SaberSuccess; +} + +} //namespace lite + +} //namespace saber + +} // namespace anakin + +#endif //USE_ARM_PLACE \ No newline at end of file diff --git a/saber/lite/funcs/neon/saber_prelu.cpp b/saber/lite/funcs/neon/saber_prelu.cpp new file mode 100644 index 000000000..0eb06fce2 --- /dev/null +++ b/saber/lite/funcs/neon/saber_prelu.cpp @@ -0,0 +1,118 @@ +#include "saber/lite/funcs/saber_prelu.h" + +#ifdef USE_ARM_PLACE +namespace anakin{ + +namespace saber{ + +namespace lite{ + +template +void prelu_kernel(const Dtype* din, const Dtype* slopes, \ + Dtype* dout, int num, int cin, int csize, bool is_channel_shared); + +template <> +void prelu_kernel(const float* din, const float* slopes, \ + float* dout, int num, int cin, int csize, bool is_channel_shared) { + + for (int n = 0; n < num; n++){ + const float* data_in_batch = din + n * cin * csize; + float* data_out_batch = dout + n * cin * csize; +#pragma omp parallel for + for (int c = 0; c < cin; c++){ + const float* data_in_channel = data_in_batch + c * csize; + float* data_out_channel = data_out_batch + c * csize; + float slope = is_channel_shared ? slopes[0] : slopes[c]; + + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vslope = vdupq_n_f32(slope); + int cnt = csize >> 2; + int remain = csize - (cnt * 4); +#ifdef __arrch64__ + for (; cnt > 0; cnt--){ + float32x4_t vr0 = vld1q_f32(data_in_channel); + uint32x4_t vmask = vcltq_f32(vr0, vzero);//vr0 <= vzero + float32x4_t vout = vmulq_f32(vr0, vslope);//vr0 * vslope + float32x4_t vout_sel = vbslq_f32(vmask, vout, vr0); + vst1q_f32(data_out_channel, vout_sel); + data_in_channel += 4; + data_out_channel += 4; + } +#else + if (cnt > 0){ + asm volatile( + "prelu_loop: @main loop\n" + "vld1.f32 {d0-d1}, [%[ptr_in]]! @load q1\n" + "vclt.f32 q1, q0, %q[vzero] @vcle q0 <= vzero\n" + "vmul.f32 q2, q0, %q[vslope] @vmul q0 * vslope\n" + "vbit.32 q0, q2, q1 @vbit q0, q2, q1\n" + "subs %[cnt], #1 @subs nn, 1\n" + "vst1.f32 {d0-d1}, [%[dout]]! @store data\n" + "bne prelu_loop @bne nn\n" + :[ptr_in] "+r" (data_in_channel), [cnt] "+r" (cnt), \ + [dout] "+r" (data_out_channel) + :[vzero] "w" (vzero), [vslope] "w" (vslope) + :"q0", "q1", "q2" + ); + } +#endif //__aarch64__ + for (; remain > 0; remain--) { + if (*data_in_channel < 0){ + *data_out_channel = *data_in_channel * slope; + } else { + *data_out_channel = *data_in_channel; + } + data_in_channel++; + data_out_channel++; + } + } + } +} + +SaberPrelu::SaberPrelu(bool flag_shared, const float *weights) { + _flag_shared = flag_shared; + _weights = weights; +} + +SaberStatus SaberPrelu::load_param(bool flag_shared, const float *weights) { + _flag_shared = flag_shared; + _weights = weights; + return SaberSuccess; +} + +SaberStatus SaberPrelu::compute_output_shape(const std::vector *> &inputs, + std::vector *> &outputs) { + return outputs[0]->set_shape(inputs[0]->valid_shape()); +} + +SaberStatus SaberPrelu::init(const std::vector *> &inputs, + std::vector *> &outputs, Context &ctx) { + _ctx = ctx; + return SaberSuccess; +} + +//template +SaberStatus SaberPrelu::dispatch(\ + const std::vector*>& inputs, \ + std::vector*>& outputs) { + + int num = inputs[0]->num(); + int channel = inputs[0]->channel(); + int width = inputs[0]->width(); + int height = inputs[0]->height(); + const float* din = inputs[0]->data(); + float* dout = outputs[0]->mutable_data(); + const float* ptr_slope = _weights; + + prelu_kernel(din, ptr_slope, dout, num, channel, width * height, _flag_shared); + + return SaberSuccess; +} + +} //namespace lite + +} //namespace saber + +} // namespace anakin + +#endif //USE_ARM_PLACE \ No newline at end of file diff --git a/saber/lite/funcs/neon/saber_priorbox.cpp b/saber/lite/funcs/neon/saber_priorbox.cpp new file mode 100644 index 000000000..33a61baff --- /dev/null +++ b/saber/lite/funcs/neon/saber_priorbox.cpp @@ -0,0 +1,221 @@ +#include "saber/lite/funcs/saber_priorbox.h" +#include +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +SaberPriorBox::SaberPriorBox(bool is_flip, bool is_clip, std::vector min_size, std::vector max_size, + std::vector aspect_ratio, std::vector variance, int img_width, + int img_height, float step_w, float step_h, float offset) { + _is_flip = is_flip; + _is_clip = is_clip; + _min_size = min_size; + _max_size = max_size; + _aspect_ratio = aspect_ratio; + _variance = variance; + _img_width = img_width; + _img_height = img_height; + _step_w = step_w; + _step_h = step_h; + _offset = offset; +} + +SaberStatus SaberPriorBox::load_param(bool is_flip, bool is_clip, std::vector min_size, + std::vector max_size, std::vector aspect_ratio, + std::vector variance, int img_width, int img_height, float step_w, + float step_h, float offset) { + _is_flip = is_flip; + _is_clip = is_clip; + _min_size = min_size; + _max_size = max_size; + _img_width = img_width; + _img_height = img_height; + _step_w = step_w; + _step_h = step_h; + _offset = offset; + + _aspect_ratio.clear(); + _aspect_ratio.push_back(1.f); + + _variance.clear(); + if (variance.size() == 1) { + _variance.push_back(variance[0]); + _variance.push_back(variance[0]); + _variance.push_back(variance[0]); + _variance.push_back(variance[0]); + } else { + LCHECK_EQ(variance.size(), 4, "variance size must = 1 or = 4"); + _variance.push_back(variance[0]); + _variance.push_back(variance[1]); + _variance.push_back(variance[2]); + _variance.push_back(variance[3]); + } + + for (int i = 0; i < aspect_ratio.size(); ++i) { + float ar = aspect_ratio[i]; + bool already_exist = false; + for (int j = 0; j < aspect_ratio.size(); ++j) { + if (fabsf(ar - aspect_ratio[j]) < 1e-6f) { + already_exist = true; + break; + } + } + if (!already_exist) { + _aspect_ratio.push_back(ar); + if (_is_flip) { + _aspect_ratio.push_back(1.f / ar); + } + } + } + _num_priors = min_size.size() * aspect_ratio.size(); + _max_size.clear(); + if (max_size.size() > 0) { + LCHECK_EQ(max_size.size(), min_size.size(), "max_size num must = min_size num"); + for (int i = 0; i < max_size.size(); ++i) { + LCHECK_GT(max_size[i], min_size[i], "max_size val must > min_size val"); + _max_size.push_back(max_size[i]); + _num_priors++; + } + } + + return SaberSuccess; +} + +SaberStatus SaberPriorBox::compute_output_shape(const std::vector *> &inputs, + std::vector *> &outputs) { + //! output tensor's dims = 3 (1, 2, 4 * num_priors) + Shape shape_out = outputs[0]->valid_shape(); + shape_out[0] = 1; + shape_out[1] = 2; + + int win1 = inputs[0]->width(); + int hin1 = inputs[0]->height(); + + int wout = win1 * hin1 * _num_priors * 4; + shape_out[2] = wout; + + return outputs[0]->set_shape(shape_out); +} + +SaberStatus SaberPriorBox::init(const std::vector *> &inputs, + std::vector *> &outputs, Context &ctx) { + _ctx = ctx; + + LITE_CHECK(_output_arm.reshape(outputs[0]->valid_shape())); + float* output_host = _output_arm.mutable_data(); + + const int width = inputs[0]->width(); + const int height = inputs[0]->height(); + int img_width = _img_width; + int img_height = _img_height; + if (img_width == 0 || img_height == 0) { + img_width = inputs[1]->width(); + img_height = inputs[1]->height(); + } + + float step_w = _step_w; + float step_h = _step_h; + if (step_w == 0 || step_h == 0) { + step_w = static_cast(img_width) / width; + step_h = static_cast(img_height) / height; + } + float offset = _offset; + + int channel_size = height * width * _num_priors * 4; + int idx = 0; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + float center_x = (w + offset) * step_w; + float center_y = (h + offset) * step_h; + float box_width; + float box_height; + for (int s = 0; s < _min_size.size(); ++s) { + float min_size = _min_size[s]; + //! first prior: aspect_ratio = 1, size = min_size + box_width = box_height = min_size; + //! xmin + output_host[idx++] = (center_x - box_width / 2.f) / img_width; + //! ymin + output_host[idx++] = (center_y - box_height / 2.f) / img_height; + //! xmax + output_host[idx++] = (center_x + box_width / 2.f) / img_width; + //! ymax + output_host[idx++] = (center_y + box_height / 2.f) / img_height; + + if (_max_size.size() > 0) { + + int max_size = _max_size[s]; + //! second prior: aspect_ratio = 1, size = sqrt(min_size * max_size) + box_width = box_height = sqrtf(min_size * max_size); + //! xmin + output_host[idx++] = (center_x - box_width / 2.f) / img_width; + //! ymin + output_host[idx++] = (center_y - box_height / 2.f) / img_height; + //! xmax + output_host[idx++] = (center_x + box_width / 2.f) / img_width; + //! ymax + output_host[idx++] = (center_y + box_height / 2.f) / img_height; + } + + //! rest of priors + for (int r = 0; r < _aspect_ratio.size(); ++r) { + float ar = _aspect_ratio[r]; + if (fabs(ar - 1.f) < 1e-6f) { + continue; + } + box_width = min_size * sqrtf(ar); + box_height = min_size / sqrtf(ar); + //! xmin + output_host[idx++] = (center_x - box_width / 2.f) / img_width; + //! ymin + output_host[idx++] = (center_y - box_height / 2.f) / img_height; + //! xmax + output_host[idx++] = (center_x + box_width / 2.f) / img_width; + //! ymax + output_host[idx++] = (center_y + box_height / 2.f) / img_height; + } + } + } + } + //! clip the prior's coordidate such that it is within [0, 1] + if (_is_clip) { + for (int d = 0; d < channel_size; ++d) { + output_host[d] = std::min(std::max(output_host[d], 0.f), 1.f); + } + } + //! set the variance. + + float* ptr = output_host + channel_size; + int count = 0; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int i = 0; i < _num_priors; ++i) { + for (int j = 0; j < 4; ++j) { + ptr[count] = _variance[j]; + ++count; + } + } + } + } + return SaberSuccess; +} + +SaberStatus SaberPriorBox::dispatch(const std::vector *> &inputs, + std::vector *> &outputs) { + memcpy(outputs[0]->mutable_data(), _output_arm.data(), \ + outputs[0]->valid_size() * sizeof(float)); + return SaberSuccess; +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif + diff --git a/saber/lite/funcs/neon/saber_slice.cpp b/saber/lite/funcs/neon/saber_slice.cpp new file mode 100644 index 000000000..7a745c9b8 --- /dev/null +++ b/saber/lite/funcs/neon/saber_slice.cpp @@ -0,0 +1,104 @@ +#include "saber/lite/funcs/saber_slice.h" + +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +SaberSlice::SaberSlice(int axis, std::vector slice_points) { + _axis = axis; + _slice_points = slice_points; +} + +SaberStatus SaberSlice::load_param(int axis, std::vector slice_points) { + _axis = axis; + _slice_points = slice_points; + return SaberSuccess; +} + +SaberStatus SaberSlice::compute_output_shape(const std::vector *> &inputs, + std::vector *> &outputs) { + SaberStatus status; + //! input size is equal to 1 + Shape shape_in = inputs[0]->valid_shape(); + int top_size = outputs.size(); + int slice_points_size = _slice_points.size(); + int axis_size = shape_in[_axis]; + + LCHECK_EQ(top_size > 0 || slice_points_size > 0, true, "output shapes number is 0 and slice points size is 0"); + + if (slice_points_size > 0) { + LCHECK_EQ(slice_points_size + 1, top_size, "error params or ouput size"); + int prev = 0; + Shape sh = shape_in; + for (int i = 0; i < slice_points_size; ++i) { + LCHECK_GT(_slice_points[i], prev, " later should > prev"); + LCHECK_LT(_slice_points[i], axis_size, "slice point exceed"); + sh[_axis] = _slice_points[i] - prev; + outputs[i]->set_shape(sh); + prev = _slice_points[i]; + sh = shape_in; + } + LCHECK_GT(axis_size - prev, 0, "slice point exceed"); + sh[_axis] = axis_size - prev; + return outputs[slice_points_size]->set_shape(sh); + } else { + + LCHECK_EQ(axis_size % top_size, 0, "size in slice axis should divide exactly by top size"); + int step = axis_size / top_size; + Shape sh = shape_in; + sh[_axis] = step; + outputs[0]->set_shape(sh); + for (int i = 1; i < top_size; ++i) { + _slice_points[i - 1] = i * step; + status = outputs[i]->set_shape(sh); + if (status != SaberSuccess) { + return status; + } + } + } + return SaberSuccess; +} + +SaberStatus SaberSlice::init(const std::vector *> &inputs, + std::vector *> &outputs, Context &ctx) { + // get context + _ctx = ctx; + _slice_num = inputs[0]->count_valid(0, _axis); + _slice_size = inputs[0]->count_valid(_axis + 1, inputs[0]->dims()); + return SaberSuccess; +} + + +//template +SaberStatus SaberSlice::dispatch(const std::vector *> &inputs, + std::vector *> &outputs) { + int offset_slice_axis = 0; + const float* din = inputs[0]->data(); + const int in_slice_axis = inputs[0]->valid_shape()[_axis]; + for (int i = 0; i < outputs.size(); ++i) { + float* dout = outputs[i]->mutable_data(); + const int out_slice_axis = outputs[i]->valid_shape()[_axis]; + for (int n = 0; n < _slice_num; ++n) { + const int out_offset = n * out_slice_axis * _slice_size; + const int in_offset = (n * in_slice_axis + offset_slice_axis) * _slice_size; + memcpy((void*)(dout + out_offset), (void*)(din + in_offset), \ + sizeof(float) * out_slice_axis * _slice_size); + } + offset_slice_axis += out_slice_axis; + } + return SaberSuccess; +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //USE_ARM + + diff --git a/saber/lite/funcs/neon/saber_softmax.cpp b/saber/lite/funcs/neon/saber_softmax.cpp new file mode 100644 index 000000000..3ba6a2d47 --- /dev/null +++ b/saber/lite/funcs/neon/saber_softmax.cpp @@ -0,0 +1,171 @@ +#include "saber/lite/funcs/saber_softmax.h" + +#ifdef USE_ARM_PLACE + +#include +#include "saber/lite/funcs/neon/impl/neon_mathfun.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +void softmax_basic(const float* din, float* dout, \ + const int axis_size, const int inner_num, \ + const int outer_num, const int compute_size) { + +#pragma omp parallel for + for (int i = 0; i < compute_size; ++i) { + int idx_inner = i % inner_num; + int idx_outer = (i / inner_num) * axis_size; + int real_index = idx_outer * inner_num + idx_inner; + + float max_data = din[real_index]; + //! get max + for (int j = 1; j < axis_size; ++j) { + real_index += inner_num; + max_data = din[real_index] > max_data? din[real_index] : max_data; + } + + real_index = idx_outer * inner_num + idx_inner; + //! sub, exp and sum + dout[real_index] = expf(din[real_index] - max_data); + float sum_data = dout[real_index]; + for (int j = 1; j < axis_size; ++j) { + real_index += inner_num; + dout[real_index] = expf(din[real_index] - max_data); + sum_data += dout[real_index]; + } + + float sum_inv = 1.f / sum_data; + real_index = idx_outer * inner_num + idx_inner; + //! get softmax result + for (int j = 0; j < axis_size; ++j) { + dout[real_index] *= sum_inv; + real_index += inner_num; + } + } +} + +//! for inner size == 1 +void softmax_inner1(const float* din, float* dout, \ + const int outer_size, const int axis_size) { +#pragma omp parallel for + for (int i = 0; i < outer_size; ++i) { + const float* din_ptr = din + i * axis_size; + float* dout_ptr = dout + i * axis_size; + + const float* din_max_ptr = din_ptr; + int nn = axis_size >> 2; + + //! get max + float32x4_t vmax = vld1q_f32(din_max_ptr); + din_max_ptr += 4; + int j = 1; + for (; j < nn; ++j) { + vmax = vmaxq_f32(vmax, vld1q_f32(din_max_ptr)); + din_max_ptr += 4; + } + float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax)); + float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1)); + for (j = 4 * j; j < axis_size; ++j) { + max_data = std::max(max_data, din_max_ptr[0]); + din_max_ptr++; + } + //printf("max data: %.2f\n", max_data); + + //! sub, exp and sum + const float* din_sum_ptr = din_ptr; + float* dout_sum_ptr = dout_ptr; + vmax = vdupq_n_f32(max_data); + float32x4_t vsub_exp = exp_ps(vsubq_f32(vld1q_f32(din_sum_ptr), vmax)); + float32x4_t vsum = vsub_exp; + vst1q_f32(dout_sum_ptr, vsub_exp); + din_sum_ptr += 4; + dout_sum_ptr += 4; + + j = 1; + for (; j < nn; ++j) { + vsub_exp = exp_ps(vsubq_f32(vld1q_f32(din_sum_ptr), vmax)); + vst1q_f32(dout_sum_ptr, vsub_exp); + vsum = vaddq_f32(vsum, vsub_exp); + din_sum_ptr += 4; + dout_sum_ptr += 4; + } + float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum)); + float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1); + + for (j = 4 * j; j < axis_size; ++j) { + dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data); + sum_data += dout_sum_ptr[0]; + din_sum_ptr++; + dout_sum_ptr++; + } + //printf("sum data: %.2f\n", sum_data); + + float sum_inv = 1.f / sum_data; + float* dout_res_ptr = dout_ptr; + float32x4_t vinv = vdupq_n_f32(sum_inv); + //! get softmax result + j = 0; + for (; j < nn; ++j) { + float32x4_t vout = vld1q_f32(dout_res_ptr); + float32x4_t vres= vmulq_f32(vout, vinv); + vst1q_f32(dout_res_ptr, vres); + dout_res_ptr += 4; + } + for (j = nn * 4; j < axis_size; ++j) { + dout_ptr[j] *= sum_inv; + } + } +} + +SaberSoftmax::SaberSoftmax(int axis) { + _axis = axis; +} + +SaberStatus SaberSoftmax::load_param(int axis) { + _axis = axis; + return SaberSuccess; +} + +SaberStatus SaberSoftmax::init(const std::vector *> &inputs, + std::vector *> &outputs, Context &ctx) { + _ctx = ctx; + Shape shape_in = inputs[0]->valid_shape(); + Shape shape_out = outputs[0]->valid_shape(); + _outer_num = inputs[0]->count_valid(0, _axis); + _inner_num = inputs[0]->count_valid(_axis + 1, inputs[0]->dims()); + _axis_size = shape_in[_axis]; + + int buffer_size = this->_inner_num * this->_outer_num; + return SaberSuccess; +} + +//template +SaberStatus SaberSoftmax::dispatch(const std::vector*>& inputs, \ + std::vector*>& outputs) { + + float* dout = (float*)outputs[0]->mutable_data(); + const float* din = (float*)inputs[0]->data(); + + if (this->_inner_num == 1) { + softmax_inner1(din, dout, _outer_num, _axis_size); + } else { + int compute_size = inputs[0]->valid_size() / _axis_size; + softmax_basic(din, dout, _axis_size, _inner_num, _outer_num, compute_size); + } + + return SaberSuccess; +} + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //USE_ARM + + diff --git a/saber/lite/funcs/op_base.h b/saber/lite/funcs/op_base.h new file mode 100644 index 000000000..73e9a2c05 --- /dev/null +++ b/saber/lite/funcs/op_base.h @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_FUNCS_OP_BASE_H +#define ANAKIN_SABER_LITE_FUNCS_OP_BASE_H + +#include "saber/lite/core/common_lite.h" +#include "saber/lite/core/tensor_lite.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +class OpBase { +public: + OpBase(){} + virtual SaberStatus load_param() = 0; + virtual compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs) { + return SaberUnImplError; + } + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, Context& ctx) { + return SaberUnImplError; + } + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs) { + return SaberUnImplError; + } + +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_LITE_FUNCS_OP_BASE_H diff --git a/saber/lite/funcs/saber_activation.h b/saber/lite/funcs/saber_activation.h new file mode 100644 index 000000000..15ad2e74f --- /dev/null +++ b/saber/lite/funcs/saber_activation.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_ACTIVATION_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_ACTIVATION_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//template +class SaberActivation { +public: + SaberActivation() {} + + SaberActivation(ActiveType type, float neg_slop = 0.f); + SaberStatus load_param(ActiveType type, float neg_slop = 0.f); + + ~SaberActivation() {} + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs); + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, Context& ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs); +private: + Context _ctx; + ActiveType _type; + float _neg_slop; +}; + + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_ACTIVATION_H diff --git a/saber/lite/funcs/saber_concat.h b/saber/lite/funcs/saber_concat.h new file mode 100644 index 000000000..6d92d4fda --- /dev/null +++ b/saber/lite/funcs/saber_concat.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_CONCAT_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_CONCAT_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//template +class SaberConcat { +public: + SaberConcat() = default; + SaberConcat(int axis); + ~SaberConcat() {} + + SaberStatus load_param(int axis); + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs); + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, Context &ctx); + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs); + +private: + Context _ctx; + int _axis; + int _num_concats; + int _concat_input_size; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_CONCAT_H diff --git a/saber/lite/funcs/saber_conv.h b/saber/lite/funcs/saber_conv.h new file mode 100755 index 000000000..fb94ade4a --- /dev/null +++ b/saber/lite/funcs/saber_conv.h @@ -0,0 +1,97 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_CONV_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_CONV_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE + +#include "saber/lite/funcs/neon/impl/sgemm_arm.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +typedef void (*conv_func)(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, Sgemm& gemmer, void* work_space); + + +//template +class SaberConv2D { +public: + SaberConv2D(); + + SaberConv2D(int weights_size, int num_output, int group, int kw, int kh, \ + int stride_w, int stride_h, int pad_w, int pad_h, int dila_w, int dila_h, \ + bool flag_bias, const float* weights, const float* bias); + + SaberStatus load_param(int weights_size, int num_output, int group, int kw, int kh, \ + int stride_w, int stride_h, int pad_w, int pad_h, int dila_w, int dila_h, \ + bool flag_bias, const float* weights, const float* bias); + + ~SaberConv2D() {} + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs); + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, Context &ctx); + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs); + + SaberStatus set_activation(bool flag); + +private: + Context _ctx; + conv_func _impl{nullptr}; + Sgemm _gemmer; + bool _flag_relu{false}; + bool _is_trans_weights{false}; + bool _bias_term{true}; + int _num_output; + int _group; + int _kw; + int _kh; + int _stride_w; + int _stride_h; + int _pad_w; + int _pad_h; + int _dila_w; + int _dila_h; + const float* _weights{nullptr}; + const float* _bias{nullptr}; + int _weights_size; + size_t _workspace_fwd_sizes{0}; + Tensor _workspace_data; + Tensor _weights_trans; +}; + + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_CONV_H diff --git a/saber/lite/funcs/saber_conv_act.h b/saber/lite/funcs/saber_conv_act.h new file mode 100755 index 000000000..7c0d6b528 --- /dev/null +++ b/saber/lite/funcs/saber_conv_act.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_CONV_ACT_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_CONV_ACT_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE + +#include "saber/lite/funcs/saber_conv.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//template +class SaberConvAct2D { +public: + SaberConvAct2D() { + _conv_op = new SaberConv2D; + } + + SaberConvAct2D(int weights_size, int num_output, int group, int kw, int kh, int stride_w, int stride_h, \ + int pad_w, int pad_h, int dila_w, int dila_h, bool flag_bias, ActiveType type, \ + const float* weights, const float* bias) { + + LCHECK_EQ(type, Active_relu, "active type must be relu"); + _conv_op = new SaberConv2D(weights_size, num_output, group, kw, kh, stride_w, stride_h, \ + pad_w, pad_h, dila_w, dila_h, flag_bias, weights, bias); + } + + SaberStatus load_param(int weights_size, int num_output, int group, int kw, int kh, int stride_w, int stride_h, \ + int pad_w, int pad_h, int dila_w, int dila_h, bool flag_bias, ActiveType type, \ + const float* weights, const float* bias) { + + LCHECK_EQ(type, Active_relu, "active type must be relu"); + _conv_op->set_activation(true); + return _conv_op->load_param(weights_size, num_output, group, kw, kh, stride_w, stride_h, \ + pad_w, pad_h, dila_w, dila_h, flag_bias, weights, bias); + + } + + ~SaberConvAct2D() { + delete _conv_op; + } + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs) { + return _conv_op->compute_output_shape(inputs, outputs); + } + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, Context &ctx) { + _conv_op->set_activation(true); + return _conv_op->init(inputs, outputs, ctx); + } + + SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs) { + return _conv_op->dispatch(inputs, outputs); + } + +private: + SaberConv2D* _conv_op; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_CONV_ACT_H diff --git a/saber/lite/funcs/saber_conv_batchnorm_scale.h b/saber/lite/funcs/saber_conv_batchnorm_scale.h new file mode 100755 index 000000000..69cab8264 --- /dev/null +++ b/saber/lite/funcs/saber_conv_batchnorm_scale.h @@ -0,0 +1,105 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_CONV_BATCHNORM_SCALE_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_CONV_BATCHNORM_SCALE_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE + +#include "saber/lite/funcs/saber_conv.h" +#include "saber/lite/funcs/utils_arm.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//template +class SaberConvBatchnormScale { +public: + SaberConvBatchnormScale() { + _conv_op = new SaberConv2D; + } + + SaberConvBatchnormScale(int weights_size, int num_output, int group, int kw, int kh, int stride_w, int stride_h, \ + int pad_w, int pad_h, int dila_w, int dila_h, bool flag_bias, \ + float bn_scale, float bn_eps, std::vector bn_mean, std::vector bn_variance, \ + std::vector scale_w, std::vector scale_b, bool scale_bias_term, \ + const float* weights, const float* bias) { + + int ch = weights_size / (num_output * kw * kh); + update_weights(_new_weights, _new_bias, weights, bias, \ + num_output, ch, kh, kw, flag_bias, \ + bn_scale, bn_eps, bn_mean, bn_variance, \ + scale_w, scale_b, scale_bias_term); + + _conv_op = new SaberConv2D(weights_size, num_output, group, kw, kh, stride_w, stride_h, \ + pad_w, pad_h, dila_w, dila_h, true, _new_weights.data(), _new_bias.data()); + } + + SaberStatus load_param(int weights_size, int num_output, int group, int kw, int kh, int stride_w, int stride_h, \ + int pad_w, int pad_h, int dila_w, int dila_h, bool flag_bias, \ + float bn_scale, float bn_eps, std::vector bn_mean, std::vector bn_variance, \ + std::vector scale_w, std::vector scale_b, bool scale_bias_term, \ + const float* weights, const float* bias) { + + int ch = weights_size / (num_output * kw * kh); + update_weights(_new_weights, _new_bias, weights, bias, \ + num_output, ch, kh, kw, flag_bias, \ + bn_scale, bn_eps, bn_mean, bn_variance, \ + scale_w, scale_b, scale_bias_term); + + return _conv_op->load_param(weights_size, num_output, group, kw, kh, stride_w, stride_h, \ + pad_w, pad_h, dila_w, dila_h, true, _new_weights.data(), _new_bias.data()); + + } + + ~SaberConvBatchnormScale() { + delete _conv_op; + } + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs) { + return _conv_op->compute_output_shape(inputs, outputs); + } + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, Context &ctx) { + _conv_op->set_activation(false); + return _conv_op->init(inputs, outputs, ctx); + } + + SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs) { + return _conv_op->dispatch(inputs, outputs); + } + +private: + SaberConv2D* _conv_op; + Tensor _new_weights; + Tensor _new_bias; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_CONV_BATCHNORM_SCALE_H diff --git a/saber/lite/funcs/saber_conv_batchnorm_scale_relu.h b/saber/lite/funcs/saber_conv_batchnorm_scale_relu.h new file mode 100755 index 000000000..4aade12bc --- /dev/null +++ b/saber/lite/funcs/saber_conv_batchnorm_scale_relu.h @@ -0,0 +1,107 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_CONV_BATCHNORM_SCALE_RELU_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_CONV_BATCHNORM_SCALE_RELU_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE + +#include "saber/lite/funcs/saber_conv.h" +#include "saber/lite/funcs/utils_arm.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//template +class SaberConvBatchnormScaleRelu { +public: + SaberConvBatchnormScaleRelu() { + _conv_op = new SaberConv2D; + } + + SaberConvBatchnormScaleRelu(int weights_size, int num_output, int group, int kw, int kh, int stride_w, int stride_h, \ + int pad_w, int pad_h, int dila_w, int dila_h, bool flag_bias, \ + float bn_scale, float bn_eps, std::vector bn_mean, std::vector bn_variance, \ + std::vector scale_w, std::vector scale_b, bool scale_bias_term, ActiveType type, \ + const float* weights, const float* bias) { + + LCHECK_EQ(type, Active_relu, "active type must be relu"); + int ch = weights_size / (num_output * kw * kh); + update_weights(_new_weights, _new_bias, weights, bias, \ + num_output, ch, kh, kw, flag_bias, \ + bn_scale, bn_eps, bn_mean, bn_variance, \ + scale_w, scale_b, scale_bias_term); + + _conv_op = new SaberConv2D(weights_size, num_output, group, kw, kh, stride_w, stride_h, \ + pad_w, pad_h, dila_w, dila_h, true, _new_weights.data(), _new_bias.data()); + } + + SaberStatus load_param(int weights_size, int num_output, int group, int kw, int kh, int stride_w, int stride_h, \ + int pad_w, int pad_h, int dila_w, int dila_h, bool flag_bias, \ + float bn_scale, float bn_eps, std::vector bn_mean, std::vector bn_variance, \ + std::vector scale_w, std::vector scale_b, bool scale_bias_term, ActiveType type, \ + const float* weights, const float* bias) { + + LCHECK_EQ(type, Active_relu, "active type must be relu"); + int ch = weights_size / (num_output * kw * kh); + update_weights(_new_weights, _new_bias, weights, bias, \ + num_output, ch, kh, kw, flag_bias, \ + bn_scale, bn_eps, bn_mean, bn_variance, \ + scale_w, scale_b, scale_bias_term);*/ + + return _conv_op->load_param(weights_size, num_output, group, kw, kh, stride_w, stride_h, \ + pad_w, pad_h, dila_w, dila_h, true, _new_weights.data(), _new_bias.data()); + + } + + ~SaberConvBatchnormScaleRelu() { + delete _conv_op; + } + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs) { + return _conv_op->compute_output_shape(inputs, outputs); + } + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, Context &ctx) { + _conv_op->set_activation(true); + return _conv_op->init(inputs, outputs, ctx); + } + + SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs) { + return _conv_op->dispatch(inputs, outputs); + } + +private: + SaberConv2D* _conv_op; + Tensor _new_weights; + Tensor _new_bias; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_CONV_BATCHNORM_SCALE_RELU_H diff --git a/saber/lite/funcs/saber_detection_output.h b/saber/lite/funcs/saber_detection_output.h new file mode 100644 index 000000000..089f8c166 --- /dev/null +++ b/saber/lite/funcs/saber_detection_output.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_DETECTION_OUTPUT_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_DETECTION_OUTPUT_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//template +class SaberDetectionOutput { +public: + SaberDetectionOutput(){} + SaberDetectionOutput(bool share_loc, + bool variance_encode, + int class_num, + int background_id, + int keep_topk, + CodeType type, + float conf_thresh, + int nms_topk, + float nms_thresh = 0.3f, + float nms_eta = 1.f); + ~SaberDetectionOutput() {} + + SaberStatus load_param(bool share_loc, + bool variance_encode, + int class_num, + int background_id, + int keep_topk, + CodeType type, + float conf_thresh, + int nms_topk, + float nms_thresh = 0.3f, + float nms_eta = 1.f); + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs); + + SaberStatus init(const std::vector*>& inputs, \ + std::vector*>& outputs, Context &ctx); + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs); + + +private: + Context _ctx; + bool _share_loacation{true}; + bool _variance_encode_in_target{false}; + int _class_num; + int _background_id{0}; + int _keep_top_k{-1}; + CodeType _type{CENTER_SIZE}; + float _conf_thresh; + int _nms_top_k; + float _nms_thresh{0.3f}; + float _nms_eta{1.f}; + int _num_loc_classes; + int _num_priors; + Tensor _bbox_preds; + Tensor _bbox_permute; + Tensor _conf_permute; +}; + +} //namepace lite + +} //namespace saber + +} //namespace anakin + +#endif + +#endif //ANAKIN_SABER_FUNCS_CUDA_SABER_DETECTION_OUTPUT_H diff --git a/saber/lite/funcs/saber_eltwise.h b/saber/lite/funcs/saber_eltwise.h new file mode 100644 index 000000000..f2511b298 --- /dev/null +++ b/saber/lite/funcs/saber_eltwise.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_ELTWISE_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_ELTWISE_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE +namespace anakin{ + +namespace saber{ + +namespace lite{ + +typedef void (*eltwise_func)(const float* din_a, \ + const float* din_b, float* dout, const int size, std::vector coef); + +//template +class SaberEltwise { +public: + SaberEltwise() {} + SaberEltwise(EltwiseType type, std::vector coef); + + SaberStatus load_param(EltwiseType type, std::vector coef); + + ~SaberEltwise() {} + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs); + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, Context &ctx); + + SaberStatus dispatch(const std::vector*>& inputs, \ + std::vector*>& outputs); + +private: + Context _ctx; + EltwiseType _type; + std::vector _coef; + eltwise_func _impl{nullptr}; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_ELTWISE_H diff --git a/saber/lite/funcs/saber_fc.h b/saber/lite/funcs/saber_fc.h new file mode 100755 index 000000000..3e90ac197 --- /dev/null +++ b/saber/lite/funcs/saber_fc.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_FC_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_FC_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE + +#include "saber/lite/funcs/neon/impl/sgemm_arm.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//! input size: 1xk +//! output size: 1xn +//! weights size: nxk +//! bias size: 1xn +//template +class SaberFc { +public: + SaberFc() {} + + SaberFc(int axis, int num_output, bool flag_trans, bool flag_bias, \ + const float* weights, const float* bias); + + SaberStatus load_param(int axis, int num_output, bool flag_trans, bool flag_bias, \ + const float* weights, const float* bias); + + ~SaberFc() {} + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs); + + SaberStatus init(const std::vector*>& inputs, \ + std::vector*>& outputs, Context &ctx); + + SaberStatus dispatch(const std::vector*>& inputs, \ + std::vector*>& outputs); + + +private: + Context _ctx; + Sgemm _gemmer; + int _m; + int _k; + int _n; + + int _axis; + int _num_output; + bool _bias_term{true}; + bool _flag_trans{false}; + const float* _weights{nullptr}; + const float* _bias{nullptr}; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_FC_H diff --git a/saber/lite/funcs/saber_permute.h b/saber/lite/funcs/saber_permute.h new file mode 100644 index 000000000..6d344021f --- /dev/null +++ b/saber/lite/funcs/saber_permute.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_PERMUTE_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_PERMUTE_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//template +class SaberPermute { +public: + SaberPermute(); + + SaberPermute(std::vector orders); + + ~SaberPermute() {} + + SaberStatus load_param(std::vector orders); + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs); + + SaberStatus init(const std::vector*>& inputs, \ + std::vector*>& outputs, Context &ctx); + + SaberStatus dispatch(const std::vector*>& inputs, \ + std::vector*>& outputs); + + +private: + Context _ctx; + int _num_axes; + int _count; + bool _need_permute{false}; + bool _transpose{false}; + int _trans_num; + int _trans_w; + int _trans_h; + std::vector _order_dims; + std::vector _new_steps; + std::vector _old_steps; + +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_PERMUTE_H diff --git a/saber/lite/funcs/saber_pooling.h b/saber/lite/funcs/saber_pooling.h new file mode 100755 index 000000000..7dedfcee5 --- /dev/null +++ b/saber/lite/funcs/saber_pooling.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_POOLING_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_POOLING_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE +namespace anakin{ + +namespace saber{ + +namespace lite{ + +typedef void (*pool_func)(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + PoolingType type, bool global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h); + +//template +class SaberPooling { + +public: + SaberPooling() {} + + SaberPooling(PoolingType type, bool flag_global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h); + + SaberStatus load_param(PoolingType type, bool flag_global, int kernel_w, int kernel_h, \ + int stride_w, int stride_h, int pad_w, int pad_h); + + ~SaberPooling() {} + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs); + + SaberStatus init(const std::vector*>& inputs, \ + std::vector*>& outputs, Context &ctx); + + SaberStatus dispatch(const std::vector*>& inputs, \ + std::vector*>& outputs); + +private: + pool_func _impl{nullptr}; + Context _ctx; + + PoolingType _type; + bool _is_global{false}; + int _kw; + int _kh; + int _stride_w; + int _stride_h; + int _pad_w; + int _pad_h; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_POOLING_H diff --git a/saber/lite/funcs/saber_prelu.h b/saber/lite/funcs/saber_prelu.h new file mode 100644 index 000000000..bde1f67f0 --- /dev/null +++ b/saber/lite/funcs/saber_prelu.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_NEON_SABER_PRELU_H +#define ANAKIN_SABER_LITE_FUNCS_NEON_SABER_PRELU_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//template +class SaberPrelu { + +public: + + SaberPrelu() {} + + SaberPrelu(bool flag_shared, const float* weights); + + SaberStatus load_param(bool flag_shared, const float* weights); + + ~SaberPrelu() {} + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs); + + SaberStatus init(const std::vector*>& inputs, \ + std::vector*>& outputs, Context &ctx); + + SaberStatus dispatch(const std::vector*>& inputs, \ + std::vector*>& outputs); + +private: + Context _ctx; + + bool _flag_shared; + const float* _weights{nullptr}; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_NEON_SABER_PRELU_H diff --git a/saber/lite/funcs/saber_priorbox.h b/saber/lite/funcs/saber_priorbox.h new file mode 100644 index 000000000..9950ca8d0 --- /dev/null +++ b/saber/lite/funcs/saber_priorbox.h @@ -0,0 +1,76 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_PRIORBOX_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_PRIORBOX_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//template +class SaberPriorBox { +public: + + SaberPriorBox() = default; + + SaberPriorBox(bool is_flip, bool is_clip, std::vector min_size, std::vector max_size, \ + std::vector aspect_ratio, std::vector variance, \ + int img_width, int img_height, float step_w, float step_h, float offset); + + SaberStatus load_param(bool is_flip, bool is_clip, std::vector min_size, std::vector max_size, \ + std::vector aspect_ratio, std::vector variance, \ + int img_width, int img_height, float step_w, float step_h, float offset); + + ~SaberPriorBox() {} + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs); + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, Context &ctx); + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs); + +private: + Context _ctx; + Tensor _output_arm; + + int _num_priors; + + bool _is_flip; + bool _is_clip; + std::vector _min_size; + std::vector _max_size; + std::vector _aspect_ratio; + std::vector _variance; + int _img_width; + int _img_height; + float _step_w; + float _step_h; + float _offset; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_PRIORBOX_H diff --git a/saber/lite/funcs/saber_slice.h b/saber/lite/funcs/saber_slice.h new file mode 100644 index 000000000..351f4af9e --- /dev/null +++ b/saber/lite/funcs/saber_slice.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_SLICE_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_SLICE_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE +namespace anakin{ + +namespace saber{ + +namespace lite{ +//template +class SaberSlice { +public: + + SaberSlice() { + _slice_num = 4; + _slice_size = 0; + } + + SaberSlice(int axis, std::vector slice_points); + + SaberStatus load_param(int axis, std::vector slice_points); + + ~SaberSlice() {} + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs); + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, Context &ctx); + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs); + +private: + Context _ctx; + int _slice_num; + int _slice_size; + + int _axis; + std::vector _slice_points; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_SLICE_H diff --git a/saber/lite/funcs/saber_softmax.h b/saber/lite/funcs/saber_softmax.h new file mode 100755 index 000000000..6e05cca0b --- /dev/null +++ b/saber/lite/funcs/saber_softmax.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_LITE_FUNCS_SABER_SOFTMAX_H +#define ANAKIN_SABER_LITE_FUNCS_SABER_SOFTMAX_H + +#include "saber/lite/core/tensor_lite.h" +#include "saber/lite/core/context_lite.h" + +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//template +class SaberSoftmax{ +public: + + SaberSoftmax() = default; + + SaberSoftmax(int axis); + + SaberStatus load_param(int axis); + + ~SaberSoftmax() {} + + + SaberStatus compute_output_shape(const std::vector*>& inputs, + std::vector*>& outputs) { + return outputs[0]->set_shape(inputs[0]->valid_shape()); + } + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, Context &ctx); + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs); + +private: + Context _ctx; + int _axis_size{0}; + int _inner_num{0}; + int _outer_num{0}; + + int _axis; + +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin +#endif // USE_ARM_PLACE + +#endif //ANAKIN_SABER_LITE_FUNCS_SABER_SOFTMAX_H diff --git a/saber/lite/funcs/timer_lite.h b/saber/lite/funcs/timer_lite.h new file mode 100644 index 000000000..8bfbc675b --- /dev/null +++ b/saber/lite/funcs/timer_lite.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_FUNCS_TIMER_LITE_H +#define ANAKIN_SABER_LITE_FUNCS_TIMER_LITE_H + +#include "saber/lite/core/common_lite.h" +#include +#include + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +//template +class SaberTimer final { + +public: + SaberTimer() {} + + ~SaberTimer() {} + + void clear() { + ms_time.clear(); + } + + void start() { + tstart = std::chrono::system_clock::now(); + } + + void end() { + tend = std::chrono::system_clock::now(); + auto ts = std::chrono::duration_cast(tend - tstart); + float elapse_ms = 1000.f * float(ts.count()) * std::chrono::microseconds::period::num / \ + std::chrono::microseconds::period::den; + ms_time.push_back(elapse_ms); + } + + float get_average_ms() { + if (ms_time.size() == 0) { + return 0.f; + } + float sum = 0.f; + for (auto i : ms_time){ + sum += i; + } + return sum / ms_time.size(); + } + + // return tile (0-99) time. + float get_tile_time(float tile) { + + if (tile <0 || tile > 100) { + return -1.f; + } + int total_items = (int)ms_time.size(); + if (total_items <= 0) { + return -2.f; + } + ms_time.sort(); + int pos = (int)(tile * total_items / 100); + auto it = ms_time.begin(); + for (int i = 0; i < pos; ++i) { + ++it; + } + return *it; + } + + const std::list get_time_stat() { + return ms_time; + } + +private: + std::chrono::time_point tstart; + std::chrono::time_point tend; + std::list ms_time; +}; + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_LITE_FUNCS_TIMER_LITE_H diff --git a/saber/lite/funcs/utils_arm.cpp b/saber/lite/funcs/utils_arm.cpp new file mode 100644 index 000000000..9367da375 --- /dev/null +++ b/saber/lite/funcs/utils_arm.cpp @@ -0,0 +1,82 @@ +#include "saber/lite/funcs/utils_arm.h" +#include +#ifdef USE_ARM_PLACE + +namespace anakin{ + +namespace saber{ + +namespace lite{ + +void update_weights(Tensor& new_weight, Tensor& new_bias, \ + const float* weights, const float* bias, int num, int ch, int kh, int kw, bool conv_bias_term, \ + float batchnorm_scale, float batchnorm_eps, \ + std::vector batchnorm_mean, std::vector batchnorm_variance, \ + std::vector scale_w, std::vector scale_b, bool scale_bias_term){ + + Shape weight_shape = {num, ch, kh, kw}; + int weight_size = num * ch * kh * kw; + new_weight.reshape(weight_shape); + memcpy(new_weight.mutable_data(), weights, sizeof(float) * weight_size); + + Shape bias_shape = {num}; + new_bias.reshape(bias_shape); + + if (conv_bias_term) { + memcpy(new_bias.mutable_data(), bias, sizeof(float) * num); + } else { + memset(new_bias.mutable_data(), 0, sizeof(float) * num); + } + + int filter_num = new_weight.num(); + int chw = new_weight.channel(); + + float* weight_data = new_weight.mutable_data(); + float* bias_data = new_bias.mutable_data(); + + chw *= new_weight.height(); + chw *= new_weight.width(); + + for (int i = 0; i < filter_num; ++i) { + float alpha = 1.f; + float beta = 0.f; + + //! process batchnorm + float scale_factor = 1.f; + scale_factor = (batchnorm_scale == 0) ? 1 : 1.f / batchnorm_scale; + float eps = batchnorm_eps; + float variance; + float mean; + alpha = batchnorm_variance[i] * scale_factor + eps; + alpha = 1.f / sqrtf(alpha); + beta = -1.f * (batchnorm_mean[i] * scale_factor); + beta *= alpha; + + //! process scale + alpha *= scale_w[i]; + + if (scale_bias_term) { + beta = beta * scale_w[i] + scale_b[i]; + } else { + beta *= scale_w[i]; + } + + for (int j = 0; j < chw; ++j) { + weight_data[i * chw + j] *= alpha; + } + + bias_data[i] *= alpha; + bias_data[i] += beta; + } +} + + +} //namespace lite + +} //namespace saber + +} //namespace anakin + + +#endif + diff --git a/saber/lite/funcs/utils_arm.h b/saber/lite/funcs/utils_arm.h new file mode 100644 index 000000000..e6d8b5dee --- /dev/null +++ b/saber/lite/funcs/utils_arm.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_FUNCS_ARM_IMPL_UTILS_ARM_H +#define ANAKIN_SABER_FUNCS_ARM_IMPL_UTILS_ARM_H + +#include "saber/lite/core/common_lite.h" +#include "saber/lite/core/tensor_lite.h" +namespace anakin{ + +namespace saber{ + +namespace lite{ + +void update_weights(Tensor& new_weight, Tensor& new_bias, \ + const float* weights, const float* bias, int num, int ch, int kh, int kw, bool conv_bias_term, \ + float batchnorm_scale, float batchnorm_eps, \ + std::vector batchnorm_mean, std::vector batchnorm_variance, \ + std::vector scale_w, std::vector scale_b, bool scale_bias_term); + +} //namespace lite + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_ARM_IMPL_UTILS_ARM_H diff --git a/saber/saber.h b/saber/saber.h index 62bc21660..f50161ea6 100644 --- a/saber/saber.h +++ b/saber/saber.h @@ -1,3 +1,18 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + #ifndef ANAKIN_SABER_SABER_H #define ANAKIN_SABER_SABER_H diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h index c6a88cbe4..3f5f8bc40 100644 --- a/saber/saber_funcs_param.h +++ b/saber/saber_funcs_param.h @@ -1,16 +1,13 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_PARAM_H @@ -18,307 +15,276 @@ #include "anakin_config.h" #include #include -#include #include "saber/core/shape.h" #include "saber/core/tensor.h" #include "saber/saber_types.h" -namespace anakin{ +namespace anakin { namespace saber { -template -struct MatMulParam { - MatMulParam():_is_transpose_X(false),_is_transpose_Y(false){} - MatMulParam(bool x, bool y):_is_transpose_X(x),_is_transpose_Y(y){} - MatMulParam &operator=(const MatMulParam &right) - { - _is_transpose_X = right._is_transpose_X; - _is_transpose_Y = right._is_transpose_Y; +template +struct PreluParam; +template +struct PowerParam; + +template +struct ActivationParam { + ActivationParam() + : active(Active_unknow) + , negative_slope(float(-1)) + , coef(float(-1)) + , prelu_param(PreluParam(false, nullptr)) + , has_active(false) + {} + + ActivationParam(ActiveType act, float n_slope = float(0), + float co = float(1), + PreluParam prelu = PreluParam(false, nullptr)) + : active(act) + , negative_slope(n_slope) + , coef(co) + , prelu_param(prelu) + , has_active(true) + {} + + ActivationParam(ActiveType act, float n_slope, + float co, + PreluParam prelu, + bool has) + : active(act) + , negative_slope(n_slope) + , coef(co) + , prelu_param(prelu) + , has_active(has) + {} + + ActivationParam(const ActivationParam& right) + : active(right.active) + , negative_slope(right.negative_slope) + , coef(right.coef) + , prelu_param(right.prelu_param) + , has_active(right.has_active) + {} + ActivationParam& operator=(const ActivationParam& right) { + active = right.active; + negative_slope = right.negative_slope; + coef = right.coef; + prelu_param = right.prelu_param; + has_active = right.has_active; + return *this; } - bool operator==(const MatMulParam &right) { + bool operator==(const ActivationParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (_is_transpose_X == right._is_transpose_X); - comp_eq = comp_eq && (_is_transpose_Y == right._is_transpose_Y); + comp_eq = comp_eq && (active == right.active); + comp_eq = comp_eq && (negative_slope == right.negative_slope); + comp_eq = comp_eq && (coef == right.coef); + comp_eq = comp_eq && (prelu_param == right.prelu_param); + comp_eq = comp_eq && (has_active == right.has_active); + return comp_eq; + } + bool has_negative_slope() { + return (active == Active_relu) && (negative_slope != float (0)); } + ActiveType active; + float negative_slope; + float coef; + bool has_active; + PreluParam prelu_param; +}; - bool _is_transpose_X{false}; - bool _is_transpose_Y{false}; - int _M = 0; - int _N = 0; - int _K = 0; - int _B = 0;//batch_size +template +struct ArgmaxParam { -}; + ArgmaxParam() = default; - -//should design this one for pick_best_specify() -enum ImplEnum{ - VENDER_IMPL = 0, - SABER_IMPL -}; + ArgmaxParam(bool out_max_val_in, int top_k_in, bool has_axis_in, int axis_in) { + out_max_val = out_max_val_in; + top_k = top_k_in; + has_axis = has_axis_in; + axis = axis_in; + } + + ArgmaxParam(bool out_max_val_in, int top_k_in, int axis_in) { + out_max_val = out_max_val_in; + has_axis = true; + top_k = top_k_in; + axis = axis_in; + } + ArgmaxParam(bool out_max_val_in, int top_k_in) { + out_max_val = out_max_val_in; + top_k = top_k_in; + has_axis = false; + axis = 3; + } + ArgmaxParam(const ArgmaxParam& right) { + out_max_val = right.out_max_val; + top_k = right.top_k; + has_axis = right.has_axis; + axis = right.axis; + } + ArgmaxParam& operator=(const ArgmaxParam& right) { + this->out_max_val = right.out_max_val; + this->top_k = right.top_k; + this->axis = right.axis; + this->has_axis = right.has_axis; + return *this; + } -enum SequencePoolType{ - Sequence_pool_unknow = 0, - Sequence_pool_average, - Sequence_pool_sum, - Sequence_pool_sqrt, - Sequence_pool_last, - Sequence_pool_first, - Sequence_pool_max + bool operator==(const ArgmaxParam& right) { + bool flag = this->out_max_val == right.out_max_val; + flag = flag && this->top_k == right.top_k; + flag = flag && this->has_axis == right.has_axis; + return flag && (this->axis == right.axis); + } + bool out_max_val{false}; + bool has_axis{true}; + int top_k{1}; + int axis{3}; }; -template -struct TransposeParam { - TransposeParam() = default; - TransposeParam(const TransposeParam& right){} - TransposeParam& operator=(const TransposeParam& right){} - bool operator==(const TransposeParam& right){ +template +struct AxpyParam { + AxpyParam() = default; + AxpyParam(const AxpyParam& right) { } + AxpyParam& operator=(const AxpyParam& right) { + return *this; + } + bool operator==(const AxpyParam& right) { return true; } }; -/** - * GRU_Formula,origin for paddle,Cudnn for cudnn,difference is w_h_r and weighted mean - * weight for origin is [W_h_o][W_h_r,W_h_z] - * weight for cudnn is [W_h_o,W_h_r,W_h_z] - */ -enum GruFormula { - GRU_ORIGIN = 0, - GRU_CUDNN -}; - -template -struct GruParam { - - - - GruParam() : - weight_tensor(nullptr) - ,bias_tensor(nullptr) - ,init_hidden_tensor(nullptr) - ,dropout_param(1.0f) - ,num_direction(1) - ,num_layers(1) - ,is_reverse(false) - ,gate_activity(Active_sigmoid) - ,h_activity(Active_tanh) - ,formula(GRU_ORIGIN) +template +struct BatchnormParam { + BatchnormParam() + : scale(float(0)) + , use_global_stats(true) + , moving_average_fraction(float(0.999)) + , eps(float(1e-5)) + , mean(), variance() {} - /** - * - * @param weight i2h,i2h_r,i2h_z,h2h,h2h_r,h2h_z (different from paddlepaddle h2h_z,h2h_r,h2h and i2h* is the fc weights before gru) - * @param bias if bias is NULL bias will be zero - * @param dropout_param_in default 1.0f - * @param num_direction_in 1 or 2 ,output will be channged - * @param numLayers_in - * @param mode_in - */ - GruParam(opTensor* weight_in, opTensor* bias_in,GruFormula formula_in, - ActiveType gate_activity_in=Active_sigmoid, ActiveType h_activity_in=Active_tanh, - bool is_reverse_in=false,opTensor* hidden_init_in=nullptr, - float dropout_param_in=1.f - ,int num_direction_in=1,int numLayers_in=1) - : - weight_tensor(weight_in) - ,bias_tensor(bias_in) - ,dropout_param(dropout_param_in) - ,num_direction(num_direction_in) - ,num_layers(numLayers_in) - ,is_reverse(is_reverse_in) - ,gate_activity(gate_activity_in) - ,h_activity(h_activity_in) - ,formula(formula_in) - ,init_hidden_tensor(hidden_init_in) + //scale_factor = 1 / scale; + BatchnormParam(std::vector mean_in, std::vector variance_in, + float scale_in, float moving_average_fraction_in = float(0.999), + float eps_in = float(1e-5), bool use_global_stats_in = true) + : mean(mean_in), variance(variance_in), scale(scale_in) + , moving_average_fraction(moving_average_fraction_in) + , eps(eps_in), use_global_stats(use_global_stats_in) {} - - - GruParam &operator=(const GruParam &right) { - weight_tensor = right.weight_tensor; - dropout_param=right.dropout_param; - num_direction=right.num_direction; - num_layers=right.num_layers; - bias_tensor = right.bias_tensor; - gate_activity=right.gate_activity; - h_activity=right.h_activity; - is_reverse=right.is_reverse; - formula=right.formula; - init_hidden_tensor=right.init_hidden_tensor; + BatchnormParam& operator=(const BatchnormParam& right) { + scale = right.scale; + moving_average_fraction = right.moving_average_fraction; + eps = right.eps; + use_global_stats = right.use_global_stats; + mean = right.mean; + variance = right.variance; return *this; } - - bool operator==(const GruParam &right) { + bool operator==(const BatchnormParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (weight_tensor == right.weight_tensor); - comp_eq = comp_eq && (dropout_param == right.dropout_param); - comp_eq = comp_eq && (num_direction == right.num_direction); - comp_eq = comp_eq && (num_layers == right.num_layers); - comp_eq = comp_eq && (bias_tensor == right.bias_tensor); - comp_eq = comp_eq && (gate_activity=right.gate_activity); - comp_eq = comp_eq && (h_activity=right.h_activity); - comp_eq = comp_eq && (is_reverse=right.is_reverse); - comp_eq = comp_eq && (formula=right.formula); - comp_eq = comp_eq && (init_hidden_tensor==right.init_hidden_tensor); + comp_eq = comp_eq && (scale == right.scale); + comp_eq = comp_eq && (moving_average_fraction == right.moving_average_fraction); + comp_eq = comp_eq && (eps == right.eps); + comp_eq = comp_eq && (use_global_stats == right.use_global_stats); + comp_eq = comp_eq && (mean == right.mean); + comp_eq = comp_eq && (variance == right.variance); return comp_eq; } - - inline const opTensor* weight() { - return weight_tensor; - } - - inline const opTensor* bias() { - return bias_tensor; - } - - inline const opTensor* init_hidden() { - return init_hidden_tensor; - } - - int num_direction; - float dropout_param; - int num_layers; - ActiveType gate_activity; - ActiveType h_activity; - GruFormula formula; - bool is_reverse; -private: - opTensor* weight_tensor; - opTensor* bias_tensor; - opTensor* init_hidden_tensor; + float scale; + float moving_average_fraction; + float eps; + bool use_global_stats; + std::vector mean; + std::vector variance; }; -template -struct LSTMParam{ - - - - LSTMParam() : - weight_tensor(nullptr) - ,bias_tensor(nullptr) - ,init_hidden_tensor(nullptr) - ,dropout_param(1.0f) - ,num_direction(1) - ,num_layers(1) - ,is_reverse(false) - ,gate_activity(Active_sigmoid) - ,cell_activity(Active_tanh) - ,candidate_activity(Active_tanh) - ,with_peephole(true) - +template +struct CastParam { + CastParam() = default; + CastParam(int in_type_in, int out_type_in) + : in_type(in_type_in) + , out_type(out_type_in) {} - - LSTMParam(opTensor* weight_in, opTensor* bias_in, - ActiveType gate_activity_in=Active_sigmoid, ActiveType cell_activity_in=Active_tanh, - ActiveType candidate_activity_in=Active_tanh,bool with_peephole_in=true, - bool is_reverse_in=false,opTensor* hidden_init_in=nullptr, - float dropout_param_in=1.f - ,int num_direction_in=1,int numLayers_in=1) - : - weight_tensor(weight_in) - ,bias_tensor(bias_in) - ,dropout_param(dropout_param_in) - ,num_direction(num_direction_in) - ,num_layers(numLayers_in) - ,is_reverse(is_reverse_in) - ,gate_activity(gate_activity_in) - ,candidate_activity(candidate_activity_in) - ,cell_activity(cell_activity_in) - ,init_hidden_tensor(hidden_init_in) - ,with_peephole(with_peephole_in) + CastParam(const CastParam& right) + : in_type(right.in_type) + , out_type(right.out_type) {} - - - LSTMParam &operator=(const LSTMParam &right) { - weight_tensor = right.weight_tensor; - dropout_param=right.dropout_param; - num_direction=right.num_direction; - num_layers=right.num_layers; - bias_tensor = right.bias_tensor; - gate_activity=right.gate_activity; - cell_activity=right.cell_activity; - candidate_activity=right.candidate_activity; - with_peephole=right.with_peephole; - is_reverse=right.is_reverse; - init_hidden_tensor=right.init_hidden_tensor; + CastParam& operator=(const CastParam& right) { + in_type = right.in_type; + out_type = right.out_type; return *this; } - - bool operator==(const LSTMParam &right) { + bool operator==(const CastParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (weight_tensor == right.weight_tensor); - comp_eq = comp_eq && (dropout_param == right.dropout_param); - comp_eq = comp_eq && (num_direction == right.num_direction); - comp_eq = comp_eq && (num_layers == right.num_layers); - comp_eq = comp_eq && (bias_tensor == right.bias_tensor); - comp_eq = comp_eq && (gate_activity==right.gate_activity); - comp_eq = comp_eq && (cell_activity==right.cell_activity); - comp_eq = comp_eq && (with_peephole==right.with_peephole); - comp_eq = comp_eq && (candidate_activity==right.candidate_activity); - comp_eq = comp_eq && (is_reverse=right.is_reverse); - comp_eq = comp_eq && (init_hidden_tensor==right.init_hidden_tensor); + comp_eq = comp_eq && (in_type == right.in_type); + comp_eq = comp_eq && (out_type == right.out_type); return comp_eq; } + int in_type; + int out_type; +}; - inline const opTensor* weight() { - return weight_tensor; +template +struct ConcatParam { + ConcatParam() = default; + explicit ConcatParam(int axis_in) { + CHECK_GE(axis_in, 0) << "concat parameter should >= 0, current is " << axis_in; + axis = axis_in; } - - inline const opTensor* bias() { - return bias_tensor; + ConcatParam(const ConcatParam& right) { + axis = right.axis; } - - inline const opTensor* init_hidden() { - return init_hidden_tensor; + ConcatParam& operator=(const ConcatParam& right) { + axis = right.axis; + return *this; } - - int num_direction; - float dropout_param; - int num_layers; - ActiveType gate_activity; - ActiveType cell_activity; - ActiveType candidate_activity; - bool is_reverse; - bool with_peephole; -private: - opTensor* weight_tensor; - opTensor* bias_tensor; - opTensor* init_hidden_tensor; + bool operator==(const ConcatParam& right) { + return axis == right.axis; + } + int axis; }; - -template +template struct ConvParam { - ConvParam() : group(-1), pad_h(-1), pad_w(-1), - stride_h(-1), stride_w(-1), - dilation_h(-1), dilation_w(-1), - weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0) {} + ConvParam() + : group(-1), pad_h(-1), pad_w(-1) + , stride_h(-1), stride_w(-1) + , dilation_h(-1), dilation_w(-1) + , weight_tensor(NULL), bias_tensor(NULL) + , alpha(1.0), beta(0.0) + , activation_param(ActivationParam()) {} ConvParam(int group_in, int pad_h_in, int pad_w_in, int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_, - opTensor* weight, opTensor* bias, float alpha_in = 1.0, float beta_in = 0.0) - : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) - , stride_h(stride_h_in), stride_w(stride_w_in) - , dilation_h(dilation_h_), dilation_w(dilation_w_) - , weight_tensor(weight), bias_tensor(bias) - , alpha(alpha_in), beta(beta_in) + Tensor* weight, Tensor* bias, + ActivationParam activation_param_in = ActivationParam(), + float alpha_in = 1.0, float beta_in = 0.0) + : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) + , stride_h(stride_h_in), stride_w(stride_w_in) + , dilation_h(dilation_h_), dilation_w(dilation_w_) + , weight_tensor(weight), bias_tensor(bias) + , activation_param(activation_param_in) + , alpha(alpha_in), beta(beta_in) {} - ConvParam(const ConvParam &right) - : group(right.group), pad_h(right.pad_h) - , pad_w(right.pad_w), stride_h(right.stride_h) - , stride_w(right.stride_w), dilation_h(right.dilation_h) - , dilation_w(right.dilation_w) - , weight_tensor(right.weight_tensor) - , bias_tensor(right.bias_tensor) - , alpha(right.alpha) - , beta(right.beta) + ConvParam(const ConvParam& right) + : group(right.group), pad_h(right.pad_h) + , pad_w(right.pad_w), stride_h(right.stride_h) + , stride_w(right.stride_w), dilation_h(right.dilation_h) + , dilation_w(right.dilation_w) + , weight_tensor(right.weight_tensor) + , bias_tensor(right.bias_tensor) + , alpha(right.alpha) + , beta(right.beta) + , activation_param(right.activation_param) {} - ConvParam &operator=(const ConvParam &right) { + ConvParam& operator=(const ConvParam& right) { group = right.group; pad_h = right.pad_h; pad_w = right.pad_w; @@ -330,10 +296,11 @@ struct ConvParam { bias_tensor = right.bias_tensor; alpha = right.alpha; beta = right.beta; + activation_param = right.activation_param; return *this; } - bool operator==(const ConvParam &right) { + bool operator==(const ConvParam& right) { bool comp_eq = true; comp_eq = comp_eq && (group == right.group); comp_eq = comp_eq && (pad_h == right.pad_h); @@ -346,22 +313,23 @@ struct ConvParam { comp_eq = comp_eq && (bias_tensor == right.bias_tensor); comp_eq = comp_eq && (alpha == right.alpha); comp_eq = comp_eq && (beta == right.beta); + comp_eq = comp_eq && (activation_param == right.activation_param); return comp_eq; } - inline const opTensor* weight() { + inline const Tensor* weight() { return weight_tensor; } - inline const opTensor* bias() { + inline const Tensor* bias() { return bias_tensor; } - inline opTensor* mutable_weight() { + inline Tensor* mutable_weight() { return weight_tensor; } - inline opTensor* mutable_bias() { + inline Tensor* mutable_bias() { return bias_tensor; } @@ -374,197 +342,245 @@ struct ConvParam { int dilation_w; float alpha; float beta; - + ActivationParam activation_param; private: - opTensor* weight_tensor; - opTensor* bias_tensor; + Tensor* weight_tensor; + Tensor* bias_tensor; }; -// specify for int8 -#ifdef USE_CUDA -template <> -struct ConvParam > { - ConvParam() : group(-1), pad_h(-1), pad_w(-1), - stride_h(-1), stride_w(-1), - dilation_h(-1), dilation_w(-1), - weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0){} - ConvParam(int group_in, int pad_h_in, int pad_w_in, - int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_, - Tensor* weight, Tensor* bias, - float alpha_in = 1.0, float beta_in = 0.0) - : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) - , stride_h(stride_h_in), stride_w(stride_w_in) - , dilation_h(dilation_h_), dilation_w(dilation_w_) - , weight_tensor(weight), bias_tensor(bias) - , alpha(alpha_in), beta(beta_in) + +template +struct EltwiseParam; + +template +struct ConvEltwiseParam { + ConvEltwiseParam() + : conv_param() + , eltwise_param() {} - ConvParam(const ConvParam &right) - : group(right.group), pad_h(right.pad_h) - , pad_w(right.pad_w), stride_h(right.stride_h) - , stride_w(right.stride_w), dilation_h(right.dilation_h) - , dilation_w(right.dilation_w) - , weight_tensor(right.weight_tensor) - , bias_tensor(right.bias_tensor) - , alpha(right.alpha) - , beta(right.beta) {} - ConvParam &operator=(const ConvParam &right) { - group = right.group; - pad_h = right.pad_h; - pad_w = right.pad_w; - stride_h = right.stride_h; - stride_w = right.stride_w; - dilation_h = right.dilation_h; - dilation_w = right.dilation_w; - weight_tensor = right.weight_tensor; - bias_tensor = right.bias_tensor; - alpha = right.alpha; - beta = right.beta; + ConvEltwiseParam(ConvParam conv_param_in, + EltwiseParam eltwise_param_in) + : conv_param(conv_param_in) + , eltwise_param(eltwise_param_in) + {} + + ConvEltwiseParam(const ConvEltwiseParam& right) + : conv_param(right.conv_param) + , eltwise_param(right.eltwise_param) + {} + ConvEltwiseParam& operator=(const ConvEltwiseParam& right) { + conv_param = right.conv_param; + eltwise_param = right.eltwise_param; return *this; } - bool operator==(const ConvParam &right) { + bool operator==(const ConvEltwiseParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (group == right.group); - comp_eq = comp_eq && (pad_h == right.pad_h); - comp_eq = comp_eq && (pad_w == right.pad_w); - comp_eq = comp_eq && (stride_h == right.stride_h); - comp_eq = comp_eq && (stride_w == right.stride_w); - comp_eq = comp_eq && (dilation_h == right.dilation_h); - comp_eq = comp_eq && (dilation_w == right.dilation_w); - comp_eq = comp_eq && (weight_tensor == right.weight_tensor); - comp_eq = comp_eq && (bias_tensor == right.bias_tensor); - comp_eq = comp_eq && (alpha == right.alpha); - comp_eq = comp_eq && (beta == right.beta); + comp_eq &= (conv_param == right.conv_param); + comp_eq &= (eltwise_param == right.eltwise_param); return comp_eq; } - inline const Tensor* weight() { - return weight_tensor; - } - inline const Tensor* bias() { - return bias_tensor; - } - inline Tensor* mutable_weight() { - return weight_tensor; - } - inline Tensor* mutable_bias() { - return bias_tensor; - } - int group; - int pad_h; - int pad_w; - int stride_h; - int stride_w; - int dilation_h; - int dilation_w; - float alpha; - float beta; -private: - Tensor* weight_tensor; - Tensor* bias_tensor; + + ConvParam conv_param; + EltwiseParam eltwise_param; }; -// specify for int8 NCHW_VECT_C -template <> -struct ConvParam > { +template +struct PoolingParam; + +template +struct ConvPoolingParam { + ConvPoolingParam() + : conv_param() + , pooling_param() + {} - ConvParam() : group(-1), pad_h(-1), pad_w(-1), - stride_h(-1), stride_w(-1), - dilation_h(-1), dilation_w(-1), - weight_tensor(NULL), bias_tensor(NULL) {} + ConvPoolingParam(ConvParam conv_param_in, + PoolingParam pooling_param_in) + : conv_param(conv_param_in) + , pooling_param(pooling_param_in) + {} - ConvParam(int group_in, int pad_h_in, int pad_w_in, - int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_, - Tensor* weight, Tensor* bias) - : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) - , stride_h(stride_h_in), stride_w(stride_w_in) - , dilation_h(dilation_h_), dilation_w(dilation_w_) - , weight_tensor(weight), bias_tensor(bias) + ConvPoolingParam(const ConvPoolingParam& right) + : conv_param(right.conv_param) + , pooling_param(right.pooling_param) + {} + ConvPoolingParam& operator=(const ConvPoolingParam& right) { + conv_param = right.conv_param; + pooling_param = right.pooling_param; + return *this; + } + bool operator==(const ConvPoolingParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (conv_param == right.conv_param); + comp_eq = comp_eq && (pooling_param == right.pooling_param); + return comp_eq; + } + + ConvParam conv_param; + PoolingParam pooling_param; +}; + +template +struct ConvUnpaddingPaddingParam { + ConvUnpaddingPaddingParam():stride_h(1),stride_w(1) {} - ConvParam(const ConvParam &right) - : group(right.group), pad_h(right.pad_h) - , pad_w(right.pad_w), stride_h(right.stride_h) - , stride_w(right.stride_w), dilation_h(right.dilation_h) - , dilation_w(right.dilation_w) - , weight_tensor(right.weight_tensor) - , bias_tensor(right.bias_tensor) + ConvUnpaddingPaddingParam(int stride_h_in, + int stride_w_in) + : stride_h(stride_h_in) + , stride_w(stride_w_in) {} - ConvParam &operator=(const ConvParam &right) { - group = right.group; - pad_h = right.pad_h; - pad_w = right.pad_w; + ConvUnpaddingPaddingParam(const ConvUnpaddingPaddingParam& right) + : stride_h(right.stride_h) + , stride_w(right.stride_w) + {} + ConvUnpaddingPaddingParam& operator=(const ConvUnpaddingPaddingParam& right) { stride_h = right.stride_h; stride_w = right.stride_w; - dilation_h = right.dilation_h; - dilation_w = right.dilation_w; - weight_tensor = right.weight_tensor; - bias_tensor = right.bias_tensor; return *this; } - bool operator==(const ConvParam &right) { + bool operator==(const ConvUnpaddingPaddingParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (group == right.group); - comp_eq = comp_eq && (pad_h == right.pad_h); - comp_eq = comp_eq && (pad_w == right.pad_w); comp_eq = comp_eq && (stride_h == right.stride_h); comp_eq = comp_eq && (stride_w == right.stride_w); - comp_eq = comp_eq && (dilation_h == right.dilation_h); - comp_eq = comp_eq && (dilation_w == right.dilation_w); - comp_eq = comp_eq && (weight_tensor == right.weight_tensor); - comp_eq = comp_eq && (bias_tensor == right.bias_tensor); return comp_eq; } - inline const Tensor* weight() { - return weight_tensor; + + int stride_h; + int stride_w; +}; + + +template +struct CrfDecodingParam { + CrfDecodingParam() + : weight_tensor(NULL) + , tag_num(0) + {} + CrfDecodingParam(Tensor* weight_tensor_in, int tag_num_in = 0) + : weight_tensor(weight_tensor_in) { + if (tag_num_in == 0) { + tag_num = weight_tensor->channel(); + } else { + tag_num = tag_num_in; + } } - inline const Tensor* bias() { - return bias_tensor; + CrfDecodingParam(const CrfDecodingParam& right) + : weight_tensor(right.weight_tensor) + , tag_num(right.tag_num) + {} + CrfDecodingParam& operator=(const CrfDecodingParam& right) { + weight_tensor = right.weight_tensor; + tag_num = right.tag_num; + return *this; + } + bool operator==(const CrfDecodingParam& right) { + bool comp_eq = true; + comp_eq &= (weight_tensor == right.weight_tensor); + comp_eq &= (tag_num == right.tag_num); + return comp_eq; } - inline Tensor* mutable_weight() { + inline const Tensor* transition_weight() { return weight_tensor; } - inline Tensor* mutable_bias() { - return bias_tensor; + inline Tensor* mutable_transition_weight() { + return weight_tensor; } - int group; - int pad_h; - int pad_w; - int stride_h; - int stride_w; - int dilation_h; - int dilation_w; + int tag_num; private: - Tensor* weight_tensor; - Tensor* bias_tensor; + Tensor* weight_tensor; }; -#endif //USE_CUDA - -#ifdef USE_BM -template <> -struct ConvParam > { - ConvParam() : group(-1), pad_h(-1), pad_w(-1), - stride_h(-1), stride_w(-1), - dilation_h(-1), dilation_w(-1), - weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0){} - ConvParam(int group_in, int pad_h_in, int pad_w_in, - int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_, - Tensor* weight, Tensor* bias, - float alpha_in = 1.0, float beta_in = 0.0) - : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) - , stride_h(stride_h_in), stride_w(stride_w_in) - , dilation_h(dilation_h_), dilation_w(dilation_w_) - , weight_tensor(weight), bias_tensor(bias) - , alpha(alpha_in), beta(beta_in) + +template +struct CropParam { + CropParam() = default; + CropParam(int axis_in, std::vector offset_in, std::vector shape_in) + : axis(axis_in) + , offset(offset_in) + , shape(shape_in) + {} + CropParam(const CropParam& right) + : axis(right.axis) + , offset(right.offset) + , shape(right.shape) + {} + CropParam& operator=(const CropParam& right) { + axis = right.axis; + offset = right.offset; + shape = right.shape; + return *this; + } + bool operator==(const CropParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (axis == right.axis); + comp_eq = comp_eq && (offset == right.offset); + comp_eq = comp_eq && (shape == right.shape); + return comp_eq; + } + int axis = 1; + std::vector offset; + std::vector shape; +}; + + +template +struct CtcAlignParam { + CtcAlignParam() = default; + CtcAlignParam(int blank_in, bool merge_repeated_in) + : blank(blank_in) + , merge_repeated(merge_repeated_in) + {} + CtcAlignParam(const CtcAlignParam& right) + : blank(right.blank) + , merge_repeated(right.merge_repeated) {} - ConvParam(const ConvParam &right) - : group(right.group), pad_h(right.pad_h) - , pad_w(right.pad_w), stride_h(right.stride_h) - , stride_w(right.stride_w), dilation_h(right.dilation_h) - , dilation_w(right.dilation_w) - , weight_tensor(right.weight_tensor) - , bias_tensor(right.bias_tensor) - , alpha(right.alpha) - , beta(right.beta) {} - ConvParam &operator=(const ConvParam &right) { + CtcAlignParam& operator=(const CtcAlignParam& right) { + blank = right.blank; + merge_repeated = right.merge_repeated; + return *this; + } + bool operator==(const CtcAlignParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (blank == right.blank); + comp_eq = comp_eq && (merge_repeated == right.merge_repeated); + return comp_eq; + } + int blank; + bool merge_repeated; +}; + +template +struct DeformableConvParam { + + DeformableConvParam() : group(-1), pad_h(-1), pad_w(-1), + stride_h(-1), stride_w(-1), + dilation_h(-1), dilation_w(-1), axis(-1), + weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0) {} + + DeformableConvParam(int group_in, int pad_h_in, int pad_w_in, int stride_h_in, + int stride_w_in, int dilation_h_, int dilation_w_, Tensor* weight, + Tensor* bias, int axis_in = 1, float alpha_in = 1.0, float beta_in = 0.0) + : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) + , stride_h(stride_h_in), stride_w(stride_w_in) + , dilation_h(dilation_h_), dilation_w(dilation_w_) + , axis(axis_in) + , weight_tensor(weight), bias_tensor(bias) + , alpha(alpha_in), beta(beta_in) + {} + + DeformableConvParam(const DeformableConvParam& right) + : group(right.group), pad_h(right.pad_h) + , pad_w(right.pad_w), stride_h(right.stride_h) + , stride_w(right.stride_w), dilation_h(right.dilation_h) + , dilation_w(right.dilation_w) + , axis(right.axis) + , weight_tensor(right.weight_tensor) + , bias_tensor(right.bias_tensor) + , alpha(right.alpha) + , beta(right.beta) + {} + + DeformableConvParam& operator=(const DeformableConvParam& right) { group = right.group; pad_h = right.pad_h; pad_w = right.pad_w; @@ -572,13 +588,15 @@ struct ConvParam > { stride_w = right.stride_w; dilation_h = right.dilation_h; dilation_w = right.dilation_w; + axis = right.axis; weight_tensor = right.weight_tensor; bias_tensor = right.bias_tensor; alpha = right.alpha; beta = right.beta; return *this; } - bool operator==(const ConvParam &right) { + + bool operator==(const DeformableConvParam& right) { bool comp_eq = true; comp_eq = comp_eq && (group == right.group); comp_eq = comp_eq && (pad_h == right.pad_h); @@ -587,24 +605,30 @@ struct ConvParam > { comp_eq = comp_eq && (stride_w == right.stride_w); comp_eq = comp_eq && (dilation_h == right.dilation_h); comp_eq = comp_eq && (dilation_w == right.dilation_w); + comp_eq = comp_eq && (axis == right.axis); comp_eq = comp_eq && (weight_tensor == right.weight_tensor); comp_eq = comp_eq && (bias_tensor == right.bias_tensor); comp_eq = comp_eq && (alpha == right.alpha); comp_eq = comp_eq && (beta == right.beta); return comp_eq; } - inline const Tensor* weight() { + + inline const Tensor* weight() { return weight_tensor; } - inline const Tensor* bias() { + + inline const Tensor* bias() { return bias_tensor; } - inline Tensor* mutable_weight() { + + inline Tensor* mutable_weight() { return weight_tensor; } - inline Tensor* mutable_bias() { + + inline Tensor* mutable_bias() { return bias_tensor; } + int group; int pad_h; int pad_w; @@ -612,924 +636,721 @@ struct ConvParam > { int stride_w; int dilation_h; int dilation_w; + int axis; float alpha; float beta; + private: - Tensor* weight_tensor; - Tensor* bias_tensor; + Tensor* weight_tensor; + Tensor* bias_tensor; }; -#endif //USE_BM -template -struct PermuteParam { - PermuteParam() {} - PermuteParam(std::vector order):order(order) {} - PermuteParam(const PermuteParam &right): order(right.order) {} - PermuteParam &operator=(const PermuteParam &right) { - order = right.order; - return *this; - } - bool operator==(const PermuteParam &right) { - bool comp_eq = true; - comp_eq = order.size() == right.order.size(); - for (int i = 0; i < order.size(); ++i) { - comp_eq = comp_eq && (order[i] == right.order[i]); - } - return comp_eq; - } - std::vector order; -}; -template -struct PowerParam { - PowerParam() {} - PowerParam(float power, float scale, float shift) - : power(power), scale(scale), shift(shift) {} - PowerParam(const PowerParam &right): - power(right.power), scale(right.scale), shift(right.shift) {} - bool operator==(const PowerParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (power == right.power); - comp_eq = comp_eq && (scale == right.scale); - comp_eq = comp_eq && (shift == right.shift); - return comp_eq; - } - float power; - float scale; - float shift; -}; -template -struct PermutePowerParam { - PermutePowerParam() {} - PermutePowerParam(PermuteParam permute_param): - power_param(power_param), has_power_param(false) {} - PermutePowerParam(PermuteParam permute_param, PowerParam power_param): - power_param(power_param), permute_param(permute_param), has_power_param(true) {} - PermutePowerParam(const PermutePowerParam & right): - power_param(right.power_param), permute_param(right.permute_param) {} - bool operator==(const PermutePowerParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (power_param == right.power_param); - comp_eq = comp_eq && (permute_param == right.permute_param); - return comp_eq; +template +struct DetectionOutputParam { + + DetectionOutputParam() = default; + + DetectionOutputParam(int classes, int bg_id, int keep_topk, int nms_topk, float nms_threshold, \ + float confidence_threshold, bool share_loc = true, bool variance_in_target = false, \ + int codetype = 1, float eta = 1.f) { + class_num = classes; + background_id = bg_id; + keep_top_k = keep_topk; + nms_top_k = nms_topk; + nms_thresh = nms_threshold; + conf_thresh = confidence_threshold; + share_location = share_loc; + variance_encode_in_target = variance_in_target; + type = (CodeType) codetype; + nms_eta = eta; } - PowerParam power_param; - PermuteParam permute_param; - bool has_power_param; -}; -template -struct ConcatParam { - ConcatParam() = default; - explicit ConcatParam(int axis_in){ - CHECK_GE(axis_in, 0) << "concat parameter should >= 0, current is " << axis_in; - axis = axis_in; + + void init(int classes, int bg_id, int keep_topk, int nms_topk, float nms_threshold, \ + float confidence_threshold, bool share_loc = true, bool variance_in_target = false, \ + int codetype = 1, float eta = 1.f) { + class_num = classes; + background_id = bg_id; + keep_top_k = keep_topk; + nms_top_k = nms_topk; + nms_thresh = nms_threshold; + conf_thresh = confidence_threshold; + share_location = share_loc; + variance_encode_in_target = variance_in_target; + type = (CodeType) codetype; + nms_eta = eta; } - ConcatParam(const ConcatParam &right) { - axis = right.axis; + + DetectionOutputParam(const DetectionOutputParam& right) { + class_num = right.class_num; + background_id = right.background_id; + keep_top_k = right.keep_top_k; + nms_top_k = right.nms_top_k; + nms_thresh = right.nms_thresh; + conf_thresh = right.conf_thresh; + share_location = right.share_location; + variance_encode_in_target = right.variance_encode_in_target; + type = right.type; + nms_eta = right.nms_eta; } - ConcatParam &operator=(const ConcatParam &right) { - axis = right.axis; + + DetectionOutputParam& operator=(const DetectionOutputParam& right) { + this->class_num = right.class_num; + this->background_id = right.background_id; + this->keep_top_k = right.keep_top_k; + this->nms_top_k = right.nms_top_k; + this->nms_thresh = right.nms_thresh; + this->conf_thresh = right.conf_thresh; + this->share_location = right.share_location; + this->variance_encode_in_target = right.variance_encode_in_target; + this->type = right.type; + this->nms_eta = right.nms_eta; return *this; } - bool operator==(const ConcatParam &right) { - return axis == right.axis; + + bool operator==(const DetectionOutputParam& right) { + bool flag = class_num == right.class_num; + flag = flag && (background_id == right.background_id); + flag = flag && (keep_top_k == right.keep_top_k); + flag = flag && (nms_top_k == right.nms_top_k); + flag = flag && (nms_thresh == right.nms_thresh); + flag = flag && (conf_thresh == right.conf_thresh); + flag = flag && (share_location == right.share_location); + flag = flag && (variance_encode_in_target == right.variance_encode_in_target); + flag = flag && (type == right.type); + flag = flag && (nms_eta == right.nms_eta); + return flag; } - int axis; + + bool share_location{true}; + bool variance_encode_in_target{false}; + int class_num; + int background_id{0}; + int keep_top_k{-1}; + CodeType type{CORNER}; + float conf_thresh; + int nms_top_k; + float nms_thresh{0.3f}; + float nms_eta{1.f}; + }; -template -struct ReshapeParam { - ReshapeParam() = default; - explicit ReshapeParam(std::vector shape_param_in){ - int count = 0; - for (int i = 0; i < shape_param_in.size(); ++i) { - if (shape_param_in[i] == -1){ - count ++; - } + +template +struct EltwiseParam { + EltwiseParam() + : operation(Eltwise_unknow) + , coeff() + , activation_param(ActivationParam()) + , has_eltwise(false) {} + EltwiseParam(EltwiseType operation_in + , std::vector coeff_in = std::vector({1, 1}) + , ActivationParam activation_param_in = ActivationParam()) + : operation(operation_in) + , coeff(coeff_in) + , activation_param(activation_param_in) + , has_eltwise(true) { + if ((operation == Eltwise_sum) && (coeff.size() == 0)) { + coeff.push_back(1); + coeff.push_back(1); } - CHECK_LE(count, 1) << "shape parameter contains multiple -1 dims"; - shape_params = shape_param_in; - } - ReshapeParam(const ReshapeParam &right) { - shape_params = right.shape_params; } - ReshapeParam &operator=(const ReshapeParam &right) { - shape_params = right.shape_params; + EltwiseParam(const EltwiseParam& right) + : operation(right.operation) + , coeff(right.coeff) + , activation_param(right.activation_param) + , has_eltwise(right.has_eltwise) + {} + EltwiseParam& operator=(const EltwiseParam& right) { + operation = right.operation; + coeff.resize(right.coeff.size()); + + for (int i = 0; i < coeff.size(); ++i) { + coeff[i] = right.coeff[i]; + } + + activation_param = right.activation_param; + has_eltwise = right.has_eltwise; return *this; } - bool operator==(const ReshapeParam &right) { - bool comp_eq = shape_params.size() == right.shape_params.size(); - for (int i = 0; i < shape_params.size(); ++i) { - if (!comp_eq){ - return false; - } - comp_eq = shape_params[i] == right.shape_params[i]; + bool operator==(const EltwiseParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (operation == right.operation); + comp_eq = comp_eq && (coeff.size() == right.coeff.size()); + comp_eq = comp_eq && (activation_param == right.activation_param); + comp_eq = comp_eq && (has_eltwise == right.has_eltwise); + + if (!comp_eq) { + return comp_eq; } - return true; + + for (int i = 0; i < coeff.size(); ++i) { + comp_eq = comp_eq && (coeff[i] == right.coeff[i]); + } + + return comp_eq; } - std::vector shape_params; + ActivationParam activation_param; + EltwiseType operation; + bool has_eltwise{false}; + std::vector coeff; }; -template -struct SliceParam { - SliceParam() = default; - explicit SliceParam(int axis_in, std::vector slice_points_in){ - CHECK_GE(axis_in, 0) << "slice axis should >=0, current is " << axis_in; - axis = axis_in; - slice_points = slice_points_in; + + +template +struct EmbeddingParam { + EmbeddingParam() = default; + EmbeddingParam(int word_num_in, int emb_dim_in, int padding_idx_in, + Tensor* weight_tensor_in) + : word_num(word_num_in) + , emb_dim(emb_dim_in) + , padding_idx(padding_idx_in) + , weight_tensor(weight_tensor_in) + {} + EmbeddingParam(const EmbeddingParam& right) + : word_num(right.word_num) + , emb_dim(right.emb_dim) + , padding_idx(right.padding_idx) + , weight_tensor(right.weight_tensor) + {} + EmbeddingParam& operator=(const EmbeddingParam& right) { + word_num = right.word_num; + emb_dim = right.emb_dim; + padding_idx = right.padding_idx; + weight_tensor = right.weight_tensor; + return *this; } - SliceParam(const SliceParam &right) { - axis = right.axis; - slice_points = right.slice_points; + bool operator==(const EmbeddingParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (word_num == right.word_num); + comp_eq = comp_eq && (emb_dim == right.emb_dim); + comp_eq = comp_eq && (padding_idx == right.padding_idx); + comp_eq = comp_eq && (weight_tensor == right.weight_tensor); + return comp_eq; } - SliceParam &operator=(const SliceParam &right) { - axis = right.axis; - slice_points = right.slice_points; + inline const Tensor* weight() { + return weight_tensor; + } + inline Tensor* mutable_weight() { + return weight_tensor; + } + int emb_dim; + int word_num; + int padding_idx; +private: + Tensor* weight_tensor; +}; + +template +struct EmptyParam{ + EmptyParam() = default; + EmptyParam(const EmptyParam& right) {} + EmptyParam& operator=(const EmptyParam& right) { return *this; } - bool operator==(const SliceParam &right) { - bool comp_eq = slice_points.size() == right.slice_points.size(); - for (int i = 0; i < slice_points.size(); ++i) { - if (!comp_eq){ - return false; - } - comp_eq = slice_points[i] == right.slice_points[i]; - } - return axis == right.axis; + bool operator==(const EmptyParam& right) { + return true; } - int axis; - std::vector slice_points; }; -template -struct SoftmaxParam { - SoftmaxParam() = default; - explicit SoftmaxParam(int axis_in){ - CHECK_GE(axis_in, 0) << "input axis index should >= 0, current is " << axis_in; - axis = axis_in; + +template +struct FcParam { + FcParam() = default; + FcParam(Tensor* input_weight, int output_num, int in_axis = 1, + bool trans = false) { + num_output = output_num; + weights = input_weight; + bias = nullptr; + axis = in_axis; + is_transpose_weights = trans; } - SoftmaxParam(const SoftmaxParam& right){ + FcParam(Tensor* input_weight, Tensor* input_bias, int output_num, + int in_axis = 1, bool trans = false) { + num_output = output_num; + weights = input_weight; + bias = input_bias; + axis = in_axis; + is_transpose_weights = trans; + } + FcParam(const FcParam& right) { + weights = right.weights; + bias = right.bias; + num_output = right.num_output; axis = right.axis; + is_transpose_weights = right.is_transpose_weights; } - SoftmaxParam& operator=(const SoftmaxParam& right){ + FcParam& operator=(const FcParam& right) { + this->weights = right.weights; + this->bias = right.bias; + this->num_output = right.num_output; this->axis = right.axis; + this->is_transpose_weights = right.is_transpose_weights; return *this; } - bool operator==(const SoftmaxParam& right){ - return axis == right.axis; + bool operator==(const FcParam& right) { + bool flag = this->is_transpose_weights == right.is_transpose_weights; + flag = flag && (this->num_output == right.num_output) && (this->axis == right.axis); + return flag && (this->weights == right.weights) && (this->bias == right.bias); } - int axis; + bool is_transpose_weights{false}; + int num_output; + int axis{1}; + Tensor* weights{nullptr}; + Tensor* bias{nullptr}; }; -template -struct BatchnormParam { - typedef typename opTensor::Dtype DataDtype; - BatchnormParam() - : scale(DataDtype(0)) - , use_global_stats(true) - , moving_average_fraction(DataDtype(0.999)) - , eps(DataDtype(1e-5)) - , mean(), variance() - {} - //scale_factor = 1 / scale; - BatchnormParam(std::vector mean_in, std::vector variance_in, - DataDtype scale_in, DataDtype moving_average_fraction_in = DataDtype(0.999), - DataDtype eps_in = DataDtype(1e-5), bool use_global_stats_in = true) - : mean(mean_in), variance(variance_in), scale(scale_in) - , moving_average_fraction(moving_average_fraction_in) - , eps(eps_in), use_global_stats(use_global_stats_in) - {} - BatchnormParam &operator=(const BatchnormParam &right) { - scale = right.scale; - moving_average_fraction = right.moving_average_fraction; - eps = right.eps; - use_global_stats = right.use_global_stats; - mean = right.mean; - variance = right.variance; - return *this; - } - bool operator==(const BatchnormParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (scale == right.scale); - comp_eq = comp_eq && (moving_average_fraction == right.moving_average_fraction); - comp_eq = comp_eq && (eps == right.eps); - comp_eq = comp_eq && (use_global_stats == right.use_global_stats); - comp_eq = comp_eq && (mean == right.mean); - comp_eq = comp_eq && (variance == right.variance); - return comp_eq; - } - DataDtype scale; - DataDtype moving_average_fraction; - DataDtype eps; - bool use_global_stats; - std::vector mean; - std::vector variance; -}; -#ifdef USE_BM -template <> -struct BatchnormParam> { - BatchnormParam() - : scale(float(0)) - , use_global_stats(true) - , moving_average_fraction(float(0.999)) - , eps(float(1e-5)) - , mean(), variance() - {} - //scale_factor = 1 / scale; - BatchnormParam(std::vector mean_in, std::vector variance_in, - float scale_in, float moving_average_fraction_in = float(0.999), - float eps_in = float(1e-5), bool use_global_stats_in = true) - : mean(mean_in), variance(variance_in), scale(scale_in) - , moving_average_fraction(moving_average_fraction_in) - , eps(eps_in), use_global_stats(use_global_stats_in) - {} - BatchnormParam &operator=(const BatchnormParam &right) { - scale = right.scale; - moving_average_fraction = right.moving_average_fraction; - eps = right.eps; - use_global_stats = right.use_global_stats; - mean = right.mean; - variance = right.variance; +template +struct FlattenParam { + FlattenParam() = default; + FlattenParam(const FlattenParam& right) {} + FlattenParam& operator=(const FlattenParam& right) { return *this; } - bool operator==(const BatchnormParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (scale == right.scale); - comp_eq = comp_eq && (moving_average_fraction == right.moving_average_fraction); - comp_eq = comp_eq && (eps == right.eps); - comp_eq = comp_eq && (use_global_stats == right.use_global_stats); - comp_eq = comp_eq && (mean == right.mean); - comp_eq = comp_eq && (variance == right.variance); - return comp_eq; + bool operator==(const FlattenParam& right) { + return true; } - float scale; - float moving_average_fraction; - float eps; - bool use_global_stats; - std::vector mean; - std::vector variance; }; -#endif -template -struct ActivationParam { - typedef typename opTensor::Dtype DataDtype; - ActivationParam() - : active(Active_unknow) - , negative_slope(DataDtype(-1)) - , coef(DataDtype(-1)) {} - ActivationParam(ActiveType act, DataDtype n_slope = DataDtype(0), - DataDtype co = DataDtype(1)) - : active(act) - , negative_slope(n_slope) - , coef(co) - {} - ActivationParam(const ActivationParam &right) - : active(right.active) - , negative_slope(right.negative_slope) - , coef(right.coef) +/** + * GRU_Formula,origin for paddle,Cudnn for cudnn,difference is w_h_r and weighted mean + * weight for origin is [W_h_o][W_h_r,W_h_z] + * weight for cudnn is [W_h_o,W_h_r,W_h_z] + */ + +template +struct GruParam { + + typedef Tensor opTensor; + + GruParam() : + weight_tensor(nullptr) + , bias_tensor(nullptr) + , init_hidden_tensor(nullptr) + , dropout_param(1.0f) + , num_direction(1) + , num_layers(1) + , is_reverse(false) + , gate_activity(Active_sigmoid) + , h_activity(Active_tanh) + , formula(GRU_ORIGIN) {} - ActivationParam &operator=(const ActivationParam &right) { - active = right.active; - negative_slope = right.negative_slope; - coef = right.coef; + /** + * + * @param weight i2h,i2h_r,i2h_z,h2h,h2h_r,h2h_z (different from paddlepaddle h2h_z,h2h_r,h2h and i2h* is the fc weights before gru) + * @param bias if bias is NULL bias will be zero + * @param dropout_param_in default 1.0f + * @param num_direction_in 1 or 2 ,output will be channged + * @param numLayers_in + * @param mode_in + */ + GruParam(opTensor* weight_in, opTensor* bias_in, GruFormula formula_in, + ActiveType gate_activity_in = Active_sigmoid, ActiveType h_activity_in = Active_tanh, + bool is_reverse_in = false, opTensor* hidden_init_in = nullptr, + float dropout_param_in = 1.f + , int num_direction_in = 1, int numLayers_in = 1) + : + weight_tensor(weight_in) + , bias_tensor(bias_in) + , dropout_param(dropout_param_in) + , num_direction(num_direction_in) + , num_layers(numLayers_in) + , is_reverse(is_reverse_in) + , gate_activity(gate_activity_in) + , h_activity(h_activity_in) + , formula(formula_in) + , init_hidden_tensor(hidden_init_in) + {} + + + GruParam& operator=(const GruParam& right) { + weight_tensor = right.weight_tensor; + dropout_param = right.dropout_param; + num_direction = right.num_direction; + num_layers = right.num_layers; + bias_tensor = right.bias_tensor; + gate_activity = right.gate_activity; + h_activity = right.h_activity; + is_reverse = right.is_reverse; + formula = right.formula; + init_hidden_tensor = right.init_hidden_tensor; return *this; } - bool operator==(const ActivationParam &right) { + + bool operator==(const GruParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (active == right.active); - comp_eq = comp_eq && (negative_slope == right.negative_slope); - comp_eq = comp_eq && (coef == right.coef); + comp_eq = comp_eq && (weight_tensor == right.weight_tensor); + comp_eq = comp_eq && (dropout_param == right.dropout_param); + comp_eq = comp_eq && (num_direction == right.num_direction); + comp_eq = comp_eq && (num_layers == right.num_layers); + comp_eq = comp_eq && (bias_tensor == right.bias_tensor); + comp_eq = comp_eq && (gate_activity = right.gate_activity); + comp_eq = comp_eq && (h_activity = right.h_activity); + comp_eq = comp_eq && (is_reverse = right.is_reverse); + comp_eq = comp_eq && (formula = right.formula); + comp_eq = comp_eq && (init_hidden_tensor == right.init_hidden_tensor); return comp_eq; } - bool has_negative_slope(){ - return (active == Active_relu) && (negative_slope != DataDtype (0)); - } - ActiveType active; - DataDtype negative_slope; - DataDtype coef; -}; -#ifdef USE_BM -template <> -struct ActivationParam > { - ActivationParam(): active(Active_unknow) {} - ActivationParam(ActiveType act): active(act) {} - ActivationParam(const ActivationParam &right): active(right.active) {} - ActivationParam &operator=(const ActivationParam &right) { - active = right.active; - return *this; - } - bool operator==(const ActivationParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (active == right.active); - return comp_eq; - } - bool has_negative_slope(){ - return (active == Active_relu); + inline const opTensor* weight() { + return weight_tensor; } - ActiveType active; -}; -#endif -template -struct ScaleParam { - typedef typename opTensor::Dtype DataDtype; - ScaleParam() - : axis(1), num_axes(1) - , bias_term(false) - {} - ScaleParam(std::vector scale_w_in, std::vector scale_b_in, - bool bias_term_in = true, int axis_in = 1, int num_axes_in = 1) - : scale_w(scale_w_in), scale_b(scale_b_in) - , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in) - {} - ScaleParam(std::vector scale_w_in, - bool bias_term_in = false, int axis_in = 1, int num_axes_in = 1) - : scale_w(scale_w_in) - , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in) - {} - ScaleParam(const ScaleParam &right) - : scale_w(right.scale_w), scale_b(right.scale_b) - , bias_term(right.bias_term), axis(right.axis), num_axes(right.num_axes) - {} - ScaleParam &operator=(const ScaleParam &right) { - scale_w = right.scale_w; - scale_b = right.scale_b; - bias_term = right.bias_term; - axis = right.axis; - num_axes = right.num_axes; - return *this; - } - bool operator==(const ScaleParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (scale_w == right.scale_w); - comp_eq = comp_eq && (scale_b == right.scale_b); - comp_eq = comp_eq && (bias_term == right.bias_term); - comp_eq = comp_eq && (axis == right.axis); - comp_eq = comp_eq && (num_axes == right.num_axes); - return comp_eq; - } - int axis; // default is 1 - int num_axes; // default is 1 - bool bias_term; // default false - std::vector scale_w; - std::vector scale_b; -}; -#ifdef USE_BM -template <> -struct ScaleParam> { - ScaleParam(): axis(1), num_axes(1), bias_term(false) {} - ScaleParam(std::vector scale_w_in, std::vector scale_b_in, - bool bias_term_in = true, int axis_in = 1, int num_axes_in = 1) - : scale_w(scale_w_in), scale_b(scale_b_in) - , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in) - {} - ScaleParam(std::vector scale_w_in, - bool bias_term_in = false, int axis_in = 1, int num_axes_in = 1) - : scale_w(scale_w_in) - , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in) - {} - ScaleParam(const ScaleParam &right) - : scale_w(right.scale_w), scale_b(right.scale_b) - , bias_term(right.bias_term), axis(right.axis), num_axes(right.num_axes) - {} - ScaleParam &operator=(const ScaleParam &right) { - scale_w = right.scale_w; - scale_b = right.scale_b; - bias_term = right.bias_term; - axis = right.axis; - num_axes = right.num_axes; - return *this; + inline const opTensor* bias() { + return bias_tensor; } - bool operator==(const ScaleParam &right) { - bool comp_eq = true; - /* comp_eq = comp_eq && (scale_w == right.scale_w); */ - /* comp_eq = comp_eq && (scale_b == right.scale_b); */ - comp_eq = comp_eq && (bias_term == right.bias_term); - comp_eq = comp_eq && (axis == right.axis); - comp_eq = comp_eq && (num_axes == right.num_axes); - return comp_eq; + + inline const opTensor* init_hidden() { + return init_hidden_tensor; } - int axis; // default is 1 - int num_axes; // default is 1 - bool bias_term; // default false - std::vector scale_w; - std::vector scale_b; + + int num_direction; + float dropout_param; + int num_layers; + ActiveType gate_activity; + ActiveType h_activity; + GruFormula formula; + bool is_reverse; +private: + opTensor* weight_tensor; + opTensor* bias_tensor; + opTensor* init_hidden_tensor; }; -#endif -template -struct PoolingParam { - PoolingParam() : window_h(-1), window_w(-1) - , pad_h(-1), pad_w(-1) - , stride_h(-1), stride_w(-1) - , pooling_type(Pooling_unknow) - , global_pooling(false) - , cmp_out_shape_floor_as_conv(false) - {} - PoolingParam(int window_h_in, int window_w_in, int pad_h_in - , int pad_w_in, int stride_h_in, int stride_w_in, PoolingType type - , bool global_pooling_in = false, bool cmp_out_shape_floor_as_conv_in = false) - : window_h(window_h_in), window_w(window_w_in) - , pad_h(pad_h_in), pad_w(pad_w_in) - , stride_h(stride_h_in), stride_w(stride_w_in) - , pooling_type(type) - , global_pooling(global_pooling_in) - , cmp_out_shape_floor_as_conv(cmp_out_shape_floor_as_conv_in) - {} - PoolingParam(const PoolingParam &right) - : window_h(right.window_h) - , window_w(right.window_w) - , pad_h(right.pad_h) - , pad_w(right.pad_w) - , stride_h(right.stride_h) - , stride_w(right.stride_w) - , pooling_type(right.pooling_type) - , global_pooling(right.global_pooling) - , cmp_out_shape_floor_as_conv(right.cmp_out_shape_floor_as_conv) - {} - PoolingParam &operator=(const PoolingParam &right) { + + +template +struct Im2SequenceParam { + Im2SequenceParam() = default; + Im2SequenceParam(int window_h_in, + int window_w_in, + int pad_up_in, + int pad_down_in, + int pad_left_in, + int pad_right_in, + int stride_h_in, + int stride_w_in, + int dilation_h_in, + int dilation_w_in) + : window_h(window_h_in) + , window_w(window_w_in) + , pad_up(pad_up_in) + , pad_down(pad_down_in) + , pad_left(pad_left_in) + , pad_right(pad_right_in) + , stride_h(stride_h_in) + , stride_w(stride_w_in) + , dilation_h(dilation_h_in) + , dilation_w(dilation_w_in) + {} + Im2SequenceParam(const Im2SequenceParam& right) + : window_h(right.window_h) + , window_w(right.window_w) + , pad_up(right.pad_up) + , pad_down(right.pad_down) + , pad_left(right.pad_left) + , pad_right(right.pad_right) + , stride_h(right.stride_h) + , stride_w(right.stride_w) + , dilation_h(right.dilation_h) + , dilation_w(right.dilation_w) + {} + Im2SequenceParam& operator=(const Im2SequenceParam& right) { window_h = right.window_h; window_w = right.window_w; - pad_h = right.pad_h; - pad_w = right.pad_w; + pad_up = right.pad_up; + pad_down = right.pad_down; + pad_left = right.pad_left; + pad_right = right.pad_right; stride_h = right.stride_h; stride_w = right.stride_w; - pooling_type = right.pooling_type; - global_pooling = right.global_pooling; - cmp_out_shape_floor_as_conv = right.cmp_out_shape_floor_as_conv; + dilation_h = right.dilation_h; + dilation_w = right.dilation_w; return *this; } - bool operator==(const PoolingParam &right) { + bool operator==(const Im2SequenceParam& right) { bool comp_eq = true; comp_eq = comp_eq && (window_h == right.window_h); comp_eq = comp_eq && (window_w == right.window_w); - comp_eq = comp_eq && (pad_h == right.pad_h); - comp_eq = comp_eq && (pad_w == right.pad_w); + comp_eq = comp_eq && (pad_up == right.pad_up); + comp_eq = comp_eq && (pad_down == right.pad_down); + comp_eq = comp_eq && (pad_left == right.pad_left); + comp_eq = comp_eq && (pad_right == right.pad_right); comp_eq = comp_eq && (stride_h == right.stride_h); comp_eq = comp_eq && (stride_w == right.stride_w); - comp_eq = comp_eq && (pooling_type == right.pooling_type); - comp_eq = comp_eq && (global_pooling == right.global_pooling); - comp_eq = comp_eq && (cmp_out_shape_floor_as_conv == right.cmp_out_shape_floor_as_conv); + comp_eq = comp_eq && (dilation_h == right.dilation_h); + comp_eq = comp_eq && (dilation_w == right.dilation_w); return comp_eq; } - inline bool pooling_padded() { - return (pad_h || pad_w); - } int window_h; int window_w; - int pad_h; - int pad_w; + int pad_up; + int pad_down; + int pad_left; + int pad_right; int stride_h; int stride_w; - PoolingType pooling_type; - bool global_pooling; - bool cmp_out_shape_floor_as_conv; + int dilation_h; + int dilation_w; }; -template -struct SequencePoolParam { - SequencePoolParam() - : sequence_pool_type(Sequence_pool_unknow) - {} - SequencePoolParam(SequencePoolType sequence_pool_type_in) - : sequence_pool_type(sequence_pool_type_in) - {} - SequencePoolParam(const SequencePoolParam &right) - : sequence_pool_type(right.sequence_pool_type) - {} - SequencePoolParam &operator=(const SequencePoolParam &right) { - sequence_pool_type = right.sequence_pool_type; - return *this; - } - bool operator==(const SequencePoolParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (sequence_pool_type == right.sequence_pool_type); - return comp_eq; +template +struct LayerNormParam { + LayerNormParam() = default; + LayerNormParam(int axis_in, float eps_in, Tensor* weights_scale, + Tensor* weights_bias) { + axis = axis_in; + eps = eps_in; + scale = weights_scale; + bias = weights_bias; } - SequencePoolType sequence_pool_type; -}; -template -struct CrfDecodingParam { - CrfDecodingParam() - : weight_tensor(NULL) - , tag_num(0) - {} - CrfDecodingParam(opTensor* weight_tensor_in, int tag_num_in = 0) - : weight_tensor(weight_tensor_in) { - if (tag_num_in == 0) { - tag_num = weight_tensor->channel(); - } else { - tag_num = tag_num_in; - } + LayerNormParam(const LayerNormParam& right) { + axis = right.axis; + eps = right.eps; + scale = right.scale; + bias = right.bias; } - CrfDecodingParam(const CrfDecodingParam &right) - : weight_tensor(right.weight_tensor) - , tag_num(right.tag_num) - {} - CrfDecodingParam &operator=(const CrfDecodingParam &right) { - weight_tensor = right.weight_tensor; - tag_num = right.tag_num; + LayerNormParam& operator=(const LayerNormParam& right) { + this->axis = right.axis; + this->eps = right.eps; + this->scale = right.scale; + this->bias = right.bias; return *this; } - bool operator==(const CrfDecodingParam &right) { + bool operator==(const LayerNormParam& right) { bool comp_eq = true; - comp_eq &= (weight_tensor == right.weight_tensor); - comp_eq &= (tag_num == right.tag_num); + comp_eq = comp_eq && (axis == right.axis); + comp_eq = comp_eq && (fabsf(eps - right.eps) < 1e-7f); + comp_eq = comp_eq && (scale == scale); + comp_eq = comp_eq && (bias == bias); return comp_eq; } - inline const opTensor* transition_weight() { - return weight_tensor; + inline const Tensor* scale_weights() { + return scale; } - inline opTensor* mutable_transition_weight() { - return weight_tensor; + inline Tensor* mutable_scale_weights() { + return scale; } - int tag_num; + inline const Tensor* bias_weights() { + return bias; + } + inline Tensor* mutable_bias_weights() { + return bias; + } + int axis; + float eps{1e-5f}; private: - opTensor *weight_tensor; + Tensor* scale; + Tensor* bias; }; -template -struct EltwiseParam; -// Fusion conv with batchnorm, scale, activation, eltwise(sigmoid, relu, tanh, clipped_relu, elu) -template -struct ConvActiveParam { - ConvActiveParam() : has_batchnorm(false), has_scale(false), has_active(false), has_eltwise(false){} - - ConvActiveParam(ConvParam &conv_param_in) - : conv_param(conv_param_in), has_active(false) - , has_batchnorm(false), has_scale(false), has_eltwise(false) - {} - ConvActiveParam(ConvParam &conv_param_in, - ActivationParam &activation_param_in) - : conv_param(conv_param_in), activation_param(activation_param_in) - , has_batchnorm(false) - , has_scale(false) - , has_active(true) - , has_eltwise(false) - {} - ConvActiveParam(ConvParam &conv_param_in - , ActivationParam &activation_param_in - , EltwiseParam &eltwise_param_in) - : conv_param(conv_param_in) - , activation_param(activation_param_in) - , eltwise_param(eltwise_param_in) - , has_batchnorm(false) - , has_scale(false) - , has_active(true) - , has_eltwise(true) - {} - ConvActiveParam(ConvParam &conv_param_in - , ActivationParam &activation_param_in - , BatchnormParam &batchnorm_param_in) - : conv_param(conv_param_in) - , activation_param(activation_param_in) - , batchnorm_param(batchnorm_param_in) - , has_batchnorm(true) - , has_scale(false) - , has_active(true) - , has_eltwise(false) - {} - ConvActiveParam(ConvParam &conv_param_in - , ActivationParam &activation_param_in - , BatchnormParam &batchnorm_param_in - , EltwiseParam &eltwise_param_in) - : conv_param(conv_param_in) - , activation_param(activation_param_in) - , batchnorm_param(batchnorm_param_in) - , eltwise_param(eltwise_param_in) - , has_batchnorm(true) - , has_scale(false) - , has_active(true) - , has_eltwise(true) - {} - ConvActiveParam(ConvParam &conv_param_in - , ActivationParam &activation_param_in - , ScaleParam &scale_param_in) - : conv_param(conv_param_in) - , activation_param(activation_param_in) - , scale_param(scale_param_in) - , has_batchnorm(false) - , has_scale(true) - , has_active(true) - , has_eltwise(false) - {} - ConvActiveParam(ConvParam &conv_param_in - , ActivationParam &activation_param_in - , ScaleParam &scale_param_in - , EltwiseParam &eltwise_param_in) - : conv_param(conv_param_in) - , activation_param(activation_param_in) - , scale_param(scale_param_in) - , eltwise_param(eltwise_param_in) - , has_batchnorm(false) - , has_scale(true) - , has_active(true) - , has_eltwise(true) - {} - ConvActiveParam(ConvParam &conv_param_in - , ActivationParam &activation_param_in - , BatchnormParam &batchnorm_param_in - , ScaleParam &scale_param_in) - : conv_param(conv_param_in) - , activation_param(activation_param_in) - , batchnorm_param(batchnorm_param_in) - , scale_param(scale_param_in) - , has_batchnorm(true) - , has_scale(true) - , has_active(true) - , has_eltwise(false) - {} - ConvActiveParam(ConvParam &conv_param_in - , BatchnormParam &batchnorm_param_in - , ScaleParam &scale_param_in) - : conv_param(conv_param_in) - , batchnorm_param(batchnorm_param_in) - , scale_param(scale_param_in) - , has_batchnorm(true) - , has_scale(true) - , has_active(false) - , has_eltwise(false) - {} - ConvActiveParam(ConvParam &conv_param_in - , ActivationParam &activation_param_in - , BatchnormParam &batchnorm_param_in - , ScaleParam &scale_param_in - , EltwiseParam &eltwise_param_in) - : conv_param(conv_param_in) - , activation_param(activation_param_in) - , batchnorm_param(batchnorm_param_in) - , scale_param(scale_param_in) - , eltwise_param(eltwise_param_in) - , has_batchnorm(true) - , has_scale(true) - , has_active(true) - , has_eltwise(true) - {} - ConvActiveParam(const ConvActiveParam &right) - : conv_param(right.conv_param) - , activation_param(right.activation_param) - , batchnorm_param(right.batchnorm_param) - , scale_param(right.scale_param) - , has_batchnorm(right.has_batchnorm) - , has_scale(right.has_scale) - , has_active(right.has_active) - {} - ConvActiveParam &operator=(const ConvActiveParam &right) { - conv_param = right.conv_param; - activation_param = right.activation_param; - batchnorm_param = right.batchnorm_param; - scale_param = right.scale_param; - has_batchnorm = right.has_batchnorm; - has_scale = right.has_scale; - has_active = right.has_active; +template +struct LrnParam { + LrnParam() = default; + LrnParam(int local_size_in, float alpha_in, float beta_in, float k_in, NormRegion norm_region_in) + : local_size(local_size_in) + , alpha(alpha_in) + , beta(beta_in) + , k(k_in) + , norm_region(norm_region_in) + {} + LrnParam(const LrnParam& right) + : local_size(right.local_size) + , alpha(right.alpha) + , beta(right.beta) + , k(right.k) + , norm_region(right.norm_region) + {} + LrnParam& operator=(const LrnParam& right) { + local_size = right.local_size; + alpha = right.alpha; + beta = right.beta; + k = right.k; + norm_region = right.norm_region; return *this; } - bool operator==(const ConvActiveParam &right) { + bool operator==(const LrnParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (conv_param == right.conv_param); - comp_eq = comp_eq && (activation_param == right.activation_param); - comp_eq = comp_eq && (batchnorm_param == right.batchnorm_param); - comp_eq = comp_eq && (scale_param == right.scale_param); - comp_eq = comp_eq && (has_batchnorm == right.has_batchnorm); - comp_eq = comp_eq && (has_scale == right.has_scale); + comp_eq = comp_eq && (local_size == right.local_size); + comp_eq = comp_eq && (alpha == right.alpha); + comp_eq = comp_eq && (beta == right.beta); + comp_eq = comp_eq && (k == right.k); + comp_eq = comp_eq && (norm_region == right.norm_region); return comp_eq; } - ConvParam conv_param; - ActivationParam activation_param; - BatchnormParam batchnorm_param; - ScaleParam scale_param; - EltwiseParam eltwise_param; - bool has_batchnorm; - bool has_scale; - bool has_active; - bool has_eltwise; + int local_size{5}; + float alpha{1.}; + float beta{0.75}; + float k{1.}; + NormRegion norm_region{ACROSS_CHANNELS}; }; -// Fusion conv with batchnorm, scale, activation(sigmoid, relu, tanh, clipped_relu, elu) -template -struct ConvActivePoolingParam { - ConvActivePoolingParam() : has_batchnorm(false), has_scale(false), - has_activation(false), has_pooling(false) {} - ConvActivePoolingParam(ConvParam &conv_param_in) - : conv_param(conv_param_in) - , has_batchnorm(false), has_scale(false) - , has_activation(false), has_pooling(false) - {} - ConvActivePoolingParam(ConvParam &conv_param_in, - ActivationParam &activation_param_in) - : conv_param(conv_param_in), activation_param(activation_param_in) - , has_batchnorm(false), has_scale(false), has_activation(true), has_pooling(false) - {} - ConvActivePoolingParam(ConvParam &conv_param_in, - PoolingParam &pooling_param_in) - : conv_param(conv_param_in), pooling_param(pooling_param_in) - , has_batchnorm(false), has_scale(false), has_activation(false), has_pooling(false) - {} - ConvActivePoolingParam(ConvParam &conv_param_in - , ActivationParam &activation_param_in - , PoolingParam &pooling_param_in) - : conv_param(conv_param_in) - , activation_param(activation_param_in) - , pooling_param(pooling_param_in) - , has_batchnorm(false) - , has_scale(false) - , has_activation(true) - , has_pooling(true) - {} - ConvActivePoolingParam(ConvParam &conv_param_in - , BatchnormParam &batchnorm_param_in - , ScaleParam &scale_param_in - , ActivationParam &activation_param_in - , PoolingParam &pooling_param_in) - : conv_param(conv_param_in) - , batchnorm_param(batchnorm_param_in) - , scale_param(scale_param_in) - , activation_param(activation_param_in) - , pooling_param(pooling_param_in) - , has_batchnorm(true) - , has_scale(true) - , has_activation(true) - , has_pooling(true) - {} - ConvActivePoolingParam(const ConvActivePoolingParam &right) - : conv_param(right.conv_param) - , batchnorm_param(right.batchnorm_param) - , scale_param(right.scale_param) - , activation_param(right.activation_param) - , pooling_param(right.pooling_param) - , has_batchnorm(right.has_batchnorm) - , has_scale(right.has_scale) - , has_activation(right.has_activation) - , has_pooling(right.has_pooling) - {} - ConvActivePoolingParam &operator=(const ConvActivePoolingParam &right) { - conv_param = right.conv_param; - batchnorm_param = right.batchnorm_param; - scale_param = right.scale_param; - activation_param = right.activation_param; - pooling_param = right.pooling_param; - has_batchnorm = right.has_batchnorm; - has_scale = right.has_scale; - has_activation = right.has_activation; - has_pooling = right.has_pooling; + +template +struct LstmParam { + typedef Tensor opTensor; + LstmParam() : + weight_tensor(nullptr) + , bias_tensor(nullptr) + , init_hidden_tensor(nullptr) + , dropout_param(1.0f) + , num_direction(1) + , num_layers(1) + , is_reverse(false) + , input_activity(Active_unknow) + , gate_activity(Active_sigmoid) + , cell_activity(Active_tanh) + , candidate_activity(Active_tanh) + , with_peephole(true) + , skip_input(false) + + {} + + LstmParam(opTensor* weight_in, opTensor* bias_in, + opTensor* hidden_init_in = nullptr, + ActiveType input_activity = Active_unknow, + ActiveType gate_activity_in = Active_sigmoid, + ActiveType cell_activity_in = Active_tanh, + ActiveType candidate_activity_in = Active_tanh, + bool with_peephole_in = true, + bool skip_input_in = false, + bool is_reverse_in = false, + float dropout_param_in = 1.f, + int num_direction_in = 1, + int numLayers_in = 1) + : + weight_tensor(weight_in) + , bias_tensor(bias_in) + , dropout_param(dropout_param_in) + , num_direction(num_direction_in) + , num_layers(numLayers_in) + , is_reverse(is_reverse_in) + , input_activity(input_activity) + , gate_activity(gate_activity_in) + , candidate_activity(candidate_activity_in) + , cell_activity(cell_activity_in) + , init_hidden_tensor(hidden_init_in) + , with_peephole(with_peephole_in) + , skip_input(skip_input_in) + {} + + + LstmParam& operator=(const LstmParam& right) { + weight_tensor = right.weight_tensor; + dropout_param = right.dropout_param; + num_direction = right.num_direction; + num_layers = right.num_layers; + bias_tensor = right.bias_tensor; + input_activity = right.input_activity; + gate_activity = right.gate_activity; + cell_activity = right.cell_activity; + candidate_activity = right.candidate_activity; + with_peephole = right.with_peephole; + skip_input = right.skip_input; + is_reverse = right.is_reverse; + init_hidden_tensor = right.init_hidden_tensor; return *this; } - bool operator==(const ConvActivePoolingParam &right) { + + bool operator==(const LstmParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (conv_param == right.conv_param); - comp_eq = comp_eq && (batchnorm_param == right.batchnorm_param); - comp_eq = comp_eq && (scale_param == right.scale_param); - comp_eq = comp_eq && (activation_param == right.activation_param); - comp_eq = comp_eq && (pooling_param == right.pooling_param); - comp_eq = comp_eq && (has_batchnorm == right.has_batchnorm); - comp_eq = comp_eq && (has_scale == right.has_scale); - comp_eq = comp_eq && (has_activation == right.has_activation); - comp_eq = comp_eq && (has_pooling == right.has_pooling); + comp_eq = comp_eq && (weight_tensor == right.weight_tensor); + comp_eq = comp_eq && (dropout_param == right.dropout_param); + comp_eq = comp_eq && (num_direction == right.num_direction); + comp_eq = comp_eq && (num_layers == right.num_layers); + comp_eq = comp_eq && (bias_tensor == right.bias_tensor); + comp_eq = comp_eq && (input_activity == right.input_activity); + comp_eq = comp_eq && (gate_activity == right.gate_activity); + comp_eq = comp_eq && (cell_activity == right.cell_activity); + comp_eq = comp_eq && (with_peephole == right.with_peephole); + comp_eq = comp_eq && (skip_input == right.skip_input); + comp_eq = comp_eq && (candidate_activity == right.candidate_activity); + comp_eq = comp_eq && (is_reverse = right.is_reverse); + comp_eq = comp_eq && (init_hidden_tensor == right.init_hidden_tensor); return comp_eq; } - ConvParam conv_param; - BatchnormParam batchnorm_param; - ScaleParam scale_param; - ActivationParam activation_param; - PoolingParam pooling_param; - bool has_activation; - bool has_pooling; - bool has_batchnorm; - bool has_scale; -}; -template -struct ResizeParam { - ResizeParam() = default; - explicit ResizeParam(float scale_w, float scale_h){ - bool flag = scale_w > 0.f && scale_h > 0.f; - CHECK_EQ(flag, true) << "wrong parameters"; - width_scale = scale_w; - height_scale = scale_h; - } - ResizeParam(const ResizeParam& right){ - width_scale = right.width_scale; - height_scale = right.height_scale; + + inline const opTensor* weight() { + return weight_tensor; } - ResizeParam& operator=(const ResizeParam& right){ - this->width_scale = right.width_scale; - this->height_scale = right.height_scale; - return *this; + + inline const opTensor* bias() { + return bias_tensor; } - bool operator==(const ResizeParam right){ - float eps = 1e-6f; - bool flag = fabsf(width_scale - right.width_scale) < eps; - flag &= fabsf(height_scale - right.height_scale) < eps; - return flag; + + inline const opTensor* init_hidden() { + return init_hidden_tensor; } - float width_scale{0.f}; - float height_scale{0.f}; + + int num_direction; + float dropout_param; + int num_layers; + ActiveType input_activity; + ActiveType gate_activity; + ActiveType cell_activity; + ActiveType candidate_activity; + bool is_reverse; + bool with_peephole; + // skip input (X * [Wix, Wfx, Wcx, Wox]) or not; + // if true, the input's memory layout should be total_seq_len * (4 * hidden_size), + // and you should calc this information in fc layer before; + // otherwise the input's memory layout should be total_seq_len * input_size; + bool skip_input; +private: + opTensor* weight_tensor; + opTensor* bias_tensor; + opTensor* init_hidden_tensor; + }; -template -struct PreluParam { - PreluParam() = default; - PreluParam(bool is_channel_shared, opTensor* input_slope) { - channel_shared = is_channel_shared; - slope = input_slope; - } - PreluParam(const PreluParam& right) { - channel_shared = right.channel_shared; - slope = right.slope; - } - PreluParam& operator=(const PreluParam& right) { - this->channel_shared = right.channel_shared; - this->slope = right.slope; - return *this; + +template +struct MatMulParam { + MatMulParam(): _is_transpose_X(false), _is_transpose_Y(false) {} + MatMulParam(bool x, bool y): _is_transpose_X(x), _is_transpose_Y(y) {} + MatMulParam& operator=(const MatMulParam& right) { + _is_transpose_X = right._is_transpose_X; + _is_transpose_Y = right._is_transpose_Y; } - bool operator==(const PreluParam& right) { - bool flag = this->channel_shared == right.channel_shared; - return flag && (this->slope == right.slope); + bool operator==(const MatMulParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (_is_transpose_X == right._is_transpose_X); + comp_eq = comp_eq && (_is_transpose_Y == right._is_transpose_Y); + return comp_eq; } - bool channel_shared{false}; - opTensor* slope{nullptr}; + bool _is_transpose_X{false}; + bool _is_transpose_Y{false}; + int _m = 0; + int _n = 0; + int _k = 0; + int _b = 0;//batch_size }; -template +template struct MvnParam { - MvnParam() = default; - MvnParam(bool normalize_variance_in, bool across_channels_in, float eps_in) { normalize_variance = normalize_variance_in; across_channels = across_channels_in; eps = eps_in; } - - MvnParam(const MvnParam& right) { + MvnParam(const MvnParam& right) { normalize_variance = right.normalize_variance; across_channels = right.across_channels; eps = right.eps; } - - MvnParam& operator=(const MvnParam& right) { + MvnParam& operator=(const MvnParam& right) { this->normalize_variance = right.normalize_variance; this->across_channels = right.across_channels; this->eps = right.eps; return *this; } - - bool operator==(const MvnParam& right) { + bool operator==(const MvnParam& right) { bool flag = this->normalize_variance == right.normalize_variance; flag = flag && this->across_channels == right.across_channels; return flag && (this->eps == right.eps); } - bool normalize_variance{true}; bool across_channels{true}; float eps{1e-9}; }; -template -struct ArgmaxParam { - - ArgmaxParam() = default; - - ArgmaxParam(bool out_max_val_in,int top_k_in, int axis_in) { - out_max_val = out_max_val_in; - has_axis = true; - top_k = top_k_in; - axis = axis_in; - } - - ArgmaxParam(bool out_max_val_in,int top_k_in) { - out_max_val = out_max_val_in; - top_k = top_k_in; - has_axis = false; - axis = 3; - } - - - ArgmaxParam(const ArgmaxParam& right) { - out_max_val = right.out_max_val; - top_k = right.top_k; - has_axis = right.has_axis; - axis = right.axis; - } - - ArgmaxParam& operator=(const ArgmaxParam& right) { - this->out_max_val = right.out_max_val; - this->top_k = right.top_k; - this->axis = right.axis; - this->has_axis = right.has_axis; - return *this; - } - - bool operator==(const ArgmaxParam& right) { - bool flag = this->out_max_val == right.out_max_val; - flag = flag && this->top_k == right.top_k; - flag = flag && this->has_axis == right.has_axis; - return flag && (this->axis == right.axis); - } - bool out_max_val{false}; - bool has_axis{true}; - int top_k{1}; - int axis{3}; -}; - - - -template +template struct NormalizeParam { NormalizeParam() = default; @@ -1542,7 +1363,7 @@ struct NormalizeParam { CHECK_EQ(p == 2 || p == 1, true) << "only support L1 and L2 norm"; } NormalizeParam(bool is_across_spatial, bool is_shared_channel, \ - opTensor* input_scale, float eps_in = 1e-6f, int pin = 2) { + Tensor* input_scale, float eps_in = 1e-6f, int pin = 2) { across_spatial = is_across_spatial; channel_shared = is_shared_channel; @@ -1553,7 +1374,7 @@ struct NormalizeParam { CHECK_EQ(p == 2 || p == 1, true) << "only support L1 and L2 norm"; } - NormalizeParam(const NormalizeParam& right) { + NormalizeParam(const NormalizeParam& right) { channel_shared = right.channel_shared; across_spatial = right.across_spatial; p = right.p; @@ -1562,7 +1383,7 @@ struct NormalizeParam { eps = right.eps; } - NormalizeParam& operator=(const NormalizeParam& right) { + NormalizeParam& operator=(const NormalizeParam& right) { this->channel_shared = right.channel_shared; this->across_spatial = right.across_spatial; this->scale = right.scale; @@ -1572,7 +1393,7 @@ struct NormalizeParam { return *this; } - bool operator==(const NormalizeParam& right) { + bool operator==(const NormalizeParam& right) { bool flag = this->across_spatial == right.across_spatial; flag = flag && (this->channel_shared == right.channel_shared); flag = flag && (this->has_scale == right.has_scale); @@ -1591,167 +1412,216 @@ struct NormalizeParam { //! if channel_shared = true, use one scale data bool channel_shared{false}; //! scale tensor if has one - opTensor* scale{nullptr}; + Tensor* scale{nullptr}; float eps{1e-6f}; }; -template -struct FcParam { - FcParam() = default; - - FcParam(opTensor* input_weight, int output_num, int in_axis = 1, - bool trans = false) { - - num_output = output_num; - weights = input_weight; - bias = nullptr; - axis = in_axis; - is_transpose_weights = trans; - } - FcParam(opTensor* input_weight, opTensor* input_bias, int output_num, - int in_axis = 1, bool trans = false) { - - num_output = output_num; - weights = input_weight; - bias = input_bias; - axis = in_axis; - is_transpose_weights = trans; - } - - FcParam(const FcParam& right) { - weights = right.weights; - bias = right.bias; - num_output = right.num_output; - axis = right.axis; - is_transpose_weights = right.is_transpose_weights; - } - - FcParam& operator=(const FcParam& right) { - this->weights = right.weights; - this->bias = right.bias; - this->num_output = right.num_output; - this->axis = right.axis; - this->is_transpose_weights = right.is_transpose_weights; +template +struct PadParam { + PadParam() = default; + PadParam(std::vector pad_c_in, std::vector pad_h_in, std::vector pad_w_in) + : pad_c(pad_c_in) + , pad_h(pad_h_in) + , pad_w(pad_w_in) + {} + PadParam(const PadParam& right) + : pad_c(right.pad_c) + , pad_h(right.pad_h) + , pad_w(right.pad_w) + {} + PadParam& operator=(const PadParam& right) { + pad_c = right.pad_c; + pad_h = right.pad_h; + pad_w = right.pad_w; return *this; } - - bool operator==(const FcParam& right) { - bool flag = this->is_transpose_weights == right.is_transpose_weights; - flag = flag && (this->num_output == right.num_output) && (this->axis == right.axis); - return flag && (this->weights == right.weights) && (this->bias == right.bias); + bool operator==(const PadParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (pad_c == right.pad_c); + comp_eq = comp_eq && (pad_h == right.pad_h); + comp_eq = comp_eq && (pad_w == right.pad_w); + return comp_eq; } - - bool is_transpose_weights{false}; - int num_output; - int axis{1}; - opTensor* weights{nullptr}; - opTensor* bias{nullptr}; + std::vector pad_c; + std::vector pad_h; + std::vector pad_w; }; -template -struct EltwiseParam { - typedef typename opTensor::Dtype DataDtype; - EltwiseParam() - : operation(Eltwise_unknow) - , coeff() - {} - EltwiseParam(EltwiseType operation_in - , std::vector coeff_in = std::vector({1,1})) - : operation(operation_in) - , coeff(coeff_in) - { - if ((operation == Eltwise_sum) && (coeff.size() == 0)) { - coeff.push_back(1); - coeff.push_back(1); - } +template +struct PermuteParam { + PermuteParam() {} + PermuteParam(std::vector order): order(order) {} + PermuteParam(const PermuteParam& right): order(right.order) {} + PermuteParam& operator=(const PermuteParam& right) { + order = right.order; + return *this; } + bool operator==(const PermuteParam& right) { + bool comp_eq = true; + comp_eq = order.size() == right.order.size(); - EltwiseParam(const EltwiseParam& right) - : operation(right.operation) - , coeff(right.coeff) - {} - - EltwiseParam& operator=(const EltwiseParam& right) { - operation = right.operation; - coeff.resize(right.coeff.size()); - for (int i = 0; i < coeff.size(); ++i) { - coeff[i] = right.coeff[i]; + for (int i = 0; i < order.size(); ++i) { + comp_eq = comp_eq && (order[i] == right.order[i]); } - return *this; + + return comp_eq; } + std::vector order; +}; - bool operator==(const EltwiseParam& right) { +template +struct PermutePowerParam { + PermutePowerParam() {} + PermutePowerParam(PermuteParam permute_param): + power_param(power_param), has_power_param(false) {} + PermutePowerParam(PermuteParam permute_param, PowerParam power_param): + power_param(power_param), permute_param(permute_param), has_power_param(true) {} + PermutePowerParam(const PermutePowerParam& right): + power_param(right.power_param), permute_param(right.permute_param), + has_power_param(right.has_power_param) {} + bool operator==(const PermutePowerParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (operation == right.operation); - comp_eq = comp_eq && (coeff.size() == right.coeff.size()); - if (!comp_eq) { - return comp_eq; - } - for (int i = 0; i < coeff.size(); ++i) { - comp_eq = comp_eq && (coeff[i] == right.coeff[i]); - } + comp_eq = comp_eq && (power_param == right.power_param); + comp_eq = comp_eq && (permute_param == right.permute_param); + return comp_eq; } - EltwiseType operation; - std::vector coeff; + PowerParam power_param; + PermuteParam permute_param; + bool has_power_param; }; -template -struct EltwiseActiveParam { - EltwiseActiveParam() - : eltwise_param() - , activation_param() - , has_activation(false) - {} - EltwiseActiveParam(EltwiseParam &eltwise_param_in, - ActivationParam &activation_param_in) - : eltwise_param(eltwise_param_in) - , activation_param(activation_param_in) - , has_activation(true) - {} - EltwiseActiveParam(const EltwiseActiveParam &right) - : eltwise_param(right.eltwise_param) - , activation_param(right.activation_param) - , has_activation(right.has_activation) +template +struct PoolingParam { + PoolingParam() : window_h(-1), window_w(-1) + , pad_h(-1), pad_w(-1) + , stride_h(-1), stride_w(-1) + , pooling_type(Pooling_unknow) + , global_pooling(false) + , cmp_out_shape_floor_as_conv(false) {} - EltwiseActiveParam &operator=(const EltwiseActiveParam &right) { - eltwise_param = right.eltwise_param; - activation_param = right.activation_param; - has_activation = right.has_activation; + PoolingParam(int window_h_in, int window_w_in, int pad_h_in + , int pad_w_in, int stride_h_in, int stride_w_in, PoolingType type + , bool global_pooling_in = false, bool cmp_out_shape_floor_as_conv_in = false) + : window_h(window_h_in), window_w(window_w_in) + , pad_h(pad_h_in), pad_w(pad_w_in) + , stride_h(stride_h_in), stride_w(stride_w_in) + , pooling_type(type) + , global_pooling(global_pooling_in) + , cmp_out_shape_floor_as_conv(cmp_out_shape_floor_as_conv_in) + {} + PoolingParam(const PoolingParam& right) + : window_h(right.window_h) + , window_w(right.window_w) + , pad_h(right.pad_h) + , pad_w(right.pad_w) + , stride_h(right.stride_h) + , stride_w(right.stride_w) + , pooling_type(right.pooling_type) + , global_pooling(right.global_pooling) + , cmp_out_shape_floor_as_conv(right.cmp_out_shape_floor_as_conv) + {} + PoolingParam& operator=(const PoolingParam& right) { + window_h = right.window_h; + window_w = right.window_w; + pad_h = right.pad_h; + pad_w = right.pad_w; + stride_h = right.stride_h; + stride_w = right.stride_w; + pooling_type = right.pooling_type; + global_pooling = right.global_pooling; + cmp_out_shape_floor_as_conv = right.cmp_out_shape_floor_as_conv; return *this; } - bool operator==(const EltwiseActiveParam &right) { + bool operator==(const PoolingParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (eltwise_param == right.eltwise_param); - comp_eq = comp_eq && (activation_param == right.activation_param); - comp_eq = comp_eq && (has_activation == right.has_activation); + comp_eq = comp_eq && (window_h == right.window_h); + comp_eq = comp_eq && (window_w == right.window_w); + comp_eq = comp_eq && (pad_h == right.pad_h); + comp_eq = comp_eq && (pad_w == right.pad_w); + comp_eq = comp_eq && (stride_h == right.stride_h); + comp_eq = comp_eq && (stride_w == right.stride_w); + comp_eq = comp_eq && (pooling_type == right.pooling_type); + comp_eq = comp_eq && (global_pooling == right.global_pooling); + comp_eq = comp_eq && (cmp_out_shape_floor_as_conv == right.cmp_out_shape_floor_as_conv); return comp_eq; } - - EltwiseParam eltwise_param; - ActivationParam activation_param; - bool has_activation; + inline bool pooling_padded() { + return (pad_h || pad_w); + } + int window_h; + int window_w; + int pad_h; + int pad_w; + int stride_h; + int stride_w; + PoolingType pooling_type; + bool global_pooling; + bool cmp_out_shape_floor_as_conv; }; -template -struct PriorBoxParam { - - PriorBoxParam(){} - PriorBoxParam(std::vector min_in, std::vector max_in, \ - std::vector aspect_in, std::vector variance_in, - bool flip, bool clip, int image_width, int image_height, \ - float step_width, float step_height, float offset_in) { - is_flip = flip; - is_clip = clip; +template +struct PowerParam { + PowerParam() {} + PowerParam(float power, float scale, float shift) : power(power), scale(scale), shift(shift) {} + PowerParam(const PowerParam& right) : power(right.power), scale(right.scale), shift(right.shift) {} + bool operator==(const PowerParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (power == right.power); + comp_eq = comp_eq && (scale == right.scale); + comp_eq = comp_eq && (shift == right.shift); + return comp_eq; + } + float power; + float scale; + float shift; +}; + +template +struct PreluParam { + PreluParam() = default; + PreluParam(bool is_channel_shared, Tensor* input_slope) { + channel_shared = is_channel_shared; + slope = input_slope; + } + PreluParam(const PreluParam& right) { + channel_shared = right.channel_shared; + slope = right.slope; + } + PreluParam& operator=(const PreluParam& right) { + this->channel_shared = right.channel_shared; + this->slope = right.slope; + return *this; + } + bool operator==(const PreluParam& right) { + bool flag = this->channel_shared == right.channel_shared; + return flag && (this->slope == right.slope); + } + bool channel_shared{false}; + Tensor* slope{nullptr}; +}; + +template +struct PriorBoxParam { + + PriorBoxParam() {} + PriorBoxParam(std::vector min_in, std::vector max_in, \ + std::vector aspect_in, std::vector variance_in, + bool flip, bool clip, int image_width, int image_height, \ + float step_width, float step_height, float offset_in, std::vector order_in) { + is_flip = flip; + is_clip = clip; min_size = min_in; img_w = image_width; img_h = image_height; step_w = step_width; step_h = step_height; offset = offset_in; + order = order_in; aspect_ratio.clear(); aspect_ratio.push_back(1.f); variance.clear(); + if (variance_in.size() == 1) { variance.push_back(variance_in[0]); variance.push_back(variance_in[0]); @@ -1768,23 +1638,29 @@ struct PriorBoxParam { for (int i = 0; i < aspect_in.size(); ++i) { float ar = aspect_in[i]; bool already_exist = false; + for (int j = 0; j < aspect_ratio.size(); ++j) { if (fabs(ar - aspect_ratio[j]) < 1e-6) { already_exist = true; break; } } + if (!already_exist) { aspect_ratio.push_back(ar); + if (is_flip) { - aspect_ratio.push_back(1.f/ar); + aspect_ratio.push_back(1.f / ar); } } } + prior_num = min_size.size() * aspect_ratio.size(); max_size.clear(); + if (max_in.size() > 0) { CHECK_EQ(max_in.size(), min_size.size()) << "max_size num must = min_size num"; + for (int i = 0; i < max_in.size(); ++i) { CHECK_GT(max_in[i], min_size[i]) << "max_size val must > min_size val"; max_size.push_back(max_in[i]); @@ -1792,7 +1668,7 @@ struct PriorBoxParam { } } } - PriorBoxParam(const PriorBoxParam& right) { + PriorBoxParam(const PriorBoxParam& right) { is_flip = right.is_flip; is_clip = right.is_clip; min_size = right.min_size; @@ -1804,9 +1680,10 @@ struct PriorBoxParam { step_w = right.step_w; step_h = right.step_h; offset = right.offset; + order = right.order; prior_num = right.prior_num; } - PriorBoxParam& operator=(const PriorBoxParam& right) { + PriorBoxParam& operator=(const PriorBoxParam& right) { this->is_flip = right.is_flip; this->is_clip = right.is_clip; this->min_size = right.min_size; @@ -1818,49 +1695,60 @@ struct PriorBoxParam { this->step_w = right.step_w; this->step_h = right.step_h; this->offset = right.offset; + this->order = right.order; this->prior_num = right.prior_num; return *this; } - bool operator==(const PriorBoxParam& right) { + bool operator==(const PriorBoxParam& right) { bool flag = is_flip == right.is_flip; flag = flag && (is_clip == right.is_clip); + if (min_size.size() != right.min_size.size()) { return false; } + for (int i = 0; i < min_size.size(); ++i) { if (min_size[i] != right.min_size[i]) { return false; } } + if (max_size.size() != right.max_size.size()) { return false; } + for (int i = 0; i < max_size.size(); ++i) { if (max_size[i] != right.max_size[i]) { return false; } } + if (aspect_ratio.size() != right.aspect_ratio.size()) { return false; } + for (int i = 0; i < aspect_ratio.size(); ++i) { if (aspect_ratio[i] != right.aspect_ratio[i]) { return false; } } + if (variance.size() != right.variance.size()) { return false; } + for (int i = 0; i < variance.size(); ++i) { if (variance[i] != right.variance[i]) { return false; } } + flag = flag && (img_w == right.img_w); flag = flag && (img_h == right.img_h); flag = flag && (step_w == right.step_w); flag = flag && (step_h == right.step_h); flag = flag && (offset == right.offset); + flag = flag && (order == right.order); flag = flag && (prior_num == right.prior_num); return flag; } @@ -1877,531 +1765,99 @@ struct PriorBoxParam { float step_h{0}; float offset{0.5}; int prior_num{0}; + std::vector order; }; -template -struct DeformableConvParam { - - DeformableConvParam() : group(-1), pad_h(-1), pad_w(-1), - stride_h(-1), stride_w(-1), - dilation_h(-1), dilation_w(-1), axis(-1), - weight_tensor(NULL), bias_tensor(NULL), alpha(1.0), beta(0.0) {} - - DeformableConvParam(int group_in, int pad_h_in, int pad_w_in, int stride_h_in, - int stride_w_in, int dilation_h_, int dilation_w_, opTensor* weight, - opTensor* bias, int axis_in = 1, float alpha_in = 1.0, float beta_in = 0.0) - : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) - , stride_h(stride_h_in), stride_w(stride_w_in) - , dilation_h(dilation_h_), dilation_w(dilation_w_) - , axis(axis_in) - , weight_tensor(weight), bias_tensor(bias) - , alpha(alpha_in), beta(beta_in) - {} - - DeformableConvParam(const DeformableConvParam &right) - : group(right.group), pad_h(right.pad_h) - , pad_w(right.pad_w), stride_h(right.stride_h) - , stride_w(right.stride_w), dilation_h(right.dilation_h) - , dilation_w(right.dilation_w) - , axis(right.axis) - , weight_tensor(right.weight_tensor) - , bias_tensor(right.bias_tensor) - , alpha(right.alpha) - , beta(right.beta) - {} - - DeformableConvParam &operator=(const DeformableConvParam &right) { - group = right.group; - pad_h = right.pad_h; - pad_w = right.pad_w; - stride_h = right.stride_h; - stride_w = right.stride_w; - dilation_h = right.dilation_h; - dilation_w = right.dilation_w; - axis = right.axis; - weight_tensor = right.weight_tensor; - bias_tensor = right.bias_tensor; - alpha = right.alpha; - beta = right.beta; - return *this; - } - - bool operator==(const DeformableConvParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (group == right.group); - comp_eq = comp_eq && (pad_h == right.pad_h); - comp_eq = comp_eq && (pad_w == right.pad_w); - comp_eq = comp_eq && (stride_h == right.stride_h); - comp_eq = comp_eq && (stride_w == right.stride_w); - comp_eq = comp_eq && (dilation_h == right.dilation_h); - comp_eq = comp_eq && (dilation_w == right.dilation_w); - comp_eq = comp_eq && (axis == right.axis); - comp_eq = comp_eq && (weight_tensor == right.weight_tensor); - comp_eq = comp_eq && (bias_tensor == right.bias_tensor); - comp_eq = comp_eq && (alpha == right.alpha); - comp_eq = comp_eq && (beta == right.beta); - return comp_eq; - } - - inline const opTensor* weight() { - return weight_tensor; - } - - inline const opTensor* bias() { - return bias_tensor; - } - - inline opTensor* mutable_weight() { - return weight_tensor; - } - - inline opTensor* mutable_bias() { - return bias_tensor; - } - - int group; - int pad_h; - int pad_w; - int stride_h; - int stride_w; - int dilation_h; - int dilation_w; - int axis; - float alpha; - float beta; - -private: - opTensor* weight_tensor; - opTensor* bias_tensor; -}; - -template -struct SPPParam { - SPPParam() = default; - SPPParam(int pyramid_height_in, PoolingType pool_type_in) - : pyramid_height(pyramid_height_in) - , pool_type(pool_type_in) - {} - SPPParam(const SPPParam &right) - : pyramid_height(right.pyramid_height) - , pool_type(right.pool_type) - {} - SPPParam &operator=(const SPPParam &right) { - pyramid_height = right.pyramid_height; - pool_type = right.pool_type; - return *this; - } - bool operator==(const SPPParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (pyramid_height == right.pyramid_height); - comp_eq = comp_eq && (pool_type == right.pool_type); - return comp_eq; - } - - int pyramid_height; - PoolingType pool_type; -}; - -template -struct CropParam { - CropParam() = default; - CropParam(int axis_in, std::vector offset_in, std::vector shape_in) - : axis(axis_in) - , offset(offset_in) - , shape(shape_in) - {} - CropParam(const CropParam &right) - : axis(right.axis) - , offset(right.offset) - , shape(right.shape) - {} - CropParam &operator=(const CropParam &right) { - axis = right.axis; - offset = right.offset; - shape = right.shape; - return *this; - } - bool operator==(const CropParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (axis == right.axis); - comp_eq = comp_eq && (offset == right.offset); - comp_eq = comp_eq && (shape == right.shape); - return comp_eq; - } - int axis = 1; - std::vector offset; - std::vector shape; -}; - - -template -struct PadParam { - PadParam() = default; - PadParam(std::vector pad_c_in, std::vector pad_h_in, std::vector pad_w_in) - : pad_c(pad_c_in) - , pad_h(pad_h_in) - , pad_w(pad_w_in) - {} - PadParam(const PadParam &right) - : pad_c(right.pad_c) - , pad_h(right.pad_h) - , pad_w(right.pad_w) - {} - PadParam &operator=(const PadParam &right) { - pad_c = right.pad_c; - pad_h = right.pad_h; - pad_w = right.pad_w; - return *this; - } - bool operator==(const PadParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (pad_c == right.pad_c); - comp_eq = comp_eq && (pad_h == right.pad_h); - comp_eq = comp_eq && (pad_w == right.pad_w); - return comp_eq; - } - std::vector pad_c; - std::vector pad_h; - std::vector pad_w; -}; - -template -struct LrnParam { - LrnParam() = default; - LrnParam(int local_size_in, float alpha_in, float beta_in, float k_in, NormRegion norm_region_in) - : local_size(local_size_in) - , alpha(alpha_in) - , beta(beta_in) - , k(k_in) - , norm_region(norm_region_in) - {} - LrnParam(const LrnParam &right) - : local_size(right.local_size) - , alpha(right.alpha) - , beta(right.beta) - , k(right.k) - , norm_region(right.norm_region) - {} - LrnParam &operator=(const LrnParam &right) { - local_size = right.local_size; - alpha = right.alpha; - beta = right.beta; - k = right.k; - norm_region = right.norm_region; - return *this; - } - bool operator==(const LrnParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (local_size == right.local_size); - comp_eq = comp_eq && (alpha == right.alpha); - comp_eq = comp_eq && (beta == right.beta); - comp_eq = comp_eq && (k == right.k); - comp_eq = comp_eq && (norm_region == right.norm_region); - return comp_eq; - } - int local_size{5}; - float alpha{1.}; - float beta{0.75}; - float k{1.}; - NormRegion norm_region{ACROSS_CHANNELS}; -}; - -template -struct MultiClassNMSParam { - MultiClassNMSParam() = default; - - MultiClassNMSParam(int bg_id, int keep_topk, int nms_topk, float nms_threshold, \ - float confidence_threshold, float eta = 1.f) { - background_id = bg_id; - keep_top_k = keep_topk; - nms_top_k = nms_topk; - nms_thresh = nms_threshold; - conf_thresh = confidence_threshold; - nms_eta = eta; - } - - void init(int bg_id, int keep_topk, int nms_topk, float nms_threshold, \ - float confidence_threshold, float eta = 1.f) { - background_id = bg_id; - keep_top_k = keep_topk; - nms_top_k = nms_topk; - nms_thresh = nms_threshold; - conf_thresh = confidence_threshold; - nms_eta = eta; - } - - MultiClassNMSParam(const MultiClassNMSParam &right) { - background_id = right.background_id; - keep_top_k = right.keep_top_k; - nms_top_k = right.nms_top_k; - nms_thresh = right.nms_thresh; - conf_thresh = right.conf_thresh; - nms_eta = right.nms_eta; - } - - MultiClassNMSParam &operator=(const MultiClassNMSParam &right) { - this->background_id = right.background_id; - this->keep_top_k = right.keep_top_k; - this->nms_top_k = right.nms_top_k; - this->nms_thresh = right.nms_thresh; - this->conf_thresh = right.conf_thresh; - this->nms_eta = right.nms_eta; - return *this; - } - - bool operator==(const MultiClassNMSParam &right) { - bool flag = (background_id == right.background_id); - flag = flag && (keep_top_k == right.keep_top_k); - flag = flag && (nms_top_k == right.nms_top_k); - flag = flag && (nms_thresh == right.nms_thresh); - flag = flag && (conf_thresh == right.conf_thresh); - flag = flag && (nms_eta == right.nms_eta); - return flag; - } - - int background_id{0}; - int keep_top_k{-1}; - float conf_thresh; - int nms_top_k; - float nms_thresh{0.3f}; - float nms_eta{1.f}; -}; - -template -struct DetectionOutputParam { - - DetectionOutputParam() = default; - - DetectionOutputParam(int classes, int bg_id, int keep_topk, int nms_topk, float nms_threshold, \ - float confidence_threshold, bool share_loc = true, bool variance_in_target = false, \ - int codetype = 1, float eta = 1.f) { - class_num = classes; - background_id = bg_id; - keep_top_k = keep_topk; - nms_top_k = nms_topk; - nms_thresh = nms_threshold; - conf_thresh = confidence_threshold; - share_location = share_loc; - variance_encode_in_target = variance_in_target; - type = (CodeType) codetype; - nms_eta = eta; - } - - void init(int classes, int bg_id, int keep_topk, int nms_topk, float nms_threshold, \ - float confidence_threshold, bool share_loc = true, bool variance_in_target = false, \ - int codetype = 1, float eta = 1.f) { - class_num = classes; - background_id = bg_id; - keep_top_k = keep_topk; - nms_top_k = nms_topk; - nms_thresh = nms_threshold; - conf_thresh = confidence_threshold; - share_location = share_loc; - variance_encode_in_target = variance_in_target; - type = (CodeType) codetype; - nms_eta = eta; - } - - DetectionOutputParam(const DetectionOutputParam &right) { - class_num = right.class_num; - background_id = right.background_id; - keep_top_k = right.keep_top_k; - nms_top_k = right.nms_top_k; - nms_thresh = right.nms_thresh; - conf_thresh = right.conf_thresh; - share_location = right.share_location; - variance_encode_in_target = right.variance_encode_in_target; - type = right.type; - nms_eta = right.nms_eta; - } - - DetectionOutputParam &operator=(const DetectionOutputParam &right) { - this->class_num = right.class_num; - this->background_id = right.background_id; - this->keep_top_k = right.keep_top_k; - this->nms_top_k = right.nms_top_k; - this->nms_thresh = right.nms_thresh; - this->conf_thresh = right.conf_thresh; - this->share_location = right.share_location; - this->variance_encode_in_target = right.variance_encode_in_target; - this->type = right.type; - this->nms_eta = right.nms_eta; - return *this; - } - - bool operator==(const DetectionOutputParam &right) { - bool flag = class_num == right.class_num; - flag = flag && (background_id == right.background_id); - flag = flag && (keep_top_k == right.keep_top_k); - flag = flag && (nms_top_k == right.nms_top_k); - flag = flag && (nms_thresh == right.nms_thresh); - flag = flag && (conf_thresh == right.conf_thresh); - flag = flag && (share_location == right.share_location); - flag = flag && (variance_encode_in_target == right.variance_encode_in_target); - flag = flag && (type == right.type); - flag = flag && (nms_eta == right.nms_eta); - return flag; - } - - bool share_location{true}; - bool variance_encode_in_target{false}; - int class_num; - int background_id{0}; - int keep_top_k{-1}; - CodeType type{CORNER}; - float conf_thresh; - int nms_top_k; - float nms_thresh{0.3f}; - float nms_eta{1.f}; -}; - -template -struct BoxCoderParam { +template +struct ReshapeParam { + ReshapeParam() = default; + explicit ReshapeParam(std::vector shape_param_in) { + int count = 0; - BoxCoderParam() = default; + for (int i = 0; i < shape_param_in.size(); ++i) { + if (shape_param_in[i] == -1) { + count ++; + } + } - BoxCoderParam(int coder_type) { - type = BoxCoderType(coder_type); + CHECK_LE(count, 1) << "shape parameter contains multiple -1 dims"; + shape_params = shape_param_in; } - - BoxCoderParam(const BoxCoderParam& right) { - type = right.type; + ReshapeParam(const ReshapeParam& right) { + shape_params = right.shape_params; } - - BoxCoderParam& operator=(const BoxCoderParam& right) { - this->type = right.type; + ReshapeParam& operator=(const ReshapeParam& right) { + shape_params = right.shape_params; return *this; } + bool operator==(const ReshapeParam& right) { + bool comp_eq = shape_params.size() == right.shape_params.size(); - bool operator==(const BoxCoderParam& right) { - return this->type == right.type; - } - - BoxCoderType type{ENCODE_CENTER}; -}; - -template -struct DfmbpsRoiAlignParam { - DfmbpsRoiAlignParam() - : heat_map_a(0) - , output_dim(0) - {} - DfmbpsRoiAlignParam(float heat_map_a_in, int output_dim_in, - float heat_map_b_in = 0, float pad_ratio_in = 0, - float trans_std_in = 0.1, int sample_per_part_in = 4, - int group_height_in = 7, int group_width_in = 7, - int pooled_height_in = 7, int pooled_width_in = 7, - int part_height_in = 7, int part_width_in = 7) - : heat_map_a(heat_map_a_in), output_dim(output_dim_in) - , heat_map_b(heat_map_b_in), pad_ratio(pad_ratio_in) - , trans_std(trans_std_in), sample_per_part(sample_per_part_in) - , group_height(group_height_in), group_width(group_width_in) - , pooled_height(pooled_height_in), pooled_width(pooled_width_in) + for (int i = 0; i < shape_params.size(); ++i) { + if (!comp_eq) { + return false; + } - {} - ~DfmbpsRoiAlignParam(){} - - DfmbpsRoiAlignParam(const DfmbpsRoiAlignParam &right) - : heat_map_a(right.heat_map_a) - , output_dim(right.output_dim) - , heat_map_b(right.heat_map_b) - , pad_ratio(right.pad_ratio) - , trans_std(right.trans_std) - , sample_per_part(right.sample_per_part) - , group_height(right.group_height) - , group_width(right.group_width) - , pooled_height(right.pooled_height) - , pooled_width(right.pooled_width) - {} + comp_eq = shape_params[i] == right.shape_params[i]; + } - DfmbpsRoiAlignParam &operator=(const DfmbpsRoiAlignParam &right) { - heat_map_a = right.heat_map_a; - output_dim = right.output_dim; - heat_map_b = right.heat_map_b; - pad_ratio = right.pad_ratio; - trans_std = right.trans_std; - sample_per_part = right.sample_per_part; - group_height = right.group_height; - group_width = right.group_width; - pooled_height = right.pooled_height; - pooled_width = right.pooled_width; - return *this; + return true; } + std::vector shape_params; +}; - bool operator==(const DfmbpsRoiAlignParam &right) { - bool comp_eq = true; - comp_eq &= comp_eq && (heat_map_a == right.heat_map_a); - comp_eq &= comp_eq && (output_dim == right.output_dim); - comp_eq &= comp_eq && (heat_map_b == right.heat_map_b); - comp_eq &= comp_eq && (pad_ratio == right.pad_ratio); - comp_eq &= comp_eq && (trans_std == right.trans_std); - comp_eq &= comp_eq && (sample_per_part == right.sample_per_part); - comp_eq &= comp_eq && (group_height == right.group_height); - comp_eq &= comp_eq && (group_width == right.group_width); - comp_eq &= comp_eq && (pooled_height == right.pooled_height); - comp_eq &= comp_eq && (pooled_width == right.pooled_width); - return comp_eq; +template +struct ResizeParam { + ResizeParam() = default; + explicit ResizeParam(float scale_w, float scale_h) { + bool flag = scale_w > 0.f && scale_h > 0.f; + CHECK_EQ(flag, true) << "wrong parameters"; + width_scale = scale_w; + height_scale = scale_h; } - - float heat_map_a; - int output_dim; - - float heat_map_b; - float pad_ratio; - float trans_std; - int sample_per_part; - int group_height; - int group_width; - int pooled_height; - int pooled_width; - int part_height; - int part_width; -}; -template -inline -bool compare_vectors(vectors &a, const vectors &b) { - if (a.size()!= b.size()){ - return false; + ResizeParam(const ResizeParam& right) { + width_scale = right.width_scale; + height_scale = right.height_scale; } - bool comp = true; - for (int i = 0; i < a.size(); ++i) { - comp &= comp && (a[i] == b[i]); + ResizeParam& operator=(const ResizeParam& right) { + this->width_scale = right.width_scale; + this->height_scale = right.height_scale; + return *this; } - return comp; -} -template -inline -void copy_vectors(vectors &out, const vectors &in) { - out.resize(in.size()); - for (int i = 0; i < out.size(); ++i) { - out[i] = in[i]; + bool operator==(const ResizeParam& right) { + float eps = 1e-6; + bool flag = fabsf(width_scale - right.width_scale) < eps; + flag &= fabsf(height_scale - right.height_scale) < eps; + return flag; } -} + float width_scale{0.0f}; + float height_scale{0.0f}; +}; -template +template struct RoiPoolParam { RoiPoolParam() = default; RoiPoolParam(int pooled_height_in, int pooled_width_in, float spatial_scale_in, - int height_in, int width_in) - : pooled_height(pooled_height_in) - , pooled_width(pooled_width_in) - , spatial_scale(spatial_scale_in) - , height(height_in) - , width(width_in) + int height_in, int width_in) + : pooled_height(pooled_height_in) + , pooled_width(pooled_width_in) + , spatial_scale(spatial_scale_in) + , height(height_in) + , width(width_in) {} RoiPoolParam(int pooled_height_in, int pooled_width_in, float spatial_scale_in) - : pooled_height(pooled_height_in) - , pooled_width(pooled_width_in) - , spatial_scale(spatial_scale_in) + : pooled_height(pooled_height_in) + , pooled_width(pooled_width_in) + , spatial_scale(spatial_scale_in) {} - RoiPoolParam(const RoiPoolParam &right) - : pooled_height(right.pooled_height) - , pooled_width(right.pooled_width) - , spatial_scale(right.spatial_scale) - , height(right.height) - , width(right.width) + RoiPoolParam(const RoiPoolParam& right) + : pooled_height(right.pooled_height) + , pooled_width(right.pooled_width) + , spatial_scale(right.spatial_scale) + , height(right.height) + , width(right.width) {} - RoiPoolParam &operator=(const RoiPoolParam &right) { + RoiPoolParam& operator=(const RoiPoolParam& right) { pooled_height = right.pooled_height; pooled_width = right.pooled_width; spatial_scale = right.spatial_scale; @@ -2409,7 +1865,7 @@ struct RoiPoolParam { width = right.width; return *this; } - bool operator==(const RoiPoolParam &right) { + bool operator==(const RoiPoolParam& right) { bool comp_eq = true; comp_eq = comp_eq && (pooled_height == right.pooled_height); comp_eq = comp_eq && (pooled_width == right.pooled_width); @@ -2425,246 +1881,222 @@ struct RoiPoolParam { int width{1}; }; -template -struct FlattenParam { - FlattenParam() = default; - FlattenParam(const FlattenParam& right) {} - FlattenParam& operator=(const FlattenParam& right){} - bool operator==(const FlattenParam& right){ - return true; - } -}; -template -struct AxpyParam { - AxpyParam() = default; - AxpyParam(const AxpyParam& right) {} - AxpyParam& operator=(const AxpyParam& right){} - bool operator==(const AxpyParam& right){ - return true; - } -}; -template -struct CtcAlignParam { - CtcAlignParam() = default; - CtcAlignParam(int blank_in, bool merge_repeated_in) - : blank(blank_in) - , merge_repeated(merge_repeated_in) +template +struct ScaleParam { + typedef float DataDtype; + ScaleParam() + : axis(1), num_axes(1) + , bias_term(false) {} - CtcAlignParam(const CtcAlignParam &right) - : blank(right.blank) - , merge_repeated(right.merge_repeated) + ScaleParam(std::vector scale_w_in, std::vector scale_b_in, + bool bias_term_in = true, int axis_in = 1, int num_axes_in = 1) + : scale_w(scale_w_in), scale_b(scale_b_in) + , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in) {} - CtcAlignParam &operator=(const CtcAlignParam &right) { - blank = right.blank; - merge_repeated = right.merge_repeated; + ScaleParam(std::vector scale_w_in, + bool bias_term_in = false, int axis_in = 1, int num_axes_in = 1) + : scale_w(scale_w_in) + , bias_term(bias_term_in), axis(axis_in), num_axes(num_axes_in) + {} + ScaleParam(const ScaleParam& right) + : scale_w(right.scale_w), scale_b(right.scale_b) + , bias_term(right.bias_term), axis(right.axis), num_axes(right.num_axes) + {} + ScaleParam& operator=(const ScaleParam& right) { + scale_w = right.scale_w; + scale_b = right.scale_b; + bias_term = right.bias_term; + axis = right.axis; + num_axes = right.num_axes; return *this; } - bool operator==(const CtcAlignParam &right) { + bool operator==(const ScaleParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (blank == right.blank); - comp_eq = comp_eq && (merge_repeated == right.merge_repeated); + comp_eq = comp_eq && (scale_w == right.scale_w); + comp_eq = comp_eq && (scale_b == right.scale_b); + comp_eq = comp_eq && (bias_term == right.bias_term); + comp_eq = comp_eq && (axis == right.axis); + comp_eq = comp_eq && (num_axes == right.num_axes); return comp_eq; } - int blank; - bool merge_repeated; + int axis; // default is 1 + int num_axes; // default is 1 + bool bias_term; // default false + std::vector scale_w; + std::vector scale_b; }; -template -struct Im2SequenceParam { - Im2SequenceParam() = default; - Im2SequenceParam(int window_h_in, - int window_w_in, - int pad_up_in, - int pad_down_in, - int pad_left_in, - int pad_right_in, - int stride_h_in, - int stride_w_in, - int dilation_h_in, - int dilation_w_in) - : window_h(window_h_in) - , window_w(window_w_in) - , pad_up(pad_up_in) - , pad_down(pad_down_in) - , pad_left(pad_left_in) - , pad_right(pad_right_in) - , stride_h(stride_h_in) - , stride_w(stride_w_in) - , dilation_h(dilation_h_in) - , dilation_w(dilation_w_in) - {} - Im2SequenceParam(const Im2SequenceParam &right) - : window_h(right.window_h) - , window_w(right.window_w) - , pad_up(right.pad_up) - , pad_down(right.pad_down) - , pad_left(right.pad_left) - , pad_right(right.pad_right) - , stride_h(right.stride_h) - , stride_w(right.stride_w) - , dilation_h(right.dilation_h) - , dilation_w(right.dilation_w) - {} - Im2SequenceParam &operator=(const Im2SequenceParam &right) { - window_h = right.window_h; - window_w = right.window_w; - pad_up = right.pad_up; - pad_down = right.pad_down; - pad_left = right.pad_left; - pad_right = right.pad_right; - stride_h = right.stride_h; - stride_w = right.stride_w; - dilation_h = right.dilation_h; - dilation_w = right.dilation_w; +template +struct SequenceConvParam { + typedef Tensor opTensor; + + SequenceConvParam() + : filter_tensor(nullptr), + padding_tensor(nullptr), + context_length(1), + context_start(0), + context_stride(1), + padding_trainable(false) + {} + SequenceConvParam(opTensor* filter_tensor_in, int context_length_in, + int context_start_in = 0, int context_stride_in = 1, bool padding_trainable_in = false, + opTensor* padding_tensor_in = nullptr) + : filter_tensor(filter_tensor_in), + padding_tensor(padding_tensor_in), + context_length(context_length_in), + context_start(context_start_in), + context_stride(context_stride_in), + padding_trainable(padding_trainable_in) + {} + SequenceConvParam(const SequenceConvParam& right) + : filter_tensor(right.filter_tensor), + padding_tensor(right.padding_tensor), + context_length(right.context_length), + context_start(right.context_start), + context_stride(right.context_stride), + padding_trainable(right.padding_trainable) + {} + SequenceConvParam& operator=(const SequenceConvParam& right) { + filter_tensor = right.filter_tensor; + padding_tensor = right.padding_tensor; + context_length = right.context_length; + context_start = right.context_start; + context_stride = right.context_stride; + padding_trainable = right.padding_trainable; + return *this; } - bool operator==(const Im2SequenceParam &right) { + bool operator==(const SequenceConvParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (window_h == right.window_h); - comp_eq = comp_eq && (window_w == right.window_w); - comp_eq = comp_eq && (pad_up == right.pad_up); - comp_eq = comp_eq && (pad_down == right.pad_down); - comp_eq = comp_eq && (pad_left == right.pad_left); - comp_eq = comp_eq && (pad_right == right.pad_right); - comp_eq = comp_eq && (stride_h == right.stride_h); - comp_eq = comp_eq && (stride_w == right.stride_w); - comp_eq = comp_eq && (dilation_h == right.dilation_h); - comp_eq = comp_eq && (dilation_w == right.dilation_w); + comp_eq = comp_eq && (filter_tensor = right.filter_tensor); + comp_eq = comp_eq && (padding_tensor = right.padding_tensor); + comp_eq = comp_eq && (context_length = right.context_length); + comp_eq = comp_eq && (context_start = right.context_start); + comp_eq = comp_eq && (context_stride = right.context_stride); + comp_eq = comp_eq && (padding_trainable = right.padding_trainable); return comp_eq; } - int window_h; - int window_w; - int pad_up; - int pad_down; - int pad_left; - int pad_right; - int stride_h; - int stride_w; - int dilation_h; - int dilation_w; + + opTensor* filter_tensor; + opTensor* padding_tensor; + int context_length; + int context_start; + int context_stride; + bool padding_trainable; }; -template -struct CastParam { - CastParam() = default; - CastParam(int in_type_in, int out_type_in) - : in_type(in_type_in) - , out_type(out_type_in) +template +struct SequencePoolParam { + SequencePoolParam() + : sequence_pool_type(Sequence_pool_unknow) {} - CastParam(const CastParam &right) - : in_type(right.in_type) - , out_type(right.out_type) + SequencePoolParam(SequencePoolType sequence_pool_type_in) + : sequence_pool_type(sequence_pool_type_in) {} - CastParam &operator=(const CastParam &right) { - in_type = right.in_type; - out_type = right.out_type; + SequencePoolParam(const SequencePoolParam& right) + : sequence_pool_type(right.sequence_pool_type) + {} + SequencePoolParam& operator=(const SequencePoolParam& right) { + sequence_pool_type = right.sequence_pool_type; return *this; } - bool operator==(const CastParam &right) { + bool operator==(const SequencePoolParam& right) { bool comp_eq = true; - comp_eq = comp_eq && (in_type == right.in_type); - comp_eq = comp_eq && (out_type == right.out_type); + comp_eq = comp_eq && (sequence_pool_type == right.sequence_pool_type); return comp_eq; } - int in_type; - int out_type; + SequencePoolType sequence_pool_type; }; -template -struct EmbeddingParam { - EmbeddingParam() = default; - EmbeddingParam(int word_num_in, int emb_dim_in, int padding_idx_in, - opTensor* weight_tensor_in) - : word_num(word_num_in) - , emb_dim(emb_dim_in) - , padding_idx(padding_idx_in) - , weight_tensor(weight_tensor_in) - {} - EmbeddingParam(const EmbeddingParam &right) - : word_num(right.word_num) - , emb_dim(right.emb_dim) - , padding_idx(right.padding_idx) - , weight_tensor(right.weight_tensor) - {} - EmbeddingParam &operator=(const EmbeddingParam &right) { - word_num = right.word_num; - emb_dim = right.emb_dim; - padding_idx = right.padding_idx; - weight_tensor = right.weight_tensor; - return *this; + +template +struct SliceParam { + SliceParam() = default; + explicit SliceParam(int axis_in, std::vector slice_points_in) { + CHECK_GE(axis_in, 0) << "slice axis should >=0, current is " << axis_in; + axis = axis_in; + slice_points = slice_points_in; } - bool operator==(const EmbeddingParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (word_num == right.word_num); - comp_eq = comp_eq && (emb_dim == right.emb_dim); - comp_eq = comp_eq && (padding_idx == right.padding_idx); - comp_eq = comp_eq && (weight_tensor == right.weight_tensor); - return comp_eq; + SliceParam(const SliceParam& right) { + axis = right.axis; + slice_points = right.slice_points; } - inline const opTensor* weight() { - return weight_tensor; + SliceParam& operator=(const SliceParam& right) { + axis = right.axis; + slice_points = right.slice_points; + return *this; } + bool operator==(const SliceParam& right) { + bool comp_eq = slice_points.size() == right.slice_points.size(); - inline opTensor* mutable_weight() { - return weight_tensor; + for (int i = 0; i < slice_points.size(); ++i) { + if (!comp_eq) { + return false; + } + + comp_eq = slice_points[i] == right.slice_points[i]; + } + + return axis == right.axis; } - int emb_dim; - int word_num; - int padding_idx; -private: - opTensor* weight_tensor; + int axis; + std::vector slice_points; }; -template -struct LayerNormParam { - LayerNormParam() = default; - LayerNormParam(int axis_in, float eps_in, opTensor* weights_scale, opTensor* weights_bias) { +template +struct SoftmaxParam { + SoftmaxParam() = default; + explicit SoftmaxParam(int axis_in) { + CHECK_GE(axis_in, 0) << "input axis index should >= 0, current is " << axis_in; axis = axis_in; - eps = eps_in; - scale = weights_scale; - bias = weights_bias; } - LayerNormParam(const LayerNormParam &right) { + SoftmaxParam(const SoftmaxParam& right) { axis = right.axis; - eps = right.eps; - scale = right.scale; - bias = right.bias; } - LayerNormParam &operator=(const LayerNormParam &right) { + SoftmaxParam& operator=(const SoftmaxParam& right) { this->axis = right.axis; - this->eps = right.eps; - this->scale = right.scale; - this->bias = right.bias; return *this; } - bool operator==(const LayerNormParam &right) { - bool comp_eq = true; - comp_eq = comp_eq && (axis == right.axis); - comp_eq = comp_eq && (fabsf(eps - right.eps) < 1e-7f); - comp_eq = comp_eq && (scale == scale); - comp_eq = comp_eq && (bias == bias); - return comp_eq; - } - inline const opTensor* scale_weights() { - return scale; + bool operator==(const SoftmaxParam& right) { + return axis == right.axis; } + int axis; +}; - inline opTensor* mutable_scale_weights() { - return scale; +template +struct SPPParam { + SPPParam() = default; + SPPParam(int pyramid_height_in, PoolingType pool_type_in) + : pyramid_height(pyramid_height_in) + , pool_type(pool_type_in) + {} + SPPParam(const SPPParam& right) + : pyramid_height(right.pyramid_height) + , pool_type(right.pool_type) + {} + SPPParam& operator=(const SPPParam& right) { + pyramid_height = right.pyramid_height; + pool_type = right.pool_type; + return *this; } - - inline const opTensor* bias_weights() { - return bias; + bool operator==(const SPPParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (pyramid_height == right.pyramid_height); + comp_eq = comp_eq && (pool_type == right.pool_type); + return comp_eq; } - inline opTensor* mutable_bias_weights() { - return bias; - } + int pyramid_height; + PoolingType pool_type; +}; - int axis; - float eps{1e-5f}; -private: - opTensor* scale; - opTensor* bias; +template +struct TransposeParam { + TransposeParam() = default; + TransposeParam(const TransposeParam& right) {} + TransposeParam& operator=(const TransposeParam& right) {} + bool operator==(const TransposeParam& right) { + return true; + } }; } diff --git a/saber/saber_types.h b/saber/saber_types.h index 3dccb5f3f..6b53b112e 100644 --- a/saber/saber_types.h +++ b/saber/saber_types.h @@ -1,20 +1,22 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. */ + limitations under the License. +*/ #ifndef ANAKIN_SABER_CORE_TYPES_H #define ANAKIN_SABER_CORE_TYPES_H +#include "anakin_config.h" namespace anakin{ namespace saber{ @@ -32,43 +34,126 @@ enum TargetTypeEnum { eX86 = 4, eNVHX86 = 5, eNVHARM = 6, - eBM = 7 + eARMGPU = 7, + eARMDSP =8, + eBM = 9 }; template struct TargetType {}; // NV device without pinned memory typedef TargetType NV; +typedef TargetType BM; typedef TargetType ARM; +typedef TargetType ARMGPU; typedef TargetType AMD; typedef TargetType X86; // NV device with pinned memory typedef TargetType NVHX86; //typedef TargetType NVHARM; -// Bitmain device support -typedef TargetType BM; // invalid target type, for target has only one memory block typedef TargetType INVLD; +enum LayoutType { + Layout_invalid = 0, + Layout_W = 1, + Layout_HW = 2, + Layout_WH = 3, + Layout_NW = 4, + Layout_NHW = 5, + Layout_NCHW = 6, + Layout_NHWC = 7, + Layout_NCHW_C4 = 8, + Layout_NCHW_C8 = 9, + Layout_NCHW_C16 = 10, + Layout_OIHW16I16O = 11, + Layout_GOIHW16I16O = 12 +}; + //! target_type struct -struct W{}; -struct HW{}; -struct WH{}; -struct NW{}; -struct NHW{}; -struct NCHW{}; -struct NHWC{}; -struct NCHW_C4{}; -struct NCHW_C8{}; -struct NCHW_C16{}; -struct OIHW16I16O {}; -struct GOIHW16I16O {}; -//!target_category struct -struct _5D{}; -struct _4D{}; -struct _3D{}; -struct _2D{}; -struct _1D{}; +struct Layout { + virtual int num_index() {return -1;} + virtual int channel_index() {return -1;} + virtual int height_index() {return -1;} + virtual int width_index() {return -1;} + virtual int depth_index() {return -1;} + virtual int inner_c() {return -1;} + virtual int dims() {return -1;} + virtual LayoutType type() {return Layout_invalid;} +}; +struct W : public Layout { + int width_index() {return 0;} + int dims() {return 1;} + LayoutType type() {return Layout_W;} +}; +struct HW : public Layout { + int height_index() {return 0;} + int width_index() {return 1;} + int dims() {return 2;} + LayoutType type() {return Layout_HW;} +}; +struct WH : public Layout { + int height_index() {return 1;} + int width_index() {return 0;} + int dims() {return 2;} + LayoutType type() {return Layout_WH;} +}; +struct NW : public Layout { + int num_index() {return 0;} + int width_index() {return 1;} + int dims() {return 2;} + LayoutType type() {return Layout_NW;} +}; +struct NHW : public Layout { + int num_index() {return 0;} + int height_index() {return 1;} + int width_index() {return 2;} + int dims() {return 3;} + LayoutType type() {return Layout_NHW;} +}; +struct NCHW : public Layout { + int num_index() {return 0;} + int channel_index() {return 1;} + int height_index() {return 2;} + int width_index() {return 3;} + int dims() {return 4;} + LayoutType type() {return Layout_NCHW;} +}; +struct NHWC : public Layout { + int num_index() {return 0;} + int height_index() {return 1;} + int width_index() {return 2;} + int channel_index() {return 3;} + int dims() {return 4;} + LayoutType type() {return Layout_NHWC;} +}; +struct NCHW_C4 : public Layout { + int num_index() {return 0;} + int channel_index() {return 1;} + int height_index() {return 2;} + int width_index() {return 3;} + int inner_c() {return 4;} + int dims() {return 5;} + LayoutType type() {return Layout_NCHW_C4;} +}; +struct NCHW_C8 : public Layout { + int num_index() {return 0;} + int channel_index() {return 1;} + int height_index() {return 2;} + int width_index() {return 3;} + int inner_c() {return 8;} + int dims() {return 5;} + LayoutType type() {return Layout_NCHW_C8;} +}; +struct NCHW_C16 : public Layout { + int num_index() {return 0;} + int channel_index() {return 1;} + int height_index() {return 2;} + int width_index() {return 3;} + int inner_c() {return 16;} + int dims() {return 5;} + LayoutType type() {return Layout_NCHW_C16;} +}; enum DataType { AK_INVALID = -1, @@ -85,8 +170,7 @@ enum DataType { AK_STRING = 10, AK_BOOL = 11, AK_SHAPE = 12, - AK_TENSOR = 13, - AK_BM = 14 + AK_TENSOR = 13 }; typedef enum { @@ -108,16 +192,43 @@ typedef enum{ UNKNOWN = 4 }SaberImplStrategy; + +//should design this one for pick_best_specify() +enum ImplEnum{ + VENDER_IMPL = 0, + SABER_IMPL +}; + +enum SequencePoolType{ + Sequence_pool_unknow = 0, + Sequence_pool_average, + Sequence_pool_sum, + Sequence_pool_sqrt, + Sequence_pool_last, + Sequence_pool_first, + Sequence_pool_max +}; + +/** + * GRU_Formula,origin for paddle,Cudnn for cudnn,difference is w_h_r and weighted mean + * weight for origin is [W_h_o][W_h_r,W_h_z] + * weight for cudnn is [W_h_o,W_h_r,W_h_z] + */ +enum GruFormula { + GRU_ORIGIN = 0, + GRU_CUDNN +}; + typedef enum{ Active_unknow = 0, Active_sigmoid = 1, Active_relu = 2, Active_tanh = 3, Active_clipped_relu = 4, - Active_elu=5, - Active_identity=6, - Active_sigmoid_fluid=7, - Active_tanh_fluid=8 + Active_elu = 5, + Active_identity = 6, + Active_stanh = 9, + Active_prelu = 10 } ActiveType; @@ -152,29 +263,6 @@ enum CodeType { CORNER_SIZE = 3 }; -typedef enum { - ATRS_NormType_NONE = 0, - ATRS_NormType_WIDTH = 1, - ATRS_NormType_HEIGHT = 2, - ATRS_NormType_WIDTH_LOG = 3, - ATRS_NormType_HEIGHT_LOG = 4, -} ATRS_NormType; - -typedef enum { - DetectionOutputSSD_HEIGHT_AND_WIDTH = 0, - DetectionOutputSSD_HEIGHT_OR_WIDTH = 1 -} DetectionOutputSSD_MIN_SIZE_MODE; - -typedef enum { - ProposalImgScaleToCamCoords_NormType_HEIGHT = 0, - ProposalImgScaleToCamCoords_NormType_HEIGHT_LOG = 1 -} ProposalImgScaleToCamCoords_NormType; - -typedef enum { - ProposalImgScaleToCamCoords_OrienType_PI = 0, - ProposalImgScaleToCamCoords_OrienType_PI2 = 1 -} ProposalImgScaleToCamCoords_OrienType; - typedef enum { SABER_POWER_HIGH = 0, SABER_POWER_LOW = 1, @@ -186,6 +274,18 @@ typedef enum { BORDER_REPLICATE } BorderType; +typedef enum { + PRIOR_MIN = 0, + PRIOR_MAX = 1, + PRIOR_COM = 2 +} PriorType; + +typedef enum{ + RANDOM=0, + SPECIAL, + CUSTOM +} TestDataType; + } //namespace saber } //namespace anakin diff --git a/saber/utils.h b/saber/utils.h index 31b575957..4e1ad87c7 100644 --- a/saber/utils.h +++ b/saber/utils.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/tags b/tags new file mode 100644 index 000000000..5569c5e83 --- /dev/null +++ b/tags @@ -0,0 +1,40601 @@ +!_TAG_FILE_FORMAT 2 /extended format; --format=1 will not append ;" to lines/ +!_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted, 2=foldcase/ +!_TAG_PROGRAM_AUTHOR Darren Hiebert /dhiebert@users.sourceforge.net/ +!_TAG_PROGRAM_NAME Exuberant Ctags // +!_TAG_PROGRAM_URL http://ctags.sourceforge.net /official site/ +!_TAG_PROGRAM_VERSION 5.8 // +$ tags /^Licensor LICENSE \/^ names, trademarks, service marks, or product names of the Licensor,$\/;" v language:C++$/;" v language:C++ +$ tags /^Work LICENSE \/^ attribution notices from the Source form of the Work,$\/;" v language:C++$/;" v language:C++ +$ tags /^code LICENSE \/^ not limited to compiled object code, generated documentation,$\/;" v language:C++$/;" v language:C++ +$ tags /^import LICENSE \/^ use, offer to sell, sell, import, and otherwise transfer the Work,$\/;" v language:C++$/;" v language:C++ +$ tags /^made LICENSE \/^ (except as stated in this section) patent license to make, have made,$\/;" v language:C++$/;" v language:C++ +$ tags /^of LICENSE \/^ separable from, or merely link (or bind by name) to the interfaces of,$\/;" v language:C++$/;" v language:C++ +$ tags /^perpetual LICENSE \/^ this License, each Contributor hereby grants to You a perpetual,$\/;" v language:C++$/;" v language:C++ +$ tags /^reproduce LICENSE \/^ copyright license to reproduce, prepare Derivative Works of,$\/;" v language:C++$/;" v language:C++ +$ tags /^sell LICENSE \/^ use, offer to sell, sell, import, and otherwise transfer the Work,$\/;" v language:C++$/;" v language:C++ +$ tags /^systems LICENSE \/^ communication on electronic mailing lists, source code control systems,$\/;" v language:C++$/;" v language:C++ +$ tags /^trademarks LICENSE \/^ names, trademarks, service marks, or product names of the Licensor,$\/;" v language:C++$/;" v language:C++ +$ tags /^whole LICENSE \/^ for any such Derivative Works as a whole, provided Your use,$\/;" v language:C++$/;" v language:C++ +$I tools/external_converter_v2/parser/frontend/dash_board/static/echart/echarts.min.js /^},_judgeAutoMode:function(){var t=this.option,e=!1;iI(function(n){null!=t[n.axisIndex]&&(e=!0)},this);var n=t.orient;return null==n&&e?"orient":e?void 0:(null==n&&(t.orient="horizontal"),"axisIndex")},_autoSetAxisIndex:function(){var t=!0,e=this.get("orient",!0),n=this.option,i=this.dependentModels;if(t){var r="vertical"===e?"y":"x";i[r+"Axis"].length?(n[r+"AxisIndex"]=[0],t=!1):nI(i.singleAxis,function(i){t&&i.get("orient",!0)===e&&(n.singleAxisIndex=[i.componentIndex],t=!1)})}t&&iI(function(e){if(t){var i=[],r=this.dependentModels[e.axis];if(r.length&&!i.length)for(var o=0,a=r.length;a>o;o++)"category"===r[o].get("type")&&i.push(o);n[e.axisIndex]=i,i.length&&(t=!1)}},this),t&&this.ecModel.eachSeries(function(t){this._isSeriesHasAllAxesTypeOf(t,"value")&&iI(function(e){var i=n[e.axisIndex],r=t.get(e.axisIndex),o=t.get(e.axisId),a=t.ecModel.queryComponents({mainType:e.axis,index:r,id:o})[0];r=a.componentIndex,u(i,r)<0&&i.push(r)})},this)},_autoSetOrient:function(){var t;this.eachTargetAxis(function(e){!t&&(t=e.name)},this),this.option.orient="y"===t?"vertical":"horizontal"},_isSeriesHasAllAxesTypeOf:function(t,e){var n=!0;return iI(function(i){var r=t.get(i.axisIndex),o=this.dependentModels[i.axis][r];o&&o.get("type")===e||(n=!1)},this),n},_setDefaultThrottle:function(t){if(t.hasOwnProperty("throttle")&&(this._autoThrottle=!1),this._autoThrottle){var e=this.ecModel.option;this.option.throttle=e.animation&&e.animationDurationUpdate>0?100:20}},getFirstTargetAxisModel:function(){var t;return iI(function(e){if(null==t){var n=this.get(e.axisIndex);n.length&&(t=this.dependentModels[e.axis][n[0]])}},this),t},eachTargetAxis:function(t,e){var n=this.ecModel;iI(function(i){nI(this.get(i.axisIndex),function(r){t.call(e,i,r,this,n)},this)},this)},getAxisProxy:function(t,e){return this._axisProxies[t+"_"+e]},getAxisModel:function(t,e){var n=this.getAxisProxy(t,e);return n&&n.getAxisModel()},setRawRange:function(t,e){var n=this.option;nI([["start","startValue"],["end","endValue"]],function(e){(null!=t[e[0]]||null!=t[e[1]])&&(n[e[0]]=t[e[0]],n[e[1]]=t[e[1]])},this),!e&&cp(this,t)},getPercentRange:function(){var t=this.findRepresentativeAxisProxy();return t?t.getDataPercentWindow():void 0},getValueRange:function(t,e){if(null!=t||null!=e)return this.getAxisProxy(t,e).getDataValueWindow();var n=this.findRepresentativeAxisProxy();return n?n.getDataValueWindow():void 0},findRepresentativeAxisProxy:function(t){if(t)return t.__dzAxisProxy;var e=this._axisProxies;for(var n in e)if(e.hasOwnProperty(n)&&e[n].hostedBy(this))return e[n];for(var n in e)if(e.hasOwnProperty(n)&&!e[n].hostedBy(this))return e[n]},getRangePropMode:function(){return this._rangePropMode.slice()}}),oI=t_.extend({type:"dataZoom",render:function(t,e,n){this.dataZoomModel=t,this.ecModel=e,this.api=n},getTargetCoordInfo:function(){function t(t,e,n,i){for(var r,o=0;o')}}catch(yI){fI=function(t){return mI.createElement("<"+t+' xmlns="'+pI+'" class="zrvml">')}}var xI=Um.CMD,_I=Math.round,wI=Math.sqrt,bI=Math.abs,MI=Math.cos,SI=Math.sin,II=Math.max;if(!Jp.canvasSupported){var CI=",",TI="progid:DXImageTransform.Microsoft",AI=21600,DI=AI\/2,kI=1e5,PI=1e3,LI=function(t){t.style.cssText="position:absolute;left:0;top:0;width:1px;height:1px;",t.coordsize=AI+","+AI,t.coordorigin="0,0"},OI=function(t){return String(t).replace(\/&\/g,"&").replace(\/"\/g,""")},EI=function(t,e,n){return"rgb("+[t,e,n].join(",")+")"},RI=function(t,e){e&&t&&e.parentNode!==t&&t.appendChild(e)},zI=function(t,e){e&&t&&e.parentNode===t&&t.removeChild(e)},BI=function(t,e,n){return(parseFloat(t)||0)*kI+(parseFloat(e)||0)*PI+n},NI=function(t,e){return"string"==typeof t?t.lastIndexOf("%")>=0?parseFloat(t)\/100*e:parseFloat(t):t},VI=function(t,e,n){var i=Ee(e);n=+n,isNaN(n)&&(n=1),i&&(t.color=EI(i[0],i[1],i[2]),t.opacity=n*i[3])},FI=function(t){var e=Ee(t);return[EI(e[0],e[1],e[2]),e[3]]},HI=function(t,e,n){var i=e.fill;if(null!=i)if(i instanceof ky){var r,o=0,a=[0,0],s=0,l=1,u=n.getBoundingRect(),h=u.width,c=u.height;if("linear"===i.type){r="gradient";var d=n.transform,f=[i.x*h,i.y*c],p=[i.x2*h,i.y2*c];d&&(oe(f,f,d),oe(p,p,d));var g=p[0]-f[0],v=p[1]-f[1];o=180*Math.atan2(g,v)\/Math.PI,0>o&&(o+=360),1e-6>o&&(o=0)}else{r="gradientradial";var f=[i.x*h,i.y*c],d=n.transform,m=n.scale,y=h,x=c;a=[(f[0]-u.x)\/y,(f[1]-u.y)\/x],d&&oe(f,f,d),y\/=m[0]*AI,x\/=m[1]*AI;var _=II(y,x);s=0\/_,l=2*i.r\/_-s}var w=i.colorStops.slice();w.sort(function(t,e){return t.offset-e.offset});for(var b=w.length,M=[],S=[],I=0;b>I;I++){var C=w[I],T=FI(C.color);S.push(C.offset*l+s+" "+T[0]),(0===I||I===b-1)&&M.push(T)}if(b>=2){var A=M[0][0],D=M[1][0],k=M[0][1]*e.opacity,P=M[1][1]*e.opacity;t.type=r,t.method="none",t.focus="100%",t.angle=o,t.color=A,t.color2=D,t.colors=S.join(","),t.opacity=P,t.opacity2=k}"radial"===r&&(t.focusposition=a.join(","))}else VI(t,i,e.opacity)},WI=function(t,e){null!=e.lineDash&&(t.dashstyle=e.lineDash.join(" ")),null==e.stroke||e.stroke instanceof ky||VI(t,e.stroke,e.opacity)},GI=function(t,e,n,i){var r="fill"==e,o=t.getElementsByTagName(e)[0];null!=n[e]&&"none"!==n[e]&&(r||!r&&n.lineWidth)?(t[r?"filled":"stroked"]="true",n[e]instanceof ky&&zI(t,o),o||(o=mp(e)),r?HI(o,n,i):WI(o,n),RI(t,o)):(t[r?"filled":"stroked"]="false",zI(t,o))},UI=[[],[],[]],ZI=function(t,e){var n,i,r,o,a,s,l=xI.M,u=xI.C,h=xI.L,c=xI.A,d=xI.Q,f=[],p=t.data,g=t.len();for(o=0;g>o;){switch(r=p[o++],i="",n=0,r){case l:i=" m ",n=1,a=p[o++],s=p[o++],UI[0][0]=a,UI[0][1]=s;break;case h:i=" l ",n=1,a=p[o++],s=p[o++],UI[0][0]=a,UI[0][1]=s;break;case d:case u:i=" c ",n=3;var v,m,y=p[o++],x=p[o++],_=p[o++],w=p[o++];r===d?(v=_,m=w,_=(_+2*y)\/3,w=(w+2*x)\/3,y=(a+2*y)\/3,x=(s+2*x)\/3):(v=p[o++],m=p[o++]),UI[0][0]=y,UI[0][1]=x,UI[1][0]=_,UI[1][1]=w,UI[2][0]=v,UI[2][1]=m,a=v,s=m;break;case c:var b=0,M=0,S=1,I=1,C=0;e&&(b=e[4],M=e[5],S=wI(e[0]*e[0]+e[1]*e[1]),I=wI(e[2]*e[2]+e[3]*e[3]),C=Math.atan2(-e[1]\/I,e[0]\/S));var T=p[o++],A=p[o++],D=p[o++],k=p[o++],P=p[o++]+C,L=p[o++]+P+C;o++;var O=p[o++],E=T+MI(P)*D,R=A+SI(P)*k,y=T+MI(L)*D,x=A+SI(L)*k,z=O?" wa ":" at ";Math.abs(E-y)<1e-4&&(Math.abs(L-P)>.01?O&&(E+=270\/AI):Math.abs(R-A)<1e-4?O&&T>E||!O&&E>T?x-=270\/AI:x+=270\/AI:O&&A>R||!O&&R>A?y+=270\/AI:y-=270\/AI),f.push(z,_I(((T-D)*S+b)*AI-DI),CI,_I(((A-k)*I+M)*AI-DI),CI,_I(((T+D)*S+b)*AI-DI),CI,_I(((A+k)*I+M)*AI-DI),CI,_I((E*S+b)*AI-DI),CI,_I((R*I+M)*AI-DI),CI,_I((y*S+b)*AI-DI),CI,_I((x*I+M)*AI-DI)),a=y,s=x;break;case xI.R:var B=UI[0],N=UI[1];B[0]=p[o++],B[1]=p[o++],N[0]=B[0]+p[o++],N[1]=B[1]+p[o++],e&&(oe(B,B,e),oe(N,N,e)),B[0]=_I(B[0]*AI-DI),N[0]=_I(N[0]*AI-DI),B[1]=_I(B[1]*AI-DI),N[1]=_I(N[1]*AI-DI),f.push(" m ",B[0],CI,B[1]," l ",N[0],CI,B[1]," l ",N[0],CI,N[1]," l ",B[0],CI,N[1]);break;case xI.Z:f.push(" x ")}if(n>0){f.push(i);for(var V=0;n>V;V++){var F=UI[V];e&&oe(F,F,e),f.push(_I(F[0]*AI-DI),CI,_I(F[1]*AI-DI),n-1>V?CI:"")}}}return f.join("")};Lr.prototype.brushVML=function(t){var e=this.style,n=this._vmlEl;n||(n=mp("shape"),LI(n),this._vmlEl=n),GI(n,"fill",e,this),GI(n,"stroke",e,this);var i=this.transform,r=null!=i,o=n.getElementsByTagName("stroke")[0];if(o){var a=e.lineWidth;if(r&&!e.strokeNoScale){var s=i[0]*i[3]-i[1]*i[2];a*=wI(bI(s))}o.weight=a+"px"}var l=this.path||(this.path=new Um);this.__dirtyPath&&(l.beginPath(),this.buildPath(l,this.shape),l.toStatic(),this.__dirtyPath=!1),n.path=ZI(l,this.transform),n.style.zIndex=BI(this.zlevel,this.z,this.z2),RI(t,n),null!=e.text?this.drawRectText(t,this.getBoundingRect()):this.removeRectText(t)},Lr.prototype.onRemove=function(t){zI(t,this._vmlEl),this.removeRectText(t)},Lr.prototype.onAdd=function(t){RI(t,this._vmlEl),this.appendRectText(t)};var jI=function(t){return"object"==typeof t&&t.tagName&&"IMG"===t.tagName.toUpperCase()};ai.prototype.brushVML=function(t){var e,n,i=this.style,r=i.image;if(jI(r)){var o=r.src;if(o===this._imageSrc)e=this._imageWidth,n=this._imageHeight;else{var a=r.runtimeStyle,s=a.width,l=a.height;a.width="auto",a.height="auto",e=r.width,n=r.height,a.width=s,a.height=l,this._imageSrc=o,this._imageWidth=e,this._imageHeight=n}r=o}else r===this._imageSrc&&(e=this._imageWidth,n=this._imageHeight);if(r){var u=i.x||0,h=i.y||0,c=i.width,d=i.height,f=i.sWidth,p=i.sHeight,g=i.sx||0,v=i.sy||0,m=f&&p,y=this._vmlEl;y||(y=mI.createElement("div"),LI(y),this._vmlEl=y);var x,_=y.style,w=!1,b=1,M=1;if(this.transform&&(x=this.transform,b=wI(x[0]*x[0]+x[1]*x[1]),M=wI(x[2]*x[2]+x[3]*x[3]),w=x[1]||x[2]),w){var S=[u,h],I=[u+c,h],C=[u,h+d],T=[u+c,h+d];oe(S,S,x),oe(I,I,x),oe(C,C,x),oe(T,T,x);var A=II(S[0],I[0],C[0],T[0]),D=II(S[1],I[1],C[1],T[1]),k=[];k.push("M11=",x[0]\/b,CI,"M12=",x[2]\/M,CI,"M21=",x[1]\/b,CI,"M22=",x[3]\/M,CI,"Dx=",_I(u*b+x[4]),CI,"Dy=",_I(h*M+x[5])),_.padding="0 "+_I(A)+"px "+_I(D)+"px 0",_.filter=TI+".Matrix("+k.join("")+", SizingMethod=clip)"}else x&&(u=u*b+x[4],h=h*M+x[5]),_.filter="",_.left=_I(u)+"px",_.top=_I(h)+"px";var P=this._imageEl,L=this._cropEl;P||(P=mI.createElement("div"),this._imageEl=P);var O=P.style;if(m){if(e&&n)O.width=_I(b*e*c\/f)+"px",O.height=_I(M*n*d\/p)+"px";else{var E=new Image,R=this;E.onload=function(){E.onload=null,e=E.width,n=E.height,O.width=_I(b*e*c\/f)+"px",O.height=_I(M*n*d\/p)+"px",R._imageWidth=e,R._imageHeight=n,R._imageSrc=r},E.src=r}L||(L=mI.createElement("div"),L.style.overflow="hidden",this._cropEl=L);var z=L.style;z.width=_I((c+g*c\/f)*b),z.height=_I((d+v*d\/p)*M),z.filter=TI+".Matrix(Dx="+-g*c\/f*b+",Dy="+-v*d\/p*M+")",L.parentNode||y.appendChild(L),P.parentNode!=L&&L.appendChild(P)}else O.width=_I(b*c)+"px",O.height=_I(M*d)+"px",y.appendChild(P),L&&L.parentNode&&(y.removeChild(L),this._cropEl=null);var B="",N=i.opacity;1>N&&(B+=".Alpha(opacity="+_I(100*N)+") "),B+=TI+".AlphaImageLoader(src="+r+", SizingMethod=scale)",O.filter=B,y.style.zIndex=BI(this.zlevel,this.z,this.z2),RI(t,y),null!=i.text&&this.drawRectText(t,this.getBoundingRect())}},ai.prototype.onRemove=function(t){zI(t,this._vmlEl),this._vmlEl=null,this._cropEl=null,this._imageEl=null,this.removeRectText(t)},ai.prototype.onAdd=function(t){RI(t,this._vmlEl),this.appendRectText(t)};var XI,YI="normal",qI={},$I=0,KI=100,QI=document.createElement("div"),JI=function(t){var e=qI[t];if(!e){$I>KI&&($I=0,qI={});var n,i=QI.style;try{i.font=t,n=i.fontFamily.split(",")[0]}catch(r){}e={style:i.fontStyle||YI,variant:i.fontVariant||YI,weight:i.fontWeight||YI,size:0|parseFloat(i.fontSize||12),family:n||"Microsoft YaHei"},qI[t]=e,$I++}return e};bn("measureText",function(t,e){var n=mI;XI||(XI=n.createElement("div"),XI.style.cssText="position:absolute;top:-20000px;left:0;padding:0;margin:0;border:none;white-space:pre;",mI.body.appendChild(XI));try{XI.style.font=e}catch(i){}return XI.innerHTML="",XI.appendChild(n.createTextNode(t)),{width:XI.offsetWidth}});for(var tC=new rn,eC=function(t,e,n,i){var r=this.style;this.__dirty&&Hn(r,!0);var o=r.text;if(null!=o&&(o+=""),o){if(r.rich){var a=Bn(o,r);o=[];for(var s=0;sI;I++){var C=S[I];C?C.innerHTML="":(C=S[I]=bp("tspan"),o.appendChild(C),Ap(C,"alignment-baseline",d),Ap(C,"text-anchor",b)),Ap(C,"x",a),Ap(C,"y",s+I*f+M),C.appendChild(document.createTextNode(_[I]))}for(;II;++I){var C=t.__tspanList[I];C&&(Ap(C,"x",a),Ap(C,"y",s+I*f+M))}}};_C.drawRectText=bC,_C.brush=function(t){var e=t.style;null!=e.text&&(e.textPosition=[0,0],bC(t,{x:e.x||0,y:e.y||0,width:0,height:0},t.getBoundingRect()))},Op.prototype={diff:function(t,e,n){function i(){for(var n=-1*s;s>=n;n+=2){var i,l=u[n-1],h=u[n+1],c=(h?h.newPos:0)-n;l&&(u[n-1]=void 0);var d=l&&l.newPos+1=0&&a>c;if(d||f){if(!d||f&&l.newPos=o&&c+1>=a)return Ep(r,i.components,e,t);u[n]=i}else u[n]=void 0}s++}n||(n=function(t,e){return t===e}),this.equals=n;var r=this;t=t.slice(),e=e.slice();var o=e.length,a=t.length,s=1,l=o+a,u=[{newPos:-1,components:[]}],h=this.extractCommon(u[0],e,t,0);if(u[0].newPos+1>=o&&h+1>=a){for(var c=[],d=0;d=s;){var f=i();if(f)return f}},pushComponent:function(t,e,n){var i=t[t.length-1];i&&i.added===e&&i.removed===n?t[t.length-1]={count:i.count+1,added:e,removed:n}:t.push({count:1,added:e,removed:n})},extractCommon:function(t,e,n,i){for(var r=e.length,o=n.length,a=t.newPos,s=a-i,l=0;r>a+1&&o>s+1&&this.equals(e[a+1],n[s+1]);)a++,s++,l++;return l&&t.components.push({count:l}),t.newPos=a,s},tokenize:function(t){return t.slice()},join:function(t){return t.slice()}};var MC=new Op,SC=function(t,e,n){return MC.diff(t,e,n)},IC="0",CC="1";zp.prototype.createElement=bp,zp.prototype.getDefs=function(t){var e=this._svgRoot,n=this._svgRoot.getElementsByTagName("defs");return 0===n.length?t?(n=e.insertBefore(this.createElement("defs"),e.firstChild),n.contains||(n.contains=function(t){var e=n.children;if(!e)return!1;for(var i=e.length-1;i>=0;--i)if(e[i]===t)return!0;return!1}),n):null:n[0]},zp.prototype.update=function(t,e){if(t){var n=this.getDefs(!1);if(t[this._domName]&&n.contains(t[this._domName]))"function"==typeof e&&e(t);else{var i=this.add(t);i&&(t[this._domName]=i)}}},zp.prototype.addDom=function(t){var e=this.getDefs(!0);e.appendChild(t)},zp.prototype.removeDom=function(t){var e=this.getDefs(!1);e&&t[this._domName]&&(e.removeChild(t[this._domName]),t[this._domName]=null)},zp.prototype.getDoms=function(){var t=this.getDefs(!1);if(!t)return[];var e=[];return f(this._tagNames,function(n){var i=t.getElementsByTagName(n);e=e.concat([].slice.call(i))}),e},zp.prototype.markAllUnused=function(){var t=this.getDoms(),e=this;f(t,function(t){t[e._markLabel]=IC})},zp.prototype.markUsed=function(t){t&&(t[this._markLabel]=CC)},zp.prototype.removeUnused=function(){var t=this.getDefs(!1);if(t){var e=this.getDoms(),n=this;f(e,function(e){e[n._markLabel]!==CC&&t.removeChild(e)})}},zp.prototype.getSvgProxy=function(t){return t instanceof Lr?yC:t instanceof ai?xC:t instanceof py?_C:yC},zp.prototype.getTextSvgElement=function(t){return t.__textSvgEl},zp.prototype.getSvgElement=function(t){return t.__svgEl},h(Bp,zp),Bp.prototype.addWithoutUpdate=function(t,e){if(e&&e.style){var n=this;f(["fill","stroke"],function(i){if(e.style[i]&&("linear"===e.style[i].type||"radial"===e.style[i].type)){var r,o=e.style[i],a=n.getDefs(!0);o._dom?(r=o._dom,a.contains(o._dom)||n.addDom(r)):r=n.add(o),n.markUsed(e);var s=r.getAttribute("id");t.setAttribute(i,"url(#"+s+")")}})}},Bp.prototype.add=function(t){var e;if("linear"===t.type)e=this.createElement("linearGradient");else{if("radial"!==t.type)return $g("Illegal gradient type."),null;e=this.createElement("radialGradient")}return t.id=t.id||this.nextId++,e.setAttribute("id","zr"+this._zrId+"-gradient-"+t.id),this.updateDom(t,e),this.addDom(e),e},Bp.prototype.update=function(t){var e=this;zp.prototype.update.call(this,t,function(){var n=t.type,i=t._dom.tagName;"linear"===n&&"linearGradient"===i||"radial"===n&&"radialGradient"===i?e.updateDom(t,t._dom):(e.removeDom(t),e.add(t))})},Bp.prototype.updateDom=function(t,e){if("linear"===t.type)e.setAttribute("x1",t.x),e.setAttribute("y1",t.y),e.setAttribute("x2",t.x2),e.setAttribute("y2",t.y2);else{if("radial"!==t.type)return void $g("Illegal gradient type.");e.setAttribute("cx",t.x),e.setAttribute("cy",t.y),e.setAttribute("r",t.r)}t.global?e.setAttribute("gradientUnits","userSpaceOnUse"):e.setAttribute("gradientUnits","objectBoundingBox"),e.innerHTML="";for(var n=t.colorStops,i=0,r=n.length;r>i;++i){var o=this.createElement("stop");o.setAttribute("offset",100*n[i].offset+"%"),o.setAttribute("stop-color",n[i].color),e.appendChild(o)}t._dom=e},Bp.prototype.markUsed=function(t){if(t.style){var e=t.style.fill;e&&e._dom&&zp.prototype.markUsed.call(this,e._dom),e=t.style.stroke,e&&e._dom&&zp.prototype.markUsed.call(this,e._dom)}},h(Np,zp),Np.prototype.update=function(t){var e=this.getSvgElement(t);e&&this.updateDom(e,t.__clipPaths,!1);var n=this.getTextSvgElement(t);n&&this.updateDom(n,t.__clipPaths,!0),this.markUsed(t)},Np.prototype.updateDom=function(t,e,n){if(e&&e.length>0){var i,r,o=this.getDefs(!0),a=e[0],s=n?"_textDom":"_dom";a[s]?(r=a[s].getAttribute("id"),i=a[s],o.contains(i)||o.appendChild(i)):(r="zr"+this._zrId+"-clip-"+this.nextId,++this.nextId,i=this.createElement("clipPath"),i.setAttribute("id",r),o.appendChild(i),a[s]=i);var l=this.getSvgProxy(a);if(a.transform&&a.parent.invTransform&&!n){var u=Array.prototype.slice.call(a.transform);ve(a.transform,a.parent.invTransform,a.transform),l.brush(a),a.transform=u}else l.brush(a);var h=this.getSvgElement(a);i.innerHTML="",i.appendChild(h.cloneNode()),t.setAttribute("clip-path","url(#"+r+")"),e.length>1&&this.updateDom(i,e.slice(1),n)}else t&&t.setAttribute("clip-path","none")},Np.prototype.markUsed=function(t){var e=this;t.__clipPaths&&t.__clipPaths.length>0&&f(t.__clipPaths,function(t){t._dom&&zp.prototype.markUsed.call(e,t._dom),t._textDom&&zp.prototype.markUsed.call(e,t._textDom)})},h(Vp,zp),Vp.prototype.addWithoutUpdate=function(t,e){if(e&&Fp(e.style)){var n,i=e.style;if(i._shadowDom){n=i._shadowDom;var r=this.getDefs(!0);r.contains(i._shadowDom)||this.addDom(n)}else n=this.add(e);this.markUsed(e);var o=n.getAttribute("id");t.style.filter="url(#"+o+")"}},Vp.prototype.add=function(t){var e=this.createElement("filter"),n=t.style;return n._shadowDomId=n._shadowDomId||this.nextId++,e.setAttribute("id","zr"+this._zrId+"-shadow-"+n._shadowDomId),this.updateDom(t,e),this.addDom(e),e},Vp.prototype.update=function(t,e){var n=e.style;if(Fp(n)){var i=this;zp.prototype.update.call(this,e,function(t){i.updateDom(e,t._shadowDom)})}else this.remove(t,n)},Vp.prototype.remove=function(t,e){null!=e._shadowDomId&&(this.removeDom(e),t.style.filter="")},Vp.prototype.updateDom=function(t,e){var n=e.getElementsByTagName("feDropShadow");n=0===n.length?this.createElement("feDropShadow"):n[0];var i,r,o,a,s=t.style,l=t.scale?t.scale[0]||1:1,u=t.scale?t.scale[1]||1:1;if(s.shadowBlur||s.shadowOffsetX||s.shadowOffsetY)i=s.shadowOffsetX||0,r=s.shadowOffsetY||0,o=s.shadowBlur,a=s.shadowColor;else{if(!s.textShadowBlur)return void this.removeDom(e,s);i=s.textShadowOffsetX||0,r=s.textShadowOffsetY||0,o=s.textShadowBlur,a=s.textShadowColor}n.setAttribute("dx",i\/l),n.setAttribute("dy",r\/u),n.setAttribute("flood-color",a);var h=o\/2\/l,c=o\/2\/u,d=h+" "+c;n.setAttribute("stdDeviation",d),e.setAttribute("x","-100%"),e.setAttribute("y","-100%"),e.setAttribute("width",Math.ceil(o\/2*200)+"%"),e.setAttribute("height",Math.ceil(o\/2*200)+"%"),e.appendChild(n),s._shadowDom=e},Vp.prototype.markUsed=function(t){var e=t.style;e&&e._shadowDom&&zp.prototype.markUsed.call(this,e._shadowDom)};var TC=function(t,e,n,i){this.root=t,this.storage=e,this._opts=n=a({},n||{});var r=bp("svg");r.setAttribute("xmlns","http:\/\/www.w3.org\/2000\/svg"),r.setAttribute("version","1.1"),r.setAttribute("baseProfile","full"),r.style.cssText="user-select:none;position:absolute;left:0;top:0;",this.gradientManager=new Bp(i,r),this.clipPathManager=new Np(i,r),this.shadowManager=new Vp(i,r);var o=document.createElement("div");o.style.cssText="overflow:hidden;position:relative",this._svgRoot=r,this._viewport=o,t.appendChild(o),o.appendChild(r),this.resize(n.width,n.height),this._visibleList=[]};TC.prototype={constructor:TC,getType:function(){return"svg"},getViewportRoot:function(){return this._viewport},getViewportRootOffset:function(){var t=this.getViewportRoot();return t?{offsetLeft:t.offsetLeft||0,offsetTop:t.offsetTop||0}:void 0},refresh:function(){var t=this.storage.getDisplayList(!0);this._paintList(t)},setBackgroundColor:function(t){this._viewport.style.background=t},_paintList:function(t){this.gradientManager.markAllUnused(),this.clipPathManager.markAllUnused(),this.shadowManager.markAllUnused();var e,n=this._svgRoot,i=this._visibleList,r=t.length,o=[];for(e=0;r>e;e++){var a=t[e],s=Wp(a),l=Yp(a)||Xp(a);a.invisible||(a.__dirty&&(s&&s.brush(a),this.clipPathManager.update(a),a.style&&(this.gradientManager.update(a.style.fill),this.gradientManager.update(a.style.stroke),this.shadowManager.update(l,a)),a.__dirty=!1),o.push(a))}var u,h=SC(i,o);for(e=0;e=0;--i)if(e[i]===t)return!0;return!1}),n}return null}return n[0]},resize:function(t,e){var n=this._viewport;n.style.display="none";var i=this._opts;if(null!=t&&(i.width=t),null!=e&&(i.height=e),t=this._getSize(0),e=this._getSize(1),n.style.display="",this._width!==t||this._height!==e){this._width=t,this._height=e;var r=n.style;r.width=t+"px",r.height=e+"px";var o=this._svgRoot;o.setAttribute("width",t),o.setAttribute("height",e)}},getWidth:function(){return this._width},getHeight:function(){return this._height},_getSize:function(t){var e=this._opts,n=["width","height"][t],i=["clientWidth","clientHeight"][t],r=["paddingLeft","paddingTop"][t],o=["paddingRight","paddingBottom"][t];if(null!=e[n]&&"auto"!==e[n])return parseFloat(e[n]);var a=this.root,s=document.defaultView.getComputedStyle(a);return(a[i]||Hp(s[n])||Hp(a.style[n]))-(Hp(s[r])||0)-(Hp(s[o])||0)|0$/;" l language:C++ +$g tools/external_converter_v2/parser/frontend/dash_board/static/echart/echarts.min.js /^},kg.getLocalTransform=function(t){return Dg.getLocalTransform(this,t)},kg.setTransform=function(t){var e=this.transform,n=t.dpr||1;e?t.setTransform(n*e[0],n*e[1],n*e[2],n*e[3],n*e[4],n*e[5]):t.setTransform(n,0,0,n,0,0)},kg.restoreTransform=function(t){var e=t.dpr||1;t.setTransform(e,0,0,e,0,0)};var Pg=[];kg.decomposeTransform=function(){if(this.transform){var t=this.parent,e=this.transform;t&&t.transform&&(ve(Pg,t.invTransform,e),e=Pg);var n=e[0]*e[0]+e[1]*e[1],i=e[2]*e[2]+e[3]*e[3],r=this.position,o=this.scale;be(n-1)&&(n=Math.sqrt(n)),be(i-1)&&(i=Math.sqrt(i)),e[0]<0&&(n=-n),e[3]<0&&(i=-i),r[0]=e[4],r[1]=e[5],o[0]=n,o[1]=i,this.rotation=Math.atan2(-e[1]\/i,e[0]\/n)}},kg.getGlobalScale=function(){var t=this.transform;if(!t)return[1,1];var e=Math.sqrt(t[0]*t[0]+t[1]*t[1]),n=Math.sqrt(t[2]*t[2]+t[3]*t[3]);return t[0]<0&&(e=-e),t[3]<0&&(n=-n),[e,n]},kg.transformCoordToLocal=function(t,e){var n=[t,e],i=this.invTransform;return i&&oe(n,n,i),n},kg.transformCoordToGlobal=function(t,e){var n=[t,e],i=this.transform;return i&&oe(n,n,i),n},Dg.getLocalTransform=function(t,e){e=e||[],Tg(e);var n=t.origin,i=t.scale||[1,1],r=t.rotation||0,o=t.position||[0,0];return n&&(e[4]-=n[0],e[5]-=n[1]),xe(e,e,i),r&&ye(e,e,r),n&&(e[4]+=n[0],e[5]+=n[1]),e[4]+=o[0],e[5]+=o[1],e};var Lg={linear:function(t){return t},quadraticIn:function(t){return t*t},quadraticOut:function(t){return t*(2-t)},quadraticInOut:function(t){return(t*=2)<1?.5*t*t:-.5*(--t*(t-2)-1)},cubicIn:function(t){return t*t*t},cubicOut:function(t){return--t*t*t+1},cubicInOut:function(t){return(t*=2)<1?.5*t*t*t:.5*((t-=2)*t*t+2)},quarticIn:function(t){return t*t*t*t},quarticOut:function(t){return 1- --t*t*t*t},quarticInOut:function(t){return(t*=2)<1?.5*t*t*t*t:-.5*((t-=2)*t*t*t-2)},quinticIn:function(t){return t*t*t*t*t},quinticOut:function(t){return--t*t*t*t*t+1},quinticInOut:function(t){return(t*=2)<1?.5*t*t*t*t*t:.5*((t-=2)*t*t*t*t+2)},sinusoidalIn:function(t){return 1-Math.cos(t*Math.PI\/2)},sinusoidalOut:function(t){return Math.sin(t*Math.PI\/2)},sinusoidalInOut:function(t){return.5*(1-Math.cos(Math.PI*t))},exponentialIn:function(t){return 0===t?0:Math.pow(1024,t-1)},exponentialOut:function(t){return 1===t?1:1-Math.pow(2,-10*t)},exponentialInOut:function(t){return 0===t?0:1===t?1:(t*=2)<1?.5*Math.pow(1024,t-1):.5*(-Math.pow(2,-10*(t-1))+2)},circularIn:function(t){return 1-Math.sqrt(1-t*t)},circularOut:function(t){return Math.sqrt(1- --t*t)},circularInOut:function(t){return(t*=2)<1?-.5*(Math.sqrt(1-t*t)-1):.5*(Math.sqrt(1-(t-=2)*t)+1)},elasticIn:function(t){var e,n=.1,i=.4;return 0===t?0:1===t?1:(!n||1>n?(n=1,e=i\/4):e=i*Math.asin(1\/n)\/(2*Math.PI),-(n*Math.pow(2,10*(t-=1))*Math.sin(2*(t-e)*Math.PI\/i)))},elasticOut:function(t){var e,n=.1,i=.4;return 0===t?0:1===t?1:(!n||1>n?(n=1,e=i\/4):e=i*Math.asin(1\/n)\/(2*Math.PI),n*Math.pow(2,-10*t)*Math.sin(2*(t-e)*Math.PI\/i)+1)},elasticInOut:function(t){var e,n=.1,i=.4;return 0===t?0:1===t?1:(!n||1>n?(n=1,e=i\/4):e=i*Math.asin(1\/n)\/(2*Math.PI),(t*=2)<1?-.5*n*Math.pow(2,10*(t-=1))*Math.sin(2*(t-e)*Math.PI\/i):n*Math.pow(2,-10*(t-=1))*Math.sin(2*(t-e)*Math.PI\/i)*.5+1)},backIn:function(t){var e=1.70158;return t*t*((e+1)*t-e)},backOut:function(t){var e=1.70158;return--t*t*((e+1)*t+e)+1},backInOut:function(t){var e=2.5949095;return(t*=2)<1?.5*t*t*((e+1)*t-e):.5*((t-=2)*t*((e+1)*t+e)+2)},bounceIn:function(t){return 1-Lg.bounceOut(1-t)},bounceOut:function(t){return 1\/2.75>t?7.5625*t*t:2\/2.75>t?7.5625*(t-=1.5\/2.75)*t+.75:2.5\/2.75>t?7.5625*(t-=2.25\/2.75)*t+.9375:7.5625*(t-=2.625\/2.75)*t+.984375},bounceInOut:function(t){return.5>t?.5*Lg.bounceIn(2*t):.5*Lg.bounceOut(2*t-1)+.5}};Me.prototype={constructor:Me,step:function(t,e){if(this._initialized||(this._startTime=t+this._delay,this._initialized=!0),this._paused)return void(this._pausedTime+=e);var n=(t-this._startTime-this._pausedTime)\/this._life;if(!(0>n)){n=Math.min(n,1);var i=this.easing,r="string"==typeof i?Lg[i]:i,o="function"==typeof r?r(n):n;return this.fire("frame",o),1==n?this.loop?(this.restart(t),"restart"):(this._needsRemove=!0,"destroy"):null}},restart:function(t){var e=(t-this._startTime-this._pausedTime)%this._life;this._startTime=t-e+this.gap,this._pausedTime=0,this._needsRemove=!1},fire:function(t,e){t="on"+t,this[t]&&this[t](this._target,e)},pause:function(){this._paused=!0},resume:function(){this._paused=!1}};var Og=function(){this.head=null,this.tail=null,this._len=0},Eg=Og.prototype;Eg.insert=function(t){var e=new Rg(t);return this.insertEntry(e),e},Eg.insertEntry=function(t){this.head?(this.tail.next=t,t.prev=this.tail,t.next=null,this.tail=t):this.head=this.tail=t,this._len++},Eg.remove=function(t){var e=t.prev,n=t.next;e?e.next=n:this.head=n,n?n.prev=e:this.tail=e,t.next=t.prev=null,this._len--},Eg.len=function(){return this._len},Eg.clear=function(){this.head=this.tail=null,this._len=0};var Rg=function(t){this.value=t,this.next,this.prev},zg=function(t){this._list=new Og,this._map={},this._maxSize=t||10,this._lastRemovedEntry=null},Bg=zg.prototype;Bg.put=function(t,e){var n=this._list,i=this._map,r=null;if(null==i[t]){var o=n.len(),a=this._lastRemovedEntry;if(o>=this._maxSize&&o>0){var s=n.head;n.remove(s),delete i[s.key],r=s.value,this._lastRemovedEntry=s}a?a.value=e:a=new Rg(e),a.key=t,n.insertEntry(a),i[t]=a}return r},Bg.get=function(t){var e=this._map[t],n=this._list;return null!=e?(e!==n.tail&&(n.remove(e),n.insertEntry(e)),e.value):void 0},Bg.clear=function(){this._list.clear(),this._map={}};var Ng={transparent:[0,0,0,0],aliceblue:[240,248,255,1],antiquewhite:[250,235,215,1],aqua:[0,255,255,1],aquamarine:[127,255,212,1],azure:[240,255,255,1],beige:[245,245,220,1],bisque:[255,228,196,1],black:[0,0,0,1],blanchedalmond:[255,235,205,1],blue:[0,0,255,1],blueviolet:[138,43,226,1],brown:[165,42,42,1],burlywood:[222,184,135,1],cadetblue:[95,158,160,1],chartreuse:[127,255,0,1],chocolate:[210,105,30,1],coral:[255,127,80,1],cornflowerblue:[100,149,237,1],cornsilk:[255,248,220,1],crimson:[220,20,60,1],cyan:[0,255,255,1],darkblue:[0,0,139,1],darkcyan:[0,139,139,1],darkgoldenrod:[184,134,11,1],darkgray:[169,169,169,1],darkgreen:[0,100,0,1],darkgrey:[169,169,169,1],darkkhaki:[189,183,107,1],darkmagenta:[139,0,139,1],darkolivegreen:[85,107,47,1],darkorange:[255,140,0,1],darkorchid:[153,50,204,1],darkred:[139,0,0,1],darksalmon:[233,150,122,1],darkseagreen:[143,188,143,1],darkslateblue:[72,61,139,1],darkslategray:[47,79,79,1],darkslategrey:[47,79,79,1],darkturquoise:[0,206,209,1],darkviolet:[148,0,211,1],deeppink:[255,20,147,1],deepskyblue:[0,191,255,1],dimgray:[105,105,105,1],dimgrey:[105,105,105,1],dodgerblue:[30,144,255,1],firebrick:[178,34,34,1],floralwhite:[255,250,240,1],forestgreen:[34,139,34,1],fuchsia:[255,0,255,1],gainsboro:[220,220,220,1],ghostwhite:[248,248,255,1],gold:[255,215,0,1],goldenrod:[218,165,32,1],gray:[128,128,128,1],green:[0,128,0,1],greenyellow:[173,255,47,1],grey:[128,128,128,1],honeydew:[240,255,240,1],hotpink:[255,105,180,1],indianred:[205,92,92,1],indigo:[75,0,130,1],ivory:[255,255,240,1],khaki:[240,230,140,1],lavender:[230,230,250,1],lavenderblush:[255,240,245,1],lawngreen:[124,252,0,1],lemonchiffon:[255,250,205,1],lightblue:[173,216,230,1],lightcoral:[240,128,128,1],lightcyan:[224,255,255,1],lightgoldenrodyellow:[250,250,210,1],lightgray:[211,211,211,1],lightgreen:[144,238,144,1],lightgrey:[211,211,211,1],lightpink:[255,182,193,1],lightsalmon:[255,160,122,1],lightseagreen:[32,178,170,1],lightskyblue:[135,206,250,1],lightslategray:[119,136,153,1],lightslategrey:[119,136,153,1],lightsteelblue:[176,196,222,1],lightyellow:[255,255,224,1],lime:[0,255,0,1],limegreen:[50,205,50,1],linen:[250,240,230,1],magenta:[255,0,255,1],maroon:[128,0,0,1],mediumaquamarine:[102,205,170,1],mediumblue:[0,0,205,1],mediumorchid:[186,85,211,1],mediumpurple:[147,112,219,1],mediumseagreen:[60,179,113,1],mediumslateblue:[123,104,238,1],mediumspringgreen:[0,250,154,1],mediumturquoise:[72,209,204,1],mediumvioletred:[199,21,133,1],midnightblue:[25,25,112,1],mintcream:[245,255,250,1],mistyrose:[255,228,225,1],moccasin:[255,228,181,1],navajowhite:[255,222,173,1],navy:[0,0,128,1],oldlace:[253,245,230,1],olive:[128,128,0,1],olivedrab:[107,142,35,1],orange:[255,165,0,1],orangered:[255,69,0,1],orchid:[218,112,214,1],palegoldenrod:[238,232,170,1],palegreen:[152,251,152,1],paleturquoise:[175,238,238,1],palevioletred:[219,112,147,1],papayawhip:[255,239,213,1],peachpuff:[255,218,185,1],peru:[205,133,63,1],pink:[255,192,203,1],plum:[221,160,221,1],powderblue:[176,224,230,1],purple:[128,0,128,1],red:[255,0,0,1],rosybrown:[188,143,143,1],royalblue:[65,105,225,1],saddlebrown:[139,69,19,1],salmon:[250,128,114,1],sandybrown:[244,164,96,1],seagreen:[46,139,87,1],seashell:[255,245,238,1],sienna:[160,82,45,1],silver:[192,192,192,1],skyblue:[135,206,235,1],slateblue:[106,90,205,1],slategray:[112,128,144,1],slategrey:[112,128,144,1],snow:[255,250,250,1],springgreen:[0,255,127,1],steelblue:[70,130,180,1],tan:[210,180,140,1],teal:[0,128,128,1],thistle:[216,191,216,1],tomato:[255,99,71,1],turquoise:[64,224,208,1],violet:[238,130,238,1],wheat:[245,222,179,1],white:[255,255,255,1],whitesmoke:[245,245,245,1],yellow:[255,255,0,1],yellowgreen:[154,205,50,1]},Vg=new zg(20),Fg=null,Hg=Ve,Wg=Fe,Gg=(Object.freeze||Object)({parse:Ee,lift:Be,toHex:Ne,fastLerp:Ve,fastMapToColor:Hg,lerp:Fe,mapToColor:Wg,modifyHSL:He,modifyAlpha:We,stringify:Ge}),Ug=Array.prototype.slice,Zg=function(t,e,n,i){this._tracks={},this._target=t,this._loop=e||!1,this._getter=n||Ue,this._setter=i||Ze,this._clipCount=0,this._delay=0,this._doneList=[],this._onframeList=[],this._clipList=[]};Zg.prototype={when:function(t,e){var n=this._tracks;for(var i in e)if(e.hasOwnProperty(i)){if(!n[i]){n[i]=[];var r=this._getter(this._target,i);if(null==r)continue;0!==t&&n[i].push({time:0,value:Je(r)})}n[i].push({time:t,value:e[i]})}return this},during:function(t){return this._onframeList.push(t),this},pause:function(){for(var t=0;tn;n++)t[n].call(this)},start:function(t,e){var n,i=this,r=0,o=function(){r--,r||i._doneCallback()};for(var a in this._tracks)if(this._tracks.hasOwnProperty(a)){var s=nn(this,t,o,this._tracks[a],a,e);s&&(this._clipList.push(s),r++,this.animation&&this.animation.addClip(s),n=s)}if(n){var l=n.onframe;n.onframe=function(t,e){l(t,e);for(var n=0;n1&&(qg=function(){for(var t in arguments)console.log(arguments[t])});var $g=qg,Kg=function(){this.animators=[]};Kg.prototype={constructor:Kg,animate:function(t,e){var n,i=!1,r=this,o=this.__zr;if(t){var a=t.split("."),s=r;i="shape"===a[0];for(var l=0,h=a.length;h>l;l++)s&&(s=s[a[l]]);s&&(n=s)}else n=r;if(!n)return void $g('Property "'+t+'" is not existed in element '+r.id);var c=r.animators,d=new Zg(n,e);return d.during(function(){r.dirty(i)}).done(function(){c.splice(u(c,d),1)}),c.push(d),o&&o.animation.addAnimator(d),d},stopAnimation:function(t){for(var e=this.animators,n=e.length,i=0;n>i;i++)e[i].stop(t);return e.length=0,this},animateTo:function(t,e,n,i,r,o){function a(){l--,l||r&&r()}b(n)?(r=i,i=n,n=0):w(i)?(r=i,i="linear",n=0):w(n)?(r=n,n=0):w(e)?(r=e,e=500):e||(e=500),this.stopAnimation(),this._animateToShallow("",this,t,e,n);var s=this.animators.slice(),l=s.length;l||r&&r();for(var u=0;u0&&this.animate(t,!1).when(null==i?500:i,o).delay(r||0),this}};var Qg=function(t){Dg.call(this,t),wg.call(this,t),Kg.call(this,t),this.id=t.id||Kp()};Qg.prototype={type:"element",name:"",__zr:null,ignore:!1,clipPath:null,isGroup:!1,drift:function(t,e){switch(this.draggable){case"horizontal":e=0;break;case"vertical":t=0}var n=this.transform;n||(n=this.transform=[1,0,0,1,0,0]),n[4]+=t,n[5]+=e,this.decomposeTransform(),this.dirty(!1)},beforeUpdate:function(){},afterUpdate:function(){},update:function(){this.updateTransform()},traverse:function(){},attrKV:function(t,e){if("position"===t||"scale"===t||"origin"===t){if(e){var n=this[t];n||(n=this[t]=[]),n[0]=e[0],n[1]=e[1]}}else this[t]=e},hide:function(){this.ignore=!0,this.__zr&&this.__zr.refresh()},show:function(){this.ignore=!1,this.__zr&&this.__zr.refresh()},attr:function(t,e){if("string"==typeof t)this.attrKV(t,e);else if(M(t))for(var n in t)t.hasOwnProperty(n)&&this.attrKV(n,t[n]);return this.dirty(!1),this},setClipPath:function(t){var e=this.__zr;e&&t.addSelfToZr(e),this.clipPath&&this.clipPath!==t&&this.removeClipPath(),this.clipPath=t,t.__zr=e,t.__clipTarget=this,this.dirty(!1)},removeClipPath:function(){var t=this.clipPath;t&&(t.__zr&&t.removeSelfFromZr(t.__zr),t.__zr=null,t.__clipTarget=null,this.clipPath=null,this.dirty(!1))},addSelfToZr:function(t){this.__zr=t;var e=this.animators;if(e)for(var n=0;ni||n>s||l>o||r>u)},contain:function(t,e){var n=this;return t>=n.x&&t<=n.x+n.width&&e>=n.y&&e<=n.y+n.height},clone:function(){return new rn(this.x,this.y,this.width,this.height)},copy:function(t){this.x=t.x,this.y=t.y,this.width=t.width,this.height=t.height},plain:function(){return{x:this.x,y:this.y,width:this.width,height:this.height}}},rn.create=function(t){return new rn(t.x,t.y,t.width,t.height)};var nv=function(t){t=t||{},Qg.call(this,t);for(var e in t)t.hasOwnProperty(e)&&(this[e]=t[e]);this._children=[],this.__storage=null,this.__dirty=!0};nv.prototype={constructor:nv,isGroup:!0,type:"group",silent:!1,children:function(){return this._children.slice()},childAt:function(t){return this._children[t]},childOfName:function(t){for(var e=this._children,n=0;n=0&&(n.splice(i,0,t),this._doAdd(t))}return this},_doAdd:function(t){t.parent&&t.parent.remove(t),t.parent=this;var e=this.__storage,n=this.__zr;e&&e!==t.__storage&&(e.addToStorage(t),t instanceof nv&&t.addChildrenToStorage(e)),n&&n.refresh()},remove:function(t){var e=this.__zr,n=this.__storage,i=this._children,r=u(i,t);return 0>r?this:(i.splice(r,1),t.parent=null,n&&(n.delFromStorage(t),t instanceof nv&&t.delChildrenFromStorage(n)),e&&e.refresh(),this)},removeAll:function(){var t,e,n=this._children,i=this.__storage;for(e=0;ei;i++)this._updateAndAddDisplayable(e[i],null,t);n.length=this._displayListLen,Jp.canvasSupported&&dn(n,fn)},_updateAndAddDisplayable:function(t,e,n){if(!t.ignore||n){t.beforeUpdate(),t.__dirty&&t.update(),t.afterUpdate();var i=t.clipPath;if(i){e=e?e.slice():[];for(var r=i,o=t;r;)r.parent=o,r.updateTransform(),e.push(r),o=r,r=r.clipPath}if(t.isGroup){for(var a=t._children,s=0;se;e++)this.delRoot(t[e]);else{var r=u(this._roots,t);r>=0&&(this.delFromStorage(t),this._roots.splice(r,1),t instanceof nv&&t.delChildrenFromStorage(this))}},addToStorage:function(t){return t&&(t.__storage=this,t.dirty(!1)),this},delFromStorage:function(t){return t&&(t.__storage=null),this},dispose:function(){this._renderList=this._roots=null},displayableSortFunc:fn};var av={shadowBlur:1,shadowOffsetX:1,shadowOffsetY:1,textShadowBlur:1,textShadowOffsetX:1,textShadowOffsetY:1,textBoxShadowBlur:1,textBoxShadowOffsetX:1,textBoxShadowOffsetY:1},sv=function(t,e,n){return av.hasOwnProperty(e)?n*=t.dpr:n},lv=[["shadowBlur",0],["shadowOffsetX",0],["shadowOffsetY",0],["shadowColor","#000"],["lineCap","butt"],["lineJoin","miter"],["miterLimit",10]],uv=function(t,e){this.extendFrom(t,!1),this.host=e};uv.prototype={constructor:uv,host:null,fill:"#000",stroke:null,opacity:1,lineDash:null,lineDashOffset:0,shadowBlur:0,shadowOffsetX:0,shadowOffsetY:0,lineWidth:1,strokeNoScale:!1,text:null,font:null,textFont:null,fontStyle:null,fontWeight:null,fontSize:null,fontFamily:null,textTag:null,textFill:"#000",textStroke:null,textWidth:null,textHeight:null,textStrokeWidth:0,textLineHeight:null,textPosition:"inside",textRect:null,textOffset:null,textAlign:null,textVerticalAlign:null,textDistance:5,textShadowColor:"transparent",textShadowBlur:0,textShadowOffsetX:0,textShadowOffsetY:0,textBoxShadowColor:"transparent",textBoxShadowBlur:0,textBoxShadowOffsetX:0,textBoxShadowOffsetY:0,transformText:!1,textRotation:0,textOrigin:null,textBackgroundColor:null,textBorderColor:null,textBorderWidth:0,textBorderRadius:0,textPadding:null,rich:null,truncate:null,blend:null,bind:function(t,e,n){for(var i=this,r=n&&n.style,o=!r,a=0;a0},extendFrom:function(t,e){if(t)for(var n in t)!t.hasOwnProperty(n)||e!==!0&&(e===!1?this.hasOwnProperty(n):null==t[n])||(this[n]=t[n])},set:function(t,e){"string"==typeof t?this[t]=e:this.extendFrom(t,!0)},clone:function(){var t=new this.constructor;return t.extendFrom(this,!0),t},getGradient:function(t,e,n){for(var i="radial"===e.type?gn:pn,r=i(t,e,n),o=e.colorStops,a=0;a=0&&n.splice(i,1),t.__hoverMir=null},clearHover:function(){for(var t=this._hoverElements,e=0;er;){var o=t[r],a=o.__from;a&&a.__zr?(r++,a.invisible||(o.transform=a.transform,o.invTransform=a.invTransform,o.__clipPaths=a.__clipPaths,this._doPaintEl(o,n,!0,i))):(t.splice(r,1),a.__hoverMir=null,e--)}n.ctx.restore()}},getHoverLayer:function(){return this.getLayer(Tv)},_paintList:function(t,e,n){if(this._redrawId===n){e=e||!1,this._updateLayerStatus(t);var i=this._doPaintList(t,e);if(this._needsManuallyCompositing&&this._compositeManually(),!i){var r=this;gv(function(){r._paintList(t,e,n)})}}},_compositeManually:function(){var t=this.getLayer(Av).ctx,e=this._domRoot.width,n=this._domRoot.height;t.clearRect(0,0,e,n),this.eachBuiltinLayer(function(i){i.virtual&&t.drawImage(i.dom,0,0,e,n)})},_doPaintList:function(t,e){for(var n=[],i=0;i15)break}}o.__drawIndex=v,o.__drawIndex0&&t>i[0]){for(a=0;r-1>a&&!(i[a]t);a++);o=n[i[a]]}if(i.splice(a+1,0,t),n[t]=e,!e.virtual)if(o){var l=o.dom;l.nextSibling?s.insertBefore(e.dom,l.nextSibling):s.appendChild(e.dom)}else s.firstChild?s.insertBefore(e.dom,s.firstChild):s.appendChild(e.dom)},eachLayer:function(t,e){var n,i,r=this._zlevelList;for(i=0;i0?Dv:0),this._needsManuallyCompositing),a.__builtin__||$g("ZLevel "+s+" has been used by unkown layer "+a.id),a!==r&&(a.__used=!0,a.__startIndex!==n&&(a.__dirty=!0),a.__startIndex=n,a.__drawIndex=a.incremental?-1:n,e(n),r=a),i.__dirty&&(a.__dirty=!0,a.incremental&&a.__drawIndex<0&&(a.__drawIndex=n))}e(n),this.eachBuiltinLayer(function(t){!t.__used&&t.getElementCount()>0&&(t.__dirty=!0,t.__startIndex=t.__endIndex=t.__drawIndex=0),t.__dirty&&t.__drawIndex<0&&(t.__drawIndex=t.__startIndex)})},clear:function(){return this.eachBuiltinLayer(this._clearLayer),this},_clearLayer:function(t){t.clear()},setBackgroundColor:function(t){this._backgroundColor=t},configLayer:function(t,e){if(e){var n=this._layerConfig;n[t]?r(n[t],e,!0):n[t]=e;for(var i=0;i=0&&this._clips.splice(e,1)},removeAnimator:function(t){for(var e=t.getClips(),n=0;na;a++){var s=n[a],l=s.step(t,e);l&&(r.push(l),o.push(s))}for(var a=0;i>a;)n[a]._needsRemove?(n[a]=n[i-1],n.pop(),i--):a++;i=r.length;for(var a=0;i>a;a++)o[a].fire(r[a]);this._time=t,this.onframe(e),this.trigger("frame",e),this.stage.update&&this.stage.update()},_startLoop:function(){function t(){e._running&&(gv(t),!e._paused&&e._update())}var e=this;this._running=!0,gv(t)},start:function(){this._time=(new Date).getTime(),this._pausedTime=0,this._startLoop()},stop:function(){this._running=!1},pause:function(){this._paused||(this._pauseStart=(new Date).getTime(),this._paused=!0)},resume:function(){this._paused&&(this._pausedTime+=(new Date).getTime()-this._pauseStart,this._paused=!1)},clear:function(){this._clips=[]},isFinished:function(){return!this._clips.length},animate:function(t,e){e=e||{};var n=new Zg(t,e.loop,e.getter,e.setter);return this.addAnimator(n),n}},c(Bv,wg);var Nv=function(){this._track=[]};Nv.prototype={constructor:Nv,recognize:function(t,e,n){return this._doTrack(t,e,n),this._recognize(t)},clear:function(){return this._track.length=0,this},_doTrack:function(t,e,n){var i=t.touches;if(i){for(var r={points:[],touches:[],target:e,event:t},o=0,a=i.length;a>o;o++){var s=i[o],l=pi(n,s,{});r.points.push([l.zrX,l.zrY]),r.touches.push(s)}this._track.push(r)}},_recognize:function(t){for(var e in Vv)if(Vv.hasOwnProperty(e)){var n=Vv[e](this._track,t);if(n)return n}}};var Vv={pinch:function(t,e){var n=t.length;if(n){var i=(t[n-1]||{}).points,r=(t[n-2]||{}).points||i;if(r&&r.length>1&&i&&i.length>1){var o=xi(i)\/xi(r);!isFinite(o)&&(o=1),e.pinchScale=o;var a=_i(i);return e.pinchX=a[0],e.pinchY=a[1],{type:"pinch",target:t[0].target,event:e}}}}},Fv=300,Hv=["click","dblclick","mousewheel","mouseout","mouseup","mousedown","mousemove","contextmenu"],Wv=["touchstart","touchend","touchmove"],Gv={pointerdown:1,pointerup:1,pointermove:1,pointerout:1},Uv=p(Hv,function(t){var e=t.replace("mouse","pointer");return Gv[e]?e:t}),Zv={mousemove:function(t){t=vi(this.dom,t),this.trigger("mousemove",t)},mouseout:function(t){t=vi(this.dom,t);var e=t.toElement||t.relatedTarget;if(e!=this.dom)for(;e&&9!=e.nodeType;){if(e===this.dom)return;e=e.parentNode}this.trigger("mouseout",t)},touchstart:function(t){t=vi(this.dom,t),t.zrByTouch=!0,this._lastTouchMoment=new Date,bi(this,t,"start"),Zv.mousemove.call(this,t),Zv.mousedown.call(this,t),Mi(this)},touchmove:function(t){t=vi(this.dom,t),t.zrByTouch=!0,bi(this,t,"change"),Zv.mousemove.call(this,t),Mi(this)},touchend:function(t){t=vi(this.dom,t),t.zrByTouch=!0,bi(this,t,"end"),Zv.mouseup.call(this,t),+new Date-this._lastTouchMoment=0||i&&u(i,a)<0)){var s=e.getShallow(a);null!=s&&(r[t[o][0]]=s)}}return r}},um=lm([["lineWidth","width"],["stroke","color"],["opacity"],["shadowBlur"],["shadowOffsetX"],["shadowOffsetY"],["shadowColor"]]),hm={getLineStyle:function(t){var e=um(this,t),n=this.getLineDash(e.lineWidth);return n&&(e.lineDash=n),e},getLineDash:function(t){null==t&&(t=1);var e=this.get("type"),n=Math.max(t,2),i=4*t;return"solid"===e||null==e?null:"dashed"===e?[i,i]:[n,n]}},cm=lm([["fill","color"],["shadowBlur"],["shadowOffsetX"],["shadowOffsetY"],["opacity"],["shadowColor"]]),dm={getAreaStyle:function(t,e){return cm(this,t,e)}},fm=Math.pow,pm=Math.sqrt,gm=1e-8,vm=1e-4,mm=pm(3),ym=1\/3,xm=H(),_m=H(),wm=H(),bm=Math.min,Mm=Math.max,Sm=Math.sin,Im=Math.cos,Cm=2*Math.PI,Tm=H(),Am=H(),Dm=H(),km=[],Pm=[],Lm={M:1,L:2,C:3,Q:4,A:5,Z:6,R:7},Om=[],Em=[],Rm=[],zm=[],Bm=Math.min,Nm=Math.max,Vm=Math.cos,Fm=Math.sin,Hm=Math.sqrt,Wm=Math.abs,Gm="undefined"!=typeof Float32Array,Um=function(t){this._saveData=!t,this._saveData&&(this.data=[]),this._ctx=null};Um.prototype={constructor:Um,_xi:0,_yi:0,_x0:0,_y0:0,_ux:0,_uy:0,_len:0,_lineDash:null,_dashOffset:0,_dashIdx:0,_dashSum:0,setScale:function(t,e){this._ux=Wm(1\/Yg\/t)||0,this._uy=Wm(1\/Yg\/e)||0},getContext:function(){return this._ctx},beginPath:function(t){return this._ctx=t,t&&t.beginPath(),t&&(this.dpr=t.dpr),this._saveData&&(this._len=0),this._lineDash&&(this._lineDash=null,this._dashOffset=0),this},moveTo:function(t,e){return this.addData(Lm.M,t,e),this._ctx&&this._ctx.moveTo(t,e),this._x0=t,this._y0=e,this._xi=t,this._yi=e,this},lineTo:function(t,e){var n=Wm(t-this._xi)>this._ux||Wm(e-this._yi)>this._uy||this._len<5;return this.addData(Lm.L,t,e),this._ctx&&n&&(this._needsDash()?this._dashedLineTo(t,e):this._ctx.lineTo(t,e)),n&&(this._xi=t,this._yi=e),this},bezierCurveTo:function(t,e,n,i,r,o){return this.addData(Lm.C,t,e,n,i,r,o),this._ctx&&(this._needsDash()?this._dashedBezierTo(t,e,n,i,r,o):this._ctx.bezierCurveTo(t,e,n,i,r,o)),this._xi=r,this._yi=o,this},quadraticCurveTo:function(t,e,n,i){return this.addData(Lm.Q,t,e,n,i),this._ctx&&(this._needsDash()?this._dashedQuadraticTo(t,e,n,i):this._ctx.quadraticCurveTo(t,e,n,i)),this._xi=n,this._yi=i,this},arc:function(t,e,n,i,r,o){return this.addData(Lm.A,t,e,n,n,i,r-i,0,o?0:1),this._ctx&&this._ctx.arc(t,e,n,i,r,o),this._xi=Vm(r)*n+t,this._yi=Fm(r)*n+t,this},arcTo:function(t,e,n,i,r){return this._ctx&&this._ctx.arcTo(t,e,n,i,r),this},rect:function(t,e,n,i){return this._ctx&&this._ctx.rect(t,e,n,i),this.addData(Lm.R,t,e,n,i),this},closePath:function(){this.addData(Lm.Z);var t=this._ctx,e=this._x0,n=this._y0;return t&&(this._needsDash()&&this._dashedLineTo(e,n),t.closePath()),this._xi=e,this._yi=n,this},fill:function(t){t&&t.fill(),this.toStatic()},stroke:function(t){t&&t.stroke(),this.toStatic()},setLineDash:function(t){if(t instanceof Array){this._lineDash=t,this._dashIdx=0;for(var e=0,n=0;nn;n++)this.data[n]=t[n];this._len=e},appendPath:function(t){t instanceof Array||(t=[t]);for(var e=t.length,n=0,i=this._len,r=0;e>r;r++)n+=t[r].len();Gm&&this.data instanceof Float32Array&&(this.data=new Float32Array(i+n));for(var r=0;e>r;r++)for(var o=t[r].data,a=0;ae.length&&(this._expandData(),e=this.data);for(var n=0;no&&(o=r+o),o%=r,f-=o*h,p-=o*c;h>0&&t>=f||0>h&&f>=t||0==h&&(c>0&&e>=p||0>c&&p>=e);)i=this._dashIdx,n=a[i],f+=h*n,p+=c*n,this._dashIdx=(i+1)%g,h>0&&l>f||0>h&&f>l||c>0&&u>p||0>c&&p>u||s[i%2?"moveTo":"lineTo"](h>=0?Bm(f,t):Nm(f,t),c>=0?Bm(p,e):Nm(p,e));h=f-t,c=p-e,this._dashOffset=-Hm(h*h+c*c)},_dashedBezierTo:function(t,e,n,i,r,o){var a,s,l,u,h,c=this._dashSum,d=this._dashOffset,f=this._lineDash,p=this._ctx,g=this._xi,v=this._yi,m=er,y=0,x=this._dashIdx,_=f.length,w=0;for(0>d&&(d=c+d),d%=c,a=0;1>a;a+=.1)s=m(g,t,n,r,a+.1)-m(g,t,n,r,a),l=m(v,e,i,o,a+.1)-m(v,e,i,o,a),y+=Hm(s*s+l*l);for(;_>x&&(w+=f[x],!(w>d));x++);for(a=(w-d)\/y;1>=a;)u=m(g,t,n,r,a),h=m(v,e,i,o,a),x%2?p.moveTo(u,h):p.lineTo(u,h),a+=f[x]\/y,x=(x+1)%_;x%2!==0&&p.lineTo(r,o),s=r-u,l=o-h,this._dashOffset=-Hm(s*s+l*l)},_dashedQuadraticTo:function(t,e,n,i){var r=n,o=i;n=(n+2*t)\/3,i=(i+2*e)\/3,t=(this._xi+2*t)\/3,e=(this._yi+2*e)\/3,this._dashedBezierTo(t,e,n,i,r,o)},toStatic:function(){var t=this.data;t instanceof Array&&(t.length=this._len,Gm&&(this.data=new Float32Array(t)))},getBoundingRect:function(){Om[0]=Om[1]=Rm[0]=Rm[1]=Number.MAX_VALUE,Em[0]=Em[1]=zm[0]=zm[1]=-Number.MAX_VALUE;for(var t=this.data,e=0,n=0,i=0,r=0,o=0;oc;){var d=s[c++];switch(1==c&&(i=s[c],r=s[c+1],e=i,n=r),d){case Lm.M:e=i=s[c++],n=r=s[c++],t.moveTo(i,r);break;case Lm.L:o=s[c++],a=s[c++],(Wm(o-i)>l||Wm(a-r)>u||c===h-1)&&(t.lineTo(o,a),i=o,r=a);break;case Lm.C:t.bezierCurveTo(s[c++],s[c++],s[c++],s[c++],s[c++],s[c++]),i=s[c-2],r=s[c-1];break;case Lm.Q:t.quadraticCurveTo(s[c++],s[c++],s[c++],s[c++]),i=s[c-2],r=s[c-1];break;case Lm.A:var f=s[c++],p=s[c++],g=s[c++],v=s[c++],m=s[c++],y=s[c++],x=s[c++],_=s[c++],w=g>v?g:v,b=g>v?1:g\/v,M=g>v?v\/g:1,S=Math.abs(g-v)>.001,I=m+y;S?(t.translate(f,p),t.rotate(x),t.scale(b,M),t.arc(0,0,w,m,I,1-_),t.scale(1\/b,1\/M),t.rotate(-x),t.translate(-f,-p)):t.arc(f,p,w,m,I,1-_),1==c&&(e=Vm(m)*g+f,n=Fm(m)*v+p),i=Vm(I)*g+f,r=Fm(I)*v+p;break;case Lm.R:e=i=s[c],n=r=s[c+1],t.rect(s[c++],s[c++],s[c++],s[c++]);break;case Lm.Z:t.closePath(),i=e,r=n}}}},Um.CMD=Lm;var Zm=2*Math.PI,jm=2*Math.PI,Xm=Um.CMD,Ym=2*Math.PI,qm=1e-4,$m=[-1,-1,-1],Km=[-1,-1],Qm=fv.prototype.getCanvasPattern,Jm=Math.abs,ty=new Um(!0);Lr.prototype={constructor:Lr,type:"path",__dirtyPath:!0,strokeContainThreshold:5,brush:function(t,e){var n=this.style,i=this.path||ty,r=n.hasStroke(),o=n.hasFill(),a=n.fill,s=n.stroke,l=o&&!!a.colorStops,u=r&&!!s.colorStops,h=o&&!!a.image,c=r&&!!s.image;if(n.bind(t,this,e),this.setTransform(t),this.__dirty){var d;l&&(d=d||this.getBoundingRect(),this._fillGradient=n.getGradient(t,a,d)),u&&(d=d||this.getBoundingRect(),this._strokeGradient=n.getGradient(t,s,d))}l?t.fillStyle=this._fillGradient:h&&(t.fillStyle=Qm.call(a,t)),u?t.strokeStyle=this._strokeGradient:c&&(t.strokeStyle=Qm.call(s,t));var f=n.lineDash,p=n.lineDashOffset,g=!!t.setLineDash,v=this.getGlobalScale();i.setScale(v[0],v[1]),this.__dirtyPath||f&&!g&&r?(i.beginPath(t),f&&!g&&(i.setLineDash(f),i.setLineDashOffset(p)),this.buildPath(i,this.shape,!1),this.path&&(this.__dirtyPath=!1)):(t.beginPath(),this.path.rebuildPath(t)),o&&i.fill(t),f&&g&&(t.setLineDash(f),t.lineDashOffset=p),r&&i.stroke(t),f&&g&&t.setLineDash([]),null!=n.text&&(this.restoreTransform(t),this.drawRectText(t,this.getBoundingRect()))},buildPath:function(){},createPathProxy:function(){this.path=new Um},getBoundingRect:function(){var t=this._rect,e=this.style,n=!t;if(n){var i=this.path;i||(i=this.path=new Um),this.__dirtyPath&&(i.beginPath(),this.buildPath(i,this.shape,!1)),t=i.getBoundingRect()}if(this._rect=t,e.hasStroke()){var r=this._rectWithStroke||(this._rectWithStroke=t.clone());if(this.__dirty||n){r.copy(t);var o=e.lineWidth,a=e.strokeNoScale?this.getLineScale():1;e.hasFill()||(o=Math.max(o,this.strokeContainThreshold||4)),a>1e-10&&(r.width+=o\/a,r.height+=o\/a,r.x-=o\/a\/2,r.y-=o\/a\/2)}return r}return t},contain:function(t,e){var n=this.transformCoordToLocal(t,e),i=this.getBoundingRect(),r=this.style;if(t=n[0],e=n[1],i.contain(t,e)){var o=this.path.data;if(r.hasStroke()){var a=r.lineWidth,s=r.strokeNoScale?this.getLineScale():1;if(s>1e-10&&(r.hasFill()||(a=Math.max(a,this.strokeContainThreshold)),Pr(o,a\/s,t,e)))return!0}if(r.hasFill())return kr(o,t,e)}return!1},dirty:function(t){null==t&&(t=!0),t&&(this.__dirtyPath=t,this._rect=null),this.__dirty=!0,this.__zr&&this.__zr.refresh(),this.__clipTarget&&this.__clipTarget.dirty()},animateShape:function(t){return this.animate("shape",t)},attrKV:function(t,e){"shape"===t?(this.setShape(e),this.__dirtyPath=!0,this._rect=null):oi.prototype.attrKV.call(this,t,e)},setShape:function(t,e){var n=this.shape;if(n){if(M(t))for(var i in t)t.hasOwnProperty(i)&&(n[i]=t[i]);else n[t]=e;this.dirty(!0)}return this},getLineScale:function(){var t=this.transform;return t&&Jm(t[0]-1)>1e-10&&Jm(t[3]-1)>1e-10?Math.sqrt(Jm(t[0]*t[3]-t[2]*t[1])):1}},Lr.extend=function(t){var e=function(e){Lr.call(this,e),t.style&&this.style.extendFrom(t.style,!1);var n=t.shape;if(n){this.shape=this.shape||{};var i=this.shape;for(var r in n)!i.hasOwnProperty(r)&&n.hasOwnProperty(r)&&(i[r]=n[r])}t.init&&t.init.call(this,e)};h(e,Lr);for(var n in t)"style"!==n&&"shape"!==n&&(e.prototype[n]=t[n]);return e},h(Lr,oi);var ey=Um.CMD,ny=[[],[],[]],iy=Math.sqrt,ry=Math.atan2,oy=function(t,e){var n,i,r,o,a,s,l=t.data,u=ey.M,h=ey.C,c=ey.L,d=ey.R,f=ey.A,p=ey.Q;for(r=0,o=0;ra;a++){var s=ny[a];s[0]=l[r++],s[1]=l[r++],oe(s,s,e),l[o++]=s[0],l[o++]=s[1]}}},ay=["m","M","l","L","v","V","h","H","z","Z","c","C","q","Q","t","T","s","S","a","A"],sy=Math.sqrt,ly=Math.sin,uy=Math.cos,hy=Math.PI,cy=function(t){return Math.sqrt(t[0]*t[0]+t[1]*t[1])},dy=function(t,e){return(t[0]*e[0]+t[1]*e[1])\/(cy(t)*cy(e))},fy=function(t,e){return(t[0]*e[1]=11?function(){var e,n=this.__clipPaths,i=this.style;if(n)for(var r=0;ro;o++)r+=ee(t[o-1],t[o]);var a=r\/2;a=n>a?n:a;for(var o=0;a>o;o++){var s,l,u,h=o\/(a-1)*(e?n:n-1),c=Math.floor(h),d=h-c,f=t[c%n];e?(s=t[(c-1+n)%n],l=t[(c+1)%n],u=t[(c+2)%n]):(s=t[0===c?c:c-1],l=t[c>n-2?n-1:c+1],u=t[c>n-3?n-1:c+2]);var p=d*d,g=d*p;i.push([Vr(s[0],f[0],l[0],u[0],d,p,g),Vr(s[1],f[1],l[1],u[1],d,p,g)])}return i},wy=function(t,e,n,i){var r,o,a,s,l=[],u=[],h=[],c=[];if(i){a=[1\/0,1\/0],s=[-1\/0,-1\/0];for(var d=0,f=t.length;f>d;d++)ae(a,a,t[d]),se(s,s,t[d]);ae(a,a,i[0]),se(s,s,i[1])}for(var d=0,f=t.length;f>d;d++){var p=t[d];if(n)r=t[d?d-1:f-1],o=t[(d+1)%f];else{if(0===d||d===f-1){l.push(G(t[d]));continue}r=t[d-1],o=t[d+1]}X(u,o,r),J(u,u,e);var g=ee(p,r),v=ee(p,o),m=g+v;0!==m&&(g\/=m,v\/=m),J(h,u,-g),J(c,u,v);var y=Z([],p,h),x=Z([],p,c);i&&(se(y,y,a),ae(y,y,s),se(x,x,a),ae(x,x,s)),l.push(y),l.push(x)}return n&&l.push(l.shift()),l},by=Lr.extend({type:"polygon",shape:{points:null,smooth:!1,smoothConstraint:null},buildPath:function(t,e){Fr(t,e,!0)}}),My=Lr.extend({type:"polyline",shape:{points:null,smooth:!1,smoothConstraint:null},style:{stroke:"#000",fill:null},buildPath:function(t,e){Fr(t,e,!1)}}),Sy=Lr.extend({type:"rect",shape:{r:0,x:0,y:0,width:0,height:0},buildPath:function(t,e){var n=e.x,i=e.y,r=e.width,o=e.height;e.r?Fn(t,e):t.rect(n,i,r,o),t.closePath()}}),Iy=Lr.extend({type:"line",shape:{x1:0,y1:0,x2:0,y2:0,percent:1},style:{stroke:"#000",fill:null},buildPath:function(t,e){var n=e.x1,i=e.y1,r=e.x2,o=e.y2,a=e.percent;0!==a&&(t.moveTo(n,i),1>a&&(r=n*(1-a)+r*a,o=i*(1-a)+o*a),t.lineTo(r,o))},pointAt:function(t){var e=this.shape;return[e.x1*(1-t)+e.x2*t,e.y1*(1-t)+e.y2*t]}}),Cy=[],Ty=Lr.extend({type:"bezier-curve",shape:{x1:0,y1:0,x2:0,y2:0,cpx1:0,cpy1:0,percent:1},style:{stroke:"#000",fill:null},buildPath:function(t,e){var n=e.x1,i=e.y1,r=e.x2,o=e.y2,a=e.cpx1,s=e.cpy1,l=e.cpx2,u=e.cpy2,h=e.percent;0!==h&&(t.moveTo(n,i),null==l||null==u?(1>h&&(cr(n,a,r,h,Cy),a=Cy[1],r=Cy[2],cr(i,s,o,h,Cy),s=Cy[1],o=Cy[2]),t.quadraticCurveTo(a,s,r,o)):(1>h&&(or(n,a,l,r,h,Cy),a=Cy[1],l=Cy[2],r=Cy[3],or(i,s,u,o,h,Cy),s=Cy[1],u=Cy[2],o=Cy[3]),t.bezierCurveTo(a,s,l,u,r,o)))},pointAt:function(t){return Hr(this.shape,t,!1)},tangentAt:function(t){var e=Hr(this.shape,t,!0);return te(e,e)}}),Ay=Lr.extend({type:"arc",shape:{cx:0,cy:0,r:0,startAngle:0,endAngle:2*Math.PI,clockwise:!0},style:{stroke:"#000",fill:null},buildPath:function(t,e){var n=e.cx,i=e.cy,r=Math.max(e.r,0),o=e.startAngle,a=e.endAngle,s=e.clockwise,l=Math.cos(o),u=Math.sin(o);t.moveTo(l*r+n,u*r+i),t.arc(n,i,r,o,a,!s)}}),Dy=Lr.extend({type:"compound",shape:{paths:null},_updatePathDirty:function(){for(var t=this.__dirtyPath,e=this.shape.paths,n=0;n<\/div>');$/;" l language:C++ +$v tools/external_converter_v2/parser/frontend/dash_board/static/echart/echarts.min.js /^break}}for(var r=null,o=0,n=0;n0?Dv:0),this._needsManuallyCompositing),a.__builtin__||$g("ZLevel "+s+" has been used by unkown layer "+a.id),a!==r&&(a.__used=!0,a.__startIndex!==n&&(a.__dirty=!0),a.__startIndex=n,a.__drawIndex=a.incremental?-1:n,e(n),r=a),i.__dirty&&(a.__dirty=!0,a.incremental&&a.__drawIndex<0&&(a.__drawIndex=n))}e(n),this.eachBuiltinLayer(function(t){!t.__used&&t.getElementCount()>0&&(t.__dirty=!0,t.__startIndex=t.__endIndex=t.__drawIndex=0),t.__dirty&&t.__drawIndex<0&&(t.__drawIndex=t.__startIndex)})},clear:function(){return this.eachBuiltinLayer(this._clearLayer),this},_clearLayer:function(t){t.clear()},setBackgroundColor:function(t){this._backgroundColor=t},configLayer:function(t,e){if(e){var n=this._layerConfig;n[t]?r(n[t],e,!0):n[t]=e;for(var i=0;i=0&&this._clips.splice(e,1)},removeAnimator:function(t){for(var e=t.getClips(),n=0;na;a++){var s=n[a],l=s.step(t,e);l&&(r.push(l),o.push(s))}for(var a=0;i>a;)n[a]._needsRemove?(n[a]=n[i-1],n.pop(),i--):a++;i=r.length;for(var a=0;i>a;a++)o[a].fire(r[a]);this._time=t,this.onframe(e),this.trigger("frame",e),this.stage.update&&this.stage.update()},_startLoop:function(){function t(){e._running&&(gv(t),!e._paused&&e._update())}var e=this;this._running=!0,gv(t)},start:function(){this._time=(new Date).getTime(),this._pausedTime=0,this._startLoop()},stop:function(){this._running=!1},pause:function(){this._paused||(this._pauseStart=(new Date).getTime(),this._paused=!0)},resume:function(){this._paused&&(this._pausedTime+=(new Date).getTime()-this._pauseStart,this._paused=!1)},clear:function(){this._clips=[]},isFinished:function(){return!this._clips.length},animate:function(t,e){e=e||{};var n=new Zg(t,e.loop,e.getter,e.setter);return this.addAnimator(n),n}},c(Bv,wg);var Nv=function(){this._track=[]};Nv.prototype={constructor:Nv,recognize:function(t,e,n){return this._doTrack(t,e,n),this._recognize(t)},clear:function(){return this._track.length=0,this},_doTrack:function(t,e,n){var i=t.touches;if(i){for(var r={points:[],touches:[],target:e,event:t},o=0,a=i.length;a>o;o++){var s=i[o],l=pi(n,s,{});r.points.push([l.zrX,l.zrY]),r.touches.push(s)}this._track.push(r)}},_recognize:function(t){for(var e in Vv)if(Vv.hasOwnProperty(e)){var n=Vv[e](this._track,t);if(n)return n}}};var Vv={pinch:function(t,e){var n=t.length;if(n){var i=(t[n-1]||{}).points,r=(t[n-2]||{}).points||i;if(r&&r.length>1&&i&&i.length>1){var o=xi(i)\/xi(r);!isFinite(o)&&(o=1),e.pinchScale=o;var a=_i(i);return e.pinchX=a[0],e.pinchY=a[1],{type:"pinch",target:t[0].target,event:e}}}}},Fv=300,Hv=["click","dblclick","mousewheel","mouseout","mouseup","mousedown","mousemove","contextmenu"],Wv=["touchstart","touchend","touchmove"],Gv={pointerdown:1,pointerup:1,pointermove:1,pointerout:1},Uv=p(Hv,function(t){var e=t.replace("mouse","pointer");return Gv[e]?e:t}),Zv={mousemove:function(t){t=vi(this.dom,t),this.trigger("mousemove",t)},mouseout:function(t){t=vi(this.dom,t);var e=t.toElement||t.relatedTarget;if(e!=this.dom)for(;e&&9!=e.nodeType;){if(e===this.dom)return;e=e.parentNode}this.trigger("mouseout",t)},touchstart:function(t){t=vi(this.dom,t),t.zrByTouch=!0,this._lastTouchMoment=new Date,bi(this,t,"start"),Zv.mousemove.call(this,t),Zv.mousedown.call(this,t),Mi(this)},touchmove:function(t){t=vi(this.dom,t),t.zrByTouch=!0,bi(this,t,"change"),Zv.mousemove.call(this,t),Mi(this)},touchend:function(t){t=vi(this.dom,t),t.zrByTouch=!0,bi(this,t,"end"),Zv.mouseup.call(this,t),+new Date-this._lastTouchMoment=0||i&&u(i,a)<0)){var s=e.getShallow(a);null!=s&&(r[t[o][0]]=s)}}return r}},um=lm([["lineWidth","width"],["stroke","color"],["opacity"],["shadowBlur"],["shadowOffsetX"],["shadowOffsetY"],["shadowColor"]]),hm={getLineStyle:function(t){var e=um(this,t),n=this.getLineDash(e.lineWidth);return n&&(e.lineDash=n),e},getLineDash:function(t){null==t&&(t=1);var e=this.get("type"),n=Math.max(t,2),i=4*t;return"solid"===e||null==e?null:"dashed"===e?[i,i]:[n,n]}},cm=lm([["fill","color"],["shadowBlur"],["shadowOffsetX"],["shadowOffsetY"],["opacity"],["shadowColor"]]),dm={getAreaStyle:function(t,e){return cm(this,t,e)}},fm=Math.pow,pm=Math.sqrt,gm=1e-8,vm=1e-4,mm=pm(3),ym=1\/3,xm=H(),_m=H(),wm=H(),bm=Math.min,Mm=Math.max,Sm=Math.sin,Im=Math.cos,Cm=2*Math.PI,Tm=H(),Am=H(),Dm=H(),km=[],Pm=[],Lm={M:1,L:2,C:3,Q:4,A:5,Z:6,R:7},Om=[],Em=[],Rm=[],zm=[],Bm=Math.min,Nm=Math.max,Vm=Math.cos,Fm=Math.sin,Hm=Math.sqrt,Wm=Math.abs,Gm="undefined"!=typeof Float32Array,Um=function(t){this._saveData=!t,this._saveData&&(this.data=[]),this._ctx=null};Um.prototype={constructor:Um,_xi:0,_yi:0,_x0:0,_y0:0,_ux:0,_uy:0,_len:0,_lineDash:null,_dashOffset:0,_dashIdx:0,_dashSum:0,setScale:function(t,e){this._ux=Wm(1\/Yg\/t)||0,this._uy=Wm(1\/Yg\/e)||0},getContext:function(){return this._ctx},beginPath:function(t){return this._ctx=t,t&&t.beginPath(),t&&(this.dpr=t.dpr),this._saveData&&(this._len=0),this._lineDash&&(this._lineDash=null,this._dashOffset=0),this},moveTo:function(t,e){return this.addData(Lm.M,t,e),this._ctx&&this._ctx.moveTo(t,e),this._x0=t,this._y0=e,this._xi=t,this._yi=e,this},lineTo:function(t,e){var n=Wm(t-this._xi)>this._ux||Wm(e-this._yi)>this._uy||this._len<5;return this.addData(Lm.L,t,e),this._ctx&&n&&(this._needsDash()?this._dashedLineTo(t,e):this._ctx.lineTo(t,e)),n&&(this._xi=t,this._yi=e),this},bezierCurveTo:function(t,e,n,i,r,o){return this.addData(Lm.C,t,e,n,i,r,o),this._ctx&&(this._needsDash()?this._dashedBezierTo(t,e,n,i,r,o):this._ctx.bezierCurveTo(t,e,n,i,r,o)),this._xi=r,this._yi=o,this},quadraticCurveTo:function(t,e,n,i){return this.addData(Lm.Q,t,e,n,i),this._ctx&&(this._needsDash()?this._dashedQuadraticTo(t,e,n,i):this._ctx.quadraticCurveTo(t,e,n,i)),this._xi=n,this._yi=i,this},arc:function(t,e,n,i,r,o){return this.addData(Lm.A,t,e,n,n,i,r-i,0,o?0:1),this._ctx&&this._ctx.arc(t,e,n,i,r,o),this._xi=Vm(r)*n+t,this._yi=Fm(r)*n+t,this},arcTo:function(t,e,n,i,r){return this._ctx&&this._ctx.arcTo(t,e,n,i,r),this},rect:function(t,e,n,i){return this._ctx&&this._ctx.rect(t,e,n,i),this.addData(Lm.R,t,e,n,i),this},closePath:function(){this.addData(Lm.Z);var t=this._ctx,e=this._x0,n=this._y0;return t&&(this._needsDash()&&this._dashedLineTo(e,n),t.closePath()),this._xi=e,this._yi=n,this},fill:function(t){t&&t.fill(),this.toStatic()},stroke:function(t){t&&t.stroke(),this.toStatic()},setLineDash:function(t){if(t instanceof Array){this._lineDash=t,this._dashIdx=0;for(var e=0,n=0;nn;n++)this.data[n]=t[n];this._len=e},appendPath:function(t){t instanceof Array||(t=[t]);for(var e=t.length,n=0,i=this._len,r=0;e>r;r++)n+=t[r].len();Gm&&this.data instanceof Float32Array&&(this.data=new Float32Array(i+n));for(var r=0;e>r;r++)for(var o=t[r].data,a=0;ae.length&&(this._expandData(),e=this.data);for(var n=0;no&&(o=r+o),o%=r,f-=o*h,p-=o*c;h>0&&t>=f||0>h&&f>=t||0==h&&(c>0&&e>=p||0>c&&p>=e);)i=this._dashIdx,n=a[i],f+=h*n,p+=c*n,this._dashIdx=(i+1)%g,h>0&&l>f||0>h&&f>l||c>0&&u>p||0>c&&p>u||s[i%2?"moveTo":"lineTo"](h>=0?Bm(f,t):Nm(f,t),c>=0?Bm(p,e):Nm(p,e));h=f-t,c=p-e,this._dashOffset=-Hm(h*h+c*c)},_dashedBezierTo:function(t,e,n,i,r,o){var a,s,l,u,h,c=this._dashSum,d=this._dashOffset,f=this._lineDash,p=this._ctx,g=this._xi,v=this._yi,m=er,y=0,x=this._dashIdx,_=f.length,w=0;for(0>d&&(d=c+d),d%=c,a=0;1>a;a+=.1)s=m(g,t,n,r,a+.1)-m(g,t,n,r,a),l=m(v,e,i,o,a+.1)-m(v,e,i,o,a),y+=Hm(s*s+l*l);for(;_>x&&(w+=f[x],!(w>d));x++);for(a=(w-d)\/y;1>=a;)u=m(g,t,n,r,a),h=m(v,e,i,o,a),x%2?p.moveTo(u,h):p.lineTo(u,h),a+=f[x]\/y,x=(x+1)%_;x%2!==0&&p.lineTo(r,o),s=r-u,l=o-h,this._dashOffset=-Hm(s*s+l*l)},_dashedQuadraticTo:function(t,e,n,i){var r=n,o=i;n=(n+2*t)\/3,i=(i+2*e)\/3,t=(this._xi+2*t)\/3,e=(this._yi+2*e)\/3,this._dashedBezierTo(t,e,n,i,r,o)},toStatic:function(){var t=this.data;t instanceof Array&&(t.length=this._len,Gm&&(this.data=new Float32Array(t)))},getBoundingRect:function(){Om[0]=Om[1]=Rm[0]=Rm[1]=Number.MAX_VALUE,Em[0]=Em[1]=zm[0]=zm[1]=-Number.MAX_VALUE;for(var t=this.data,e=0,n=0,i=0,r=0,o=0;oc;){var d=s[c++];switch(1==c&&(i=s[c],r=s[c+1],e=i,n=r),d){case Lm.M:e=i=s[c++],n=r=s[c++],t.moveTo(i,r);break;case Lm.L:o=s[c++],a=s[c++],(Wm(o-i)>l||Wm(a-r)>u||c===h-1)&&(t.lineTo(o,a),i=o,r=a);break;case Lm.C:t.bezierCurveTo(s[c++],s[c++],s[c++],s[c++],s[c++],s[c++]),i=s[c-2],r=s[c-1];break;case Lm.Q:t.quadraticCurveTo(s[c++],s[c++],s[c++],s[c++]),i=s[c-2],r=s[c-1];break;case Lm.A:var f=s[c++],p=s[c++],g=s[c++],v=s[c++],m=s[c++],y=s[c++],x=s[c++],_=s[c++],w=g>v?g:v,b=g>v?1:g\/v,M=g>v?v\/g:1,S=Math.abs(g-v)>.001,I=m+y;S?(t.translate(f,p),t.rotate(x),t.scale(b,M),t.arc(0,0,w,m,I,1-_),t.scale(1\/b,1\/M),t.rotate(-x),t.translate(-f,-p)):t.arc(f,p,w,m,I,1-_),1==c&&(e=Vm(m)*g+f,n=Fm(m)*v+p),i=Vm(I)*g+f,r=Fm(I)*v+p;break;case Lm.R:e=i=s[c],n=r=s[c+1],t.rect(s[c++],s[c++],s[c++],s[c++]);break;case Lm.Z:t.closePath(),i=e,r=n}}}},Um.CMD=Lm;var Zm=2*Math.PI,jm=2*Math.PI,Xm=Um.CMD,Ym=2*Math.PI,qm=1e-4,$m=[-1,-1,-1],Km=[-1,-1],Qm=fv.prototype.getCanvasPattern,Jm=Math.abs,ty=new Um(!0);Lr.prototype={constructor:Lr,type:"path",__dirtyPath:!0,strokeContainThreshold:5,brush:function(t,e){var n=this.style,i=this.path||ty,r=n.hasStroke(),o=n.hasFill(),a=n.fill,s=n.stroke,l=o&&!!a.colorStops,u=r&&!!s.colorStops,h=o&&!!a.image,c=r&&!!s.image;if(n.bind(t,this,e),this.setTransform(t),this.__dirty){var d;l&&(d=d||this.getBoundingRect(),this._fillGradient=n.getGradient(t,a,d)),u&&(d=d||this.getBoundingRect(),this._strokeGradient=n.getGradient(t,s,d))}l?t.fillStyle=this._fillGradient:h&&(t.fillStyle=Qm.call(a,t)),u?t.strokeStyle=this._strokeGradient:c&&(t.strokeStyle=Qm.call(s,t));var f=n.lineDash,p=n.lineDashOffset,g=!!t.setLineDash,v=this.getGlobalScale();i.setScale(v[0],v[1]),this.__dirtyPath||f&&!g&&r?(i.beginPath(t),f&&!g&&(i.setLineDash(f),i.setLineDashOffset(p)),this.buildPath(i,this.shape,!1),this.path&&(this.__dirtyPath=!1)):(t.beginPath(),this.path.rebuildPath(t)),o&&i.fill(t),f&&g&&(t.setLineDash(f),t.lineDashOffset=p),r&&i.stroke(t),f&&g&&t.setLineDash([]),null!=n.text&&(this.restoreTransform(t),this.drawRectText(t,this.getBoundingRect()))},buildPath:function(){},createPathProxy:function(){this.path=new Um},getBoundingRect:function(){var t=this._rect,e=this.style,n=!t;if(n){var i=this.path;i||(i=this.path=new Um),this.__dirtyPath&&(i.beginPath(),this.buildPath(i,this.shape,!1)),t=i.getBoundingRect()}if(this._rect=t,e.hasStroke()){var r=this._rectWithStroke||(this._rectWithStroke=t.clone());if(this.__dirty||n){r.copy(t);var o=e.lineWidth,a=e.strokeNoScale?this.getLineScale():1;e.hasFill()||(o=Math.max(o,this.strokeContainThreshold||4)),a>1e-10&&(r.width+=o\/a,r.height+=o\/a,r.x-=o\/a\/2,r.y-=o\/a\/2)}return r}return t},contain:function(t,e){var n=this.transformCoordToLocal(t,e),i=this.getBoundingRect(),r=this.style;if(t=n[0],e=n[1],i.contain(t,e)){var o=this.path.data;if(r.hasStroke()){var a=r.lineWidth,s=r.strokeNoScale?this.getLineScale():1;if(s>1e-10&&(r.hasFill()||(a=Math.max(a,this.strokeContainThreshold)),Pr(o,a\/s,t,e)))return!0}if(r.hasFill())return kr(o,t,e)}return!1},dirty:function(t){null==t&&(t=!0),t&&(this.__dirtyPath=t,this._rect=null),this.__dirty=!0,this.__zr&&this.__zr.refresh(),this.__clipTarget&&this.__clipTarget.dirty()},animateShape:function(t){return this.animate("shape",t)},attrKV:function(t,e){"shape"===t?(this.setShape(e),this.__dirtyPath=!0,this._rect=null):oi.prototype.attrKV.call(this,t,e)},setShape:function(t,e){var n=this.shape;if(n){if(M(t))for(var i in t)t.hasOwnProperty(i)&&(n[i]=t[i]);else n[t]=e;this.dirty(!0)}return this},getLineScale:function(){var t=this.transform;return t&&Jm(t[0]-1)>1e-10&&Jm(t[3]-1)>1e-10?Math.sqrt(Jm(t[0]*t[3]-t[2]*t[1])):1}},Lr.extend=function(t){var e=function(e){Lr.call(this,e),t.style&&this.style.extendFrom(t.style,!1);var n=t.shape;if(n){this.shape=this.shape||{};var i=this.shape;for(var r in n)!i.hasOwnProperty(r)&&n.hasOwnProperty(r)&&(i[r]=n[r])}t.init&&t.init.call(this,e)};h(e,Lr);for(var n in t)"style"!==n&&"shape"!==n&&(e.prototype[n]=t[n]);return e},h(Lr,oi);var ey=Um.CMD,ny=[[],[],[]],iy=Math.sqrt,ry=Math.atan2,oy=function(t,e){var n,i,r,o,a,s,l=t.data,u=ey.M,h=ey.C,c=ey.L,d=ey.R,f=ey.A,p=ey.Q;for(r=0,o=0;ra;a++){var s=ny[a];s[0]=l[r++],s[1]=l[r++],oe(s,s,e),l[o++]=s[0],l[o++]=s[1]}}},ay=["m","M","l","L","v","V","h","H","z","Z","c","C","q","Q","t","T","s","S","a","A"],sy=Math.sqrt,ly=Math.sin,uy=Math.cos,hy=Math.PI,cy=function(t){return Math.sqrt(t[0]*t[0]+t[1]*t[1])},dy=function(t,e){return(t[0]*e[0]+t[1]*e[1])\/(cy(t)*cy(e))},fy=function(t,e){return(t[0]*e[1]=11?function(){var e,n=this.__clipPaths,i=this.style;if(n)for(var r=0;ro;o++)r+=ee(t[o-1],t[o]);var a=r\/2;a=n>a?n:a;for(var o=0;a>o;o++){var s,l,u,h=o\/(a-1)*(e?n:n-1),c=Math.floor(h),d=h-c,f=t[c%n];e?(s=t[(c-1+n)%n],l=t[(c+1)%n],u=t[(c+2)%n]):(s=t[0===c?c:c-1],l=t[c>n-2?n-1:c+1],u=t[c>n-3?n-1:c+2]);var p=d*d,g=d*p;i.push([Vr(s[0],f[0],l[0],u[0],d,p,g),Vr(s[1],f[1],l[1],u[1],d,p,g)])}return i},wy=function(t,e,n,i){var r,o,a,s,l=[],u=[],h=[],c=[];if(i){a=[1\/0,1\/0],s=[-1\/0,-1\/0];for(var d=0,f=t.length;f>d;d++)ae(a,a,t[d]),se(s,s,t[d]);ae(a,a,i[0]),se(s,s,i[1])}for(var d=0,f=t.length;f>d;d++){var p=t[d];if(n)r=t[d?d-1:f-1],o=t[(d+1)%f];else{if(0===d||d===f-1){l.push(G(t[d]));continue}r=t[d-1],o=t[d+1]}X(u,o,r),J(u,u,e);var g=ee(p,r),v=ee(p,o),m=g+v;0!==m&&(g\/=m,v\/=m),J(h,u,-g),J(c,u,v);var y=Z([],p,h),x=Z([],p,c);i&&(se(y,y,a),ae(y,y,s),se(x,x,a),ae(x,x,s)),l.push(y),l.push(x)}return n&&l.push(l.shift()),l},by=Lr.extend({type:"polygon",shape:{points:null,smooth:!1,smoothConstraint:null},buildPath:function(t,e){Fr(t,e,!0)}}),My=Lr.extend({type:"polyline",shape:{points:null,smooth:!1,smoothConstraint:null},style:{stroke:"#000",fill:null},buildPath:function(t,e){Fr(t,e,!1)}}),Sy=Lr.extend({type:"rect",shape:{r:0,x:0,y:0,width:0,height:0},buildPath:function(t,e){var n=e.x,i=e.y,r=e.width,o=e.height;e.r?Fn(t,e):t.rect(n,i,r,o),t.closePath()}}),Iy=Lr.extend({type:"line",shape:{x1:0,y1:0,x2:0,y2:0,percent:1},style:{stroke:"#000",fill:null},buildPath:function(t,e){var n=e.x1,i=e.y1,r=e.x2,o=e.y2,a=e.percent;0!==a&&(t.moveTo(n,i),1>a&&(r=n*(1-a)+r*a,o=i*(1-a)+o*a),t.lineTo(r,o))},pointAt:function(t){var e=this.shape;return[e.x1*(1-t)+e.x2*t,e.y1*(1-t)+e.y2*t]}}),Cy=[],Ty=Lr.extend({type:"bezier-curve",shape:{x1:0,y1:0,x2:0,y2:0,cpx1:0,cpy1:0,percent:1},style:{stroke:"#000",fill:null},buildPath:function(t,e){var n=e.x1,i=e.y1,r=e.x2,o=e.y2,a=e.cpx1,s=e.cpy1,l=e.cpx2,u=e.cpy2,h=e.percent;0!==h&&(t.moveTo(n,i),null==l||null==u?(1>h&&(cr(n,a,r,h,Cy),a=Cy[1],r=Cy[2],cr(i,s,o,h,Cy),s=Cy[1],o=Cy[2]),t.quadraticCurveTo(a,s,r,o)):(1>h&&(or(n,a,l,r,h,Cy),a=Cy[1],l=Cy[2],r=Cy[3],or(i,s,u,o,h,Cy),s=Cy[1],u=Cy[2],o=Cy[3]),t.bezierCurveTo(a,s,l,u,r,o)))},pointAt:function(t){return Hr(this.shape,t,!1)},tangentAt:function(t){var e=Hr(this.shape,t,!0);return te(e,e)}}),Ay=Lr.extend({type:"arc",shape:{cx:0,cy:0,r:0,startAngle:0,endAngle:2*Math.PI,clockwise:!0},style:{stroke:"#000",fill:null},buildPath:function(t,e){var n=e.cx,i=e.cy,r=Math.max(e.r,0),o=e.startAngle,a=e.endAngle,s=e.clockwise,l=Math.cos(o),u=Math.sin(o);t.moveTo(l*r+n,u*r+i),t.arc(n,i,r,o,a,!s)}}),Dy=Lr.extend({type:"compound",shape:{paths:null},_updatePathDirty:function(){for(var t=this.__dirtyPath,e=this.shape.paths,n=0;n"'])\/g,Jy={"&":"&","<":"<",">":">",'"':""","'":"'"},tx=["a","b","c","d","e","f","g"],ex=function(t,e){return"{"+t+(null==e?"":e)+"}"},nx=kn,ix=Sn,rx=(Object.freeze||Object)({addCommas:ea,toCamelCase:na,normalizeCssArray:Ky,encodeHTML:ia,formatTpl:ra,formatTplSimple:oa,getTooltipMarker:aa,formatTime:la,capitalFirst:ua,truncateText:nx,getTextRect:ix}),ox=f,ax=["left","right","top","bottom","width","height"],sx=[["width","left","right"],["height","top","bottom"]],lx=ha,ux=(x(ha,"vertical"),x(ha,"horizontal"),{getBoxLayoutParams:function(){return{left:this.get("left"),top:this.get("top"),right:this.get("right"),bottom:this.get("bottom"),width:this.get("width"),height:this.get("height")}}}),hx=Hi(),cx=Lo.extend({type:"component",id:"",name:"",mainType:"",subType:"",componentIndex:0,defaultOption:null,ecModel:null,dependentModels:[],uid:null,layoutMode:null,$constructor:function(t,e,n,i){Lo.call(this,t,e,n,i),this.uid=Ro("ec_cpt_model")},init:function(t,e,n){this.mergeDefaultAndTheme(t,n)},mergeDefaultAndTheme:function(t,e){var n=this.layoutMode,i=n?pa(t):{},o=e.getTheme();r(t,o.get(this.mainType)),r(t,this.getDefaultOption()),n&&fa(t,i,n)},mergeOption:function(t){r(this.option,t,!0);var e=this.layoutMode;e&&fa(this.option,t,e)},optionUpdated:function(){},getDefaultOption:function(){var t=hx(this);if(!t.defaultOption){for(var e=[],n=this.constructor;n;){var i=n.prototype.defaultOption;i&&e.push(i),n=n.superClass}for(var o={},a=e.length-1;a>=0;a--)o=r(o,e[a],!0);t.defaultOption=o}return t.defaultOption},getReferringComponents:function(t){return this.ecModel.queryComponents({mainType:t,index:this.get(t+"Index",!0),id:this.get(t+"Id",!0)})}});Qi(cx,{registerWhenExtend:!0}),zo(cx),Bo(cx,va),c(cx,ux);var dx="";"undefined"!=typeof navigator&&(dx=navigator.platform||"");var fx={color:["#c23531","#2f4554","#61a0a8","#d48265","#91c7ae","#749f83","#ca8622","#bda29a","#6e7074","#546570","#c4ccd3"],gradientColor:["#f6efa6","#d88273","#bf444c"],textStyle:{fontFamily:dx.match(\/^Win\/)?"Microsoft YaHei":"sans-serif",fontSize:12,fontStyle:"normal",fontWeight:"normal"},blendMode:null,animation:"auto",animationDuration:1e3,animationDurationUpdate:300,animationEasing:"exponentialOut",animationEasingUpdate:"cubicOut",animationThreshold:2e3,progressiveThreshold:3e3,progressive:400,hoverLayerThreshold:3e3,useUTC:!1},px=Hi(),gx={clearColorPalette:function(){px(this).colorIdx=0,px(this).colorNameMap={}},getColorFromPalette:function(t,e,n){e=e||this;var i=px(e),r=i.colorIdx||0,o=i.colorNameMap=i.colorNameMap||{};if(o.hasOwnProperty(t))return o[t];var a=Li(this.get("color",!0)),s=this.get("colorLayer",!0),l=null!=n&&s?ma(s,n):a;if(l=l||a,l&&l.length){var u=l[r];return t&&(o[t]=u),i.colorIdx=(r+1)%l.length,u}}},vx={cartesian2d:function(t,e,n,i){var r=t.getReferringComponents("xAxis")[0],o=t.getReferringComponents("yAxis")[0];e.coordSysDims=["x","y"],n.set("x",r),n.set("y",o),xa(r)&&(i.set("x",r),e.firstCategoryDimIndex=0),xa(o)&&(i.set("y",o),e.firstCategoryDimIndex=1)},singleAxis:function(t,e,n,i){var r=t.getReferringComponents("singleAxis")[0];e.coordSysDims=["single"],n.set("single",r),xa(r)&&(i.set("single",r),e.firstCategoryDimIndex=0)},polar:function(t,e,n,i){var r=t.getReferringComponents("polar")[0],o=r.findAxisModel("radiusAxis"),a=r.findAxisModel("angleAxis");e.coordSysDims=["radius","angle"],n.set("radius",o),n.set("angle",a),xa(o)&&(i.set("radius",o),e.firstCategoryDimIndex=0),xa(a)&&(i.set("angle",a),e.firstCategoryDimIndex=1)},geo:function(t,e){e.coordSysDims=["lng","lat"]},parallel:function(t,e,n,i){var r=t.ecModel,o=r.getComponent("parallel",t.get("parallelIndex")),a=e.coordSysDims=o.dimensions.slice();f(o.parallelAxisIndex,function(t,o){var s=r.getComponent("parallelAxis",t),l=a[o];n.set(l,s),xa(s)&&null==e.firstCategoryDimIndex&&(i.set(l,s),e.firstCategoryDimIndex=o)})}},mx="original",yx="arrayRows",xx="objectRows",_x="keyedColumns",bx="unknown",Mx="typedArray",Sx="column",Ix="row";_a.seriesDataToSource=function(t){return new _a({data:t,sourceFormat:I(t)?Mx:mx,fromDataset:!1})},qi(_a);var Cx=Hi(),Tx="\\x00_ec_inner",Ax=Lo.extend({init:function(t,e,n,i){n=n||{},this.option=null,this._theme=new Lo(n),this._optionManager=i},setOption:function(t,e){O(!(Tx in t),"please use chart.getOption()"),this._optionManager.setOption(t,e),this.resetOption(null)},resetOption:function(t){var e=!1,n=this._optionManager;if(!t||"recreate"===t){var i=n.mountOption("recreate"===t);this.option&&"recreate"!==t?(this.restoreData(),this.mergeOption(i)):Ra.call(this,i),e=!0}if(("timeline"===t||"media"===t)&&this.restoreData(),!t||"recreate"===t||"timeline"===t){var r=n.getTimelineOption(this);r&&(this.mergeOption(r),e=!0)}if(!t||"recreate"===t||"media"===t){var o=n.getMediaOption(this,this._api);o.length&&f(o,function(t){this.mergeOption(t,e=!0)},this)}return e},mergeOption:function(t){function e(e,i){var r=Li(t[e]),s=zi(o.get(e),r);Bi(s),f(s,function(t){var n=t.option;M(n)&&(t.keyInfo.mainType=e,t.keyInfo.subType=Ba(e,n,t.exist))});var l=za(o,i);n[e]=[],o.set(e,[]),f(s,function(t,i){var r=t.exist,s=t.option;if(O(M(s)||r,"Empty component definition"),s){var u=cx.getClass(e,t.keyInfo.subType,!0);if(r&&r instanceof u)r.name=t.keyInfo.name,r.mergeOption(s,this),r.optionUpdated(s,!1);else{var h=a({dependentModels:l,componentIndex:i},t.keyInfo);r=new u(s,this,this,h),a(r,h),r.init(s,this,this,h),r.optionUpdated(null,!0)}}else r.mergeOption({},this),r.optionUpdated({},!1);o.get(e)[i]=r,n[e][i]=r.option},this),"series"===e&&Na(this,o.get("series"))}var n=this.option,o=this._componentsMap,s=[];Ma(this),f(t,function(t,e){null!=t&&(cx.hasClass(e)?e&&s.push(e):n[e]=null==n[e]?i(t):r(n[e],t,!0))}),cx.topologicalTravel(s,cx.getAllClassMainTypes(),e,this),this._seriesIndicesMap=N(this._seriesIndices=this._seriesIndices||[])},getOption:function(){var t=i(this.option);return f(t,function(e,n){if(cx.hasClass(n)){for(var e=Li(e),i=e.length-1;i>=0;i--)Vi(e[i])&&e.splice(i,1);t[n]=e}}),delete t[Tx],t},getTheme:function(){return this._theme},getComponent:function(t,e){var n=this._componentsMap.get(t);return n?n[e||0]:void 0},queryComponents:function(t){var e=t.mainType;if(!e)return[];var n=t.index,i=t.id,r=t.name,o=this._componentsMap.get(e);if(!o||!o.length)return[];var a;if(null!=n)_(n)||(n=[n]),a=v(p(n,function(t){return o[t]}),function(t){return!!t});else if(null!=i){var s=_(i);a=v(o,function(t){return s&&u(i,t.id)>=0||!s&&t.id===i})}else if(null!=r){var l=_(r);a=v(o,function(t){return l&&u(r,t.name)>=0||!l&&t.name===r})}else a=o.slice();return Va(a,t)},findComponents:function(t){function e(t){var e=r+"Index",n=r+"Id",i=r+"Name";return!t||null==t[e]&&null==t[n]&&null==t[i]?null:{mainType:r,index:t[e],id:t[n],name:t[i]}}function n(e){return t.filter?v(e,t.filter):e}var i=t.query,r=t.mainType,o=e(i),a=o?this.queryComponents(o):this._componentsMap.get(r);return n(Va(a,t))},eachComponent:function(t,e,n){var i=this._componentsMap;if("function"==typeof t)n=e,e=t,i.each(function(t,i){f(t,function(t,r){e.call(n,i,t,r)})});else if(b(t))f(i.get(t),e,n);else if(M(t)){var r=this.findComponents(t);f(r,e,n)}},getSeriesByName:function(t){var e=this._componentsMap.get("series");return v(e,function(e){return e.name===t})},getSeriesByIndex:function(t){return this._componentsMap.get("series")[t]},getSeriesByType:function(t){var e=this._componentsMap.get("series");return v(e,function(e){return e.subType===t})},getSeries:function(){return this._componentsMap.get("series").slice()},getSeriesCount:function(){return this._componentsMap.get("series").length},eachSeries:function(t,e){f(this._seriesIndices,function(n){var i=this._componentsMap.get("series")[n];t.call(e,i,n)},this)},eachRawSeries:function(t,e){f(this._componentsMap.get("series"),t,e)},eachSeriesByType:function(t,e,n){f(this._seriesIndices,function(i){var r=this._componentsMap.get("series")[i];r.subType===t&&e.call(n,r,i)},this)},eachRawSeriesByType:function(t,e,n){return f(this.getSeriesByType(t),e,n)},isSeriesFiltered:function(t){return null==this._seriesIndicesMap.get(t.componentIndex)},getCurrentSeriesIndices:function(){return(this._seriesIndices||[]).slice()},filterSeries:function(t,e){var n=v(this._componentsMap.get("series"),t,e);Na(this,n)},restoreData:function(t){var e=this._componentsMap;Na(this,e.get("series"));var n=[];e.each(function(t,e){n.push(e)}),cx.topologicalTravel(n,cx.getAllClassMainTypes(),function(n){f(e.get(n),function(e){("series"!==n||!Oa(e,t))&&e.restoreData()})})}});c(Ax,gx);var Dx=["getDom","getZr","getWidth","getHeight","getDevicePixelRatio","dispatchAction","isDisposed","on","off","getDataURL","getConnectedDataURL","getModel","getOption","getViewOfComponentModel","getViewOfSeriesModel"],kx={};Ha.prototype={constructor:Ha,create:function(t,e){var n=[];f(kx,function(i){var r=i.create(t,e);n=n.concat(r||[])}),this._coordinateSystems=n},update:function(t,e){f(this._coordinateSystems,function(n){n.update&&n.update(t,e)})},getCoordinateSystems:function(){return this._coordinateSystems.slice()}},Ha.register=function(t,e){kx[t]=e},Ha.get=function(t){return kx[t]};var Px=f,Lx=i,Ox=p,Ex=r,Rx=\/^(min|max)?(.+)$\/;Wa.prototype={constructor:Wa,setOption:function(t,e){t&&f(Li(t.series),function(t){t&&t.data&&I(t.data)&&R(t.data)}),t=Lx(t,!0);var n=this._optionBackup,i=Ga.call(this,t,e,!n);this._newBaseOption=i.baseOption,n?(Xa(n.baseOption,i.baseOption),i.timelineOptions.length&&(n.timelineOptions=i.timelineOptions),i.mediaList.length&&(n.mediaList=i.mediaList),i.mediaDefault&&(n.mediaDefault=i.mediaDefault)):this._optionBackup=i},mountOption:function(t){var e=this._optionBackup;return this._timelineOptions=Ox(e.timelineOptions,Lx),this._mediaList=Ox(e.mediaList,Lx),this._mediaDefault=Lx(e.mediaDefault),this._currentMediaIndices=[],Lx(t?e.baseOption:this._newBaseOption)},getTimelineOption:function(t){var e,n=this._timelineOptions;if(n.length){var i=t.getComponent("timeline");i&&(e=Lx(n[i.getCurrentIndex()],!0))}return e},getMediaOption:function(){var t=this._api.getWidth(),e=this._api.getHeight(),n=this._mediaList,i=this._mediaDefault,r=[],o=[];if(!n.length&&!i)return o;for(var a=0,s=n.length;s>a;a++)Ua(n[a].query,t,e)&&r.push(a);return!r.length&&i&&(r=[-1]),r.length&&!ja(r,this._currentMediaIndices)&&(o=Ox(r,function(t){return Lx(-1===t?i.option:n[t].option)})),this._currentMediaIndices=r,o}};var zx=f,Bx=M,Nx=["areaStyle","lineStyle","nodeStyle","linkStyle","chordStyle","label","labelLine"],Vx=function(t,e){zx(ts(t.series),function(t){Bx(t)&&Ja(t)});var n=["xAxis","yAxis","radiusAxis","angleAxis","singleAxis","parallelAxis","radar"];e&&n.push("valueAxis","categoryAxis","logAxis","timeAxis"),zx(n,function(e){zx(ts(t[e]),function(t){t&&(Ka(t,"axisLabel"),Ka(t.axisPointer,"label"))})}),zx(ts(t.parallel),function(t){var e=t&&t.parallelAxisDefault;Ka(e,"axisLabel"),Ka(e&&e.axisPointer,"label")}),zx(ts(t.calendar),function(t){qa(t,"itemStyle"),Ka(t,"dayLabel"),Ka(t,"monthLabel"),Ka(t,"yearLabel")}),zx(ts(t.radar),function(t){Ka(t,"name")}),zx(ts(t.geo),function(t){Bx(t)&&(Qa(t),zx(ts(t.regions),function(t){Qa(t)}))}),zx(ts(t.timeline),function(t){Qa(t),qa(t,"label"),qa(t,"itemStyle"),qa(t,"controlStyle",!0);var e=t.data;_(e)&&f(e,function(t){M(t)&&(qa(t,"label"),qa(t,"itemStyle"))})}),zx(ts(t.toolbox),function(t){qa(t,"iconStyle"),zx(t.feature,function(t){qa(t,"iconStyle")})}),Ka(es(t.axisPointer),"label"),Ka(es(t.tooltip).axisPointer,"label")},Fx=[["x","left"],["y","top"],["x2","right"],["y2","bottom"]],Hx=["grid","geo","parallel","legend","toolbox","title","visualMap","dataZoom","timeline"],Wx=function(t,e){Vx(t,e),t.series=Li(t.series),f(t.series,function(t){if(M(t)){var e=t.type;if(("pie"===e||"gauge"===e)&&null!=t.clockWise&&(t.clockwise=t.clockWise),"gauge"===e){var n=ns(t,"pointer.color");null!=n&&is(t,"itemStyle.normal.color",n)}rs(t)}}),t.dataRange&&(t.visualMap=t.dataRange),f(Hx,function(e){var n=t[e];n&&(_(n)||(n=[n]),f(n,function(t){rs(t)}))})},Gx=function(t){var e=N();t.eachSeries(function(t){var n=t.get("stack");if(n){var i=e.get(n)||e.set(n,[]),r=t.getData(),o={stackResultDimension:r.getCalculationInfo("stackResultDimension"),stackedOverDimension:r.getCalculationInfo("stackedOverDimension"),stackedDimension:r.getCalculationInfo("stackedDimension"),stackedByDimension:r.getCalculationInfo("stackedByDimension"),isStackedByIndex:r.getCalculationInfo("isStackedByIndex"),data:r,seriesModel:t};if(!o.stackedDimension||!o.isStackedByIndex&&!o.stackedByDimension)return;i.length&&r.setCalculationInfo("stackedOnSeries",i[i.length-1].seriesModel),i.push(o)}}),e.each(os)},Ux=as.prototype;Ux.pure=!1,Ux.persistent=!0,Ux.getSource=function(){return this._source};var Zx={arrayRows_column:{pure:!0,count:function(){return Math.max(0,this._data.length-this._source.startIndex)},getItem:function(t){return this._data[t+this._source.startIndex]},appendData:us},arrayRows_row:{pure:!0,count:function(){var t=this._data[0];return t?Math.max(0,t.length-this._source.startIndex):0},getItem:function(t){t+=this._source.startIndex;for(var e=[],n=this._data,i=0;i=1)&&(t=1),t}var n=this._upstream,i=t&&t.skip;if(this._dirty&&n){var r=this.context;r.data=r.outputData=n.context.outputData}this.__pipeline&&(this.__pipeline.currentTask=this);var o;this._plan&&!i&&(o=this._plan(this.context));var a=e(this._modBy),s=this._modDataCount||0,l=e(t&&t.modBy),u=t&&t.modDataCount||0;(a!==l||s!==u)&&(o="reset");var h;(this._dirty||"reset"===o)&&(this._dirty=!1,h=ys(this,i)),this._modBy=l,this._modDataCount=u;var c=t&&t.step;if(this._dueEnd=n?n._outputDueEnd:this._count?this._count(this.context):1\/0,this._progress){var d=this._dueIndex,f=Math.min(null!=c?this._dueIndex+c:1\/0,this._dueEnd);if(!i&&(h||f>d)){var p=this._progress;if(_(p))for(var g=0;gi?i++:null}function e(){var t=i%a*r+Math.ceil(i\/a),e=i>=n?null:o>t?t:i;return i++,e}var n,i,r,o,a,s={reset:function(l,u,h,c){i=l,n=u,r=h,o=c,a=Math.ceil(o\/r),s.next=r>1&&o>0?e:t}};return s}();$x.dirty=function(){this._dirty=!0,this._onDirty&&this._onDirty(this.context)},$x.unfinished=function(){return this._progress&&this._dueIndex":"")+s.join(a?"":", ")}function i(t){return ia(ea(t))}var r=this.getData(),o=r.mapDimension("defaultedTooltip",!0),a=o.length,s=this.getRawValue(t),l=_(s),u=r.getItemVisual(t,"color");M(u)&&u.colorStops&&(u=(u.colorStops[0]||{}).color),u=u||"transparent";var h=a>1||l&&!a?n(s):i(a?fs(r,t,o[0]):l?s[0]:s),c=aa(u),d=r.getName(t),p=this.name;return Ni(this)||(p=""),p=p?ia(p)+(e?": ":""):"",e?c+p+h:p+c+(d?ia(d)+": "+h:h)},isAnimationEnabled:function(){if(Jp.node)return!1;var t=this.getShallow("animation");return t&&this.getData().count()>this.getShallow("animationThreshold")&&(t=!1),t},restoreData:function(){this.dataTask.dirty()},getColorFromPalette:function(t,e,n){var i=this.ecModel,r=gx.getColorFromPalette.call(this,t,e,n);return r||(r=i.getColorFromPalette(t,e,n)),r},coordDimToDataDim:function(t){return this.getRawData().mapDimension(t,!0)},getProgressive:function(){return this.get("progressive")},getProgressiveThreshold:function(){return this.get("progressiveThreshold")},getAxisTooltipData:null,getTooltipPosition:null,pipeTask:null,preventIncremental:null,pipelineContext:null});c(Jx,qx),c(Jx,gx);var t_=function(){this.group=new nv,this.uid=Ro("viewComponent")};t_.prototype={constructor:t_,init:function(){},render:function(){},dispose:function(){}};var e_=t_.prototype;e_.updateView=e_.updateLayout=e_.updateVisual=function(){},Yi(t_),Qi(t_,{registerWhenExtend:!0});var n_=function(){var t=Hi();return function(e){var n=t(e),i=e.pipelineContext,r=n.large,o=n.progressiveRender,a=n.large=i.large,s=n.progressiveRender=i.progressiveRender;return!!(r^a||o^s)&&"reset"}},i_=Hi(),r_=n_();Ts.prototype={type:"chart",init:function(){},render:function(){},highlight:function(t,e,n,i){Ds(t.getData(),i,"emphasis")},downplay:function(t,e,n,i){Ds(t.getData(),i,"normal")},remove:function(){this.group.removeAll()},dispose:function(){},incrementalPrepareRender:null,incrementalRender:null,updateTransform:null};var o_=Ts.prototype;o_.updateView=o_.updateLayout=o_.updateVisual=function(t,e,n,i){this.render(t,e,n,i)},Yi(Ts,["dispose"]),Qi(Ts,{registerWhenExtend:!0}),Ts.markUpdateMethod=function(t,e){i_(t).updateMethod=e};var a_={incrementalPrepareRender:{progress:function(t,e){e.view.incrementalRender(t,e.model,e.ecModel,e.api,e.payload)}},render:{forceFirstProgress:!0,progress:function(t,e){e.view.render(e.model,e.ecModel,e.api,e.payload)}}},s_="\\x00__throttleOriginMethod",l_="\\x00__throttleRate",u_="\\x00__throttleType",h_={createOnAllSeries:!0,performRawSeries:!0,reset:function(t,e){var n=t.getData(),i=(t.visualColorAccessPath||"itemStyle.color").split("."),r=t.get(i)||t.getColorFromPalette(t.name,null,e.getSeriesCount());if(n.setVisual("color",r),!e.isSeriesFiltered(t)){"function"!=typeof r||r instanceof ky||n.each(function(e){n.setItemVisual(e,"color",r(t.getDataParams(e)))});var o=function(t,e){var n=t.getItemModel(e),r=n.get(i,!0);null!=r&&t.setItemVisual(e,"color",r)};return{dataEach:n.hasItemOption?o:null}}}},c_={toolbox:{brush:{title:{rect:"矩形选择",polygon:"圈选",lineX:"横向选择",lineY:"纵向选择",keep:"保持选择",clear:"清除选择"}},dataView:{title:"数据视图",lang:["数据视图","关闭","刷新"]},dataZoom:{title:{zoom:"区域缩放",back:"区域缩放还原"}},magicType:{title:{line:"切换为折线图",bar:"切换为柱状图",stack:"切换为堆叠",tiled:"切换为平铺"}},restore:{title:"还原"},saveAsImage:{title:"保存为图片",lang:["右键另存为图片"]}},series:{typeNames:{pie:"饼图",bar:"柱状图",line:"折线图",scatter:"散点图",effectScatter:"涟漪散点图",radar:"雷达图",tree:"树图",treemap:"矩形树图",boxplot:"箱型图",candlestick:"K线图",k:"K线图",heatmap:"热力图",map:"地图",parallel:"平行坐标图",lines:"线图",graph:"关系图",sankey:"桑基图",funnel:"漏斗图",gauge:"仪表盘图",pictorialBar:"象形柱图",themeRiver:"主题河流图",sunburst:"旭日图"}},aria:{general:{withTitle:"这是一个关于“{title}”的图表。",withoutTitle:"这是一个图表,"},series:{single:{prefix:"",withName:"图表类型是{seriesType},表示{seriesName}。",withoutName:"图表类型是{seriesType}。"},multiple:{prefix:"它由{seriesCount}个图表系列组成。",withName:"第{seriesId}个系列是一个表示{seriesName}的{seriesType},",withoutName:"第{seriesId}个系列是一个{seriesType},",separator:{middle:";",end:"。"}}},data:{allData:"其数据是——",partialData:"其中,前{displayCnt}项是——",withName:"{name}的数据是{value}",withoutName:"{value}",separator:{middle:",",end:""}}}},d_=function(t,e){function n(t,e){if("string"!=typeof t)return t;var n=t;return f(e,function(t,e){n=n.replace(new RegExp("\\\\{\\\\s*"+e+"\\\\s*\\\\}","g"),t)}),n}function i(t){var e=a.get(t);if(null==e){for(var n=t.split("."),i=c_.aria,r=0;rs)){var d=r();l=d?n(i("general.withTitle"),{title:d}):i("general.withoutTitle");var p=[],g=s>1?"series.multiple.prefix":"series.single.prefix";l+=n(i(g),{seriesCount:s}),e.eachSeries(function(t,e){if(c>e){var r,a=t.get("name"),l="series."+(s>1?"multiple":"single")+".";r=i(a?l+"withName":l+"withoutName"),r=n(r,{seriesId:t.seriesIndex,seriesName:t.get("name"),seriesType:o(t.subType)});var h=t.getData();window.data=h,r+=h.count()>u?n(i("data.partialData"),{displayCnt:u}):i("data.allData");for(var d=[],f=0;ff){var g=h.getName(f),v=fs(h,f);d.push(n(i(g?"data.withName":"data.withoutName"),{name:g,value:v}))}r+=d.join(i("data.separator.middle"))+i("data.separator.end"),p.push(r)}}),l+=p.join(i("series.multiple.separator.middle"))+i("series.multiple.separator.end"),t.setAttribute("aria-label",l)}}},f_=Math.PI,p_=function(t,e){e=e||{},s(e,{text:"loading",color:"#c23531",textColor:"#000",maskColor:"rgba(255, 255, 255, 0.8)",zlevel:0});var n=new Sy({style:{fill:e.maskColor},zlevel:e.zlevel,z:1e4}),i=new Ay({shape:{startAngle:-f_\/2,endAngle:-f_\/2+.1,r:10},style:{stroke:e.color,lineCap:"round",lineWidth:5},zlevel:e.zlevel,z:10001}),r=new Sy({style:{fill:"none",text:e.text,textPosition:"right",textDistance:10,textFill:e.textColor},zlevel:e.zlevel,z:10001});i.animateShape(!0).when(1e3,{endAngle:3*f_\/2}).start("circularInOut"),i.animateShape(!0).when(1e3,{startAngle:3*f_\/2}).delay(300).start("circularInOut");var o=new nv;return o.add(i),o.add(r),o.add(n),o.resize=function(){var e=t.getWidth()\/2,o=t.getHeight()\/2;i.setShape({cx:e,cy:o});var a=i.shape.r;r.setShape({x:e-a,y:o-a,width:2*a,height:2*a}),n.setShape({x:0,y:0,width:t.getWidth(),height:t.getHeight()})},o.resize(),o},g_=Es.prototype;g_.restoreData=function(t,e){t.restoreData(e),this._stageTaskMap.each(function(t){var e=t.overallTask;e&&e.dirty()})},g_.getPerformArgs=function(t,e){if(t.__pipeline){var n=this._pipelineMap.get(t.__pipeline.id),i=n.context,r=!e&&n.progressiveEnabled&&(!i||i.progressiveRender)&&t.__idxInPipeline>n.blockIndex,o=r?n.step:null,a=i&&i.modDataCount,s=null!=a?Math.ceil(a\/o):null;return{step:o,modBy:s,modDataCount:a}}},g_.getPipeline=function(t){return this._pipelineMap.get(t)},g_.updateStreamModes=function(t,e){var n=this._pipelineMap.get(t.uid),i=t.getData(),r=i.count(),o=n.progressiveEnabled&&e.incrementalPrepareRender&&r>=n.threshold,a=t.get("large")&&r>=t.get("largeThreshold"),s="mod"===t.get("progressiveChunkMode")?r:null;t.pipelineContext=n.context={progressiveRender:o,modDataCount:s,large:a}},g_.restorePipelines=function(t){var e=this,n=e._pipelineMap=N();t.eachSeries(function(t){var i=t.getProgressive(),r=t.uid;n.set(r,{id:r,head:null,tail:null,threshold:t.getProgressiveThreshold(),progressiveEnabled:i&&!(t.preventIncremental&&t.preventIncremental()),blockIndex:-1,step:Math.round(i||700),count:0}),js(e,t,t.dataTask)})},g_.prepareStageTasks=function(){var t=this._stageTaskMap,e=this.ecInstance.getModel(),n=this.api;f(this._allHandlers,function(i){var r=t.get(i.uid)||t.set(i.uid,[]);i.reset&&zs(this,i,r,e,n),i.overallReset&&Bs(this,i,r,e,n)},this)},g_.prepareView=function(t,e,n,i){var r=t.renderTask,o=r.context;o.model=e,o.ecModel=n,o.api=i,r.__block=!t.incrementalPrepareRender,js(this,e,r)},g_.performDataProcessorTasks=function(t,e){Rs(this,this._dataProcessorHandlers,t,e,{block:!0})},g_.performVisualTasks=function(t,e,n){Rs(this,this._visualHandlers,t,e,n)},g_.performSeriesTasks=function(t){var e;t.eachSeries(function(t){e|=t.dataTask.perform()}),this.unfinished|=e},g_.plan=function(){this._pipelineMap.each(function(t){var e=t.tail;do{if(e.__block){t.blockIndex=e.__idxInPipeline;break}e=e.getUpstream()}while(e)})};var v_=g_.updatePayload=function(t,e){"remain"!==e&&(t.context.payload=e)},m_=Us(0);Es.wrapStageHandler=function(t,e){return w(t)&&(t={overallReset:t,seriesType:Xs(t)}),t.uid=Ro("stageHandler"),e&&(t.visualType=e),t};var y_,x_={},__={};Ys(x_,Ax),Ys(__,Fa),x_.eachSeriesByType=x_.eachRawSeriesByType=function(t){y_=t},x_.eachComponent=function(t){"series"===t.mainType&&t.subType&&(y_=t.subType)};var w_=["#37A2DA","#32C5E9","#67E0E3","#9FE6B8","#FFDB5C","#ff9f7f","#fb7293","#E062AE","#E690D1","#e7bcf3","#9d96f5","#8378EA","#96BFFF"],b_={color:w_,colorLayer:[["#37A2DA","#ffd85c","#fd7b5f"],["#37A2DA","#67E0E3","#FFDB5C","#ff9f7f","#E062AE","#9d96f5"],["#37A2DA","#32C5E9","#9FE6B8","#FFDB5C","#ff9f7f","#fb7293","#e7bcf3","#8378EA","#96BFFF"],w_]},M_="#eee",S_=function(){return{axisLine:{lineStyle:{color:M_}},axisTick:{lineStyle:{color:M_}},axisLabel:{textStyle:{color:M_}},splitLine:{lineStyle:{type:"dashed",color:"#aaa"}},splitArea:{areaStyle:{color:M_}}}},I_=["#dd6b66","#759aa0","#e69d87","#8dc1a9","#ea7e53","#eedd78","#73a373","#73b9bc","#7289ab","#91ca8c","#f49f42"],C_={color:I_,backgroundColor:"#333",tooltip:{axisPointer:{lineStyle:{color:M_},crossStyle:{color:M_}}},legend:{textStyle:{color:M_}},textStyle:{color:M_},title:{textStyle:{color:M_}},toolbox:{iconStyle:{normal:{borderColor:M_}}},dataZoom:{textStyle:{color:M_}},visualMap:{textStyle:{color:M_}},timeline:{lineStyle:{color:M_},itemStyle:{normal:{color:I_[1]}},label:{normal:{textStyle:{color:M_}}},controlStyle:{normal:{color:M_,borderColor:M_}}},timeAxis:S_(),logAxis:S_(),valueAxis:S_(),categoryAxis:S_(),line:{symbol:"circle"},graph:{color:I_},gauge:{title:{textStyle:{color:M_}}},candlestick:{itemStyle:{normal:{color:"#FD1050",color0:"#0CF49B",borderColor:"#FD1050",borderColor0:"#0CF49B"}}}};C_.categoryAxis.splitLine.show=!1,cx.extend({type:"dataset",defaultOption:{seriesLayoutBy:Sx,sourceHeader:null,dimensions:null,source:null},optionUpdated:function(){wa(this)}}),t_.extend({type:"dataset"});var T_=O,A_=f,D_=w,k_=M,P_=cx.parseClassType,L_="4.1.0",O_={zrender:"4.0.4"},E_=1,R_=1e3,z_=5e3,B_=1e3,N_=2e3,V_=3e3,F_=4e3,H_=5e3,W_={PROCESSOR:{FILTER:R_,STATISTIC:z_},VISUAL:{LAYOUT:B_,GLOBAL:N_,CHART:V_,COMPONENT:F_,BRUSH:H_}},G_="__flagInMainProcess",U_="__optionUpdated",Z_=\/^[a-zA-Z0-9_]+$\/;$s.prototype.on=qs("on"),$s.prototype.off=qs("off"),$s.prototype.one=qs("one"),c($s,wg);var j_=Ks.prototype;j_._onframe=function(){if(!this._disposed){var t=this._scheduler;if(this[U_]){var e=this[U_].silent;this[G_]=!0,Js(this),X_.update.call(this),this[G_]=!1,this[U_]=!1,il.call(this,e),rl.call(this,e)}else if(t.unfinished){var n=E_,i=this._model,r=this._api;t.unfinished=!1;do{var o=+new Date;t.performSeriesTasks(i),t.performDataProcessorTasks(i),el(this,i),t.performVisualTasks(i),hl(this,this._model,r,"remain"),n-=+new Date-o}while(n>0&&t.unfinished);t.unfinished||this._zr.flush()}}},j_.getDom=function(){return this._dom},j_.getZr=function(){return this._zr},j_.setOption=function(t,e,n){var i;if(k_(e)&&(n=e.lazyUpdate,i=e.silent,e=e.notMerge),this[G_]=!0,!this._model||e){var r=new Wa(this._api),o=this._theme,a=this._model=new Ax(null,null,o,r);$/;" l language:C++ +A saber/funcs/impl/amd/lib/wino_conv_3x3.so /^,N/;" v language:C++ +A saber/funcs/impl/amd/lib/wino_conv_3x3.so /^/;" v language:C++ +A$ tags /^TITLE LICENSE \/^ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A$\/;" v language:C++$/;" v language:C++ +A2 tools/external_converter_v2/parser/frontend/dash_board/static/bootstraptest/fonts/glyphicons-halflings-regular.eot /^jC.4Ya_{eA2=r+/;" v language:C++ +ABORT_S utils/logger/logger.h 80;" d language:C++ +ACQUIRED_AFTER framework/core/thread_safe_macros.h 44;" d language:C++ +ACQUIRED_AFTER framework/core/thread_safe_macros.h 82;" d language:C++ +ACQUIRED_BEFORE framework/core/thread_safe_macros.h 45;" d language:C++ +ACQUIRED_BEFORE framework/core/thread_safe_macros.h 83;" d language:C++ +ACROSS_CHANNELS saber/saber_types.h /^ ACROSS_CHANNELS = 0,$/;" e language:C++ enum:anakin::saber::__anon23 +ACTIVATION saber/funcs/impl/bm/device/bmkernel_base.h /^ ACTIVATION, $/;" e language:C++ enum:BmOpType +ACTIVATION saber/funcs/impl/cuda/cuda_inline_activation.h /^struct ACTIVATION {$/;" s language:C++ namespace:anakin::saber +ACTIVATION saber/funcs/impl/x86/saber_normal_activation.h /^struct ACTIVATION {$/;" s language:C++ namespace:anakin::saber +ACTIVATION test/saber/test_saber_gru.cpp /^struct ACTIVATION{$/;" s language:C++ file: +ACTIVATION test/saber/test_saber_lstm.cpp /^struct ACTIVATION{$/;" s language:C++ file: +ACTIVATION::Act test/saber/test_saber_gru.cpp /^ typedef Dtype(*Act)(const Dtype);$/;" t language:C++ struct:ACTIVATION file: access:public +ACTIVATION::Act test/saber/test_saber_lstm.cpp /^ typedef Dtype(*Act)(const Dtype);$/;" t language:C++ struct:ACTIVATION file: access:public +ADD framework/graph/llvm/base.h /^ ADD = -7, \/\/\/< -7 stand for ADD operation$/;" e language:C++ enum:anakin::graph::OpType +AK_ALIGNED framework/core/common_macros.h 44;" d language:C++ +AK_ALIGNED framework/core/common_macros.h 46;" d language:C++ +AK_ALIGNED framework/core/common_macros.h 48;" d language:C++ +AK_ATTRIBUTE_UNUSED framework/core/common_macros.h 58;" d language:C++ +AK_ATTRIBUTE_UNUSED framework/core/common_macros.h 60;" d language:C++ +AK_BOOL saber/saber_types.h /^ AK_BOOL = 11,$/;" e language:C++ enum:anakin::saber::DataType +AK_CONCAT framework/core/common_macros.h 76;" d language:C++ +AK_CONCAT_IMPL framework/core/common_macros.h 74;" d language:C++ +AK_DEPRECATED framework/core/common_macros.h 28;" d language:C++ +AK_DEPRECATED framework/core/common_macros.h 30;" d language:C++ +AK_DEPRECATED framework/core/common_macros.h 32;" d language:C++ +AK_DOUBLE saber/saber_types.h /^ AK_DOUBLE = 2,$/;" e language:C++ enum:anakin::saber::DataType +AK_EXPORT framework/core/common_macros.h 20;" d language:C++ +AK_EXPORT framework/core/common_macros.h 22;" d language:C++ +AK_EXPORT framework/core/common_macros.h 24;" d language:C++ +AK_FLOAT saber/saber_types.h /^ AK_FLOAT = 1,$/;" e language:C++ enum:anakin::saber::DataType +AK_HALF saber/saber_types.h /^ AK_HALF = 0,$/;" e language:C++ enum:anakin::saber::DataType +AK_INT16 saber/saber_types.h /^ AK_INT16 = 4,$/;" e language:C++ enum:anakin::saber::DataType +AK_INT32 saber/saber_types.h /^ AK_INT32 = 5,$/;" e language:C++ enum:anakin::saber::DataType +AK_INT64 saber/saber_types.h /^ AK_INT64 = 6,$/;" e language:C++ enum:anakin::saber::DataType +AK_INT8 saber/saber_types.h /^ AK_INT8 = 3,$/;" e language:C++ enum:anakin::saber::DataType +AK_INVALID saber/saber_types.h /^ AK_INVALID = -1,$/;" e language:C++ enum:anakin::saber::DataType +AK_MAKE_UNIQ_OPERATOR_NAME framework/core/common_macros.h 80;" d language:C++ +AK_NORETURN framework/core/common_macros.h 36;" d language:C++ +AK_NORETURN framework/core/common_macros.h 38;" d language:C++ +AK_NORETURN framework/core/common_macros.h 40;" d language:C++ +AK_NO_NULL framework/core/common_macros.h 52;" d language:C++ +AK_NO_NULL framework/core/common_macros.h 54;" d language:C++ +AK_SHAPE saber/saber_types.h /^ AK_SHAPE = 12,$/;" e language:C++ enum:anakin::saber::DataType +AK_STRING saber/saber_types.h /^ AK_STRING = 10,$/;" e language:C++ enum:anakin::saber::DataType +AK_TENSOR saber/saber_types.h /^ AK_TENSOR = 13$/;" e language:C++ enum:anakin::saber::DataType +AK_THREAD_LOCAL framework/core/common_macros.h 65;" d language:C++ +AK_THREAD_LOCAL framework/core/common_macros.h 67;" d language:C++ +AK_THREAD_LOCAL framework/core/common_macros.h 71;" d language:C++ +AK_UINT16 saber/saber_types.h /^ AK_UINT16 = 8,$/;" e language:C++ enum:anakin::saber::DataType +AK_UINT32 saber/saber_types.h /^ AK_UINT32 = 9,$/;" e language:C++ enum:anakin::saber::DataType +AK_UINT8 saber/saber_types.h /^ AK_UINT8 = 7,$/;" e language:C++ enum:anakin::saber::DataType +AK_UNIQ_NAME framework/core/common_macros.h 78;" d language:C++ +ALIGN saber/funcs/impl/bm/device/bm_common.h 33;" d language:C++ +ALIGN32_BEG saber/funcs/impl/x86/saber_avx2_math.h 49;" d language:C++ +ALIGN32_END saber/funcs/impl/x86/saber_avx2_math.h 50;" d language:C++ +ALLOC_ROUND saber/lite/funcs/neon/impl/sgemm_arm.cpp /^const int ALLOC_ROUND = 128;$/;" m language:C++ namespace:anakin::saber::lite file: +AMD docs/Manual/addCustomDevice.md /^typedef TargetType AMD;$/;" t language:C++ file: +AMD saber/saber_types.h /^typedef TargetType AMD;$/;" t language:C++ namespace:anakin::saber +AMD_API saber/core/impl/amd/amd_device.cpp /^typedef TargetWrapper AMD_API;$/;" t language:C++ namespace:anakin::saber file: +AMD_API saber/core/impl/amd/amd_env.cpp /^typedef TargetWrapper AMD_API;$/;" t language:C++ namespace:anakin::saber file: +AMD_API saber/core/impl/amd/amd_impl.cpp /^typedef TargetWrapper AMD_API;$/;" t language:C++ namespace:anakin::saber file: +AMD_API saber/core/impl/amd/tensor_op_amd.cpp /^typedef TargetWrapper AMD_API;$/;" t language:C++ namespace:anakin::saber file: +AMD_API saber/funcs/impl/amd/saber_activation.cpp /^typedef TargetWrapper AMD_API;$/;" t language:C++ namespace:anakin::saber file: +AMD_API saber/funcs/timer.h /^typedef TargetWrapper AMD_API;$/;" t language:C++ namespace:anakin::saber +AMD_CHECK saber/core/common.h 161;" d language:C++ +AMD_CHECK_MSG saber/core/common.h 153;" d language:C++ +AMD_ENV saber/core/impl/amd/amd_env.cpp /^typedef Env AMD_ENV;$/;" t language:C++ namespace:anakin::saber file: +AMD_ENV saber/core/impl/amd/amd_impl.cpp /^typedef Env AMD_ENV;$/;" t language:C++ namespace:anakin::saber file: +AMD_GPU_EXTENSION saber/core/impl/amd/amd_impl.cpp 10;" d language:C++ file: +ANAKIN2_UTILS_TEST_TENSOR_OPS_H utils/unit_test/tensor_ops.h 17;" d language:C++ +ANAKIN_ALGO_H framework/graph/algorithm.h 17;" d language:C++ +ANAKIN_ANY_H framework/core/any.h 17;" d language:C++ +ANAKIN_ARC_H framework/graph/arc.h 17;" d language:C++ +ANAKIN_BASE_H framework/core/base.h 17;" d language:C++ +ANAKIN_COMMON_MACROS_H framework/core/common_macros.h 17;" d language:C++ +ANAKIN_CONV_FUNC_HELPER_H test/saber/conv_func_helper.h 17;" d language:C++ +ANAKIN_DATA_TYPES_H framework/core/data_types.h 17;" d language:C++ +ANAKIN_DEVICE_INFO_H framework/service/device_info.h 17;" d language:C++ +ANAKIN_FACTORY_H framework/core/factory.h 17;" d language:C++ +ANAKIN_FRAMEWORK_LITE_BINARY_WRITTER_H framework/lite/binary_writter.h 17;" d language:C++ +ANAKIN_FRAMEWORK_LITE_CODE_GENERATE_CPP_H framework/lite/code_gen_cpp.h 17;" d language:C++ +ANAKIN_FRAMEWORK_LITE_CODE_GEN_BASE_H framework/lite/code_gen_base.h 17;" d language:C++ +ANAKIN_FRAMEWORK_LITE_CODE_WRITTER_H framework/lite/code_writter.h 17;" d language:C++ +ANAKIN_FRAMEWORK_LITE_FILE_STREAM_H framework/lite/file_stream.h 17;" d language:C++ +ANAKIN_FRAMEWORK_LITE_OPERATION_MAP_H framework/lite/op_map.h 17;" d language:C++ +ANAKIN_FRAMEWORK_OPERATOR_ARGMAX_H framework/operators/arg_max.h 17;" d language:C++ +ANAKIN_FRAMEWORK_UTILS_PARAMETER_FUSION_H framework/utils/parameter_fusion.h 17;" d language:C++ +ANAKIN_FUNCTOR_H framework/core/functor.h 17;" d language:C++ +ANAKIN_GRAPH_BASE_H framework/graph/graph_base.h 17;" d language:C++ +ANAKIN_GRAPH_GLOBAL_MEM_H framework/graph/graph_global_mem.h 17;" d language:C++ +ANAKIN_GRAPH_H framework/graph/graph.h 17;" d language:C++ +ANAKIN_GRAPH_TEST_H test/framework/core/core_test.h 17;" d language:C++ +ANAKIN_GRAPH_TEST_H test/framework/graph/graph_test.h 17;" d language:C++ +ANAKIN_LLVM_BASE_H framework/graph/llvm/base.h 17;" d language:C++ +ANAKIN_LLVM_FUSION_GRAPH_PATTERN_H framework/graph/llvm/fusion/graph_pattern.h 17;" d language:C++ +ANAKIN_LLVM_SCHEDULER_BASE_H framework/graph/llvm/schedule_base.h 17;" d language:C++ +ANAKIN_LLVM_SCHEDULER_CONV_ELEWISE_FUSION_H framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.h 17;" d language:C++ +ANAKIN_LLVM_SCHEDULER_H framework/graph/llvm/scheduler.h 17;" d language:C++ +ANAKIN_LLVM_SCHEDULER_MEMORY_H framework/graph/llvm/optimizer/memory_scheduler.h 17;" d language:C++ +ANAKIN_LLVM_SCHEDULER_PARALLEL_H framework/graph/llvm/optimizer/parall_scheduler.h 17;" d language:C++ +ANAKIN_LLVM_VIRTUAL_GRAPH_H framework/graph/llvm/virtual_graph.h 17;" d language:C++ +ANAKIN_MEM_INFO_H framework/core/mem_info.h 17;" d language:C++ +ANAKIN_MODEL_IO_H framework/model_parser/parser/model_io.h 17;" d language:C++ +ANAKIN_MODEL_PARSER_H framework/model_parser/parser/parser.h 17;" d language:C++ +ANAKIN_MONITOR_H framework/service/monitor.h 17;" d language:C++ +ANAKIN_NET_H framework/core/net/net.h 17;" d language:C++ +ANAKIN_NET_TEST_H test/framework/net/net_test.h 17;" d language:C++ +ANAKIN_NODE_H framework/graph/node.h 17;" d language:C++ +ANAKIN_OPERATORS_H framework/operators/ops.h 17;" d language:C++ +ANAKIN_OPERATOR_ACTIVATION_H framework/operators/activation.h 17;" d language:C++ +ANAKIN_OPERATOR_ATTR_H framework/core/operator/operator_attr.h 17;" d language:C++ +ANAKIN_OPERATOR_AXPY_H framework/operators/axpy.h 17;" d language:C++ +ANAKIN_OPERATOR_BATCHNORM_SCALE_H framework/operators/fusion_ops/batchnorm_scale.h 17;" d language:C++ +ANAKIN_OPERATOR_BATCH_NORM_H framework/operators/batch_norm.h 17;" d language:C++ +ANAKIN_OPERATOR_CONCAT_H framework/operators/concat.h 17;" d language:C++ +ANAKIN_OPERATOR_CONV_BATCHNORM_SCALE_H framework/operators/fusion_ops/conv_batchnorm_scale.h 17;" d language:C++ +ANAKIN_OPERATOR_CONV_BATCHNORM_SCALE_RELU_H framework/operators/fusion_ops/conv_batchnorm_scale_relu.h 17;" d language:C++ +ANAKIN_OPERATOR_CONV_BATCHNORM_SCALE_RELU_POOL_H framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.h 17;" d language:C++ +ANAKIN_OPERATOR_CONV_H framework/operators/convolution.h 17;" d language:C++ +ANAKIN_OPERATOR_CONV_RELU_H framework/operators/fusion_ops/conv_relu.h 17;" d language:C++ +ANAKIN_OPERATOR_CONV_RELU_POOL_H framework/operators/fusion_ops/conv_relu_pool.h 17;" d language:C++ +ANAKIN_OPERATOR_CONV_SASS_BATCHNORM_SCALE_H framework/operators/fusion_ops/conv_3x3_batchnorm_scale.h 17;" d language:C++ +ANAKIN_OPERATOR_CONV_SASS_BATCHNORM_SCALE_RELU_H framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.h 17;" d language:C++ +ANAKIN_OPERATOR_CONV_SASS_BATCHNORM_SCALE_RELU_POOL_H framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.h 17;" d language:C++ +ANAKIN_OPERATOR_CONV_SASS_H framework/operators/conv_3x3.h 17;" d language:C++ +ANAKIN_OPERATOR_CONV_SASS_RELU_H framework/operators/fusion_ops/conv_3x3_relu.h 17;" d language:C++ +ANAKIN_OPERATOR_CONV_SASS_RELU_POOL_H framework/operators/fusion_ops/conv_3x3_relu_pool.h 17;" d language:C++ +ANAKIN_OPERATOR_CRF_DECODING_H framework/operators/crf_decoding.h 17;" d language:C++ +ANAKIN_OPERATOR_CTC_ALIGN_H framework/operators/ctc_align.h 17;" d language:C++ +ANAKIN_OPERATOR_DECONV_BATCHNORM_SCALE_RELU_H framework/operators/fusion_ops/deconv_batchnorm_scale_relu.h 17;" d language:C++ +ANAKIN_OPERATOR_DECONV_H framework/operators/deconvolution.h 17;" d language:C++ +ANAKIN_OPERATOR_DECONV_RELU_H framework/operators/fusion_ops/deconv_relu.h 17;" d language:C++ +ANAKIN_OPERATOR_DEFORMABLE_CONV_H framework/operators/deformconvolution.h 17;" d language:C++ +ANAKIN_OPERATOR_DENSE_H framework/operators/dense.h 17;" d language:C++ +ANAKIN_OPERATOR_DETECTION_OUTPUT_H framework/operators/detection_output.h 16;" d language:C++ +ANAKIN_OPERATOR_ELTWISE_H framework/operators/eltwise_op.h 17;" d language:C++ +ANAKIN_OPERATOR_ELTWISE_RELU_H framework/operators/fusion_ops/eltwise_relu.h 17;" d language:C++ +ANAKIN_OPERATOR_EMBEDDING_H framework/operators/embedding.h 17;" d language:C++ +ANAKIN_OPERATOR_FLATTEN_H framework/operators/flatten.h 17;" d language:C++ +ANAKIN_OPERATOR_FUNC_H framework/core/net/operator_func.h 17;" d language:C++ +ANAKIN_OPERATOR_GATHER_H framework/operators/gather.h 17;" d language:C++ +ANAKIN_OPERATOR_GRU_H framework/operators/bk/gru.h 19;" d language:C++ +ANAKIN_OPERATOR_H framework/core/operator/operator.h 17;" d language:C++ +ANAKIN_OPERATOR_HELP_H framework/core/operator/operator_help.h 17;" d language:C++ +ANAKIN_OPERATOR_IM2SEQUENCE_H framework/operators/im2sequence.h 17;" d language:C++ +ANAKIN_OPERATOR_INPUT_H framework/operators/input.h 17;" d language:C++ +ANAKIN_OPERATOR_LAYER_NORM_H framework/operators/layer_norm.h 17;" d language:C++ +ANAKIN_OPERATOR_LRN_H framework/operators/lrn.h 17;" d language:C++ +ANAKIN_OPERATOR_LSTM_H framework/operators/lstm.h 19;" d language:C++ +ANAKIN_OPERATOR_NORMALIZE_H framework/operators/normalize.h 17;" d language:C++ +ANAKIN_OPERATOR_OUTPUT_H framework/operators/output.h 17;" d language:C++ +ANAKIN_OPERATOR_PERMUTE_H framework/operators/permute.h 17;" d language:C++ +ANAKIN_OPERATOR_PERMUTE_POWER_H framework/operators/fusion_ops/permute_power.h 17;" d language:C++ +ANAKIN_OPERATOR_POOLING_H framework/operators/pooling.h 17;" d language:C++ +ANAKIN_OPERATOR_POWER_H framework/operators/power.h 17;" d language:C++ +ANAKIN_OPERATOR_PRIORBOX_H framework/operators/priorbox.h 16;" d language:C++ +ANAKIN_OPERATOR_RELU_H framework/operators/relu.h 17;" d language:C++ +ANAKIN_OPERATOR_RESHAPE_H framework/operators/reshape.h 17;" d language:C++ +ANAKIN_OPERATOR_SCALE_H framework/operators/scale.h 17;" d language:C++ +ANAKIN_OPERATOR_SEQUENCE_CONV_H framework/operators/sequence_conv.h 17;" d language:C++ +ANAKIN_OPERATOR_SEQUENCE_POOL_H framework/operators/sequence_pool.h 17;" d language:C++ +ANAKIN_OPERATOR_SLICE_H framework/operators/slice.h 17;" d language:C++ +ANAKIN_OPERATOR_SOFTMAX_H framework/operators/softmax.h 17;" d language:C++ +ANAKIN_OPERATOR_SPLIT_H framework/operators/split.h 17;" d language:C++ +ANAKIN_OPERATOR_TESTS_H test/framework/operators/operator_tests.h 17;" d language:C++ +ANAKIN_OPERATOR_UTILS_H framework/core/operator/request.h 17;" d language:C++ +ANAKIN_PARAMETER_H framework/core/parameter.h 17;" d language:C++ +ANAKIN_PBLOCK_TO_TYPE_ID framework/core/data_types.h 120;" d language:C++ +ANAKIN_REGISTER_OP framework/core/operator/operator.h 275;" d language:C++ +ANAKIN_REGISTER_OP_HELPER docs/Manual/addCustomOp.md /^ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER docs/Manual/addCustomOp.md /^ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/core/operator/operator.h 279;" d language:C++ +ANAKIN_REGISTER_OP_HELPER framework/operators/activation.cpp /^ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, AMD, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Activation, ActivationHelper, AMD, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/activation.cpp /^ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Activation, ActivationHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/activation.cpp /^ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Activation, ActivationHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/activation.cpp /^ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Activation, ActivationHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/arg_max.cpp /^ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Argmax, ArgmaxHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/arg_max.cpp /^ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Argmax, ArgmaxHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/arg_max.cpp /^ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Argmax, ArgmaxHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/axpy.cpp /^ANAKIN_REGISTER_OP_HELPER(Axpy, AxpyHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Axpy, AxpyHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/axpy.cpp /^ANAKIN_REGISTER_OP_HELPER(Axpy, AxpyHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Axpy, AxpyHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/axpy.cpp /^ANAKIN_REGISTER_OP_HELPER(Axpy, AxpyHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Axpy, AxpyHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/batch_norm.cpp /^ANAKIN_REGISTER_OP_HELPER(BatchNorm, BatchNormHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(BatchNorm, BatchNormHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/batch_norm.cpp /^ANAKIN_REGISTER_OP_HELPER(BatchNorm, BatchNormHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(BatchNorm, BatchNormHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/batch_norm.cpp /^ANAKIN_REGISTER_OP_HELPER(BatchNorm, BatchNormHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(BatchNorm, BatchNormHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/bk/gru.cpp /^ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Gru, GruHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/bk/gru.cpp /^ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Gru, GruHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/bk/gru.cpp /^ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Gru, GruHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/concat.cpp /^ANAKIN_REGISTER_OP_HELPER(Concat, ConcatHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Concat, ConcatHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/concat.cpp /^ANAKIN_REGISTER_OP_HELPER(Concat, ConcatHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Concat, ConcatHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/concat.cpp /^ANAKIN_REGISTER_OP_HELPER(Concat, ConcatHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Concat, ConcatHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/conv_3x3.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvolution, SassConvolutionHelper, AMD, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvolution, SassConvolutionHelper, AMD, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/conv_3x3.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvolution, SassConvolutionHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvolution, SassConvolutionHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/convolution.cpp /^ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, AMD, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Convolution, ConvolutionHelper, AMD, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/convolution.cpp /^ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Convolution, ConvolutionHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/convolution.cpp /^ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Convolution, ConvolutionHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/convolution.cpp /^ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Convolution, ConvolutionHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/crf_decoding.cpp /^ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(CrfDecoding, CrfDecodingHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/crf_decoding.cpp /^ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(CrfDecoding, CrfDecodingHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/crf_decoding.cpp /^ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(CrfDecoding, CrfDecodingHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/ctc_align.cpp /^ANAKIN_REGISTER_OP_HELPER(CtcAlign, CtcAlignHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(CtcAlign, CtcAlignHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/ctc_align.cpp /^ANAKIN_REGISTER_OP_HELPER(CtcAlign, CtcAlignHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(CtcAlign, CtcAlignHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/deconvolution.cpp /^ANAKIN_REGISTER_OP_HELPER(Deconvolution, DeconvolutionHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Deconvolution, DeconvolutionHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/deconvolution.cpp /^ANAKIN_REGISTER_OP_HELPER(Deconvolution, DeconvolutionHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Deconvolution, DeconvolutionHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/deformconvolution.cpp /^ANAKIN_REGISTER_OP_HELPER(DeformConvolution, DeformConvolutionHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(DeformConvolution, DeformConvolutionHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/deformconvolution.cpp /^ANAKIN_REGISTER_OP_HELPER(DeformConvolution, DeformConvolutionHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(DeformConvolution, DeformConvolutionHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/dense.cpp /^ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, AMD, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Dense, DenseHelper, AMD, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/dense.cpp /^ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Dense, DenseHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/dense.cpp /^ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Dense, DenseHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/dense.cpp /^ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Dense, DenseHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/detection_output.cpp /^ANAKIN_REGISTER_OP_HELPER(DetectionOutput, DetectionOutputHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(DetectionOutput, DetectionOutputHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/detection_output.cpp /^ANAKIN_REGISTER_OP_HELPER(DetectionOutput, DetectionOutputHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(DetectionOutput, DetectionOutputHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/detection_output.cpp /^ANAKIN_REGISTER_OP_HELPER(DetectionOutput, DetectionOutputHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(DetectionOutput, DetectionOutputHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/eltwise_op.cpp /^ANAKIN_REGISTER_OP_HELPER(Eltwise, EltwiseHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Eltwise, EltwiseHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/eltwise_op.cpp /^ANAKIN_REGISTER_OP_HELPER(Eltwise, EltwiseHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Eltwise, EltwiseHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/eltwise_op.cpp /^ANAKIN_REGISTER_OP_HELPER(Eltwise, EltwiseHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Eltwise, EltwiseHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/embedding.cpp /^ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Embedding, EmbeddingHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/embedding.cpp /^ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Embedding, EmbeddingHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/embedding.cpp /^ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Embedding, EmbeddingHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/flatten.cpp /^ANAKIN_REGISTER_OP_HELPER(Flatten, FlattenHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Flatten, FlattenHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/flatten.cpp /^ANAKIN_REGISTER_OP_HELPER(Flatten, FlattenHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Flatten, FlattenHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/flatten.cpp /^ANAKIN_REGISTER_OP_HELPER(Flatten, FlattenHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Flatten, FlattenHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/batchnorm_scale.cpp /^ANAKIN_REGISTER_OP_HELPER(BatchnormScale, BatchnormScaleHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(BatchnormScale, BatchnormScaleHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/batchnorm_scale.cpp /^ANAKIN_REGISTER_OP_HELPER(BatchnormScale, BatchnormScaleHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(BatchnormScale, BatchnormScaleHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_3x3_batchnorm_scale.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScale, SassConvBatchnormScaleHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvBatchnormScale, SassConvBatchnormScaleHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_3x3_batchnorm_scale.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScale, SassConvBatchnormScaleHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvBatchnormScale, SassConvBatchnormScaleHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleRelu, SassConvBatchnormScaleReluHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvBatchnormScaleRelu, SassConvBatchnormScaleReluHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleRelu, SassConvBatchnormScaleReluHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvBatchnormScaleRelu, SassConvBatchnormScaleReluHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleReluPool, SassConvBatchnormScaleReluPoolHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvBatchnormScaleReluPool, SassConvBatchnormScaleReluPoolHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleReluPool, SassConvBatchnormScaleReluPoolHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvBatchnormScaleReluPool, SassConvBatchnormScaleReluPoolHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_3x3_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, AMD, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvRelu, SassConvReluHelper, AMD, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_3x3_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvRelu, SassConvReluHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_3x3_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvRelu, SassConvReluHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_3x3_relu_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, AMD, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvReluPool, SassConvReluPoolHelper, AMD, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_3x3_relu_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvReluPool, SassConvReluPoolHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_3x3_relu_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SassConvReluPool, SassConvReluPoolHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_batchnorm_scale.cpp /^ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ConvBatchnormScale, ConvBatchnormScaleHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_batchnorm_scale.cpp /^ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ConvBatchnormScale, ConvBatchnormScaleHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_batchnorm_scale_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_batchnorm_scale_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPoolHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPoolHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPoolHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPoolHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, AMD, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ConvRelu, ConvReluHelper, AMD, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ConvRelu, ConvReluHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ConvRelu, ConvReluHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_relu_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(ConvReluPool, ConvReluPoolHelper, AMD, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ConvReluPool, ConvReluPoolHelper, AMD, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_relu_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(ConvReluPool, ConvReluPoolHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ConvReluPool, ConvReluPoolHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/conv_relu_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(ConvReluPool, ConvReluPoolHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ConvReluPool, ConvReluPoolHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/deconv_batchnorm_scale_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(DeconvBatchnormScaleRelu, DeconvBatchnormScaleReluHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(DeconvBatchnormScaleRelu, DeconvBatchnormScaleReluHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/deconv_batchnorm_scale_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(DeconvBatchnormScaleRelu, DeconvBatchnormScaleReluHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(DeconvBatchnormScaleRelu, DeconvBatchnormScaleReluHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/deconv_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(DeconvRelu, DeconvReluHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(DeconvRelu, DeconvReluHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/deconv_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(DeconvRelu, DeconvReluHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(DeconvRelu, DeconvReluHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/eltwise_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(EltwiseRelu, EltwiseReluHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/eltwise_relu.cpp /^ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(EltwiseRelu, EltwiseReluHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/permute_power.cpp /^ANAKIN_REGISTER_OP_HELPER(PermutePower, PermutePowerHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(PermutePower, PermutePowerHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/fusion_ops/permute_power.cpp /^ANAKIN_REGISTER_OP_HELPER(PermutePower, PermutePowerHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(PermutePower, PermutePowerHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/gather.cpp /^ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, ARM, Precision::FP16);$/;" p language:C++ namespace:anakin::ops file: signature:(Gather, GatherHelper, ARM, Precision::FP16) +ANAKIN_REGISTER_OP_HELPER framework/operators/gather.cpp /^ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Gather, GatherHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/gather.cpp /^ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, ARM, Precision::INT8);$/;" p language:C++ namespace:anakin::ops file: signature:(Gather, GatherHelper, ARM, Precision::INT8) +ANAKIN_REGISTER_OP_HELPER framework/operators/gather.cpp /^ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, NV, Precision::FP16);$/;" p language:C++ namespace:anakin::ops file: signature:(Gather, GatherHelper, NV, Precision::FP16) +ANAKIN_REGISTER_OP_HELPER framework/operators/gather.cpp /^ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Gather, GatherHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/gather.cpp /^ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, NV, Precision::INT8);$/;" p language:C++ namespace:anakin::ops file: signature:(Gather, GatherHelper, NV, Precision::INT8) +ANAKIN_REGISTER_OP_HELPER framework/operators/gather.cpp /^ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, X86, Precision::FP16);$/;" p language:C++ namespace:anakin::ops file: signature:(Gather, GatherHelper, X86, Precision::FP16) +ANAKIN_REGISTER_OP_HELPER framework/operators/gather.cpp /^ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Gather, GatherHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/gather.cpp /^ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, X86, Precision::INT8);$/;" p language:C++ namespace:anakin::ops file: signature:(Gather, GatherHelper, X86, Precision::INT8) +ANAKIN_REGISTER_OP_HELPER framework/operators/im2sequence.cpp /^ANAKIN_REGISTER_OP_HELPER(Im2Sequence, Im2SequenceHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Im2Sequence, Im2SequenceHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/im2sequence.cpp /^ANAKIN_REGISTER_OP_HELPER(Im2Sequence, Im2SequenceHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Im2Sequence, Im2SequenceHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/input.cpp /^ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, AMD, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Input, InputHelper, AMD, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/input.cpp /^ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Input, InputHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/input.cpp /^ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Input, InputHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/input.cpp /^ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Input, InputHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/layer_norm.cpp /^ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(LayerNorm, LayerNormHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/layer_norm.cpp /^ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(LayerNorm, LayerNormHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/layer_norm.cpp /^ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(LayerNorm, LayerNormHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/lrn.cpp /^ANAKIN_REGISTER_OP_HELPER(Lrn, LrnHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Lrn, LrnHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/lrn.cpp /^ANAKIN_REGISTER_OP_HELPER(Lrn, LrnHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Lrn, LrnHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/lstm.cpp /^ANAKIN_REGISTER_OP_HELPER(Lstm, LstmHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Lstm, LstmHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/lstm.cpp /^ANAKIN_REGISTER_OP_HELPER(Lstm, LstmHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Lstm, LstmHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/lstm.cpp /^ANAKIN_REGISTER_OP_HELPER(Lstm, LstmHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Lstm, LstmHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/normalize.cpp /^ANAKIN_REGISTER_OP_HELPER(Normalize, NormalizeHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Normalize, NormalizeHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/normalize.cpp /^ANAKIN_REGISTER_OP_HELPER(Normalize, NormalizeHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Normalize, NormalizeHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/output.cpp /^ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Output, OutputHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/output.cpp /^ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Output, OutputHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/output.cpp /^ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Output, OutputHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/permute.cpp /^ANAKIN_REGISTER_OP_HELPER(Permute, PermuteHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Permute, PermuteHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/permute.cpp /^ANAKIN_REGISTER_OP_HELPER(Permute, PermuteHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Permute, PermuteHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/permute.cpp /^ANAKIN_REGISTER_OP_HELPER(Permute, PermuteHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Permute, PermuteHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/pooling.cpp /^ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, AMD, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Pooling, PoolingHelper, AMD, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/pooling.cpp /^ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Pooling, PoolingHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/pooling.cpp /^ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Pooling, PoolingHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/pooling.cpp /^ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Pooling, PoolingHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/power.cpp /^ANAKIN_REGISTER_OP_HELPER(Power, PowerHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Power, PowerHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/power.cpp /^ANAKIN_REGISTER_OP_HELPER(Power, PowerHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Power, PowerHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/priorbox.cpp /^ANAKIN_REGISTER_OP_HELPER(PriorBox, PriorBoxHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(PriorBox, PriorBoxHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/priorbox.cpp /^ANAKIN_REGISTER_OP_HELPER(PriorBox, PriorBoxHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(PriorBox, PriorBoxHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/priorbox.cpp /^ANAKIN_REGISTER_OP_HELPER(PriorBox, PriorBoxHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(PriorBox, PriorBoxHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/relu.cpp /^ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, AMD, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ReLU, ReLUHelper, AMD, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/relu.cpp /^ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ReLU, ReLUHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/relu.cpp /^ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ReLU, ReLUHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/relu.cpp /^ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(ReLU, ReLUHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/reshape.cpp /^ANAKIN_REGISTER_OP_HELPER(Reshape, ReshapeHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Reshape, ReshapeHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/reshape.cpp /^ANAKIN_REGISTER_OP_HELPER(Reshape, ReshapeHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Reshape, ReshapeHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/reshape.cpp /^ANAKIN_REGISTER_OP_HELPER(Reshape, ReshapeHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Reshape, ReshapeHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/scale.cpp /^ANAKIN_REGISTER_OP_HELPER(Scale, ScaleHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Scale, ScaleHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/scale.cpp /^ANAKIN_REGISTER_OP_HELPER(Scale, ScaleHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Scale, ScaleHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/scale.cpp /^ANAKIN_REGISTER_OP_HELPER(Scale, ScaleHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Scale, ScaleHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/sequence_conv.cpp /^ANAKIN_REGISTER_OP_HELPER(SequenceConv, SequenceConvHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SequenceConv, SequenceConvHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/sequence_conv.cpp /^ANAKIN_REGISTER_OP_HELPER(SequenceConv, SequenceConvHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SequenceConv, SequenceConvHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/sequence_conv.cpp /^ANAKIN_REGISTER_OP_HELPER(SequenceConv, SequenceConvHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SequenceConv, SequenceConvHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/sequence_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SequencePool, SequencePoolHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/sequence_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SequencePool, SequencePoolHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/sequence_pool.cpp /^ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(SequencePool, SequencePoolHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/slice.cpp /^ANAKIN_REGISTER_OP_HELPER(Slice, SliceHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Slice, SliceHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/slice.cpp /^ANAKIN_REGISTER_OP_HELPER(Slice, SliceHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Slice, SliceHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/slice.cpp /^ANAKIN_REGISTER_OP_HELPER(Slice, SliceHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Slice, SliceHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/softmax.cpp /^ANAKIN_REGISTER_OP_HELPER(Softmax, SoftmaxHelper, AMD, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Softmax, SoftmaxHelper, AMD, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/softmax.cpp /^ANAKIN_REGISTER_OP_HELPER(Softmax, SoftmaxHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Softmax, SoftmaxHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/softmax.cpp /^ANAKIN_REGISTER_OP_HELPER(Softmax, SoftmaxHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Softmax, SoftmaxHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/softmax.cpp /^ANAKIN_REGISTER_OP_HELPER(Softmax, SoftmaxHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Softmax, SoftmaxHelper, X86, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/split.cpp /^ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, ARM, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Split, SplitHelper, ARM, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/split.cpp /^ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, NV, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Split, SplitHelper, NV, Precision::FP32) +ANAKIN_REGISTER_OP_HELPER framework/operators/split.cpp /^ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, X86, Precision::FP32);$/;" p language:C++ namespace:anakin::ops file: signature:(Split, SplitHelper, X86, Precision::FP32) +ANAKIN_SABER_AVX512_MATH_H saber/funcs/impl/x86/saber_avx512_math.h 13;" d language:C++ +ANAKIN_SABER_CORE_BUFFER_H saber/core/buffer.h 17;" d language:C++ +ANAKIN_SABER_CORE_COMMON_H saber/core/common.h 17;" d language:C++ +ANAKIN_SABER_CORE_CONTEXT_H saber/core/context.h 17;" d language:C++ +ANAKIN_SABER_CORE_DATA_TRAITS_H saber/core/data_traits.h 17;" d language:C++ +ANAKIN_SABER_CORE_DEVICE_H saber/core/device.h 17;" d language:C++ +ANAKIN_SABER_CORE_ENV_H saber/core/env.h 17;" d language:C++ +ANAKIN_SABER_CORE_EVENTS_H saber/core/events.h 17;" d language:C++ +ANAKIN_SABER_CORE_SHAPE_H saber/core/shape.h 17;" d language:C++ +ANAKIN_SABER_CORE_TARGET_TRAITS_H saber/core/target_traits.h 17;" d language:C++ +ANAKIN_SABER_CORE_TARGET_WRAPPER_H saber/core/target_wrapper.h 17;" d language:C++ +ANAKIN_SABER_CORE_TENSOR_H saber/core/tensor.h 17;" d language:C++ +ANAKIN_SABER_CORE_TYPES_H saber/saber_types.h 17;" d language:C++ +ANAKIN_SABER_CUDNN_HELPER_H saber/funcs/impl/cuda/cudnn_helper.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_ACTIVATION_H saber/funcs/activation.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_AMD_IMPL_H saber/funcs/impl/amd/amd_impl.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_ARGMAX_H saber/funcs/argmax.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_ARM_IMPL_NEON_MATHFUN_H saber/funcs/impl/arm/impl/neon_mathfun.h 28;" d language:C++ +ANAKIN_SABER_FUNCS_ARM_IMPL_NEON_MATHFUN_H saber/lite/funcs/neon/impl/neon_mathfun.h 28;" d language:C++ +ANAKIN_SABER_FUNCS_ARM_IMPL_SGEMV_ARM_H saber/lite/funcs/neon/impl/sgemv_arm.h 15;" d language:C++ +ANAKIN_SABER_FUNCS_ARM_IMPL_UTILS_ARM_H saber/funcs/impl/arm/impl/utils_arm.h 15;" d language:C++ +ANAKIN_SABER_FUNCS_ARM_IMPL_UTILS_ARM_H saber/lite/funcs/utils_arm.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_AXPY_H saber/funcs/axpy.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_BASE_H saber/funcs/base.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_CALIBRATE_H saber/funcs/calibrate.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_CAST_H saber/funcs/cast.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_CONCAT_H saber/funcs/concat.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_CONV_ELTWISE_H saber/funcs/conv_eltwise.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_CONV_H saber/funcs/conv.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_CONV_POOLING_H saber/funcs/conv_pooling.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_CONV_UNPADDING_PADDING_H saber/funcs/conv_unpadding_padding.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_CRF_DECODING_H saber/funcs/crf_decoding.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_CROP_H saber/funcs/crop.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_CTC_ALIGN_H saber/funcs/ctc_align.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_CUDA_SABER_FC_H saber/funcs/impl/cuda/saber_fc.h 15;" d language:C++ +ANAKIN_SABER_FUNCS_CUDA_SABER_RESIZE_H saber/funcs/impl/cuda/saber_resize.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_CUDA_SABER_SLICE_H saber/funcs/impl/cuda/saber_slice.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_CUDA_SABER_TRANSPOSE_H saber/funcs/impl/cuda/saber_transpose.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_DEBUG_H saber/funcs/debug.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_DECONV_H saber/funcs/deconv.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_DEFORMABLE_CONV_H saber/funcs/deformable_conv.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_DETECTION_OUTPUT_H saber/funcs/detection_output.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_ELTWISE_H saber/funcs/eltwise.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_EMBEDDING_H saber/funcs/embedding.h 14;" d language:C++ +ANAKIN_SABER_FUNCS_FC_H saber/funcs/fc.h 15;" d language:C++ +ANAKIN_SABER_FUNCS_FLATTEN_H saber/funcs/flatten.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_GEMM_H saber/funcs/gemm.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_GRU_H saber/funcs/gru.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IM2SEQUENCE_H saber/funcs/im2sequence.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_ACTIVATION_H saber/funcs/impl/impl_activation.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_AMD_SABER_ACTIVATION_H saber/funcs/impl/amd/saber_activation.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_ARGMAX_H saber/funcs/impl/impl_argmax.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_ARGMAX_H saber/funcs/impl/impl_axpy.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_ACTIVATION_H saber/funcs/impl/arm/saber_activation.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_CONCAT_H saber/funcs/impl/arm/saber_concat.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_BASE_IMPL_H saber/funcs/impl/impl_base.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_BM_CONV2D_H saber/funcs/impl/bm/vender_conv.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BMKERNEL_BASE_H saber/funcs/impl/bm/device/bmkernel_base.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_COMMON_H saber/funcs/impl/bm/device/bm_common.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_BM_DEVICE_BM_MEMMAP_H saber/funcs/impl/bm/device/bm_memmap.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_BOXCODER_H saber/funcs/impl/impl_box_coder.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CAST_H saber/funcs/impl/impl_cast.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CONCAT_H saber/funcs/impl/impl_concat.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CONV2D_H saber/funcs/impl/impl_conv.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CONV_ELTWISE_H saber/funcs/impl/impl_conv_eltwise.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CONV_POOLING_H saber/funcs/impl/impl_conv_pooling.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CRF_DECODING_H saber/funcs/impl/impl_crf_decoding.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CROP_H saber/funcs/impl/impl_crop.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CTCALIGN_H saber/funcs/impl/impl_ctc_align.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV2D_H saber/funcs/impl/cuda/vender_conv.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_ELTWISE_H saber/funcs/impl/cuda/vender_conv_eltwise.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_CONV_POOLING_H saber/funcs/impl/cuda/vender_conv_pooling.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_DECONV_H saber/funcs/impl/cuda/vender_deconv.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_PERMUTE_POWER_H saber/funcs/impl/cuda/vender_permute_power.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_CUDNN_SOFTMAX_H saber/funcs/impl/cuda/vender_softmax.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_MAT_MUL_H saber/funcs/impl/cuda/saber_mat_mul.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_POOLING_H saber/funcs/impl/cuda/vender_pooling.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ACTIVATION_H saber/funcs/impl/cuda/saber_activation.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ARGMAX_H saber/funcs/impl/cuda/saber_argmax.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_AXPY_H saber/funcs/impl/cuda/saber_axpy.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CAST_H saber/funcs/impl/cuda/saber_cast.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONCAT_H saber/funcs/impl/cuda/saber_concat.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV2D_H saber/funcs/impl/cuda/saber_conv.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_ELTWISE_H saber/funcs/impl/cuda/saber_conv_eltwise.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_POOLING_H saber/funcs/impl/cuda/saber_conv_pooling.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_UPADDING_PADDING_H saber/funcs/impl/cuda/saber_conv_upadding_padding.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CRFDECODING_H saber/funcs/impl/cuda/saber_crf_decoding.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CROP_H saber/funcs/impl/cuda/saber_crop.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_DECONV_H saber/funcs/impl/cuda/saber_deconv.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ELTWISE_H saber/funcs/impl/cuda/saber_eltwise.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_EMBEDDING_H saber/funcs/impl/cuda/saber_embedding.h 15;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_GRU_H saber/funcs/impl/cuda/saber_gru.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_IM2SEQUENCE_H saber/funcs/impl/cuda/saber_im2sequence.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_LAYER_NORM_H saber/funcs/impl/cuda/saber_layer_norm.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_LRN_H saber/funcs/impl/cuda/saber_lrn.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_LSTM_H saber/funcs/impl/cuda/saber_lstm.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_MVN_H saber/funcs/impl/cuda/saber_mvn.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_NORMALIZE_H saber/funcs/impl/cuda/saber_normalize.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PAD_H saber/funcs/impl/cuda/saber_pad.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PERMUTE_H saber/funcs/impl/cuda/saber_permute.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PERMUTE_POWER_H saber/funcs/impl/cuda/saber_permute_power.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_POOLING_H saber/funcs/impl/cuda/saber_pooling.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_POOLING_WITH_INDEX_H saber/funcs/impl/cuda/saber_pooling_with_index.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_POWER_H saber/funcs/impl/cuda/saber_power.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_REVERSE_INPUT_H saber/funcs/impl/cuda/saber_reverse_input.h 18;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_REVERSE_SEQUENCE_H saber/funcs/impl/cuda/saber_reverse_sequence.h 18;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SASS_FUNCS_H saber/funcs/impl/cuda/base/sass_funcs.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SCALE_H saber/funcs/impl/cuda/saber_scale.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_POOL_H saber/funcs/impl/cuda/saber_sequence_pool.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SOFTMAX_H saber/funcs/impl/cuda/saber_softmax.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_UNPOOL_H saber/funcs/impl/cuda/saber_unpool.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_VENDER_ACTIVATION_H saber/funcs/impl/cuda/vender_activation.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_CUDA_VENDER_GRU_H saber/funcs/impl/cuda/vender_gru.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_DECONV_H saber/funcs/impl/impl_deconv.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_DEFORMABLECONV_H saber/funcs/impl/impl_deformable_conv.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_DETECTIONOUTPUT_H saber/funcs/impl/impl_detection_output.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_ELTWISEACT_H saber/funcs/impl/impl_eltwise_act.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_ELTWISE_H saber/funcs/impl/impl_eltwise.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_EMBEDDING_H saber/funcs/impl/impl_embedding.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_FC_H saber/funcs/impl/impl_fc.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_FLATTEN_H saber/funcs/impl/impl_flatten.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_GRU_H saber/funcs/impl/impl_gru.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_IM2S_H saber/funcs/impl/impl_im2sequence.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_IMPL_CONV_UNPADDING_PADDING_H saber/funcs/impl/impl_conv_unpadding_padding.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_IMPL_REVERSE_SEQUENCE_H saber/funcs/impl/impl_reverse_sequence.h 18;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_LAYER_NORM_H saber/funcs/impl/impl_layer_norm.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_LRN_H saber/funcs/impl/impl_lrn.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_LSTM_H saber/funcs/impl/impl_lstm.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_MATMUL_H saber/funcs/impl/impl_mat_mul.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_MULTICLASSNMS_H saber/funcs/impl/impl_multiclass_nms.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_MVN_H saber/funcs/impl/impl_mvn.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_NORMALIZE_H saber/funcs/impl/impl_normalize.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_PAD_H saber/funcs/impl/impl_pad.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_PERMUTEPOWER_H saber/funcs/impl/impl_permute_power.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_PERMUTE_H saber/funcs/impl/impl_permute.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_POOLINGWITHINDEX_H saber/funcs/impl/impl_pooling_with_index.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_POOLING_H saber/funcs/impl/impl_pooling.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_POWER_H saber/funcs/impl/impl_power.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_PRIORBOX_H saber/funcs/impl/impl_priorbox.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_RESHAPE_H saber/funcs/impl/impl_reshape.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_RESIZE_H saber/funcs/impl/impl_resize.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_REVERSE_INPUT_H saber/funcs/impl/impl_reverse_input.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_ROIPOOLING_H saber/funcs/impl/impl_roi_pooling.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_SABER_UTIL_H saber/funcs/saber_util.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_SCALE_H saber/funcs/impl/impl_scale.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_SEQUENCECONV_H saber/funcs/impl/impl_sequence_conv.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_SEQUENCEPOOL_H saber/funcs/impl/impl_sequence_pool.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_SLICE_H saber/funcs/impl/impl_slice.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_SOFTMAX_H saber/funcs/impl/impl_softmax.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_SPP_H saber/funcs/impl/impl_spp.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_TRANSPOSE_H saber/funcs/impl/impl_transpose.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_UNPOOL_H saber/funcs/impl/impl_unpool.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX2_CONV_H saber/funcs/impl/x86/jit_avx2_conv.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV1X1_H saber/funcs/impl/x86/jit_avx512_conv1x1.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CONV_H saber/funcs/impl/x86/jit_avx512_conv.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_JIT_CALL_CONF_H saber/funcs/impl/x86/jit_call_conf.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_JIT_UNI_1X1_CONV_UTIL_H saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h 15;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_JIT_UNI_DW_CONVOLUTION_H saber/funcs/impl/x86/jit_uni_dwconv.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_AVX2_CONV_KERNEL_H saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_AVX512_CONV_KERNEL_H saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_UNI_DW_CONV_KERNEL_F32_H saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_MAT_MUL_H saber/funcs/impl/x86/vender_mat_mul.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_MKL_PACKED_WEIGHT_H saber/funcs/impl/x86/mkl_packed_weight.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ACTIVATION_H saber/funcs/impl/x86/saber_activation.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ARGMAX_H saber/funcs/impl/x86/saber_argmax.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_AXPY_H saber/funcs/impl/x86/saber_axpy.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CAST_H saber/funcs/impl/x86/saber_cast.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CONCAT_H saber/funcs/impl/x86/saber_concat.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CONV_H saber/funcs/impl/x86/saber_conv.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CRF_DECODING_H saber/funcs/impl/x86/saber_crf_decoding.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CROP_H saber/funcs/impl/x86/saber_crop.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ELTWISE_H saber/funcs/impl/x86/saber_eltwise.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_EMBEDDING_H saber/funcs/impl/x86/saber_embedding.h 13;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_GRU_H saber/funcs/impl/x86/saber_gru.h 4;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_IM2SEQUENCE_H saber/funcs/impl/x86/saber_im2sequence.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_LAYER_NORM_H saber/funcs/impl/x86/saber_layer_norm.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_LSTM_H saber/funcs/impl/x86/saber_lstm.h 2;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_MVN_H saber/funcs/impl/x86/saber_mvn.h 18;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_NORMALIZE_H saber/funcs/impl/x86/saber_normalize.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PERMUTE_H saber/funcs/impl/x86/saber_permute.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PERMUTE_POWER_H saber/funcs/impl/x86/saber_permute_power.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_POOLING_H saber/funcs/impl/x86/saber_pooling.h 18;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_POOLING_WITH_INDEX_H saber/funcs/impl/x86/saber_pooling_with_index.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_POWER_H saber/funcs/impl/x86/saber_power.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_REVERSE_INPUT_H saber/funcs/impl/x86/saber_reverse_input.h 18;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_REVERSE_SEQUENCE_H saber/funcs/impl/x86/saber_reverse_sequence.h 18;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SOFTMAX_H saber/funcs/impl/x86/saber_softmax.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_SABER_VENDER_FC_H saber/funcs/impl/x86/vender_fc.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_GRU_H saber/funcs/impl/x86/vender_gru.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_LSTM_H saber/funcs/impl/x86/vender_lstm.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_JIT_AVX512_CONV1X1_KERNEL_H saber/funcs/impl/x86/kernel/jit_avx512_conv1x1_kernel.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_JIT_AVX512_RTUS_DRIVER_H saber/funcs/impl/x86/kernel/jit_avx512_rtus_driver.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_LAYER_NORM_H saber/funcs/layer_norm.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_LRN_H saber/funcs/lrn.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_LSTM_H saber/funcs/lstm.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_MAT_MUL_H saber/funcs/mat_mul.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_MVN_H saber/funcs/mvn.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_NORMALIZE_H saber/funcs/normalize.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_PAD_H saber/funcs/pad.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_PARAM_H saber/saber_funcs_param.h 14;" d language:C++ +ANAKIN_SABER_FUNCS_PERMUTE_H saber/funcs/permute.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_PERMUTE_POWER_H saber/funcs/permute_power.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_POOLING_H saber/funcs/pooling.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_POOLING_WITH_INDEX_H saber/funcs/pooling_with_index.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_POWER_H saber/funcs/power.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_PRIORBOX_H saber/funcs/priorbox.h 16;" d language:C++ +ANAKIN_SABER_FUNCS_RESHAPE_H saber/funcs/reshape.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_RESIZE_H saber/funcs/resize.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_REVERSE_INPUT_H saber/funcs/reverse_input.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_REVERSE_SEQUENCE_H saber/funcs/reverse_sequence.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_ROI_POOL_H saber/funcs/roi_pooling.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_SCALE_H saber/funcs/scale.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_SEQUENCE_CONV_H saber/funcs/sequence_conv.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_SEQUENCE_POOL_H saber/funcs/sequence_pool.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_SLICE_H saber/funcs/slice.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_SOFTMAX_H saber/funcs/softmax.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_SPP_H saber/funcs/spp.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_TIMER_H saber/funcs/timer.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_TRANSPOSE_H saber/funcs/transpose.h 17;" d language:C++ +ANAKIN_SABER_FUNCS_UNPOOL_H saber/funcs/unpool.h 16;" d language:C++ +ANAKIN_SABER_FUNC_IMPL_AMD_UTILS_H saber/funcs/impl/amd/amd_utils.h 16;" d language:C++ +ANAKIN_SABER_FUNC_IMPL_X86_MATH_SEQUENCE_BATCH_H saber/funcs/impl/x86/sequence2batch.h 2;" d language:C++ +ANAKIN_SABER_LITE_CORE_ARM_DEVICE_H saber/lite/core/arm_device.h 17;" d language:C++ +ANAKIN_SABER_LITE_CORE_BUFFER_LITE_H saber/lite/core/buffer_lite.h 16;" d language:C++ +ANAKIN_SABER_LITE_CORE_COMMON_H saber/lite/core/common_lite.h 16;" d language:C++ +ANAKIN_SABER_LITE_CORE_DEVICE_LITE_H saber/lite/core/context_lite.h 16;" d language:C++ +ANAKIN_SABER_LITE_CORE_SHAPE_LITE_H saber/lite/core/shape_lite.h 16;" d language:C++ +ANAKIN_SABER_LITE_CORE_TENSOR_LITE_H saber/lite/core/tensor_lite.h 17;" d language:C++ +ANAKIN_SABER_LITE_CORE_TENSOR_OP_H saber/lite/core/tensor_op_lite.h 16;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_DETECTION_LITE_H saber/lite/funcs/detection_lite.h 17;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_NEON_IMPL_CONV_ARM_IMPL_H saber/lite/funcs/neon/impl/conv_arm_impl.h 15;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_NEON_IMPL_POOLING_ARM_IMPL_H saber/lite/funcs/neon/impl/pooling_arm_impl.h 15;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_NEON_IMPL_SGEMM_ARM_H saber/lite/funcs/neon/impl/sgemm_arm.h 15;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_NEON_SABER_PRELU_H saber/lite/funcs/saber_prelu.h 15;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_OP_BASE_H saber/lite/funcs/op_base.h 16;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_ACTIVATION_H saber/lite/funcs/saber_activation.h 16;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_CONCAT_H saber/lite/funcs/saber_concat.h 16;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_CONV_ACT_H saber/lite/funcs/saber_conv_act.h 15;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_CONV_BATCHNORM_SCALE_H saber/lite/funcs/saber_conv_batchnorm_scale.h 16;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_CONV_BATCHNORM_SCALE_RELU_H saber/lite/funcs/saber_conv_batchnorm_scale_relu.h 16;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_CONV_H saber/lite/funcs/saber_conv.h 15;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_DETECTION_OUTPUT_H saber/lite/funcs/saber_detection_output.h 16;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_ELTWISE_H saber/lite/funcs/saber_eltwise.h 16;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_FC_H saber/lite/funcs/saber_fc.h 15;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_PERMUTE_H saber/lite/funcs/saber_permute.h 15;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_POOLING_H saber/lite/funcs/saber_pooling.h 15;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_PRIORBOX_H saber/lite/funcs/saber_priorbox.h 15;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_SLICE_H saber/lite/funcs/saber_slice.h 15;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_SABER_SOFTMAX_H saber/lite/funcs/saber_softmax.h 16;" d language:C++ +ANAKIN_SABER_LITE_FUNCS_TIMER_LITE_H saber/lite/funcs/timer_lite.h 16;" d language:C++ +ANAKIN_SABER_NORMAL_ACTIVATION_H saber/funcs/impl/x86/saber_normal_activation.h 3;" d language:C++ +ANAKIN_SABER_SABER_H saber/saber.h 17;" d language:C++ +ANAKIN_SABER_SSE_MATH_H saber/funcs/impl/x86/saber_sse_math.h 13;" d language:C++ +ANAKIN_SABER_TENSOR_OP_H saber/core/tensor_op.h 17;" d language:C++ +ANAKIN_SABER_UTILS_H saber/utils.h 17;" d language:C++ +ANAKIN_SERVICE_DAEMON_H framework/service/service_daemon.h 17;" d language:C++ +ANAKIN_SERVICE_H framework/service/anakin_service.h 17;" d language:C++ +ANAKIN_SERVICE_TEST_H test/framework/service/service_test.h 17;" d language:C++ +ANAKIN_SINGLETON_H framework/core/singleton.h 17;" d language:C++ +ANAKIN_TEST_SABER_BASE_H test/saber/test_saber_base.h 15;" d language:C++ +ANAKIN_TEST_SABER_FUNC_H test/saber/test_saber_func.h 17;" d language:C++ +ANAKIN_THREAD_POOL_H framework/core/thread_pool.h 17;" d language:C++ +ANAKIN_THREAD_SAFE_MACROS_H framework/core/thread_safe_macros.h 17;" d language:C++ +ANAKIN_TLS_H framework/core/tls.h 17;" d language:C++ +ANAKIN_TO_TYPE_ID framework/core/data_types.h 79;" d language:C++ +ANAKIN_TYPES_H framework/core/types.h 17;" d language:C++ +ANAKIN_TYPE_TRAITS_EXTEND_H framework/core/type_traits_extend.h 17;" d language:C++ +ANAKIN_VERSION cmake/config/anakin_config.h.in 19;" d language:C++ file: +ANAKIN_WORKER_H framework/core/net/worker.h 17;" d language:C++ +ANDROID_NDK tools/anakin-lite/andrid_build.sh /^export ANDROID_NDK=\/home\/public\/android-ndk-r14b$/;" v language:C++ +ANDROID_NDK tools/andrid_build.sh /^export ANDROID_NDK=\/home\/public\/android-ndk-r14b$/;" v language:C++ +API saber/core/buffer.h /^ typedef TargetWrapper API;$/;" t language:C++ class:anakin::saber::Buffer access:public +API saber/core/context.h /^ typedef TargetWrapper API;$/;" t language:C++ class:anakin::saber::final access:private +API saber/core/device.h /^ typedef TargetWrapper API;$/;" t language:C++ struct:anakin::saber::Device access:public +API saber/core/device.h /^ typedef TargetWrapper API;$/;" t language:C++ struct:anakin::saber::Device access:public +API saber/core/env.h /^ typedef TargetWrapper API;$/;" t language:C++ class:anakin::saber::Env access:public +API saber/core/env.h /^ typedef TargetWrapper API;$/;" t language:C++ class:anakin::saber::Env access:public +API saber/core/events.h /^ typedef TargetWrapper API;$/;" t language:C++ class:anakin::saber::Events access:public +API saber/core/events.h /^ typedef TargetWrapper API;$/;" t language:C++ class:anakin::saber::EventsTree access:public +API saber/core/impl/bm/bm_device.cpp /^ typedef TargetWrapper API;$/;" t language:C++ file: +API saber/core/impl/cuda/cuda_device.cpp /^ typedef TargetWrapper API;$/;" t language:C++ file: +API saber/core/impl/cuda/cuda_device.cpp /^ typedef TargetWrapper API;$/;" t language:C++ file: +API saber/core/impl/x86/x86_device.cpp /^ typedef TargetWrapper API;$/;" t language:C++ file: +API saber/core/tensor.h /^ typedef TargetWrapper API;$/;" t language:C++ class:anakin::saber::Tensor access:public +API saber/funcs/impl/cuda/saber_softmax.h /^ typedef TargetWrapper API;$/;" t language:C++ class:anakin::saber::SaberSoftmax access:public +API saber/funcs/priorbox.h /^ typedef TargetWrapper API;$/;" t language:C++ class:anakin::saber::PriorBox access:public +API_t saber/core/buffer.h /^ typedef TargetWrapper API_t;$/;" t language:C++ +API_t saber/core/tensor.h /^ typedef TargetWrapper API_t;$/;" t language:C++ +APIs docs/Manual/Tutorial_en.md /^This tutorial will briefly explain how Anakin works, some of the basic Anakin APIs, and how to call these APIs.$/;" v language:C++ +ARM docs/Manual/addCustomDevice.md /^typedef TargetType ARM;$/;" t language:C++ file: +ARM saber/saber_types.h /^typedef TargetType ARM;$/;" t language:C++ namespace:anakin::saber +ARMGPU saber/saber_types.h /^typedef TargetType ARMGPU;$/;" t language:C++ namespace:anakin::saber +ARMType saber/lite/core/common_lite.h /^enum ARMType{$/;" g language:C++ namespace:anakin::saber::lite +ATTR_HAS tools/external_converter_v2/parser/frontend/dash_board/static/cytoscape/qtip2/jquery.qtip.js /^ATTR_HAS = 'data-hasqtip',$/;" l language:C++ +AVG framework/operators/pooling.h /^ AVG, \/\/\/< AVG stand for avg-pooling operation$/;" m language:C++ class:anakin::ops::PoolingType access:private +A_INTERLEAVE saber/lite/funcs/neon/impl/sgemm_arm.cpp /^const int A_INTERLEAVE = 8;$/;" m language:C++ namespace:anakin::saber::lite file: +A_TRANSPOSE saber/lite/funcs/neon/impl/sgemm_arm.cpp /^const bool A_TRANSPOSE = false;$/;" m language:C++ namespace:anakin::saber::lite file: +Act saber/funcs/impl/cuda/cuda_inline_activation.h /^ typedef Dtype(*Act)(const Dtype);$/;" t language:C++ struct:anakin::saber::ACTIVATION access:public +Act saber/funcs/impl/x86/saber_normal_activation.h /^ typedef Dtype(*Act)(const Dtype);$/;" t language:C++ struct:anakin::saber::ACTIVATION access:public +Act test/saber/test_saber_gru.cpp /^ typedef Dtype(*Act)(const Dtype);$/;" t language:C++ struct:ACTIVATION file: access:public +Act test/saber/test_saber_lstm.cpp /^ typedef Dtype(*Act)(const Dtype);$/;" t language:C++ struct:ACTIVATION file: access:public +Activate test/saber/test_saber_gru.cpp /^inline typename ACTIVATION::Act Activate(ActiveType type){$/;" f language:C++ signature:(ActiveType type) +Activate test/saber/test_saber_lstm.cpp /^inline typename ACTIVATION::Act Activate(ActiveType type){$/;" f language:C++ signature:(ActiveType type) +Activate_inner saber/funcs/impl/cuda/cuda_inline_activation.h /^__device__ inline typename ACTIVATION::Act Activate_inner(ActiveType type) {$/;" f language:C++ namespace:anakin::saber signature:(ActiveType type) +Activate_inner saber/funcs/impl/x86/saber_normal_activation.h /^inline typename ACTIVATION::Act Activate_inner(ActiveType type) {$/;" f language:C++ namespace:anakin::saber signature:(ActiveType type) +Activate_inner saber/funcs/impl/x86/saber_normal_activation.h /^static inline Dtype Activate_inner(Dtype value,ActiveType type) {$/;" f language:C++ namespace:anakin::saber signature:(Dtype value,ActiveType type) +Activation framework/operators/activation.h /^ Activation() {}$/;" f language:C++ class:anakin::ops::Activation access:public signature:() +Activation framework/operators/activation.h /^class Activation : public Operator {$/;" c language:C++ namespace:anakin::ops inherits:Operator +Activation saber/funcs/activation.h /^class Activation : public BaseFunc<$/;" c language:C++ namespace:anakin::saber inherits:BaseFunc +ActivationHelper docs/Manual/addCustomDevice.md /^template class ActivationHelper;$/;" x language:C++ file: +ActivationHelper framework/operators/activation.cpp /^template class ActivationHelper;$/;" x language:C++ file: +ActivationHelper framework/operators/activation.cpp /^template class ActivationHelper;$/;" x language:C++ file: +ActivationHelper framework/operators/activation.cpp /^template class ActivationHelper;$/;" x language:C++ file: +ActivationHelper framework/operators/activation.cpp /^template class ActivationHelper;$/;" x language:C++ file: +ActivationHelper framework/operators/activation.cpp /^template class ActivationHelper;$/;" x language:C++ file: +ActivationHelper framework/operators/activation.h /^ ActivationHelper()=default;$/;" p language:C++ class:anakin::ops::ActivationHelper access:public signature:() +ActivationHelper framework/operators/activation.h /^ friend class ActivationHelper;$/;" x language:C++ +ActivationHelper framework/operators/activation.h /^class ActivationHelper : public OperatorHelper {$/;" c language:C++ namespace:anakin::ops inherits:OperatorHelper +ActivationHelper framework/operators/activation.h /^class ActivationHelper;$/;" x language:C++ +ActivationHelper::Init docs/Manual/addCustomDevice.md /^Status ActivationHelper::Init(OpContext &ctx,\\$/;" f language:C++ class:ActivationHelper signature:(OpContext &ctx,\ const std::vector >& ins, \ std::vector >& outs) +ActivationParam saber/saber_funcs_param.h /^ ActivationParam()$/;" f language:C++ struct:anakin::saber::ActivationParam access:public signature:() +ActivationParam saber/saber_funcs_param.h /^ ActivationParam(ActiveType act, float n_slope = float(0),$/;" f language:C++ struct:anakin::saber::ActivationParam access:public signature:(ActiveType act, float n_slope = float(0), float co = float(1), PreluParam prelu = PreluParam(false, nullptr)) +ActivationParam saber/saber_funcs_param.h /^ ActivationParam(ActiveType act, float n_slope,$/;" f language:C++ struct:anakin::saber::ActivationParam access:public signature:(ActiveType act, float n_slope, float co, PreluParam prelu, bool has) +ActivationParam saber/saber_funcs_param.h /^ ActivationParam(const ActivationParam& right)$/;" f language:C++ struct:anakin::saber::ActivationParam access:public signature:(const ActivationParam& right) +ActivationParam saber/saber_funcs_param.h /^struct ActivationParam {$/;" s language:C++ namespace:anakin::saber +ActiveType saber/saber_types.h /^} ActiveType;$/;" t language:C++ namespace:anakin::saber typeref:enum:anakin::saber::__anon20 +Active_clipped_relu saber/saber_types.h /^ Active_clipped_relu = 4,$/;" e language:C++ enum:anakin::saber::__anon20 +Active_elu saber/saber_types.h /^ Active_elu = 5,$/;" e language:C++ enum:anakin::saber::__anon20 +Active_identity saber/saber_types.h /^ Active_identity = 6,$/;" e language:C++ enum:anakin::saber::__anon20 +Active_prelu saber/saber_types.h /^ Active_prelu = 10$/;" e language:C++ enum:anakin::saber::__anon20 +Active_relu saber/saber_types.h /^ Active_relu = 2,$/;" e language:C++ enum:anakin::saber::__anon20 +Active_sigmoid saber/saber_types.h /^ Active_sigmoid = 1,$/;" e language:C++ enum:anakin::saber::__anon20 +Active_stanh saber/saber_types.h /^ Active_stanh = 9,$/;" e language:C++ enum:anakin::saber::__anon20 +Active_tanh saber/saber_types.h /^ Active_tanh = 3,$/;" e language:C++ enum:anakin::saber::__anon20 +Active_unknow saber/saber_types.h /^ Active_unknow = 0,$/;" e language:C++ enum:anakin::saber::__anon20 +AddConnect framework/graph/llvm/fusion/graph_pattern.cpp /^Pattern& Pattern::AddConnect(std::string node_name_btm, std::string node_name_top) {$/;" f language:C++ class:anakin::graph::Pattern signature:(std::string node_name_btm, std::string node_name_top) +AddConnect framework/graph/llvm/fusion/graph_pattern.h /^ Pattern& AddConnect(std::string, std::string);$/;" p language:C++ class:anakin::graph::Pattern access:public signature:(std::string, std::string) +AddOpNode framework/graph/llvm/fusion/graph_pattern.cpp /^Pattern& Pattern::AddOpNode(std::string node_name, std::string op_name) {$/;" f language:C++ class:anakin::graph::Pattern signature:(std::string node_name, std::string op_name) +AddOpNode framework/graph/llvm/fusion/graph_pattern.h /^ Pattern& AddOpNode(std::string, std::string);$/;" p language:C++ class:anakin::graph::Pattern access:public signature:(std::string, std::string) +Ag tools/external_converter_v2/parser/frontend/dash_board/static/echart/echarts.min.js /^}}function ed(t){var e=t.axis.model,n={},i=n.axisDim=t.axis.dim;return n.axisIndex=n[i+"AxisIndex"]=e.componentIndex,n.axisName=n[i+"AxisName"]=e.name,n.axisId=n[i+"AxisId"]=e.id,n}function nd(t){return!t||null==t[0]||isNaN(t[0])||null==t[1]||isNaN(t[1])}function id(t,e,n){if(!Jp.node){var i=e.getZr();UM(i).records||(UM(i).records={}),rd(i,e);var r=UM(i).records[t]||(UM(i).records[t]={});r.handler=n}}function rd(t,e){function n(n,i){t.on(n,function(n){var r=ld(e);ZM(UM(t).records,function(t){t&&i(t,n,r.dispatchAction)}),od(r.pendings,e)})}UM(t).initialized||(UM(t).initialized=!0,n("click",x(sd,"click")),n("mousemove",x(sd,"mousemove")),n("globalout",ad))}function od(t,e){var n,i=t.showTip.length,r=t.hideTip.length;i?n=t.showTip[i-1]:r&&(n=t.hideTip[r-1]),n&&(n.dispatchAction=null,e.dispatchAction(n))}function ad(t,e,n){t.handler("leave",null,n)}function sd(t,e,n,i){e.handler(t,n,i)}function ld(t){var e={showTip:[],hideTip:[]},n=function(i){var r=e[i.type];r?r.push(i):(i.dispatchAction=n,t.dispatchAction(i))};return{dispatchAction:n,pendings:e}}function ud(t,e){if(!Jp.node){var n=e.getZr(),i=(UM(n).records||{})[t];i&&(UM(n).records[t]=null)}}function hd(){}function cd(t,e,n,i){dd(XM(n).lastProp,i)||(XM(n).lastProp=i,e?Mo(n,i,t):(n.stopAnimation(),n.attr(i)))}function dd(t,e){if(M(t)&&M(e)){var n=!0;return f(e,function(e,i){n=n&&dd(t[i],e)}),!!n}return t===e}function fd(t,e){t[e.get("label.show")?"show":"hide"]()}function pd(t){return{position:t.position.slice(),rotation:t.rotation||0}}function gd(t,e,n){var i=e.get("z"),r=e.get("zlevel");t&&t.traverse(function(t){"group"!==t.type&&(null!=i&&(t.z=i),null!=r&&(t.zlevel=r),t.silent=n)})}function vd(t){var e,n=t.get("type"),i=t.getModel(n+"Style");return"line"===n?(e=i.getLineStyle(),e.fill=null):"shadow"===n&&(e=i.getAreaStyle(),e.stroke=null),e}function md(t,e,n,i,r){var o=n.get("value"),a=xd(o,e.axis,e.ecModel,n.get("seriesDataIndices"),{precision:n.get("label.precision"),formatter:n.get("label.formatter")}),s=n.getModel("label"),l=Ky(s.get("padding")||0),u=s.getFont(),h=Sn(a,u),c=r.position,d=h.width+l[1]+l[3],f=h.height+l[0]+l[2],p=r.align;"right"===p&&(c[0]-=d),"center"===p&&(c[0]-=d\/2);var g=r.verticalAlign;"bottom"===g&&(c[1]-=f),"middle"===g&&(c[1]-=f\/2),yd(c,d,f,i);var v=s.get("backgroundColor");v&&"auto"!==v||(v=e.get("axisLine.lineStyle.color")),t.label={shape:{x:0,y:0,width:d,height:f,r:s.get("borderRadius")},position:c.slice(),style:{text:a,textFont:u,textFill:s.getTextColor(),textPosition:"inside",fill:v,stroke:s.get("borderColor")||"transparent",lineWidth:s.get("borderWidth")||0,shadowBlur:s.get("shadowBlur"),shadowColor:s.get("shadowColor"),shadowOffsetX:s.get("shadowOffsetX"),shadowOffsetY:s.get("shadowOffsetY")},z2:10}}function yd(t,e,n,i){var r=i.getWidth(),o=i.getHeight();t[0]=Math.min(t[0]+e,r)-e,t[1]=Math.min(t[1]+n,o)-n,t[0]=Math.max(t[0],0),t[1]=Math.max(t[1],0)}function xd(t,e,n,i,r){t=e.scale.parse(t);var o=e.scale.getLabel(t,{precision:r.precision}),a=r.formatter;if(a){var s={value:Fu(e,t),seriesData:[]};f(i,function(t){var e=n.getSeriesByIndex(t.seriesIndex),i=t.dataIndexInside,r=e&&e.getDataParams(i);r&&s.seriesData.push(r)}),b(a)?o=a.replace("{value}",o):w(a)&&(o=a(s))}return o}function _d(t,e,n){var i=fe();return ye(i,i,n.rotation),me(i,i,n.position),Co([t.dataToCoord(e),(n.labelOffset||0)+(n.labelDirection||1)*(n.labelMargin||0)],i)}function wd(t,e,n,i,r,o){var a=Rb.innerTextLayout(n.rotation,0,n.labelDirection);n.labelMargin=r.get("label.margin"),md(e,i,r,o,{position:_d(i.axis,t,n),align:a.textAlign,verticalAlign:a.textVerticalAlign})}function bd(t,e,n){return n=n||0,{x1:t[n],y1:t[1-n],x2:e[n],y2:e[1-n]}}function Md(t,e,n){return n=n||0,{x:t[n],y:t[1-n],width:e[n],height:e[1-n]}}function Sd(t,e){var n={};return n[e.dim+"AxisIndex"]=e.index,t.getCartesian(n)}function Id(t){return"x"===t.dim?0:1}function Cd(t){var e="cubic-bezier(0.23, 1, 0.32, 1)",n="left "+t+"s "+e+",top "+t+"s "+e;return p(tS,function(t){return t+"transition:"+n}).join(";")}function Td(t){var e=[],n=t.get("fontSize"),i=t.getTextColor();return i&&e.push("color:"+i),e.push("font:"+t.getFont()),n&&e.push("line-height:"+Math.round(3*n\/2)+"px"),QM(["decoration","align"],function(n){var i=t.get(n);i&&e.push("text-"+n+":"+i)}),e.join(";")}function Ad(t){var e=[],n=t.get("transitionDuration"),i=t.get("backgroundColor"),r=t.getModel("textStyle"),o=t.get("padding");return n&&e.push(Cd(n)),i&&(Jp.canvasSupported?e.push("background-Color:"+i):(e.push("background-Color:#"+Ne(i)),e.push("filter:alpha(opacity=70)"))),QM(["width","color","radius"],function(n){var i="border-"+n,r=JM(i),o=t.get(r);null!=o&&e.push(i+":"+o+("color"===n?"":"px"))}),e.push(Td(r)),null!=o&&e.push("padding:"+Ky(o).join("px ")+"px"),e.join(";")+";"}function Dd(t,e){if(Jp.wxa)return null;var n=document.createElement("div"),i=this._zr=e.getZr();this.el=n,this._x=e.getWidth()\/2,this._y=e.getHeight()\/2,t.appendChild(n),this._container=t,this._show=!1,this._hideTimeout;var r=this;n.onmouseenter=function(){r._enterable&&(clearTimeout(r._hideTimeout),r._show=!0),r._inContent=!0},n.onmousemove=function(e){if(e=e||window.event,!r._enterable){var n=i.handler;vi(t,e,!0),n.dispatch("mousemove",e)}},n.onmouseleave=function(){r._enterable&&r._show&&r.hideLater(r._hideDelay),r._inContent=!1}}function kd(t){for(var e=t.pop();t.length;){var n=t.pop();n&&(Lo.isInstance(n)&&(n=n.get("tooltip",!0)),"string"==typeof n&&(n={formatter:n}),e=new Lo(n,e,e.ecModel))}return e}function Pd(t,e){return t.dispatchAction||y(e.dispatchAction,e)}function Ld(t,e,n,i,r,o,a){var s=Ed(n),l=s.width,u=s.height;return null!=o&&(t+l+o>i?t-=l+o:t+=o),null!=a&&(e+u+a>r?e-=u+a:e+=a),[t,e]}function Od(t,e,n,i,r){var o=Ed(n),a=o.width,s=o.height;return t=Math.min(t+a,i)-a,e=Math.min(e+s,r)-s,t=Math.max(t,0),e=Math.max(e,0),[t,e]}function Ed(t){var e=t.clientWidth,n=t.clientHeight;if(document.defaultView&&document.defaultView.getComputedStyle){var i=document.defaultView.getComputedStyle(t);i&&(e+=parseInt(i.paddingLeft,10)+parseInt(i.paddingRight,10)+parseInt(i.borderLeftWidth,10)+parseInt(i.borderRightWidth,10),n+=parseInt(i.paddingTop,10)+parseInt(i.paddingBottom,10)+parseInt(i.borderTopWidth,10)+parseInt(i.borderBottomWidth,10))}return{width:e,height:n}}function Rd(t,e,n){var i=n[0],r=n[1],o=5,a=0,s=0,l=e.width,u=e.height;switch(t){case"inside":a=e.x+l\/2-i\/2,s=e.y+u\/2-r\/2;break;case"top":a=e.x+l\/2-i\/2,s=e.y-r-o;break;case"bottom":a=e.x+l\/2-i\/2,s=e.y+u+o;break;case"left":a=e.x-i-o,s=e.y+u\/2-r\/2;break;case"right":a=e.x+l+o,s=e.y+u\/2-r\/2}return[a,s]}function zd(t){return"center"===t||"middle"===t}function Bd(t,e){aS[t]=e}function Nd(t){return aS[t]}function Vd(t){return 0===t.indexOf("my")}function Fd(t){this.model=t}function Hd(t){this.model=t}function Wd(t){var e={},n=[],i=[];return t.eachRawSeries(function(t){var r=t.coordinateSystem;if(!r||"cartesian2d"!==r.type&&"polar"!==r.type)n.push(t);else{var o=r.getBaseAxis();if("category"===o.type){var a=o.dim+"_"+o.index;e[a]||(e[a]={categoryAxis:o,valueAxis:r.getOtherAxis(o),series:[]},i.push({axisDim:o.dim,axisIndex:o.index})),e[a].series.push(t)}else n.push(t)}}),{seriesGroupByCategoryAxis:e,other:n,meta:i}}function Gd(t){var e=[];return f(t,function(t){var n=t.categoryAxis,i=t.valueAxis,r=i.dim,o=[" "].concat(p(t.series,function(t){return t.name})),a=[n.model.getCategories()];f(t.series,function(t){a.push(t.getRawData().mapArray(r,function(t){return t}))});for(var s=[o.join(vS)],l=0;la;a++)i[a]=arguments[a];n.push((o?o+vS:"")+i.join(vS))}),n.join("\\n")}).join("\\n\\n"+gS+"\\n\\n")}function Zd(t){var e=Wd(t);return{value:v([Gd(e.seriesGroupByCategoryAxis),Ud(e.other)],function(t){return t.replace(\/[\\n\\t\\s]\/g,"")}).join("\\n\\n"+gS+"\\n\\n"),meta:e.meta}}function jd(t){return t.replace(\/^\\s\\s*\/,"").replace(\/\\s\\s*$\/,"")}function Xd(t){var e=t.slice(0,t.indexOf("\\n"));return e.indexOf(vS)>=0?!0:void 0}function Yd(t){for(var e=t.split(\/\\n+\/g),n=jd(e.shift()).split(mS),i=[],r=p(n,function(t){return{name:t,data:[]}}),o=0;oCS}function mf(t){var e=t.length-1;return 0>e&&(e=0),[t[0],t[e]]}function yf(t,e,n,i){var r=new nv;return r.add(new Sy({name:"main",style:bf(n),silent:!0,draggable:!0,cursor:"move",drift:xS(t,e,r,"nswe"),ondragend:xS(gf,e,{isEnd:!0})})),_S(i,function(n){r.add(new Sy({name:n,style:{opacity:0},draggable:!0,silent:!0,invisible:!0,drift:xS(t,e,r,n),ondragend:xS(gf,e,{isEnd:!0})}))}),r}function xf(t,e,n,i){var r=i.brushStyle.lineWidth||0,o=MS(r,TS),a=n[0][0],s=n[1][0],l=a-r\/2,u=s-r\/2,h=n[0][1],c=n[1][1],d=h-o+r\/2,f=c-o+r\/2,p=h-a,g=c-s,v=p+r,m=g+r;wf(t,e,"main",a,s,p,g),i.transformable&&(wf(t,e,"w",l,u,o,m),wf(t,e,"e",d,u,o,m),wf(t,e,"n",l,u,v,o),wf(t,e,"s",l,f,v,o),wf(t,e,"nw",l,u,o,o),wf(t,e,"ne",d,u,o,o),wf(t,e,"sw",l,f,o,o),wf(t,e,"se",d,f,o,o))}function _f(t,e){var n=e.__brushOption,i=n.transformable,r=e.childAt(0);r.useStyle(bf(n)),r.attr({silent:!i,cursor:i?"move":"default"}),_S(["w","e","n","s","se","sw","ne","nw"],function(n){var r=e.childOfName(n),o=If(t,n);r&&r.attr({silent:!i,invisible:!i,cursor:i?kS[o]+"-resize":null})})}function wf(t,e,n,i,r,o,a){var s=e.childOfName(n);s&&s.setShape(kf(Df(t,e,[[i,r],[i+o,r+a]])))}function bf(t){return s({strokeNoScale:!0},t.brushStyle)}function Mf(t,e,n,i){var r=[bS(t,n),bS(e,i)],o=[MS(t,n),MS(e,i)];return[[r[0],o[0]],[r[1],o[1]]]}function Sf(t){return Io(t.group)}function If(t,e){if(e.length>1){e=e.split("");var n=[If(t,e[0]),If(t,e[1])];return("e"===n[0]||"w"===n[0])&&n.reverse(),n.join("")}var i={w:"left",e:"right",n:"top",s:"bottom"},r={left:"w",right:"e",top:"n",bottom:"s"},n=To(i[e],Sf(t));return r[n]}function Cf(t,e,n,i,r,o,a){var s=i.__brushOption,l=t(s.range),u=Af(n,o,a);_S(r.split(""),function(t){var e=DS[t];l[e[0]][e[1]]+=u[e[0]]}),s.range=e(Mf(l[0][0],l[1][0],l[0][1],l[1][1])),hf(n,i),gf(n,{isEnd:!1})}function Tf(t,e,n,i){var r=e.__brushOption.range,o=Af(t,n,i);_S(r,function(t){t[0]+=o[0],t[1]+=o[1]}),hf(t,e),gf(t,{isEnd:!1})}function Af(t,e,n){var i=t.group,r=i.transformCoordToLocal(e,n),o=i.transformCoordToLocal(0,0);return[r[0]-o[0],r[1]-o[1]]}function Df(t,e,n){var r=ff(t,e);return r&&r!==!0?r.clipPath(n,t._transform):i(n)}function kf(t){var e=bS(t[0][0],t[1][0]),n=bS(t[0][1],t[1][1]),i=MS(t[0][0],t[1][0]),r=MS(t[0][1],t[1][1]);return{x:e,y:n,width:i-e,height:r-n}}function Pf(t,e,n){if(t._brushType){var i=t._zr,r=t._covers,o=df(t,e,n);if(!t._dragging)for(var a=0;a=0)&&t(o,i,r)})}function Uf(t){return t[0]>t[1]&&t.reverse(),t}function Zf(t,e){return Wi(t,e,{includeMainTypes:FS})}function jf(t,e,n,i){var r=n.getAxis(["x","y"][t]),o=Uf(p([0,1],function(t){return e?r.coordToData(r.toLocalCoord(i[t])):r.toGlobalCoord(r.dataToCoord(i[t]))})),a=[];return a[t]=o,a[1-t]=[0\/0,0\/0],{values:o,xyMinMax:a}}function Xf(t,e,n,i){return[e[0]-i[t]*n[0],e[1]-i[t]*n[1]]}function Yf(t,e){var n=qf(t),i=qf(e),r=[n[0]\/i[0],n[1]\/i[1]];return isNaN(r[0])&&(r[0]=1),isNaN(r[1])&&(r[1]=1),r}function qf(t){return t?[t[0][1]-t[0][0],t[1][1]-t[1][0]]:[0\/0,0\/0]}function $f(t,e){var n=tp(t);XS(e,function(e,i){for(var r=n.length-1;r>=0;r--){var o=n[r];if(o[i])break}if(0>r){var a=t.queryComponents({mainType:"dataZoom",subType:"select",id:i})[0];if(a){var s=a.getPercentRange();n[0][i]={dataZoomId:i,start:s[0],end:s[1]}}}}),n.push(e)}function Kf(t){var e=tp(t),n=e[e.length-1];e.length>1&&e.pop();var i={};return XS(n,function(t,n){for(var r=e.length-1;r>=0;r--){var t=e[r][n];if(t){i[n]=t;break}}}),i}function Qf(t){t[YS]=null}function Jf(t){return tp(t).length}function tp(t){var e=t[YS];return e||(e=t[YS]=[{}]),e}function ep(t,e){var n=t[e]-t[1-e];return{span:Math.abs(n),sign:n>0?-1:0>n?1:e?-1:1}}function np(t,e){return Math.min(e[1],Math.max(e[0],t))}function ip(t){return u(KS,t)>=0}function rp(t,e){t=t.slice();var n=p(t,ua);e=(e||[]).slice();var i=p(e,ua);return function(r,o){f(t,function(t,a){for(var s={name:t,capital:n[a]},l=0;l=0}function r(t,i){var r=!1;return e(function(e){f(n(t,e)||[],function(t){i.records[e.name][t]&&(r=!0)})}),r}function o(t,i){i.nodes.push(t),e(function(e){f(n(t,e)||[],function(t){i.records[e.name][t]=!0})})}return function(n){function a(t){!i(t,s)&&r(t,s)&&(o(t,s),l=!0)}var s={nodes:[],records:{}};if(e(function(t){s.records[t.name]={}}),!n)return s;o(n,s);var l;do l=!1,t(a);while(l);return s}}function ap(t,e,n){var i=[1\/0,-1\/0];return JS(n,function(t){var n=t.getData();n&&JS(n.mapDimension(e,!0),function(t){var e=n.getApproximateExtent(t);e[0]i[1]&&(i[1]=e[1])})}),i[1]0?0:0\/0);var a=n.getMax(!0);return null!=a&&"dataMax"!==a&&"function"!=typeof a?e[1]=a:r&&(e[1]=o>0?o-1:0\/0),n.get("scale",!0)||(e[0]>0&&(e[0]=0),e[1]<0&&(e[1]=0)),e}function lp(t,e){var n=t.getAxisModel(),i=t._percentWindow,r=t._valueWindow;if(i){var o=Zo(r,[0,500]);o=Math.min(o,20);var a=e||0===i[0]&&100===i[1];n.setRange(a?null:+r[0].toFixed(o),a?null:+r[1].toFixed(o))}}function up(t){var e=t._minMaxSpan={},n=t._dataZoomModel;JS(["min","max"],function(i){e[i+"Span"]=n.get(i+"Span");var r=n.get(i+"ValueSpan");if(null!=r&&(e[i+"ValueSpan"]=r,r=t.getAxisModel().axis.scale.parse(r),null!=r)){var o=t._dataExtent;e[i+"Span"]=Vo(o[0]+r,o,[0,100],!0)}})}function hp(t){var e={};return nI(["start","end","startValue","endValue","throttle"],function(n){t.hasOwnProperty(n)&&(e[n]=t[n])}),e}function cp(t,e){var n=t._rangePropMode,i=t.get("rangeMode");nI([["start","startValue"],["end","endValue"]],function(t,r){var o=null!=e[t[0]],a=null!=e[t[1]];o&&!a?n[r]="percent":!o&&a?n[r]="value":i?n[r]=i[r]:o&&(n[r]="percent")})}function dp(t,e,n){(this._brushController=new nf(n.getZr())).on("brush",y(this._onBrush,this)).mount(),this._isZoomActive}function fp(t){var e={};return f(["xAxisIndex","yAxisIndex"],function(n){e[n]=t[n],null==e[n]&&(e[n]="all"),(e[n]===!1||"none"===e[n])&&(e[n]=[])}),e}function pp(t,e){t.setIconStatus("back",Jf(e)>1?"emphasis":"normal")}function gp(t,e,n,i,r){var o=n._isZoomActive;i&&"takeGlobalCursor"===i.type&&(o="dataZoomSelect"===i.key?i.dataZoomSelectActive:!1),n._isZoomActive=o,t.setIconStatus("zoom",o?"emphasis":"normal");var a=new Gf(fp(t.option),e,{include:["grid"]});n._brushController.setPanels(a.makePanelOpts(r,function(t){return t.xAxisDeclared&&!t.yAxisDeclared?"lineX":!t.xAxisDeclared&&t.yAxisDeclared?"lineY":"rect"})).enableBrush(o?{brushType:"auto",brushStyle:{lineWidth:0,fill:"rgba(0,0,0,0.2)"}}:!1)}function vp(t){this.model=t}function mp(t){return fI(t)}function yp(){if(!vI&&mI){vI=!0;var t=mI.styleSheets;t.length<31?mI.createStyleSheet().addRule(".zrvml","behavior:url(#default#VML)"):t[0].addRule(".zrvml","behavior:url(#default#VML)")}}function xp(t){return parseInt(t,10)}function _p(t,e){yp(),this.root=t,this.storage=e;var n=document.createElement("div"),i=document.createElement("div");n.style.cssText="display:inline-block;overflow:hidden;position:relative;width:300px;height:150px;",i.style.cssText="position:absolute;left:0;top:0;",t.appendChild(n),this._vmlRoot=i,this._vmlViewport=n,this.resize();var r=e.delFromStorage,o=e.addToStorage;e.delFromStorage=function(t){r.call(e,t),t&&t.onRemove&&t.onRemove(i)},e.addToStorage=function(t){t.onAdd&&t.onAdd(i),o.call(e,t)},this._firstPaint=!0}function wp(t){return function(){$g('In IE8.0 VML mode painter not support method "'+t+'"')}}function bp(t){return document.createElementNS(sC,t)}function Mp(t){return cC(1e4*t)\/1e4}function Sp(t){return mC>t&&t>-mC}function Ip(t,e){var n=e?t.textFill:t.fill;return null!=n&&n!==hC}function Cp(t,e){var n=e?t.textStroke:t.stroke;return null!=n&&n!==hC}function Tp(t,e){e&&Ap(t,"transform","matrix("+uC.call(e,",")+")")}function Ap(t,e,n){(!n||"linear"!==n.type&&"radial"!==n.type)&&("string"==typeof n&&n.indexOf("NaN")>-1&&console.log(n),t.setAttribute(e,n))}function Dp(t,e,n){t.setAttributeNS("http:\/\/www.w3.org\/1999\/xlink",e,n)}function kp(t,e,n){if(Ip(e,n)){var i=n?e.textFill:e.fill;i="transparent"===i?hC:i,"none"!==t.getAttribute("clip-path")&&i===hC&&(i="rgba(0, 0, 0, 0.002)"),Ap(t,"fill",i),Ap(t,"fill-opacity",e.opacity)}else Ap(t,"fill",hC);if(Cp(e,n)){var r=n?e.textStroke:e.stroke;r="transparent"===r?hC:r,Ap(t,"stroke",r);var o=n?e.textStrokeWidth:e.lineWidth,a=!n&&e.strokeNoScale?e.host.getLineScale():1;Ap(t,"stroke-width",o\/a),Ap(t,"paint-order",n?"stroke":"fill"),Ap(t,"stroke-opacity",e.opacity);var s=e.lineDash;s?(Ap(t,"stroke-dasharray",e.lineDash.join(",")),Ap(t,"stroke-dashoffset",cC(e.lineDashOffset||0))):Ap(t,"stroke-dasharray",""),e.lineCap&&Ap(t,"stroke-linecap",e.lineCap),e.lineJoin&&Ap(t,"stroke-linejoin",e.lineJoin),e.miterLimit&&Ap(t,"stroke-miterlimit",e.miterLimit)}else Ap(t,"stroke",hC)}function Pp(t){for(var e=[],n=t.data,i=t.len(),r=0;i>r;){var o=n[r++],a="",s=0;switch(o){case lC.M:a="M",s=2;break;case lC.L:a="L",s=2;break;case lC.Q:a="Q",s=4;break;case lC.C:a="C",s=6;break;case lC.A:var l=n[r++],u=n[r++],h=n[r++],c=n[r++],d=n[r++],f=n[r++],p=n[r++],g=n[r++],v=Math.abs(f),m=Sp(v-gC)&&!Sp(v),y=!1;y=v>=gC?!0:Sp(v)?!1:(f>-pC&&0>f||f>pC)==!!g;var x=Mp(l+h*fC(d)),_=Mp(u+c*dC(d));m&&(f=g?gC-1e-4:-gC+1e-4,y=!0,9===r&&e.push("M",x,_));var w=Mp(l+h*fC(d+f)),b=Mp(u+c*dC(d+f));e.push("A",Mp(h),Mp(c),cC(p*vC),+y,+g,w,b);break;case lC.Z:a="Z";break;case lC.R:var w=Mp(n[r++]),b=Mp(n[r++]),M=Mp(n[r++]),S=Mp(n[r++]);e.push("M",w,b,"L",w+M,b,"L",w+M,b+S,"L",w,b+S,"L",w,b)}a&&e.push(a);for(var I=0;s>I;I++)e.push(Mp(n[r++]))}return e.join(" ")}function Lp(t){return"middle"===t?"middle":"bottom"===t?"baseline":"hanging"}function Op(){}function Ep(t,e){for(var n=0,i=e.length,r=0,o=0;i>n;n++){var a=e[n];if(a.removed){for(var s=[],l=o;lr;r++)n[t][r].h!=e&&i.push(n[t][r]);n[t]=i}n[t]&&0===n[t].length&&delete n[t]}else delete n[t];return this},trigger:function(t){if(this._$handlers[t]){var e=arguments,n=e.length;n>3&&(e=_g.call(e,1));for(var i=this._$handlers[t],r=i.length,o=0;r>o;){switch(n){case 1:i[o].h.call(i[o].ctx);break;case 2:i[o].h.call(i[o].ctx,e[1]);break;case 3:i[o].h.call(i[o].ctx,e[1],e[2]);break;default:i[o].h.apply(i[o].ctx,e)}i[o].one?(i.splice(o,1),r--):o++}}return this},triggerWithContext:function(t){if(this._$handlers[t]){var e=arguments,n=e.length;n>4&&(e=_g.call(e,1,e.length-1));for(var i=e[e.length-1],r=this._$handlers[t],o=r.length,a=0;o>a;){switch(n){case 1:r[a].h.call(i);break;case 2:r[a].h.call(i,e[1]);break;case 3:r[a].h.call(i,e[1],e[2]);break;default:r[a].h.apply(i,e)}r[a].one?(r.splice(a,1),o--):a++}}return this}};var bg="silent";ce.prototype.dispose=function(){};var Mg=["click","dblclick","mousewheel","mouseout","mouseup","mousedown","mousemove","contextmenu"],Sg=function(t,e,n,i){wg.call(this),this.storage=t,this.painter=e,this.painterRoot=i,n=n||new ce,this.proxy=null,this._hovered={},this._lastTouchMoment,this._lastX,this._lastY,le.call(this),this.setHandlerProxy(n)};Sg.prototype={constructor:Sg,setHandlerProxy:function(t){this.proxy&&this.proxy.dispose(),t&&(f(Mg,function(e){t.on&&t.on(e,this[e],this)},this),t.handler=this),this.proxy=t},mousemove:function(t){var e=t.zrX,n=t.zrY,i=this._hovered,r=i.target;r&&!r.__zr&&(i=this.findHover(i.x,i.y),r=i.target);var o=this._hovered=this.findHover(e,n),a=o.target,s=this.proxy;s.setCursor&&s.setCursor(a?a.cursor:"default"),r&&a!==r&&this.dispatchToElement(i,"mouseout",t),this.dispatchToElement(o,"mousemove",t),a&&a!==r&&this.dispatchToElement(o,"mouseover",t)},mouseout:function(t){this.dispatchToElement(this._hovered,"mouseout",t);var e,n=t.toElement||t.relatedTarget;do n=n&&n.parentNode;while(n&&9!=n.nodeType&&!(e=n===this.painterRoot));!e&&this.trigger("globalout",{event:t})},resize:function(){this._hovered={}},dispatch:function(t,e){var n=this[t];n&&n.call(this,e)},dispose:function(){this.proxy.dispose(),this.storage=this.proxy=this.painter=null},setCursorStyle:function(t){var e=this.proxy;e.setCursor&&e.setCursor(t)},dispatchToElement:function(t,e,n){t=t||{};var i=t.target;if(!i||!i.silent){for(var r="on"+e,o=he(e,t,n);i&&(i[r]&&(o.cancelBubble=i[r].call(i,o)),i.trigger(e,o),i=i.parent,!o.cancelBubble););o.cancelBubble||(this.trigger(e,o),this.painter&&this.painter.eachOtherLayer(function(t){"function"==typeof t[r]&&t[r].call(t,o),t.trigger&&t.trigger(e,o)}))}},findHover:function(t,e,n){for(var i=this.storage.getDisplayList(),r={x:t,y:e},o=i.length-1;o>=0;o--){var a;if(i[o]!==n&&!i[o].ignore&&(a=de(i[o],t,e))&&(!r.topTarget&&(r.topTarget=i[o]),a!==bg)){r.target=i[o];break}}return r}},f(["click","mousedown","mouseup","mousewheel","dblclick","contextmenu"],function(t){Sg.prototype[t]=function(e){var n=this.findHover(e.zrX,e.zrY),i=n.target;if("mousedown"===t)this._downEl=i,this._downPoint=[e.zrX,e.zrY],this._upEl=i;else if("mouseup"===t)this._upEl=i;else if("click"===t){if(this._downEl!==this._upEl||!this._downPoint||mg(this._downPoint,[e.zrX,e.zrY])>4)return;this._downPoint=null}this.dispatchToElement(n,t,e)}}),c(Sg,wg),c(Sg,le);var Ig="undefined"==typeof Float32Array?Array:Float32Array,Cg=(Object.freeze||Object)({create:fe,identity:pe,copy:ge,mul:ve,translate:me,rotate:ye,scale:xe,invert:_e,clone:we}),Tg=pe,Ag=5e-5,Dg=function(t){t=t||{},t.position||(this.position=[0,0]),null==t.rotation&&(this.rotation=0),t.scale||(this.scale=[1,1]),this.origin=this.origin||null},kg=Dg.prototype;kg.transform=null,kg.needLocalTransform=function(){return be(this.rotation)||be(this.position[0])||be(this.position[1])||be(this.scale[0]-1)||be(this.scale[1]-1)},kg.updateTransform=function(){var t=this.parent,e=t&&t.transform,n=this.needLocalTransform(),i=this.transform;return n||e?(i=i||fe(),n?this.getLocalTransform(i):Tg(i),e&&(n?ve(i,t.transform,i):ge(i,t.transform)),this.transform=i,this.invTransform=this.invTransform||fe(),void _e(this.invTransform,i)):void(i&&Tg(i))$/;" l language:C++ +Airedale examples/labels.txt /^n02096051 Airedale, Airedale terrier$/;" v language:C++ +Algorithm framework/graph/algorithm.h /^ explicit Algorithm(GraphBase* graph):_graph(graph) {}$/;" f language:C++ class:anakin::graph::Algorithm access:public signature:(GraphBase* graph) +Algorithm framework/graph/algorithm.h /^class Algorithm {$/;" c language:C++ namespace:anakin::graph +AlignedUtils saber/funcs/impl/x86/x86_utils.h /^class AlignedUtils {$/;" c language:C++ namespace:anakin::saber::utils +Am tools/external_converter_v2/parser/frontend/dash_board/static/echart/echarts.min.js /^break}}for(var r=null,o=0,n=0;n0?Dv:0),this._needsManuallyCompositing),a.__builtin__||$g("ZLevel "+s+" has been used by unkown layer "+a.id),a!==r&&(a.__used=!0,a.__startIndex!==n&&(a.__dirty=!0),a.__startIndex=n,a.__drawIndex=a.incremental?-1:n,e(n),r=a),i.__dirty&&(a.__dirty=!0,a.incremental&&a.__drawIndex<0&&(a.__drawIndex=n))}e(n),this.eachBuiltinLayer(function(t){!t.__used&&t.getElementCount()>0&&(t.__dirty=!0,t.__startIndex=t.__endIndex=t.__drawIndex=0),t.__dirty&&t.__drawIndex<0&&(t.__drawIndex=t.__startIndex)})},clear:function(){return this.eachBuiltinLayer(this._clearLayer),this},_clearLayer:function(t){t.clear()},setBackgroundColor:function(t){this._backgroundColor=t},configLayer:function(t,e){if(e){var n=this._layerConfig;n[t]?r(n[t],e,!0):n[t]=e;for(var i=0;i=0&&this._clips.splice(e,1)},removeAnimator:function(t){for(var e=t.getClips(),n=0;na;a++){var s=n[a],l=s.step(t,e);l&&(r.push(l),o.push(s))}for(var a=0;i>a;)n[a]._needsRemove?(n[a]=n[i-1],n.pop(),i--):a++;i=r.length;for(var a=0;i>a;a++)o[a].fire(r[a]);this._time=t,this.onframe(e),this.trigger("frame",e),this.stage.update&&this.stage.update()},_startLoop:function(){function t(){e._running&&(gv(t),!e._paused&&e._update())}var e=this;this._running=!0,gv(t)},start:function(){this._time=(new Date).getTime(),this._pausedTime=0,this._startLoop()},stop:function(){this._running=!1},pause:function(){this._paused||(this._pauseStart=(new Date).getTime(),this._paused=!0)},resume:function(){this._paused&&(this._pausedTime+=(new Date).getTime()-this._pauseStart,this._paused=!1)},clear:function(){this._clips=[]},isFinished:function(){return!this._clips.length},animate:function(t,e){e=e||{};var n=new Zg(t,e.loop,e.getter,e.setter);return this.addAnimator(n),n}},c(Bv,wg);var Nv=function(){this._track=[]};Nv.prototype={constructor:Nv,recognize:function(t,e,n){return this._doTrack(t,e,n),this._recognize(t)},clear:function(){return this._track.length=0,this},_doTrack:function(t,e,n){var i=t.touches;if(i){for(var r={points:[],touches:[],target:e,event:t},o=0,a=i.length;a>o;o++){var s=i[o],l=pi(n,s,{});r.points.push([l.zrX,l.zrY]),r.touches.push(s)}this._track.push(r)}},_recognize:function(t){for(var e in Vv)if(Vv.hasOwnProperty(e)){var n=Vv[e](this._track,t);if(n)return n}}};var Vv={pinch:function(t,e){var n=t.length;if(n){var i=(t[n-1]||{}).points,r=(t[n-2]||{}).points||i;if(r&&r.length>1&&i&&i.length>1){var o=xi(i)\/xi(r);!isFinite(o)&&(o=1),e.pinchScale=o;var a=_i(i);return e.pinchX=a[0],e.pinchY=a[1],{type:"pinch",target:t[0].target,event:e}}}}},Fv=300,Hv=["click","dblclick","mousewheel","mouseout","mouseup","mousedown","mousemove","contextmenu"],Wv=["touchstart","touchend","touchmove"],Gv={pointerdown:1,pointerup:1,pointermove:1,pointerout:1},Uv=p(Hv,function(t){var e=t.replace("mouse","pointer");return Gv[e]?e:t}),Zv={mousemove:function(t){t=vi(this.dom,t),this.trigger("mousemove",t)},mouseout:function(t){t=vi(this.dom,t);var e=t.toElement||t.relatedTarget;if(e!=this.dom)for(;e&&9!=e.nodeType;){if(e===this.dom)return;e=e.parentNode}this.trigger("mouseout",t)},touchstart:function(t){t=vi(this.dom,t),t.zrByTouch=!0,this._lastTouchMoment=new Date,bi(this,t,"start"),Zv.mousemove.call(this,t),Zv.mousedown.call(this,t),Mi(this)},touchmove:function(t){t=vi(this.dom,t),t.zrByTouch=!0,bi(this,t,"change"),Zv.mousemove.call(this,t),Mi(this)},touchend:function(t){t=vi(this.dom,t),t.zrByTouch=!0,bi(this,t,"end"),Zv.mouseup.call(this,t),+new Date-this._lastTouchMoment=0||i&&u(i,a)<0)){var s=e.getShallow(a);null!=s&&(r[t[o][0]]=s)}}return r}},um=lm([["lineWidth","width"],["stroke","color"],["opacity"],["shadowBlur"],["shadowOffsetX"],["shadowOffsetY"],["shadowColor"]]),hm={getLineStyle:function(t){var e=um(this,t),n=this.getLineDash(e.lineWidth);return n&&(e.lineDash=n),e},getLineDash:function(t){null==t&&(t=1);var e=this.get("type"),n=Math.max(t,2),i=4*t;return"solid"===e||null==e?null:"dashed"===e?[i,i]:[n,n]}},cm=lm([["fill","color"],["shadowBlur"],["shadowOffsetX"],["shadowOffsetY"],["opacity"],["shadowColor"]]),dm={getAreaStyle:function(t,e){return cm(this,t,e)}},fm=Math.pow,pm=Math.sqrt,gm=1e-8,vm=1e-4,mm=pm(3),ym=1\/3,xm=H(),_m=H(),wm=H(),bm=Math.min,Mm=Math.max,Sm=Math.sin,Im=Math.cos,Cm=2*Math.PI,Tm=H(),Am=H(),Dm=H(),km=[],Pm=[],Lm={M:1,L:2,C:3,Q:4,A:5,Z:6,R:7},Om=[],Em=[],Rm=[],zm=[],Bm=Math.min,Nm=Math.max,Vm=Math.cos,Fm=Math.sin,Hm=Math.sqrt,Wm=Math.abs,Gm="undefined"!=typeof Float32Array,Um=function(t){this._saveData=!t,this._saveData&&(this.data=[]),this._ctx=null};Um.prototype={constructor:Um,_xi:0,_yi:0,_x0:0,_y0:0,_ux:0,_uy:0,_len:0,_lineDash:null,_dashOffset:0,_dashIdx:0,_dashSum:0,setScale:function(t,e){this._ux=Wm(1\/Yg\/t)||0,this._uy=Wm(1\/Yg\/e)||0},getContext:function(){return this._ctx},beginPath:function(t){return this._ctx=t,t&&t.beginPath(),t&&(this.dpr=t.dpr),this._saveData&&(this._len=0),this._lineDash&&(this._lineDash=null,this._dashOffset=0),this},moveTo:function(t,e){return this.addData(Lm.M,t,e),this._ctx&&this._ctx.moveTo(t,e),this._x0=t,this._y0=e,this._xi=t,this._yi=e,this},lineTo:function(t,e){var n=Wm(t-this._xi)>this._ux||Wm(e-this._yi)>this._uy||this._len<5;return this.addData(Lm.L,t,e),this._ctx&&n&&(this._needsDash()?this._dashedLineTo(t,e):this._ctx.lineTo(t,e)),n&&(this._xi=t,this._yi=e),this},bezierCurveTo:function(t,e,n,i,r,o){return this.addData(Lm.C,t,e,n,i,r,o),this._ctx&&(this._needsDash()?this._dashedBezierTo(t,e,n,i,r,o):this._ctx.bezierCurveTo(t,e,n,i,r,o)),this._xi=r,this._yi=o,this},quadraticCurveTo:function(t,e,n,i){return this.addData(Lm.Q,t,e,n,i),this._ctx&&(this._needsDash()?this._dashedQuadraticTo(t,e,n,i):this._ctx.quadraticCurveTo(t,e,n,i)),this._xi=n,this._yi=i,this},arc:function(t,e,n,i,r,o){return this.addData(Lm.A,t,e,n,n,i,r-i,0,o?0:1),this._ctx&&this._ctx.arc(t,e,n,i,r,o),this._xi=Vm(r)*n+t,this._yi=Fm(r)*n+t,this},arcTo:function(t,e,n,i,r){return this._ctx&&this._ctx.arcTo(t,e,n,i,r),this},rect:function(t,e,n,i){return this._ctx&&this._ctx.rect(t,e,n,i),this.addData(Lm.R,t,e,n,i),this},closePath:function(){this.addData(Lm.Z);var t=this._ctx,e=this._x0,n=this._y0;return t&&(this._needsDash()&&this._dashedLineTo(e,n),t.closePath()),this._xi=e,this._yi=n,this},fill:function(t){t&&t.fill(),this.toStatic()},stroke:function(t){t&&t.stroke(),this.toStatic()},setLineDash:function(t){if(t instanceof Array){this._lineDash=t,this._dashIdx=0;for(var e=0,n=0;nn;n++)this.data[n]=t[n];this._len=e},appendPath:function(t){t instanceof Array||(t=[t]);for(var e=t.length,n=0,i=this._len,r=0;e>r;r++)n+=t[r].len();Gm&&this.data instanceof Float32Array&&(this.data=new Float32Array(i+n));for(var r=0;e>r;r++)for(var o=t[r].data,a=0;ae.length&&(this._expandData(),e=this.data);for(var n=0;no&&(o=r+o),o%=r,f-=o*h,p-=o*c;h>0&&t>=f||0>h&&f>=t||0==h&&(c>0&&e>=p||0>c&&p>=e);)i=this._dashIdx,n=a[i],f+=h*n,p+=c*n,this._dashIdx=(i+1)%g,h>0&&l>f||0>h&&f>l||c>0&&u>p||0>c&&p>u||s[i%2?"moveTo":"lineTo"](h>=0?Bm(f,t):Nm(f,t),c>=0?Bm(p,e):Nm(p,e));h=f-t,c=p-e,this._dashOffset=-Hm(h*h+c*c)},_dashedBezierTo:function(t,e,n,i,r,o){var a,s,l,u,h,c=this._dashSum,d=this._dashOffset,f=this._lineDash,p=this._ctx,g=this._xi,v=this._yi,m=er,y=0,x=this._dashIdx,_=f.length,w=0;for(0>d&&(d=c+d),d%=c,a=0;1>a;a+=.1)s=m(g,t,n,r,a+.1)-m(g,t,n,r,a),l=m(v,e,i,o,a+.1)-m(v,e,i,o,a),y+=Hm(s*s+l*l);for(;_>x&&(w+=f[x],!(w>d));x++);for(a=(w-d)\/y;1>=a;)u=m(g,t,n,r,a),h=m(v,e,i,o,a),x%2?p.moveTo(u,h):p.lineTo(u,h),a+=f[x]\/y,x=(x+1)%_;x%2!==0&&p.lineTo(r,o),s=r-u,l=o-h,this._dashOffset=-Hm(s*s+l*l)},_dashedQuadraticTo:function(t,e,n,i){var r=n,o=i;n=(n+2*t)\/3,i=(i+2*e)\/3,t=(this._xi+2*t)\/3,e=(this._yi+2*e)\/3,this._dashedBezierTo(t,e,n,i,r,o)},toStatic:function(){var t=this.data;t instanceof Array&&(t.length=this._len,Gm&&(this.data=new Float32Array(t)))},getBoundingRect:function(){Om[0]=Om[1]=Rm[0]=Rm[1]=Number.MAX_VALUE,Em[0]=Em[1]=zm[0]=zm[1]=-Number.MAX_VALUE;for(var t=this.data,e=0,n=0,i=0,r=0,o=0;oc;){var d=s[c++];switch(1==c&&(i=s[c],r=s[c+1],e=i,n=r),d){case Lm.M:e=i=s[c++],n=r=s[c++],t.moveTo(i,r);break;case Lm.L:o=s[c++],a=s[c++],(Wm(o-i)>l||Wm(a-r)>u||c===h-1)&&(t.lineTo(o,a),i=o,r=a);break;case Lm.C:t.bezierCurveTo(s[c++],s[c++],s[c++],s[c++],s[c++],s[c++]),i=s[c-2],r=s[c-1];break;case Lm.Q:t.quadraticCurveTo(s[c++],s[c++],s[c++],s[c++]),i=s[c-2],r=s[c-1];break;case Lm.A:var f=s[c++],p=s[c++],g=s[c++],v=s[c++],m=s[c++],y=s[c++],x=s[c++],_=s[c++],w=g>v?g:v,b=g>v?1:g\/v,M=g>v?v\/g:1,S=Math.abs(g-v)>.001,I=m+y;S?(t.translate(f,p),t.rotate(x),t.scale(b,M),t.arc(0,0,w,m,I,1-_),t.scale(1\/b,1\/M),t.rotate(-x),t.translate(-f,-p)):t.arc(f,p,w,m,I,1-_),1==c&&(e=Vm(m)*g+f,n=Fm(m)*v+p),i=Vm(I)*g+f,r=Fm(I)*v+p;break;case Lm.R:e=i=s[c],n=r=s[c+1],t.rect(s[c++],s[c++],s[c++],s[c++]);break;case Lm.Z:t.closePath(),i=e,r=n}}}},Um.CMD=Lm;var Zm=2*Math.PI,jm=2*Math.PI,Xm=Um.CMD,Ym=2*Math.PI,qm=1e-4,$m=[-1,-1,-1],Km=[-1,-1],Qm=fv.prototype.getCanvasPattern,Jm=Math.abs,ty=new Um(!0);Lr.prototype={constructor:Lr,type:"path",__dirtyPath:!0,strokeContainThreshold:5,brush:function(t,e){var n=this.style,i=this.path||ty,r=n.hasStroke(),o=n.hasFill(),a=n.fill,s=n.stroke,l=o&&!!a.colorStops,u=r&&!!s.colorStops,h=o&&!!a.image,c=r&&!!s.image;if(n.bind(t,this,e),this.setTransform(t),this.__dirty){var d;l&&(d=d||this.getBoundingRect(),this._fillGradient=n.getGradient(t,a,d)),u&&(d=d||this.getBoundingRect(),this._strokeGradient=n.getGradient(t,s,d))}l?t.fillStyle=this._fillGradient:h&&(t.fillStyle=Qm.call(a,t)),u?t.strokeStyle=this._strokeGradient:c&&(t.strokeStyle=Qm.call(s,t));var f=n.lineDash,p=n.lineDashOffset,g=!!t.setLineDash,v=this.getGlobalScale();i.setScale(v[0],v[1]),this.__dirtyPath||f&&!g&&r?(i.beginPath(t),f&&!g&&(i.setLineDash(f),i.setLineDashOffset(p)),this.buildPath(i,this.shape,!1),this.path&&(this.__dirtyPath=!1)):(t.beginPath(),this.path.rebuildPath(t)),o&&i.fill(t),f&&g&&(t.setLineDash(f),t.lineDashOffset=p),r&&i.stroke(t),f&&g&&t.setLineDash([]),null!=n.text&&(this.restoreTransform(t),this.drawRectText(t,this.getBoundingRect()))},buildPath:function(){},createPathProxy:function(){this.path=new Um},getBoundingRect:function(){var t=this._rect,e=this.style,n=!t;if(n){var i=this.path;i||(i=this.path=new Um),this.__dirtyPath&&(i.beginPath(),this.buildPath(i,this.shape,!1)),t=i.getBoundingRect()}if(this._rect=t,e.hasStroke()){var r=this._rectWithStroke||(this._rectWithStroke=t.clone());if(this.__dirty||n){r.copy(t);var o=e.lineWidth,a=e.strokeNoScale?this.getLineScale():1;e.hasFill()||(o=Math.max(o,this.strokeContainThreshold||4)),a>1e-10&&(r.width+=o\/a,r.height+=o\/a,r.x-=o\/a\/2,r.y-=o\/a\/2)}return r}return t},contain:function(t,e){var n=this.transformCoordToLocal(t,e),i=this.getBoundingRect(),r=this.style;if(t=n[0],e=n[1],i.contain(t,e)){var o=this.path.data;if(r.hasStroke()){var a=r.lineWidth,s=r.strokeNoScale?this.getLineScale():1;if(s>1e-10&&(r.hasFill()||(a=Math.max(a,this.strokeContainThreshold)),Pr(o,a\/s,t,e)))return!0}if(r.hasFill())return kr(o,t,e)}return!1},dirty:function(t){null==t&&(t=!0),t&&(this.__dirtyPath=t,this._rect=null),this.__dirty=!0,this.__zr&&this.__zr.refresh(),this.__clipTarget&&this.__clipTarget.dirty()},animateShape:function(t){return this.animate("shape",t)},attrKV:function(t,e){"shape"===t?(this.setShape(e),this.__dirtyPath=!0,this._rect=null):oi.prototype.attrKV.call(this,t,e)},setShape:function(t,e){var n=this.shape;if(n){if(M(t))for(var i in t)t.hasOwnProperty(i)&&(n[i]=t[i]);else n[t]=e;this.dirty(!0)}return this},getLineScale:function(){var t=this.transform;return t&&Jm(t[0]-1)>1e-10&&Jm(t[3]-1)>1e-10?Math.sqrt(Jm(t[0]*t[3]-t[2]*t[1])):1}},Lr.extend=function(t){var e=function(e){Lr.call(this,e),t.style&&this.style.extendFrom(t.style,!1);var n=t.shape;if(n){this.shape=this.shape||{};var i=this.shape;for(var r in n)!i.hasOwnProperty(r)&&n.hasOwnProperty(r)&&(i[r]=n[r])}t.init&&t.init.call(this,e)};h(e,Lr);for(var n in t)"style"!==n&&"shape"!==n&&(e.prototype[n]=t[n]);return e},h(Lr,oi);var ey=Um.CMD,ny=[[],[],[]],iy=Math.sqrt,ry=Math.atan2,oy=function(t,e){var n,i,r,o,a,s,l=t.data,u=ey.M,h=ey.C,c=ey.L,d=ey.R,f=ey.A,p=ey.Q;for(r=0,o=0;ra;a++){var s=ny[a];s[0]=l[r++],s[1]=l[r++],oe(s,s,e),l[o++]=s[0],l[o++]=s[1]}}},ay=["m","M","l","L","v","V","h","H","z","Z","c","C","q","Q","t","T","s","S","a","A"],sy=Math.sqrt,ly=Math.sin,uy=Math.cos,hy=Math.PI,cy=function(t){return Math.sqrt(t[0]*t[0]+t[1]*t[1])},dy=function(t,e){return(t[0]*e[0]+t[1]*e[1])\/(cy(t)*cy(e))},fy=function(t,e){return(t[0]*e[1]=11?function(){var e,n=this.__clipPaths,i=this.style;if(n)for(var r=0;ro;o++)r+=ee(t[o-1],t[o]);var a=r\/2;a=n>a?n:a;for(var o=0;a>o;o++){var s,l,u,h=o\/(a-1)*(e?n:n-1),c=Math.floor(h),d=h-c,f=t[c%n];e?(s=t[(c-1+n)%n],l=t[(c+1)%n],u=t[(c+2)%n]):(s=t[0===c?c:c-1],l=t[c>n-2?n-1:c+1],u=t[c>n-3?n-1:c+2]);var p=d*d,g=d*p;i.push([Vr(s[0],f[0],l[0],u[0],d,p,g),Vr(s[1],f[1],l[1],u[1],d,p,g)])}return i},wy=function(t,e,n,i){var r,o,a,s,l=[],u=[],h=[],c=[];if(i){a=[1\/0,1\/0],s=[-1\/0,-1\/0];for(var d=0,f=t.length;f>d;d++)ae(a,a,t[d]),se(s,s,t[d]);ae(a,a,i[0]),se(s,s,i[1])}for(var d=0,f=t.length;f>d;d++){var p=t[d];if(n)r=t[d?d-1:f-1],o=t[(d+1)%f];else{if(0===d||d===f-1){l.push(G(t[d]));continue}r=t[d-1],o=t[d+1]}X(u,o,r),J(u,u,e);var g=ee(p,r),v=ee(p,o),m=g+v;0!==m&&(g\/=m,v\/=m),J(h,u,-g),J(c,u,v);var y=Z([],p,h),x=Z([],p,c);i&&(se(y,y,a),ae(y,y,s),se(x,x,a),ae(x,x,s)),l.push(y),l.push(x)}return n&&l.push(l.shift()),l},by=Lr.extend({type:"polygon",shape:{points:null,smooth:!1,smoothConstraint:null},buildPath:function(t,e){Fr(t,e,!0)}}),My=Lr.extend({type:"polyline",shape:{points:null,smooth:!1,smoothConstraint:null},style:{stroke:"#000",fill:null},buildPath:function(t,e){Fr(t,e,!1)}}),Sy=Lr.extend({type:"rect",shape:{r:0,x:0,y:0,width:0,height:0},buildPath:function(t,e){var n=e.x,i=e.y,r=e.width,o=e.height;e.r?Fn(t,e):t.rect(n,i,r,o),t.closePath()}}),Iy=Lr.extend({type:"line",shape:{x1:0,y1:0,x2:0,y2:0,percent:1},style:{stroke:"#000",fill:null},buildPath:function(t,e){var n=e.x1,i=e.y1,r=e.x2,o=e.y2,a=e.percent;0!==a&&(t.moveTo(n,i),1>a&&(r=n*(1-a)+r*a,o=i*(1-a)+o*a),t.lineTo(r,o))},pointAt:function(t){var e=this.shape;return[e.x1*(1-t)+e.x2*t,e.y1*(1-t)+e.y2*t]}}),Cy=[],Ty=Lr.extend({type:"bezier-curve",shape:{x1:0,y1:0,x2:0,y2:0,cpx1:0,cpy1:0,percent:1},style:{stroke:"#000",fill:null},buildPath:function(t,e){var n=e.x1,i=e.y1,r=e.x2,o=e.y2,a=e.cpx1,s=e.cpy1,l=e.cpx2,u=e.cpy2,h=e.percent;0!==h&&(t.moveTo(n,i),null==l||null==u?(1>h&&(cr(n,a,r,h,Cy),a=Cy[1],r=Cy[2],cr(i,s,o,h,Cy),s=Cy[1],o=Cy[2]),t.quadraticCurveTo(a,s,r,o)):(1>h&&(or(n,a,l,r,h,Cy),a=Cy[1],l=Cy[2],r=Cy[3],or(i,s,u,o,h,Cy),s=Cy[1],u=Cy[2],o=Cy[3]),t.bezierCurveTo(a,s,l,u,r,o)))},pointAt:function(t){return Hr(this.shape,t,!1)},tangentAt:function(t){var e=Hr(this.shape,t,!0);return te(e,e)}}),Ay=Lr.extend({type:"arc",shape:{cx:0,cy:0,r:0,startAngle:0,endAngle:2*Math.PI,clockwise:!0},style:{stroke:"#000",fill:null},buildPath:function(t,e){var n=e.cx,i=e.cy,r=Math.max(e.r,0),o=e.startAngle,a=e.endAngle,s=e.clockwise,l=Math.cos(o),u=Math.sin(o);t.moveTo(l*r+n,u*r+i),t.arc(n,i,r,o,a,!s)}}),Dy=Lr.extend({type:"compound",shape:{paths:null},_updatePathDirty:function(){for(var t=this.__dirtyPath,e=this.shape.paths,n=0;n;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.cpp /^template class AnakinService;$/;" x language:C++ file: +AnakinService framework/service/anakin_service.h /^class AnakinService : public RPCService {$/;" c language:C++ namespace:anakin::rpc inherits:RPCService +AnakinThreadLocalVar framework/core/tls.h /^ AnakinThreadLocalVar() = default;$/;" p language:C++ class:anakin::AnakinThreadLocalVar access:private signature:() +AnakinThreadLocalVar framework/core/tls.h /^class AnakinThreadLocalVar {$/;" c language:C++ namespace:anakin +Angora examples/labels.txt /^n02328150 Angora, Angora rabbit$/;" v language:C++ +Arc framework/graph/arc.h /^ Arc() {}$/;" f language:C++ class:anakin::graph::Arc access:public signature:() +Arc framework/graph/arc.h /^ Arc(VertexNameType vertex_1, VertexNameType vertex_2);$/;" p language:C++ class:anakin::graph::Arc access:public signature:(VertexNameType vertex_1, VertexNameType vertex_2) +Arc framework/graph/arc.h /^ Arc(VertexNameType vertex_1, VertexNameType vertex_2, WeightType weight);$/;" p language:C++ class:anakin::graph::Arc access:public signature:(VertexNameType vertex_1, VertexNameType vertex_2, WeightType weight) +Arc framework/graph/arc.h /^ Arc(const Arc& otherArc);$/;" p language:C++ class:anakin::graph::Arc access:public signature:(const Arc& otherArc) +Arc framework/graph/arc.h /^class Arc {$/;" c language:C++ namespace:anakin::graph +Arc framework/graph/arc.inl /^Arc::Arc(VertexNameType vertex_1, VertexNameType vertex_2) {$/;" f language:C++ class:anakin::graph::Arc signature:(VertexNameType vertex_1, VertexNameType vertex_2) +Arc framework/graph/arc.inl /^Arc::Arc(VertexNameType vertex_1, VertexNameType vertex_2, WeightType weight) {$/;" f language:C++ class:anakin::graph::Arc signature:(VertexNameType vertex_1, VertexNameType vertex_2, WeightType weight) +Arc framework/graph/arc.inl /^Arc::Arc(const Arc& otherArc) {$/;" f language:C++ class:anakin::graph::Arc signature:(const Arc& otherArc) +Arc_iterator framework/graph/arc.h /^ Arc_iterator() {}$/;" f language:C++ class:anakin::graph::Arc_iterator access:public signature:() +Arc_iterator framework/graph/arc.h /^ Arc_iterator(const Arc_iterator& rhs):_arc_it(rhs._arc_it) {}$/;" f language:C++ class:anakin::graph::Arc_iterator access:public signature:(const Arc_iterator& rhs) +Arc_iterator framework/graph/arc.h /^ Arc_iterator(typename std::list::iterator& it):_arc_it(it) {}$/;" f language:C++ class:anakin::graph::Arc_iterator access:public signature:(typename std::list::iterator& it) +Arc_iterator framework/graph/arc.h /^ Arc_iterator(typename std::list::iterator&& it):_arc_it(it) {}$/;" f language:C++ class:anakin::graph::Arc_iterator access:public signature:(typename std::list::iterator&& it) +Arc_iterator framework/graph/arc.h /^class Arc_iterator {$/;" c language:C++ namespace:anakin::graph +ArcsIteratorList framework/graph/graph_base.h /^ typedef std::vector> ArcsIteratorList;$/;" t language:C++ class:anakin::graph::GraphBase access:private +ArcsList framework/graph/graph_base.h /^ typedef std::list ArcsList;$/;" t language:C++ class:anakin::graph::GraphBase access:private +Argmax framework/operators/arg_max.h /^ Argmax() {}$/;" f language:C++ class:anakin::ops::Argmax access:public signature:() +Argmax framework/operators/arg_max.h /^class Argmax : public Operator {$/;" c language:C++ namespace:anakin::ops inherits:Operator +Argmax saber/funcs/argmax.h /^class Argmax : public BaseFunc<$/;" c language:C++ namespace:anakin::saber inherits:BaseFunc +ArgmaxHelper framework/operators/arg_max.cpp /^template class ArgmaxHelper;$/;" x language:C++ file: +ArgmaxHelper framework/operators/arg_max.cpp /^template class ArgmaxHelper;$/;" x language:C++ file: +ArgmaxHelper framework/operators/arg_max.cpp /^template class ArgmaxHelper;$/;" x language:C++ file: +ArgmaxHelper framework/operators/arg_max.cpp /^template class ArgmaxHelper;$/;" x language:C++ file: +ArgmaxHelper framework/operators/arg_max.cpp /^template class ArgmaxHelper;$/;" x language:C++ file: +ArgmaxHelper framework/operators/arg_max.cpp /^template class ArgmaxHelper;$/;" x language:C++ file: +ArgmaxHelper framework/operators/arg_max.cpp /^template class ArgmaxHelper;$/;" x language:C++ file: +ArgmaxHelper framework/operators/arg_max.cpp /^template class ArgmaxHelper;$/;" x language:C++ file: +ArgmaxHelper framework/operators/arg_max.cpp /^template class ArgmaxHelper;$/;" x language:C++ file: +ArgmaxHelper framework/operators/arg_max.h /^ ArgmaxHelper()=default;$/;" p language:C++ class:anakin::ops::ArgmaxHelper access:public signature:() +ArgmaxHelper framework/operators/arg_max.h /^ friend class ArgmaxHelper;$/;" x language:C++ +ArgmaxHelper framework/operators/arg_max.h /^class ArgmaxHelper : public OperatorHelper {$/;" c language:C++ namespace:anakin::ops inherits:OperatorHelper +ArgmaxHelper framework/operators/arg_max.h /^class ArgmaxHelper;$/;" x language:C++ +ArgmaxParam saber/saber_funcs_param.h /^ ArgmaxParam() = default;$/;" p language:C++ struct:anakin::saber::ArgmaxParam access:public signature:() +ArgmaxParam saber/saber_funcs_param.h /^ ArgmaxParam(bool out_max_val_in, int top_k_in) {$/;" f language:C++ struct:anakin::saber::ArgmaxParam access:public signature:(bool out_max_val_in, int top_k_in) +ArgmaxParam saber/saber_funcs_param.h /^ ArgmaxParam(bool out_max_val_in, int top_k_in, bool has_axis_in, int axis_in) {$/;" f language:C++ struct:anakin::saber::ArgmaxParam access:public signature:(bool out_max_val_in, int top_k_in, bool has_axis_in, int axis_in) +ArgmaxParam saber/saber_funcs_param.h /^ ArgmaxParam(bool out_max_val_in, int top_k_in, int axis_in) {$/;" f language:C++ struct:anakin::saber::ArgmaxParam access:public signature:(bool out_max_val_in, int top_k_in, int axis_in) +ArgmaxParam saber/saber_funcs_param.h /^ ArgmaxParam(const ArgmaxParam& right) {$/;" f language:C++ struct:anakin::saber::ArgmaxParam access:public signature:(const ArgmaxParam& right) +ArgmaxParam saber/saber_funcs_param.h /^struct ArgmaxParam {$/;" s language:C++ namespace:anakin::saber +Argmax_kernel saber/funcs/impl/x86/saber_argmax.cpp /^void Argmax_kernel(const dtype* din, dtype* dout, int num, int in_channel, \\$/;" f language:C++ namespace:anakin::saber signature:(const dtype* din, dtype* dout, int num, int in_channel, \ int out_channel, int top, bool out_max) +Argmax_kernel_axis saber/funcs/impl/x86/saber_argmax.cpp /^void Argmax_kernel_axis(const dtype* din, dtype* dout, int num, int in_stride, \\$/;" f language:C++ namespace:anakin::saber signature:(const dtype* din, dtype* dout, int num, int in_stride, \ int out_stride, int size, int in_ss, int out_ss, int top, bool out_max) +Args framework/core/operator/operator_attr.h /^ OpAttrWarpper& Args(const std::string& arg_name, const std::string& arg_doc = "") {$/;" f language:C++ class:anakin::OpAttrWarpper access:public signature:(const std::string& arg_name, const std::string& arg_doc = ) +Args_map framework/core/operator/operator_attr.h /^ std::unordered_map Args_map;$/;" m language:C++ struct:anakin::OperatorAttr access:public +Argument framework/core/operator/operator_attr.h /^struct Argument {$/;" s language:C++ namespace:anakin +AttrInfo framework/graph/node.h /^ AttrInfo() {$/;" f language:C++ struct:anakin::graph::AttrInfo access:public signature:() +AttrInfo framework/graph/node.h /^struct AttrInfo {$/;" s language:C++ namespace:anakin::graph +Av tools/external_converter_v2/parser/frontend/dash_board/static/echart/echarts.min.js /^},kg.getLocalTransform=function(t){return Dg.getLocalTransform(this,t)},kg.setTransform=function(t){var e=this.transform,n=t.dpr||1;e?t.setTransform(n*e[0],n*e[1],n*e[2],n*e[3],n*e[4],n*e[5]):t.setTransform(n,0,0,n,0,0)},kg.restoreTransform=function(t){var e=t.dpr||1;t.setTransform(e,0,0,e,0,0)};var Pg=[];kg.decomposeTransform=function(){if(this.transform){var t=this.parent,e=this.transform;t&&t.transform&&(ve(Pg,t.invTransform,e),e=Pg);var n=e[0]*e[0]+e[1]*e[1],i=e[2]*e[2]+e[3]*e[3],r=this.position,o=this.scale;be(n-1)&&(n=Math.sqrt(n)),be(i-1)&&(i=Math.sqrt(i)),e[0]<0&&(n=-n),e[3]<0&&(i=-i),r[0]=e[4],r[1]=e[5],o[0]=n,o[1]=i,this.rotation=Math.atan2(-e[1]\/i,e[0]\/n)}},kg.getGlobalScale=function(){var t=this.transform;if(!t)return[1,1];var e=Math.sqrt(t[0]*t[0]+t[1]*t[1]),n=Math.sqrt(t[2]*t[2]+t[3]*t[3]);return t[0]<0&&(e=-e),t[3]<0&&(n=-n),[e,n]},kg.transformCoordToLocal=function(t,e){var n=[t,e],i=this.invTransform;return i&&oe(n,n,i),n},kg.transformCoordToGlobal=function(t,e){var n=[t,e],i=this.transform;return i&&oe(n,n,i),n},Dg.getLocalTransform=function(t,e){e=e||[],Tg(e);var n=t.origin,i=t.scale||[1,1],r=t.rotation||0,o=t.position||[0,0];return n&&(e[4]-=n[0],e[5]-=n[1]),xe(e,e,i),r&&ye(e,e,r),n&&(e[4]+=n[0],e[5]+=n[1]),e[4]+=o[0],e[5]+=o[1],e};var Lg={linear:function(t){return t},quadraticIn:function(t){return t*t},quadraticOut:function(t){return t*(2-t)},quadraticInOut:function(t){return(t*=2)<1?.5*t*t:-.5*(--t*(t-2)-1)},cubicIn:function(t){return t*t*t},cubicOut:function(t){return--t*t*t+1},cubicInOut:function(t){return(t*=2)<1?.5*t*t*t:.5*((t-=2)*t*t+2)},quarticIn:function(t){return t*t*t*t},quarticOut:function(t){return 1- --t*t*t*t},quarticInOut:function(t){return(t*=2)<1?.5*t*t*t*t:-.5*((t-=2)*t*t*t-2)},quinticIn:function(t){return t*t*t*t*t},quinticOut:function(t){return--t*t*t*t*t+1},quinticInOut:function(t){return(t*=2)<1?.5*t*t*t*t*t:.5*((t-=2)*t*t*t*t+2)},sinusoidalIn:function(t){return 1-Math.cos(t*Math.PI\/2)},sinusoidalOut:function(t){return Math.sin(t*Math.PI\/2)},sinusoidalInOut:function(t){return.5*(1-Math.cos(Math.PI*t))},exponentialIn:function(t){return 0===t?0:Math.pow(1024,t-1)},exponentialOut:function(t){return 1===t?1:1-Math.pow(2,-10*t)},exponentialInOut:function(t){return 0===t?0:1===t?1:(t*=2)<1?.5*Math.pow(1024,t-1):.5*(-Math.pow(2,-10*(t-1))+2)},circularIn:function(t){return 1-Math.sqrt(1-t*t)},circularOut:function(t){return Math.sqrt(1- --t*t)},circularInOut:function(t){return(t*=2)<1?-.5*(Math.sqrt(1-t*t)-1):.5*(Math.sqrt(1-(t-=2)*t)+1)},elasticIn:function(t){var e,n=.1,i=.4;return 0===t?0:1===t?1:(!n||1>n?(n=1,e=i\/4):e=i*Math.asin(1\/n)\/(2*Math.PI),-(n*Math.pow(2,10*(t-=1))*Math.sin(2*(t-e)*Math.PI\/i)))},elasticOut:function(t){var e,n=.1,i=.4;return 0===t?0:1===t?1:(!n||1>n?(n=1,e=i\/4):e=i*Math.asin(1\/n)\/(2*Math.PI),n*Math.pow(2,-10*t)*Math.sin(2*(t-e)*Math.PI\/i)+1)},elasticInOut:function(t){var e,n=.1,i=.4;return 0===t?0:1===t?1:(!n||1>n?(n=1,e=i\/4):e=i*Math.asin(1\/n)\/(2*Math.PI),(t*=2)<1?-.5*n*Math.pow(2,10*(t-=1))*Math.sin(2*(t-e)*Math.PI\/i):n*Math.pow(2,-10*(t-=1))*Math.sin(2*(t-e)*Math.PI\/i)*.5+1)},backIn:function(t){var e=1.70158;return t*t*((e+1)*t-e)},backOut:function(t){var e=1.70158;return--t*t*((e+1)*t+e)+1},backInOut:function(t){var e=2.5949095;return(t*=2)<1?.5*t*t*((e+1)*t-e):.5*((t-=2)*t*((e+1)*t+e)+2)},bounceIn:function(t){return 1-Lg.bounceOut(1-t)},bounceOut:function(t){return 1\/2.75>t?7.5625*t*t:2\/2.75>t?7.5625*(t-=1.5\/2.75)*t+.75:2.5\/2.75>t?7.5625*(t-=2.25\/2.75)*t+.9375:7.5625*(t-=2.625\/2.75)*t+.984375},bounceInOut:function(t){return.5>t?.5*Lg.bounceIn(2*t):.5*Lg.bounceOut(2*t-1)+.5}};Me.prototype={constructor:Me,step:function(t,e){if(this._initialized||(this._startTime=t+this._delay,this._initialized=!0),this._paused)return void(this._pausedTime+=e);var n=(t-this._startTime-this._pausedTime)\/this._life;if(!(0>n)){n=Math.min(n,1);var i=this.easing,r="string"==typeof i?Lg[i]:i,o="function"==typeof r?r(n):n;return this.fire("frame",o),1==n?this.loop?(this.restart(t),"restart"):(this._needsRemove=!0,"destroy"):null}},restart:function(t){var e=(t-this._startTime-this._pausedTime)%this._life;this._startTime=t-e+this.gap,this._pausedTime=0,this._needsRemove=!1},fire:function(t,e){t="on"+t,this[t]&&this[t](this._target,e)},pause:function(){this._paused=!0},resume:function(){this._paused=!1}};var Og=function(){this.head=null,this.tail=null,this._len=0},Eg=Og.prototype;Eg.insert=function(t){var e=new Rg(t);return this.insertEntry(e),e},Eg.insertEntry=function(t){this.head?(this.tail.next=t,t.prev=this.tail,t.next=null,this.tail=t):this.head=this.tail=t,this._len++},Eg.remove=function(t){var e=t.prev,n=t.next;e?e.next=n:this.head=n,n?n.prev=e:this.tail=e,t.next=t.prev=null,this._len--},Eg.len=function(){return this._len},Eg.clear=function(){this.head=this.tail=null,this._len=0};var Rg=function(t){this.value=t,this.next,this.prev},zg=function(t){this._list=new Og,this._map={},this._maxSize=t||10,this._lastRemovedEntry=null},Bg=zg.prototype;Bg.put=function(t,e){var n=this._list,i=this._map,r=null;if(null==i[t]){var o=n.len(),a=this._lastRemovedEntry;if(o>=this._maxSize&&o>0){var s=n.head;n.remove(s),delete i[s.key],r=s.value,this._lastRemovedEntry=s}a?a.value=e:a=new Rg(e),a.key=t,n.insertEntry(a),i[t]=a}return r},Bg.get=function(t){var e=this._map[t],n=this._list;return null!=e?(e!==n.tail&&(n.remove(e),n.insertEntry(e)),e.value):void 0},Bg.clear=function(){this._list.clear(),this._map={}};var Ng={transparent:[0,0,0,0],aliceblue:[240,248,255,1],antiquewhite:[250,235,215,1],aqua:[0,255,255,1],aquamarine:[127,255,212,1],azure:[240,255,255,1],beige:[245,245,220,1],bisque:[255,228,196,1],black:[0,0,0,1],blanchedalmond:[255,235,205,1],blue:[0,0,255,1],blueviolet:[138,43,226,1],brown:[165,42,42,1],burlywood:[222,184,135,1],cadetblue:[95,158,160,1],chartreuse:[127,255,0,1],chocolate:[210,105,30,1],coral:[255,127,80,1],cornflowerblue:[100,149,237,1],cornsilk:[255,248,220,1],crimson:[220,20,60,1],cyan:[0,255,255,1],darkblue:[0,0,139,1],darkcyan:[0,139,139,1],darkgoldenrod:[184,134,11,1],darkgray:[169,169,169,1],darkgreen:[0,100,0,1],darkgrey:[169,169,169,1],darkkhaki:[189,183,107,1],darkmagenta:[139,0,139,1],darkolivegreen:[85,107,47,1],darkorange:[255,140,0,1],darkorchid:[153,50,204,1],darkred:[139,0,0,1],darksalmon:[233,150,122,1],darkseagreen:[143,188,143,1],darkslateblue:[72,61,139,1],darkslategray:[47,79,79,1],darkslategrey:[47,79,79,1],darkturquoise:[0,206,209,1],darkviolet:[148,0,211,1],deeppink:[255,20,147,1],deepskyblue:[0,191,255,1],dimgray:[105,105,105,1],dimgrey:[105,105,105,1],dodgerblue:[30,144,255,1],firebrick:[178,34,34,1],floralwhite:[255,250,240,1],forestgreen:[34,139,34,1],fuchsia:[255,0,255,1],gainsboro:[220,220,220,1],ghostwhite:[248,248,255,1],gold:[255,215,0,1],goldenrod:[218,165,32,1],gray:[128,128,128,1],green:[0,128,0,1],greenyellow:[173,255,47,1],grey:[128,128,128,1],honeydew:[240,255,240,1],hotpink:[255,105,180,1],indianred:[205,92,92,1],indigo:[75,0,130,1],ivory:[255,255,240,1],khaki:[240,230,140,1],lavender:[230,230,250,1],lavenderblush:[255,240,245,1],lawngreen:[124,252,0,1],lemonchiffon:[255,250,205,1],lightblue:[173,216,230,1],lightcoral:[240,128,128,1],lightcyan:[224,255,255,1],lightgoldenrodyellow:[250,250,210,1],lightgray:[211,211,211,1],lightgreen:[144,238,144,1],lightgrey:[211,211,211,1],lightpink:[255,182,193,1],lightsalmon:[255,160,122,1],lightseagreen:[32,178,170,1],lightskyblue:[135,206,250,1],lightslategray:[119,136,153,1],lightslategrey:[119,136,153,1],lightsteelblue:[176,196,222,1],lightyellow:[255,255,224,1],lime:[0,255,0,1],limegreen:[50,205,50,1],linen:[250,240,230,1],magenta:[255,0,255,1],maroon:[128,0,0,1],mediumaquamarine:[102,205,170,1],mediumblue:[0,0,205,1],mediumorchid:[186,85,211,1],mediumpurple:[147,112,219,1],mediumseagreen:[60,179,113,1],mediumslateblue:[123,104,238,1],mediumspringgreen:[0,250,154,1],mediumturquoise:[72,209,204,1],mediumvioletred:[199,21,133,1],midnightblue:[25,25,112,1],mintcream:[245,255,250,1],mistyrose:[255,228,225,1],moccasin:[255,228,181,1],navajowhite:[255,222,173,1],navy:[0,0,128,1],oldlace:[253,245,230,1],olive:[128,128,0,1],olivedrab:[107,142,35,1],orange:[255,165,0,1],orangered:[255,69,0,1],orchid:[218,112,214,1],palegoldenrod:[238,232,170,1],palegreen:[152,251,152,1],paleturquoise:[175,238,238,1],palevioletred:[219,112,147,1],papayawhip:[255,239,213,1],peachpuff:[255,218,185,1],peru:[205,133,63,1],pink:[255,192,203,1],plum:[221,160,221,1],powderblue:[176,224,230,1],purple:[128,0,128,1],red:[255,0,0,1],rosybrown:[188,143,143,1],royalblue:[65,105,225,1],saddlebrown:[139,69,19,1],salmon:[250,128,114,1],sandybrown:[244,164,96,1],seagreen:[46,139,87,1],seashell:[255,245,238,1],sienna:[160,82,45,1],silver:[192,192,192,1],skyblue:[135,206,235,1],slateblue:[106,90,205,1],slategray:[112,128,144,1],slategrey:[112,128,144,1],snow:[255,250,250,1],springgreen:[0,255,127,1],steelblue:[70,130,180,1],tan:[210,180,140,1],teal:[0,128,128,1],thistle:[216,191,216,1],tomato:[255,99,71,1],turquoise:[64,224,208,1],violet:[238,130,238,1],wheat:[245,222,179,1],white:[255,255,255,1],whitesmoke:[245,245,245,1],yellow:[255,255,0,1],yellowgreen:[154,205,50,1]},Vg=new zg(20),Fg=null,Hg=Ve,Wg=Fe,Gg=(Object.freeze||Object)({parse:Ee,lift:Be,toHex:Ne,fastLerp:Ve,fastMapToColor:Hg,lerp:Fe,mapToColor:Wg,modifyHSL:He,modifyAlpha:We,stringify:Ge}),Ug=Array.prototype.slice,Zg=function(t,e,n,i){this._tracks={},this._target=t,this._loop=e||!1,this._getter=n||Ue,this._setter=i||Ze,this._clipCount=0,this._delay=0,this._doneList=[],this._onframeList=[],this._clipList=[]};Zg.prototype={when:function(t,e){var n=this._tracks;for(var i in e)if(e.hasOwnProperty(i)){if(!n[i]){n[i]=[];var r=this._getter(this._target,i);if(null==r)continue;0!==t&&n[i].push({time:0,value:Je(r)})}n[i].push({time:t,value:e[i]})}return this},during:function(t){return this._onframeList.push(t),this},pause:function(){for(var t=0;tn;n++)t[n].call(this)},start:function(t,e){var n,i=this,r=0,o=function(){r--,r||i._doneCallback()};for(var a in this._tracks)if(this._tracks.hasOwnProperty(a)){var s=nn(this,t,o,this._tracks[a],a,e);s&&(this._clipList.push(s),r++,this.animation&&this.animation.addClip(s),n=s)}if(n){var l=n.onframe;n.onframe=function(t,e){l(t,e);for(var n=0;n1&&(qg=function(){for(var t in arguments)console.log(arguments[t])});var $g=qg,Kg=function(){this.animators=[]};Kg.prototype={constructor:Kg,animate:function(t,e){var n,i=!1,r=this,o=this.__zr;if(t){var a=t.split("."),s=r;i="shape"===a[0];for(var l=0,h=a.length;h>l;l++)s&&(s=s[a[l]]);s&&(n=s)}else n=r;if(!n)return void $g('Property "'+t+'" is not existed in element '+r.id);var c=r.animators,d=new Zg(n,e);return d.during(function(){r.dirty(i)}).done(function(){c.splice(u(c,d),1)}),c.push(d),o&&o.animation.addAnimator(d),d},stopAnimation:function(t){for(var e=this.animators,n=e.length,i=0;n>i;i++)e[i].stop(t);return e.length=0,this},animateTo:function(t,e,n,i,r,o){function a(){l--,l||r&&r()}b(n)?(r=i,i=n,n=0):w(i)?(r=i,i="linear",n=0):w(n)?(r=n,n=0):w(e)?(r=e,e=500):e||(e=500),this.stopAnimation(),this._animateToShallow("",this,t,e,n);var s=this.animators.slice(),l=s.length;l||r&&r();for(var u=0;u0&&this.animate(t,!1).when(null==i?500:i,o).delay(r||0),this}};var Qg=function(t){Dg.call(this,t),wg.call(this,t),Kg.call(this,t),this.id=t.id||Kp()};Qg.prototype={type:"element",name:"",__zr:null,ignore:!1,clipPath:null,isGroup:!1,drift:function(t,e){switch(this.draggable){case"horizontal":e=0;break;case"vertical":t=0}var n=this.transform;n||(n=this.transform=[1,0,0,1,0,0]),n[4]+=t,n[5]+=e,this.decomposeTransform(),this.dirty(!1)},beforeUpdate:function(){},afterUpdate:function(){},update:function(){this.updateTransform()},traverse:function(){},attrKV:function(t,e){if("position"===t||"scale"===t||"origin"===t){if(e){var n=this[t];n||(n=this[t]=[]),n[0]=e[0],n[1]=e[1]}}else this[t]=e},hide:function(){this.ignore=!0,this.__zr&&this.__zr.refresh()},show:function(){this.ignore=!1,this.__zr&&this.__zr.refresh()},attr:function(t,e){if("string"==typeof t)this.attrKV(t,e);else if(M(t))for(var n in t)t.hasOwnProperty(n)&&this.attrKV(n,t[n]);return this.dirty(!1),this},setClipPath:function(t){var e=this.__zr;e&&t.addSelfToZr(e),this.clipPath&&this.clipPath!==t&&this.removeClipPath(),this.clipPath=t,t.__zr=e,t.__clipTarget=this,this.dirty(!1)},removeClipPath:function(){var t=this.clipPath;t&&(t.__zr&&t.removeSelfFromZr(t.__zr),t.__zr=null,t.__clipTarget=null,this.clipPath=null,this.dirty(!1))},addSelfToZr:function(t){this.__zr=t;var e=this.animators;if(e)for(var n=0;ni||n>s||l>o||r>u)},contain:function(t,e){var n=this;return t>=n.x&&t<=n.x+n.width&&e>=n.y&&e<=n.y+n.height},clone:function(){return new rn(this.x,this.y,this.width,this.height)},copy:function(t){this.x=t.x,this.y=t.y,this.width=t.width,this.height=t.height},plain:function(){return{x:this.x,y:this.y,width:this.width,height:this.height}}},rn.create=function(t){return new rn(t.x,t.y,t.width,t.height)};var nv=function(t){t=t||{},Qg.call(this,t);for(var e in t)t.hasOwnProperty(e)&&(this[e]=t[e]);this._children=[],this.__storage=null,this.__dirty=!0};nv.prototype={constructor:nv,isGroup:!0,type:"group",silent:!1,children:function(){return this._children.slice()},childAt:function(t){return this._children[t]},childOfName:function(t){for(var e=this._children,n=0;n=0&&(n.splice(i,0,t),this._doAdd(t))}return this},_doAdd:function(t){t.parent&&t.parent.remove(t),t.parent=this;var e=this.__storage,n=this.__zr;e&&e!==t.__storage&&(e.addToStorage(t),t instanceof nv&&t.addChildrenToStorage(e)),n&&n.refresh()},remove:function(t){var e=this.__zr,n=this.__storage,i=this._children,r=u(i,t);return 0>r?this:(i.splice(r,1),t.parent=null,n&&(n.delFromStorage(t),t instanceof nv&&t.delChildrenFromStorage(n)),e&&e.refresh(),this)},removeAll:function(){var t,e,n=this._children,i=this.__storage;for(e=0;ei;i++)this._updateAndAddDisplayable(e[i],null,t);n.length=this._displayListLen,Jp.canvasSupported&&dn(n,fn)},_updateAndAddDisplayable:function(t,e,n){if(!t.ignore||n){t.beforeUpdate(),t.__dirty&&t.update(),t.afterUpdate();var i=t.clipPath;if(i){e=e?e.slice():[];for(var r=i,o=t;r;)r.parent=o,r.updateTransform(),e.push(r),o=r,r=r.clipPath}if(t.isGroup){for(var a=t._children,s=0;se;e++)this.delRoot(t[e]);else{var r=u(this._roots,t);r>=0&&(this.delFromStorage(t),this._roots.splice(r,1),t instanceof nv&&t.delChildrenFromStorage(this))}},addToStorage:function(t){return t&&(t.__storage=this,t.dirty(!1)),this},delFromStorage:function(t){return t&&(t.__storage=null),this},dispose:function(){this._renderList=this._roots=null},displayableSortFunc:fn};var av={shadowBlur:1,shadowOffsetX:1,shadowOffsetY:1,textShadowBlur:1,textShadowOffsetX:1,textShadowOffsetY:1,textBoxShadowBlur:1,textBoxShadowOffsetX:1,textBoxShadowOffsetY:1},sv=function(t,e,n){return av.hasOwnProperty(e)?n*=t.dpr:n},lv=[["shadowBlur",0],["shadowOffsetX",0],["shadowOffsetY",0],["shadowColor","#000"],["lineCap","butt"],["lineJoin","miter"],["miterLimit",10]],uv=function(t,e){this.extendFrom(t,!1),this.host=e};uv.prototype={constructor:uv,host:null,fill:"#000",stroke:null,opacity:1,lineDash:null,lineDashOffset:0,shadowBlur:0,shadowOffsetX:0,shadowOffsetY:0,lineWidth:1,strokeNoScale:!1,text:null,font:null,textFont:null,fontStyle:null,fontWeight:null,fontSize:null,fontFamily:null,textTag:null,textFill:"#000",textStroke:null,textWidth:null,textHeight:null,textStrokeWidth:0,textLineHeight:null,textPosition:"inside",textRect:null,textOffset:null,textAlign:null,textVerticalAlign:null,textDistance:5,textShadowColor:"transparent",textShadowBlur:0,textShadowOffsetX:0,textShadowOffsetY:0,textBoxShadowColor:"transparent",textBoxShadowBlur:0,textBoxShadowOffsetX:0,textBoxShadowOffsetY:0,transformText:!1,textRotation:0,textOrigin:null,textBackgroundColor:null,textBorderColor:null,textBorderWidth:0,textBorderRadius:0,textPadding:null,rich:null,truncate:null,blend:null,bind:function(t,e,n){for(var i=this,r=n&&n.style,o=!r,a=0;a0},extendFrom:function(t,e){if(t)for(var n in t)!t.hasOwnProperty(n)||e!==!0&&(e===!1?this.hasOwnProperty(n):null==t[n])||(this[n]=t[n])},set:function(t,e){"string"==typeof t?this[t]=e:this.extendFrom(t,!0)},clone:function(){var t=new this.constructor;return t.extendFrom(this,!0),t},getGradient:function(t,e,n){for(var i="radial"===e.type?gn:pn,r=i(t,e,n),o=e.colorStops,a=0;a=0&&n.splice(i,1),t.__hoverMir=null},clearHover:function(){for(var t=this._hoverElements,e=0;er;){var o=t[r],a=o.__from;a&&a.__zr?(r++,a.invisible||(o.transform=a.transform,o.invTransform=a.invTransform,o.__clipPaths=a.__clipPaths,this._doPaintEl(o,n,!0,i))):(t.splice(r,1),a.__hoverMir=null,e--)}n.ctx.restore()}},getHoverLayer:function(){return this.getLayer(Tv)},_paintList:function(t,e,n){if(this._redrawId===n){e=e||!1,this._updateLayerStatus(t);var i=this._doPaintList(t,e);if(this._needsManuallyCompositing&&this._compositeManually(),!i){var r=this;gv(function(){r._paintList(t,e,n)})}}},_compositeManually:function(){var t=this.getLayer(Av).ctx,e=this._domRoot.width,n=this._domRoot.height;t.clearRect(0,0,e,n),this.eachBuiltinLayer(function(i){i.virtual&&t.drawImage(i.dom,0,0,e,n)})},_doPaintList:function(t,e){for(var n=[],i=0;i15)break}}o.__drawIndex=v,o.__drawIndex0&&t>i[0]){for(a=0;r-1>a&&!(i[a]t);a++);o=n[i[a]]}if(i.splice(a+1,0,t),n[t]=e,!e.virtual)if(o){var l=o.dom;l.nextSibling?s.insertBefore(e.dom,l.nextSibling):s.appendChild(e.dom)}else s.firstChild?s.insertBefore(e.dom,s.firstChild):s.appendChild(e.dom)},eachLayer:function(t,e){var n,i,r=this._zlevelList;for(i=0;i {$/;" c language:C++ namespace:anakin::ops inherits:Operator +Axpy saber/funcs/axpy.h /^class Axpy : public BaseFunc<$/;" c language:C++ namespace:anakin::saber inherits:BaseFunc +AxpyHelper framework/operators/axpy.cpp /^template class AxpyHelper;$/;" x language:C++ file: +AxpyHelper framework/operators/axpy.cpp /^template class AxpyHelper;$/;" x language:C++ file: +AxpyHelper framework/operators/axpy.cpp /^template class AxpyHelper;$/;" x language:C++ file: +AxpyHelper framework/operators/axpy.cpp /^template class AxpyHelper;$/;" x language:C++ file: +AxpyHelper framework/operators/axpy.cpp /^template class AxpyHelper;$/;" x language:C++ file: +AxpyHelper framework/operators/axpy.cpp /^template class AxpyHelper;$/;" x language:C++ file: +AxpyHelper framework/operators/axpy.cpp /^template class AxpyHelper;$/;" x language:C++ file: +AxpyHelper framework/operators/axpy.cpp /^template class AxpyHelper;$/;" x language:C++ file: +AxpyHelper framework/operators/axpy.cpp /^template class AxpyHelper;$/;" x language:C++ file: +AxpyHelper framework/operators/axpy.h /^ AxpyHelper()=default;$/;" p language:C++ class:anakin::ops::AxpyHelper access:public signature:() +AxpyHelper framework/operators/axpy.h /^ friend class AxpyHelper;$/;" x language:C++ +AxpyHelper framework/operators/axpy.h /^class AxpyHelper : public OperatorHelper {$/;" c language:C++ namespace:anakin::ops inherits:OperatorHelper +AxpyHelper framework/operators/axpy.h /^class AxpyHelper;$/;" x language:C++ +AxpyParam saber/saber_funcs_param.h /^ AxpyParam() = default;$/;" p language:C++ struct:anakin::saber::AxpyParam access:public signature:() +AxpyParam saber/saber_funcs_param.h /^ AxpyParam(const AxpyParam& right) { }$/;" f language:C++ struct:anakin::saber::AxpyParam access:public signature:(const AxpyParam& right) +AxpyParam saber/saber_funcs_param.h /^struct AxpyParam {$/;" s language:C++ namespace:anakin::saber +Ay tools/external_converter_v2/parser/frontend/dash_board/static/echart/echarts.min.js /^break}}for(var r=null,o=0,n=0;n0?Dv:0),this._needsManuallyCompositing),a.__builtin__||$g("ZLevel "+s+" has been used by unkown layer "+a.id),a!==r&&(a.__used=!0,a.__startIndex!==n&&(a.__dirty=!0),a.__startIndex=n,a.__drawIndex=a.incremental?-1:n,e(n),r=a),i.__dirty&&(a.__dirty=!0,a.incremental&&a.__drawIndex<0&&(a.__drawIndex=n))}e(n),this.eachBuiltinLayer(function(t){!t.__used&&t.getElementCount()>0&&(t.__dirty=!0,t.__startIndex=t.__endIndex=t.__drawIndex=0),t.__dirty&&t.__drawIndex<0&&(t.__drawIndex=t.__startIndex)})},clear:function(){return this.eachBuiltinLayer(this._clearLayer),this},_clearLayer:function(t){t.clear()},setBackgroundColor:function(t){this._backgroundColor=t},configLayer:function(t,e){if(e){var n=this._layerConfig;n[t]?r(n[t],e,!0):n[t]=e;for(var i=0;i=0&&this._clips.splice(e,1)},removeAnimator:function(t){for(var e=t.getClips(),n=0;na;a++){var s=n[a],l=s.step(t,e);l&&(r.push(l),o.push(s))}for(var a=0;i>a;)n[a]._needsRemove?(n[a]=n[i-1],n.pop(),i--):a++;i=r.length;for(var a=0;i>a;a++)o[a].fire(r[a]);this._time=t,this.onframe(e),this.trigger("frame",e),this.stage.update&&this.stage.update()},_startLoop:function(){function t(){e._running&&(gv(t),!e._paused&&e._update())}var e=this;this._running=!0,gv(t)},start:function(){this._time=(new Date).getTime(),this._pausedTime=0,this._startLoop()},stop:function(){this._running=!1},pause:function(){this._paused||(this._pauseStart=(new Date).getTime(),this._paused=!0)},resume:function(){this._paused&&(this._pausedTime+=(new Date).getTime()-this._pauseStart,this._paused=!1)},clear:function(){this._clips=[]},isFinished:function(){return!this._clips.length},animate:function(t,e){e=e||{};var n=new Zg(t,e.loop,e.getter,e.setter);return this.addAnimator(n),n}},c(Bv,wg);var Nv=function(){this._track=[]};Nv.prototype={constructor:Nv,recognize:function(t,e,n){return this._doTrack(t,e,n),this._recognize(t)},clear:function(){return this._track.length=0,this},_doTrack:function(t,e,n){var i=t.touches;if(i){for(var r={points:[],touches:[],target:e,event:t},o=0,a=i.length;a>o;o++){var s=i[o],l=pi(n,s,{});r.points.push([l.zrX,l.zrY]),r.touches.push(s)}this._track.push(r)}},_recognize:function(t){for(var e in Vv)if(Vv.hasOwnProperty(e)){var n=Vv[e](this._track,t);if(n)return n}}};var Vv={pinch:function(t,e){var n=t.length;if(n){var i=(t[n-1]||{}).points,r=(t[n-2]||{}).points||i;if(r&&r.length>1&&i&&i.length>1){var o=xi(i)\/xi(r);!isFinite(o)&&(o=1),e.pinchScale=o;var a=_i(i);return e.pinchX=a[0],e.pinchY=a[1],{type:"pinch",target:t[0].target,event:e}}}}},Fv=300,Hv=["click","dblclick","mousewheel","mouseout","mouseup","mousedown","mousemove","contextmenu"],Wv=["touchstart","touchend","touchmove"],Gv={pointerdown:1,pointerup:1,pointermove:1,pointerout:1},Uv=p(Hv,function(t){var e=t.replace("mouse","pointer");return Gv[e]?e:t}),Zv={mousemove:function(t){t=vi(this.dom,t),this.trigger("mousemove",t)},mouseout:function(t){t=vi(this.dom,t);var e=t.toElement||t.relatedTarget;if(e!=this.dom)for(;e&&9!=e.nodeType;){if(e===this.dom)return;e=e.parentNode}this.trigger("mouseout",t)},touchstart:function(t){t=vi(this.dom,t),t.zrByTouch=!0,this._lastTouchMoment=new Date,bi(this,t,"start"),Zv.mousemove.call(this,t),Zv.mousedown.call(this,t),Mi(this)},touchmove:function(t){t=vi(this.dom,t),t.zrByTouch=!0,bi(this,t,"change"),Zv.mousemove.call(this,t),Mi(this)},touchend:function(t){t=vi(this.dom,t),t.zrByTouch=!0,bi(this,t,"end"),Zv.mouseup.call(this,t),+new Date-this._lastTouchMoment=0||i&&u(i,a)<0)){var s=e.getShallow(a);null!=s&&(r[t[o][0]]=s)}}return r}},um=lm([["lineWidth","width"],["stroke","color"],["opacity"],["shadowBlur"],["shadowOffsetX"],["shadowOffsetY"],["shadowColor"]]),hm={getLineStyle:function(t){var e=um(this,t),n=this.getLineDash(e.lineWidth);return n&&(e.lineDash=n),e},getLineDash:function(t){null==t&&(t=1);var e=this.get("type"),n=Math.max(t,2),i=4*t;return"solid"===e||null==e?null:"dashed"===e?[i,i]:[n,n]}},cm=lm([["fill","color"],["shadowBlur"],["shadowOffsetX"],["shadowOffsetY"],["opacity"],["shadowColor"]]),dm={getAreaStyle:function(t,e){return cm(this,t,e)}},fm=Math.pow,pm=Math.sqrt,gm=1e-8,vm=1e-4,mm=pm(3),ym=1\/3,xm=H(),_m=H(),wm=H(),bm=Math.min,Mm=Math.max,Sm=Math.sin,Im=Math.cos,Cm=2*Math.PI,Tm=H(),Am=H(),Dm=H(),km=[],Pm=[],Lm={M:1,L:2,C:3,Q:4,A:5,Z:6,R:7},Om=[],Em=[],Rm=[],zm=[],Bm=Math.min,Nm=Math.max,Vm=Math.cos,Fm=Math.sin,Hm=Math.sqrt,Wm=Math.abs,Gm="undefined"!=typeof Float32Array,Um=function(t){this._saveData=!t,this._saveData&&(this.data=[]),this._ctx=null};Um.prototype={constructor:Um,_xi:0,_yi:0,_x0:0,_y0:0,_ux:0,_uy:0,_len:0,_lineDash:null,_dashOffset:0,_dashIdx:0,_dashSum:0,setScale:function(t,e){this._ux=Wm(1\/Yg\/t)||0,this._uy=Wm(1\/Yg\/e)||0},getContext:function(){return this._ctx},beginPath:function(t){return this._ctx=t,t&&t.beginPath(),t&&(this.dpr=t.dpr),this._saveData&&(this._len=0),this._lineDash&&(this._lineDash=null,this._dashOffset=0),this},moveTo:function(t,e){return this.addData(Lm.M,t,e),this._ctx&&this._ctx.moveTo(t,e),this._x0=t,this._y0=e,this._xi=t,this._yi=e,this},lineTo:function(t,e){var n=Wm(t-this._xi)>this._ux||Wm(e-this._yi)>this._uy||this._len<5;return this.addData(Lm.L,t,e),this._ctx&&n&&(this._needsDash()?this._dashedLineTo(t,e):this._ctx.lineTo(t,e)),n&&(this._xi=t,this._yi=e),this},bezierCurveTo:function(t,e,n,i,r,o){return this.addData(Lm.C,t,e,n,i,r,o),this._ctx&&(this._needsDash()?this._dashedBezierTo(t,e,n,i,r,o):this._ctx.bezierCurveTo(t,e,n,i,r,o)),this._xi=r,this._yi=o,this},quadraticCurveTo:function(t,e,n,i){return this.addData(Lm.Q,t,e,n,i),this._ctx&&(this._needsDash()?this._dashedQuadraticTo(t,e,n,i):this._ctx.quadraticCurveTo(t,e,n,i)),this._xi=n,this._yi=i,this},arc:function(t,e,n,i,r,o){return this.addData(Lm.A,t,e,n,n,i,r-i,0,o?0:1),this._ctx&&this._ctx.arc(t,e,n,i,r,o),this._xi=Vm(r)*n+t,this._yi=Fm(r)*n+t,this},arcTo:function(t,e,n,i,r){return this._ctx&&this._ctx.arcTo(t,e,n,i,r),this},rect:function(t,e,n,i){return this._ctx&&this._ctx.rect(t,e,n,i),this.addData(Lm.R,t,e,n,i),this},closePath:function(){this.addData(Lm.Z);var t=this._ctx,e=this._x0,n=this._y0;return t&&(this._needsDash()&&this._dashedLineTo(e,n),t.closePath()),this._xi=e,this._yi=n,this},fill:function(t){t&&t.fill(),this.toStatic()},stroke:function(t){t&&t.stroke(),this.toStatic()},setLineDash:function(t){if(t instanceof Array){this._lineDash=t,this._dashIdx=0;for(var e=0,n=0;nn;n++)this.data[n]=t[n];this._len=e},appendPath:function(t){t instanceof Array||(t=[t]);for(var e=t.length,n=0,i=this._len,r=0;e>r;r++)n+=t[r].len();Gm&&this.data instanceof Float32Array&&(this.data=new Float32Array(i+n));for(var r=0;e>r;r++)for(var o=t[r].data,a=0;ae.length&&(this._expandData(),e=this.data);for(var n=0;no&&(o=r+o),o%=r,f-=o*h,p-=o*c;h>0&&t>=f||0>h&&f>=t||0==h&&(c>0&&e>=p||0>c&&p>=e);)i=this._dashIdx,n=a[i],f+=h*n,p+=c*n,this._dashIdx=(i+1)%g,h>0&&l>f||0>h&&f>l||c>0&&u>p||0>c&&p>u||s[i%2?"moveTo":"lineTo"](h>=0?Bm(f,t):Nm(f,t),c>=0?Bm(p,e):Nm(p,e));h=f-t,c=p-e,this._dashOffset=-Hm(h*h+c*c)},_dashedBezierTo:function(t,e,n,i,r,o){var a,s,l,u,h,c=this._dashSum,d=this._dashOffset,f=this._lineDash,p=this._ctx,g=this._xi,v=this._yi,m=er,y=0,x=this._dashIdx,_=f.length,w=0;for(0>d&&(d=c+d),d%=c,a=0;1>a;a+=.1)s=m(g,t,n,r,a+.1)-m(g,t,n,r,a),l=m(v,e,i,o,a+.1)-m(v,e,i,o,a),y+=Hm(s*s+l*l);for(;_>x&&(w+=f[x],!(w>d));x++);for(a=(w-d)\/y;1>=a;)u=m(g,t,n,r,a),h=m(v,e,i,o,a),x%2?p.moveTo(u,h):p.lineTo(u,h),a+=f[x]\/y,x=(x+1)%_;x%2!==0&&p.lineTo(r,o),s=r-u,l=o-h,this._dashOffset=-Hm(s*s+l*l)},_dashedQuadraticTo:function(t,e,n,i){var r=n,o=i;n=(n+2*t)\/3,i=(i+2*e)\/3,t=(this._xi+2*t)\/3,e=(this._yi+2*e)\/3,this._dashedBezierTo(t,e,n,i,r,o)},toStatic:function(){var t=this.data;t instanceof Array&&(t.length=this._len,Gm&&(this.data=new Float32Array(t)))},getBoundingRect:function(){Om[0]=Om[1]=Rm[0]=Rm[1]=Number.MAX_VALUE,Em[0]=Em[1]=zm[0]=zm[1]=-Number.MAX_VALUE;for(var t=this.data,e=0,n=0,i=0,r=0,o=0;oc;){var d=s[c++];switch(1==c&&(i=s[c],r=s[c+1],e=i,n=r),d){case Lm.M:e=i=s[c++],n=r=s[c++],t.moveTo(i,r);break;case Lm.L:o=s[c++],a=s[c++],(Wm(o-i)>l||Wm(a-r)>u||c===h-1)&&(t.lineTo(o,a),i=o,r=a);break;case Lm.C:t.bezierCurveTo(s[c++],s[c++],s[c++],s[c++],s[c++],s[c++]),i=s[c-2],r=s[c-1];break;case Lm.Q:t.quadraticCurveTo(s[c++],s[c++],s[c++],s[c++]),i=s[c-2],r=s[c-1];break;case Lm.A:var f=s[c++],p=s[c++],g=s[c++],v=s[c++],m=s[c++],y=s[c++],x=s[c++],_=s[c++],w=g>v?g:v,b=g>v?1:g\/v,M=g>v?v\/g:1,S=Math.abs(g-v)>.001,I=m+y;S?(t.translate(f,p),t.rotate(x),t.scale(b,M),t.arc(0,0,w,m,I,1-_),t.scale(1\/b,1\/M),t.rotate(-x),t.translate(-f,-p)):t.arc(f,p,w,m,I,1-_),1==c&&(e=Vm(m)*g+f,n=Fm(m)*v+p),i=Vm(I)*g+f,r=Fm(I)*v+p;break;case Lm.R:e=i=s[c],n=r=s[c+1],t.rect(s[c++],s[c++],s[c++],s[c++]);break;case Lm.Z:t.closePath(),i=e,r=n}}}},Um.CMD=Lm;var Zm=2*Math.PI,jm=2*Math.PI,Xm=Um.CMD,Ym=2*Math.PI,qm=1e-4,$m=[-1,-1,-1],Km=[-1,-1],Qm=fv.prototype.getCanvasPattern,Jm=Math.abs,ty=new Um(!0);Lr.prototype={constructor:Lr,type:"path",__dirtyPath:!0,strokeContainThreshold:5,brush:function(t,e){var n=this.style,i=this.path||ty,r=n.hasStroke(),o=n.hasFill(),a=n.fill,s=n.stroke,l=o&&!!a.colorStops,u=r&&!!s.colorStops,h=o&&!!a.image,c=r&&!!s.image;if(n.bind(t,this,e),this.setTransform(t),this.__dirty){var d;l&&(d=d||this.getBoundingRect(),this._fillGradient=n.getGradient(t,a,d)),u&&(d=d||this.getBoundingRect(),this._strokeGradient=n.getGradient(t,s,d))}l?t.fillStyle=this._fillGradient:h&&(t.fillStyle=Qm.call(a,t)),u?t.strokeStyle=this._strokeGradient:c&&(t.strokeStyle=Qm.call(s,t));var f=n.lineDash,p=n.lineDashOffset,g=!!t.setLineDash,v=this.getGlobalScale();i.setScale(v[0],v[1]),this.__dirtyPath||f&&!g&&r?(i.beginPath(t),f&&!g&&(i.setLineDash(f),i.setLineDashOffset(p)),this.buildPath(i,this.shape,!1),this.path&&(this.__dirtyPath=!1)):(t.beginPath(),this.path.rebuildPath(t)),o&&i.fill(t),f&&g&&(t.setLineDash(f),t.lineDashOffset=p),r&&i.stroke(t),f&&g&&t.setLineDash([]),null!=n.text&&(this.restoreTransform(t),this.drawRectText(t,this.getBoundingRect()))},buildPath:function(){},createPathProxy:function(){this.path=new Um},getBoundingRect:function(){var t=this._rect,e=this.style,n=!t;if(n){var i=this.path;i||(i=this.path=new Um),this.__dirtyPath&&(i.beginPath(),this.buildPath(i,this.shape,!1)),t=i.getBoundingRect()}if(this._rect=t,e.hasStroke()){var r=this._rectWithStroke||(this._rectWithStroke=t.clone());if(this.__dirty||n){r.copy(t);var o=e.lineWidth,a=e.strokeNoScale?this.getLineScale():1;e.hasFill()||(o=Math.max(o,this.strokeContainThreshold||4)),a>1e-10&&(r.width+=o\/a,r.height+=o\/a,r.x-=o\/a\/2,r.y-=o\/a\/2)}return r}return t},contain:function(t,e){var n=this.transformCoordToLocal(t,e),i=this.getBoundingRect(),r=this.style;if(t=n[0],e=n[1],i.contain(t,e)){var o=this.path.data;if(r.hasStroke()){var a=r.lineWidth,s=r.strokeNoScale?this.getLineScale():1;if(s>1e-10&&(r.hasFill()||(a=Math.max(a,this.strokeContainThreshold)),Pr(o,a\/s,t,e)))return!0}if(r.hasFill())return kr(o,t,e)}return!1},dirty:function(t){null==t&&(t=!0),t&&(this.__dirtyPath=t,this._rect=null),this.__dirty=!0,this.__zr&&this.__zr.refresh(),this.__clipTarget&&this.__clipTarget.dirty()},animateShape:function(t){return this.animate("shape",t)},attrKV:function(t,e){"shape"===t?(this.setShape(e),this.__dirtyPath=!0,this._rect=null):oi.prototype.attrKV.call(this,t,e)},setShape:function(t,e){var n=this.shape;if(n){if(M(t))for(var i in t)t.hasOwnProperty(i)&&(n[i]=t[i]);else n[t]=e;this.dirty(!0)}return this},getLineScale:function(){var t=this.transform;return t&&Jm(t[0]-1)>1e-10&&Jm(t[3]-1)>1e-10?Math.sqrt(Jm(t[0]*t[3]-t[2]*t[1])):1}},Lr.extend=function(t){var e=function(e){Lr.call(this,e),t.style&&this.style.extendFrom(t.style,!1);var n=t.shape;if(n){this.shape=this.shape||{};var i=this.shape;for(var r in n)!i.hasOwnProperty(r)&&n.hasOwnProperty(r)&&(i[r]=n[r])}t.init&&t.init.call(this,e)};h(e,Lr);for(var n in t)"style"!==n&&"shape"!==n&&(e.prototype[n]=t[n]);return e},h(Lr,oi);var ey=Um.CMD,ny=[[],[],[]],iy=Math.sqrt,ry=Math.atan2,oy=function(t,e){var n,i,r,o,a,s,l=t.data,u=ey.M,h=ey.C,c=ey.L,d=ey.R,f=ey.A,p=ey.Q;for(r=0,o=0;ra;a++){var s=ny[a];s[0]=l[r++],s[1]=l[r++],oe(s,s,e),l[o++]=s[0],l[o++]=s[1]}}},ay=["m","M","l","L","v","V","h","H","z","Z","c","C","q","Q","t","T","s","S","a","A"],sy=Math.sqrt,ly=Math.sin,uy=Math.cos,hy=Math.PI,cy=function(t){return Math.sqrt(t[0]*t[0]+t[1]*t[1])},dy=function(t,e){return(t[0]*e[0]+t[1]*e[1])\/(cy(t)*cy(e))},fy=function(t,e){return(t[0]*e[1]=11?function(){var e,n=this.__clipPaths,i=this.style;if(n)for(var r=0;ro;o++)r+=ee(t[o-1],t[o]);var a=r\/2;a=n>a?n:a;for(var o=0;a>o;o++){var s,l,u,h=o\/(a-1)*(e?n:n-1),c=Math.floor(h),d=h-c,f=t[c%n];e?(s=t[(c-1+n)%n],l=t[(c+1)%n],u=t[(c+2)%n]):(s=t[0===c?c:c-1],l=t[c>n-2?n-1:c+1],u=t[c>n-3?n-1:c+2]);var p=d*d,g=d*p;i.push([Vr(s[0],f[0],l[0],u[0],d,p,g),Vr(s[1],f[1],l[1],u[1],d,p,g)])}return i},wy=function(t,e,n,i){var r,o,a,s,l=[],u=[],h=[],c=[];if(i){a=[1\/0,1\/0],s=[-1\/0,-1\/0];for(var d=0,f=t.length;f>d;d++)ae(a,a,t[d]),se(s,s,t[d]);ae(a,a,i[0]),se(s,s,i[1])}for(var d=0,f=t.length;f>d;d++){var p=t[d];if(n)r=t[d?d-1:f-1],o=t[(d+1)%f];else{if(0===d||d===f-1){l.push(G(t[d]));continue}r=t[d-1],o=t[d+1]}X(u,o,r),J(u,u,e);var g=ee(p,r),v=ee(p,o),m=g+v;0!==m&&(g\/=m,v\/=m),J(h,u,-g),J(c,u,v);var y=Z([],p,h),x=Z([],p,c);i&&(se(y,y,a),ae(y,y,s),se(x,x,a),ae(x,x,s)),l.push(y),l.push(x)}return n&&l.push(l.shift()),l},by=Lr.extend({type:"polygon",shape:{points:null,smooth:!1,smoothConstraint:null},buildPath:function(t,e){Fr(t,e,!0)}}),My=Lr.extend({type:"polyline",shape:{points:null,smooth:!1,smoothConstraint:null},style:{stroke:"#000",fill:null},buildPath:function(t,e){Fr(t,e,!1)}}),Sy=Lr.extend({type:"rect",shape:{r:0,x:0,y:0,width:0,height:0},buildPath:function(t,e){var n=e.x,i=e.y,r=e.width,o=e.height;e.r?Fn(t,e):t.rect(n,i,r,o),t.closePath()}}),Iy=Lr.extend({type:"line",shape:{x1:0,y1:0,x2:0,y2:0,percent:1},style:{stroke:"#000",fill:null},buildPath:function(t,e){var n=e.x1,i=e.y1,r=e.x2,o=e.y2,a=e.percent;0!==a&&(t.moveTo(n,i),1>a&&(r=n*(1-a)+r*a,o=i*(1-a)+o*a),t.lineTo(r,o))},pointAt:function(t){var e=this.shape;return[e.x1*(1-t)+e.x2*t,e.y1*(1-t)+e.y2*t]}}),Cy=[],Ty=Lr.extend({type:"bezier-curve",shape:{x1:0,y1:0,x2:0,y2:0,cpx1:0,cpy1:0,percent:1},style:{stroke:"#000",fill:null},buildPath:function(t,e){var n=e.x1,i=e.y1,r=e.x2,o=e.y2,a=e.cpx1,s=e.cpy1,l=e.cpx2,u=e.cpy2,h=e.percent;0!==h&&(t.moveTo(n,i),null==l||null==u?(1>h&&(cr(n,a,r,h,Cy),a=Cy[1],r=Cy[2],cr(i,s,o,h,Cy),s=Cy[1],o=Cy[2]),t.quadraticCurveTo(a,s,r,o)):(1>h&&(or(n,a,l,r,h,Cy),a=Cy[1],l=Cy[2],r=Cy[3],or(i,s,u,o,h,Cy),s=Cy[1],u=Cy[2],o=Cy[3]),t.bezierCurveTo(a,s,l,u,r,o)))},pointAt:function(t){return Hr(this.shape,t,!1)},tangentAt:function(t){var e=Hr(this.shape,t,!0);return te(e,e)}}),Ay=Lr.extend({type:"arc",shape:{cx:0,cy:0,r:0,startAngle:0,endAngle:2*Math.PI,clockwise:!0},style:{stroke:"#000",fill:null},buildPath:function(t,e){var n=e.cx,i=e.cy,r=Math.max(e.r,0),o=e.startAngle,a=e.endAngle,s=e.clockwise,l=Math.cos(o),u=Math.sin(o);t.moveTo(l*r+n,u*r+i),t.arc(n,i,r,o,a,!s)}}),Dy=Lr.extend({type:"compound",shape:{paths:null},_updatePathDirty:function(){for(var t=this.__dirtyPath,e=this.shape.paths,n=0;n& BFS(functor& func, ParamTypes&& ...args);$/;" p language:C++ class:anakin::graph::Algorithm access:public signature:(functor& func, ParamTypes&& ...args) +BFS framework/graph/algorithm.h /^Algorithm& Algorithm::BFS(functor& func, ParamTypes&& ...args) {$/;" f language:C++ class:anakin::graph::Algorithm signature:(functor& func, ParamTypes&& ...args) +BFS framework/graph/graph.cpp /^ graph.Scanner->BFS(shallow_copy_edge);$/;" p language:C++ file: signature:(shallow_copy_edge) +BFS framework/graph/graph.cpp /^ graph.Scanner->BFS(shallow_copy_node);$/;" p language:C++ file: signature:(shallow_copy_node) +BFS framework/graph/graph.cpp /^ this->Scanner->BFS(merge_node_attrs);$/;" p language:C++ file: signature:(merge_node_attrs) +BFS framework/graph/graph.cpp /^ this->Scanner->BFS(print_Node_debug_string);$/;" p language:C++ file: signature:(print_Node_debug_string) +BFS framework/graph/graph.cpp /^ this->Scanner->BFS(set_edge_io_in);$/;" p language:C++ file: signature:(set_edge_io_in) +BFS framework/graph/graph.cpp /^ this->Scanner->BFS(set_edge_io_out);$/;" p language:C++ file: signature:(set_edge_io_out) +BFS framework/graph/graph.cpp /^ this->Scanner->BFS(set_nodes);$/;" p language:C++ file: signature:(set_nodes) +BFS framework/graph/graph.cpp /^ vgraph->Scanner->BFS(interpreter_io_in); \/\/ this will change this real graph$/;" p language:C++ file: signature:(interpreter_io_in) +BFS framework/graph/graph.cpp /^ vgraph->Scanner->BFS(interpreter_io_out); \/\/ this will change this real graph$/;" p language:C++ file: signature:(interpreter_io_out) +BFS framework/graph/graph.cpp /^ vgraph->Scanner->BFS(interpreter_node);$/;" p language:C++ file: signature:(interpreter_node) +BFS framework/graph/llvm/scheduler.cpp /^ vgraph->Scanner->BFS(push_wait_que_f);$/;" p language:C++ file: signature:(push_wait_que_f) +BFS framework/lite/code_gen_base.cpp /^ graph.Scanner->BFS(change_node_name);$/;" p language:C++ file: signature:(change_node_name) +BFS framework/model_parser/parser/parser.cpp /^ graph->Scanner->BFS(insert_edge);$/;" p language:C++ file: signature:(insert_edge) +BFS_Edge framework/core/net/net.cpp /^ this->_graph_p->Scanner->BFS_Edge(analysis_used_of_temp_mem);$/;" p language:C++ file: signature:(analysis_used_of_temp_mem) +BFS_Edge framework/core/net/net.cpp /^ _graph_p->Scanner->BFS_Edge(alloc_memory);$/;" p language:C++ file: signature:(alloc_memory) +BFS_Edge framework/core/net/net.cpp /^ _graph_p->Scanner->BFS_Edge(share_memory);$/;" p language:C++ file: signature:(share_memory) +BFS_Edge framework/graph/algorithm.h /^ Algorithm& BFS_Edge(functor& func, ParamTypes&& ...args);$/;" p language:C++ class:anakin::graph::Algorithm access:public signature:(functor& func, ParamTypes&& ...args) +BFS_Edge framework/graph/algorithm.h /^Algorithm& Algorithm::BFS_Edge(functor& func, ParamTypes&& ...args) {$/;" f language:C++ class:anakin::graph::Algorithm signature:(functor& func, ParamTypes&& ...args) +BFS_Edge framework/graph/graph.cpp /^ this->Scanner->BFS_Edge(print_edge_debug_string);$/;" p language:C++ file: signature:(print_edge_debug_string) +BFS_Edge framework/graph/graph.cpp /^ this->Scanner->BFS_Edge(register_edge);$/;" p language:C++ file: signature:(register_edge) +BFS_Edge framework/graph/llvm/optimizer/memory_scheduler.cpp /^ vgraph_p->Scanner->BFS_Edge(find_unshared_io_one);$/;" p language:C++ file: signature:(find_unshared_io_one) +BFS_Edge framework/graph/llvm/optimizer/memory_scheduler.cpp /^ vgraph_p->Scanner->BFS_Edge(find_unshared_io_two);$/;" p language:C++ file: signature:(find_unshared_io_two) +BFS_Edge framework/graph/llvm/optimizer/memory_scheduler.cpp /^ vgraph_p->Scanner->BFS_Edge(replace_arc);$/;" p language:C++ file: signature:(replace_arc) +BFS_Edge framework/graph/llvm/optimizer/parall_scheduler.cpp /^ vgraph->Scanner->BFS_Edge(map_io);$/;" p language:C++ file: signature:(map_io) +BFS_Edge framework/graph/llvm/scheduler.cpp /^ vgraph->Scanner->BFS_Edge(register_io_f);$/;" p language:C++ file: signature:(register_io_f) +BFS_Edge framework/lite/code_gen_base.cpp /^ graph.Scanner->BFS_Edge(change_edge_name);$/;" p language:C++ file: signature:(change_edge_name) +BFS_Edge framework/lite/code_gen_base.cpp /^ _graph.Scanner->BFS_Edge(alloc_memory);$/;" p language:C++ file: signature:(alloc_memory) +BFS_Edge framework/lite/code_gen_base.cpp /^ _graph.Scanner->BFS_Edge(share_memory); $/;" p language:C++ file: signature:(share_memory) +BGIFRAME tools/external_converter_v2/parser/frontend/dash_board/static/cytoscape/qtip2/jquery.qtip.js /^BGIFRAME = '