diff --git a/.gitignore b/.gitignore
index bbcdfd1d6..3d6100c30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,6 @@ android_build
ios_build
gpu_build
output
+
+.idea
+.vscode
diff --git a/.travis.yml b/.travis.yml
index bf0ada02b..a1d2785ee 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,8 +7,8 @@ os:
env:
- JOB="-p NVIDIA-GPU -o Centos"
- JOB="-p NVIDIA-GPU -o Ubuntu"
- #- JOB="-p AMD_GPU -o Centos"
- #- JOB="-p AMD_GPU -o Ubuntu"
+ #- JOB="-p AMD-GPU -o Centos"
+ #- JOB="-p AMD-GPU -o Ubuntu"
#- JOB="-p X86-ONLY -o Centos"
#- JOB="-p X86-ONLY -o Ubuntu"
#- JOB="-p ARM -o Centos"
@@ -31,6 +31,8 @@ branches:
only:
- master
- developing
+ - AMD
+ - dev_v2
notifications:
email:
diff --git a/AUTHORS.md b/AUTHORS.md
new file mode 100644
index 000000000..bcc5f3ead
--- /dev/null
+++ b/AUTHORS.md
@@ -0,0 +1,27 @@
+| Github account | name |
+|---|---|
+| chenjiaoAngel | Jiao Chen |
+| cyj1986 | Yujuan Cheng |
+| feifei14119 | Fei Wang |
+| jackyh | Chengjie He |
+| Jayoprell | Xiaocheng Luo |
+| jjsbear | Jingsong Ji |
+| LittleMaer | Yi Zhuang |
+| mengkai94 | Kai Meng |
+| micytw | Michael Wu |
+| pangge | Chaowen Cui |
+| perchbird | Xiaokun Yu |
+| PeterJkPeng | Junyi Peng |
+| qq332982511 | Junjie Liu |
+| Shixiaowei02 | Xiaowei Shi |
+| sogalin | Soga Lin |
+| throneclay | Shuai Zhang |
+| vin-huang | Vin Huang |
+| wgy0804 | Guoya Wang |
+| xklnono | Kailu Xu |
+| xyoungli | Xiaoyang Li |
+| yanan1112 | Yanan Liu |
+| yao-matrix | Weifeng Yao |
+| zdcocnftcp10 | Dachuan Zhao |
+| zhouhuan2009 | Huan Zhou |
+| zoooooooyuan | Yuan Zu |
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ccb37468f..189a3414f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,12 +1,19 @@
-# ----------------------------------------------------------------------------
-# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved
-# @file root cmakefile
-# @auther cuichaowen
-# @date 2017-10-24
-# ----------------------------------------------------------------------------
+# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
include(cmake/thirdparty_version.cmake)
-cmake_minimum_required(VERSION ${MIN_CMAKE_V} FATAL_ERROR)
-project(ANAKIN C CXX)
+project(ANAKIN C CXX)
include(cmake/msg_color.cmake)
include(cmake/utils.cmake)
include(cmake/statistic.cmake)
@@ -14,10 +21,12 @@ include(cmake/statistic.cmake)
# ----------------------------------------------------------------------------
# section: global anakin version and lib name
# ----------------------------------------------------------------------------
-# global anakin version 2.0.1
-set(VERSION_MAJOR "2")
-set(VERSION_MINOR "0")
-set(VERSION_PATCH "1")
+cmake_minimum_required(VERSION ${MIN_CMAKE_V} FATAL_ERROR)
+
+# global anakin version 0.1.0
+set(VERSION_MAJOR "0")
+set(VERSION_MINOR "1")
+set(VERSION_PATCH "0")
set(VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
# anakin lib name and global directories
@@ -28,12 +37,15 @@ set(ANAKIN_ROOT ${PROJECT_SOURCE_DIR})
include_directories(${ANAKIN_ROOT})
set(ANAKIN_FRAMEWORK ${ANAKIN_ROOT}/framework)
-set(ANAKIN_THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/third-party)
+set(ANAKIN_LITE ${ANAKIN_FRAMEWORK}/lite)
set(ANAKIN_UTILS ${ANAKIN_ROOT}/utils)
set(ANAKIN_THIRD_PARTY_PATH ${ANAKIN_ROOT}/third-party)
set(ANAKIN_MODEL_PARSER ${ANAKIN_FRAMEWORK}/model_parser)
+set(ANAKIN_SERVICE ${ANAKIN_FRAMEWORK}/service)
set(ANAKIN_SABER ${ANAKIN_ROOT}/saber)
set(ANAKIN_UNIT_TEST ${ANAKIN_ROOT}/test)
+set(ANAKIN_EXAMPLES ${ANAKIN_ROOT}/examples)
+
# ----------------------------------------------------------------------------
# section: options for anakin
@@ -48,12 +60,13 @@ anakin_option(ANAKIN_TYPE_INT8 "define the INT8 for data precision." NO)
anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." YES)
anakin_option(USE_X86_PLACE "Select the build mode for X86 place." YES)
anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO)
+anakin_option(USE_BM_PLACE "Select the build mode for BM place." NO)
# plantfrom details
anakin_option(NVIDIA_GPU "Use NVIDIA GPU place." YES if USE_GPU_PLACE)
anakin_option(AMD_GPU "Use AMD GPU place." NO if USE_GPU_PLACE AND NOT NVIDIA_GPU)
-anakin_option(TARGET_ANDROID "" NO if USE_ARM_PLACE)
-anakin_option(TARGET_IOS "" NO if USE_ARM_PLACE)
+anakin_option(TARGET_ANDROID "build for android" YES if USE_ARM_PLACE)
+anakin_option(TARGET_IOS "not supported now" YES if USE_ARM_PLACE AND NOT TARGET_ANDROID)
# compile options for NVIDIA_GPU place
anakin_option(USE_CUDA "Use Cuda libs." YES if NVIDIA_GPU)
@@ -64,60 +77,52 @@ anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_CUDA)
anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_CUDA)
anakin_option(BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform" NO if BUILD_CROSS_PLANTFORM)
-# compile options for BM place
-#anakin_option(USE_BM "Use Cuda libs." YES if NVIDIA_GPU)
-#anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_BM)
-#anakin_option(USE_CURAND "Use Curand libs." YES if USE_BM)
-#anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_BM)
-#anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_BM)
-#anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_BM)
-
-
if(USE_CUDA)
# Select gpu target arch for local high performance implement sass code . Now we have checked on sm_61 sm_50 and it works well.
set(SELECTED_SASS_TARGET_ARCH "61")
-elseif(USE_BM)
- # Select gpu target arch for local high performance implement sass code . Now we have checked on sm_61 sm_50 and it works well.
- #set(SELECTED_SASS_TARGET_ARCH "61")
endif()
if((NOT BUILD_FAT_BIN) AND (NOT BUILD_CROSS_PLANTFORM) AND USE_CUDA)
# Select the only nvidia gpu arch you want to be built on
- set(TARGET_GPUARCH 6.1)
+ set(TARGET_GPUARCH 6.1)
endif()
# build options for cuda.
anakin_option(BUILD_CUBIN "BUILD with the -cubin option in Device mode" NO if USE_CUDA)
anakin_option(COMPILE_PTX "Returns a list of PTX files generated from src." NO if USE_CUDA)
-# build options for BM.
-anakin_option(BUILD_CUBIN "BUILD with the -cubin option in Device mode" NO if USE_BM)
-anakin_option(COMPILE_PTX "Returns a list of PTX files generated from src." NO if USE_BM)
-
-
# common build options
-anakin_option(ENABLE_DEBUG "Enable DEBUG(default) mode." NO)
+anakin_option(ENABLE_DEBUG "Enable DEBUG(default) mode." YES)
anakin_option(ENABLE_VERBOSE_MSG "Enable verbose=1 : compile msg during make." NO)
anakin_option(DISABLE_ALL_WARNINGS "Disable all the warning msg during compile." YES)
anakin_option(ENABLE_NOISY_WARNINGS "Enable noisy warning msg during compile." NO if DISABLE_ALL_WARNINGS)
# using 3rd party libs
-anakin_option(USE_GLOG "Build Glog components." NO)
+anakin_option(USE_LOGGER "Build native logger components." YES)
+anakin_option(USE_GLOG "Build Glog components." NO if NOT USE_LOGGER)
anakin_option(USE_PROTOBUF "Build Google protobuf components." YES)
anakin_option(USE_OPENCV "Use static opencv libs." NO)
anakin_option(USE_BOOST "Use static BOOST libs." NO)
-anakin_option(USE_OPENMP "Use Openmp when in andriod environment." YES if TARGET_ANDROID)
+anakin_option(USE_OPENMP "Use Openmp when in android environment." YES if TARGET_ANDROID)
anakin_option(USE_GTEST "Use googletest libs." NO if BUILD_WITH_UNIT_TEST)
anakin_option(USE_PYTHON "Generate py wrappers." NO)
-anakin_option(USE_OPENCL "Use OpenCL ." NO)
+anakin_option(USE_OPENCL "Use OpenCL ." YES if AMD_GPU)
anakin_option(USE_GFLAGS "Build Google gflags components." NO)
anakin_option(USE_MKL "Use mkl libs." NO if USE_X86_PLACE)
anakin_option(USE_MKLML "Use MKLML libs." YES if USE_X86_PLACE)
anakin_option(USE_XBYAK "Use XBYAK libs." YES if USE_X86_PLACE)
-anakin_option(USE_OPENMP "Use Openmp when in andriod environment." YES if TARGET_ANDROID)
+anakin_option(USE_OPENMP "Use Openmp when in android environment." YES if TARGET_ANDROID)
# build components
anakin_option(BUILD_WITH_UNIT_TEST "Build anakin unit test components." YES)
+anakin_option(BUILD_WITH_FRAMEWORK "Build anakin framework" YES)
+
+anakin_option(BUILD_RPC "Build anakin rpc service components." NO if BUILD_WITH_FRAMEWORK)
+anakin_option(BUILD_WITH_LITE "Build anakin lite components." YES if USE_GPU_PLACE AND BUILD_WITH_FRAMEWORK)
+
+# build examples
+anakin_option(BUILD_EXAMPLES "build detection and classification examples" NO)
+
# build target
anakin_option(BUILD_SHARED "Build anakin shared lib." YES)
anakin_option(BUILD_STATIC "Build anakin static lib." YES if NOT BUILD_SHARED)
@@ -127,10 +132,16 @@ anakin_option(ENABLE_OP_TIMER "Enable op timer mode." NO)
# ----------------------------------------------------------------------------
# section: anakin compiler and linker options
# ----------------------------------------------------------------------------
+set(CMAKE_BUILD_TYPE Debug FORCE)
if(ENABLE_DEBUG)
- set(CMAKE_BUILD_TYPE Debug FORCE)
+ set(CMAKE_BUILD_TYPE Debug FORCE)
else()
- set(CMAKE_BUILD_TYPE Release FORCE)
+ set(CMAKE_BUILD_TYPE Release FORCE)
+endif()
+
+if(USE_LOGGER)
+ anakin_option(ENABLE_STACKTRACES "If enable local logger with stacktrace." YES if NOT USE_ARM_PLACE)
+ anakin_option(SUPPORT_PTHREADS "If enable local logger with supporting pthreads. " YES)
endif()
# ----------------------------------------------------------------------------
@@ -138,8 +149,8 @@ endif()
# code
# ----------------------------------------------------------------------------
configure_file (
- "${PROJECT_SOURCE_DIR}/cmake/config/anakin_config.h.in"
- "${PROJECT_BINARY_DIR}/anakin_config.h"
+ "${PROJECT_SOURCE_DIR}/cmake/config/anakin_config.h.in"
+ "${PROJECT_BINARY_DIR}/anakin_config.h"
)
# add the binary tree to the search path so that anakin will find ak_config.h
include_directories(${PROJECT_BINARY_DIR})
@@ -157,10 +168,6 @@ if(USE_CUDA)
include(cmake/cuda.cmake)
endif()
-if(USE_BM)
- #include(cmake/cuda.cmake)
-endif()
-
if(USE_X86_PLACE)
set(ANAKIN_TEMP_THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/third-party)
if(USE_MKLML)
@@ -172,6 +179,10 @@ if(USE_X86_PLACE)
#include(cmake/external/mkldnn.cmake)
endif()
+if(AMD_GPU)
+ include(cmake/amd.cmake)
+endif()
+
# gather all the config options to anakin
include(cmake/gather.cmake)
@@ -181,14 +192,35 @@ include(cmake/gather.cmake)
# ----------------------------------------------------------------------------
# add source sub_directory whick holds the cmake build module
# fetch files of model_parser
-add_subdirectory(${ANAKIN_MODEL_PARSER})
+
+
add_subdirectory(${ANAKIN_SABER})
-add_subdirectory(${ANAKIN_FRAMEWORK})
+
+if(USE_BM_PLACE)
+ add_subdirectory(${ANAKIN_SABER}/funcs/impl/bm)
+endif()
+
+if(BUILD_WITH_FRAMEWORK)
+ add_subdirectory(${ANAKIN_MODEL_PARSER})
+ if(BUILD_RPC)
+ add_subdirectory(${ANAKIN_SERVICE})
+ endif()
+ if(BUILD_WITH_LITE)
+ add_subdirectory(${ANAKIN_LITE})
+ endif()
+ add_subdirectory(${ANAKIN_FRAMEWORK})
+endif()
if(BUILD_WITH_UNIT_TEST)
add_subdirectory(${ANAKIN_UNIT_TEST})
endif()
+if (BUILD_EXAMPLES)
+ if(BUILD_WITH_FRAMEWORK)
+ add_subdirectory(${ANAKIN_EXAMPLES})
+ endif()
+endif()
+
anakin_print_statistic()
diff --git a/README.md b/README.md
index 4cabf240b..fcbfe9ae6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Anakin
+# Anakin12
[](https://travis-ci.org/PaddlePaddle/Anakin)
[](LICENSE)
@@ -7,63 +7,65 @@
Welcome to the Anakin GitHub.
-Anakin is an cross-platform, high-performance inference engine, which is originally
+Anakin is a cross-platform, high-performance inference engine, which is originally
developed by Baidu engineers and is a large-scale application of industrial products.
-Please refer to our [release announcement]() to track the latest feature of Anakin.
+Please refer to our [release announcement](https://github.com/PaddlePaddle/Anakin/releases) to track the latest feature of Anakin.
## Features
- **Flexibility**
Anakin supports a wide range of neural network architectures and
- diffrent hardware platform. It is easy to run Anakin at GPU/x86/ARM platform.
+ different hardware platforms. It is easy to run Anakin on GPU / x86 / ARM platform.
- **High performance**
- In order to giving full play to the performance of hardware, we optimize the
- forward prediction at diffrent levels.
- - Automatic graph fusion. The goal of all performance optimization under a
- given algorithm is to make ALU as busy as possible, Operator fusion
- can effectively reduce memory access and keep ALU busy.
-
- - Memory reuse. Forward prediction is a one-way calculation. We reuse
- the memory between the input and output of different operators, thus
+ In order to give full play to the performance of hardware, we optimized the
+ forward prediction at different levels.
+ - Automatic graph fusion. The goal of all performance optimizations under a
+ given algorithm is to make the ALU as busy as possible. Operator fusion
+ can effectively reduce memory access and keep the ALU busy.
+
+ - Memory reuse. Forward prediction is a one-way calculation. We reuse
+ the memory between the input and output of different operators, thus
reducing the overall memory overhead.
- - Assembly level optimization. Saber is Anakin's underlying DNN library, which
+ - Assembly level optimization. Saber is a underlying DNN library for Anakin, which
is deeply optimized at assembly level. Performance comparison between Anakin, TensorRT
- and Tensorflow-lite, please refer to the benchmark tests.
+ and Tensorflow-lite, please refer to the [benchmark tests](benchmark/README.md).
## Installation
It is recommended to check out the
-[Docker installation guide](docker/README.md).
+[docker installation guide](docker/README.md).
before looking into the
[build from source guide](docs/Manual/INSTALL_en.md).
+For ARM, please refer [run on arm](docs/Manual/run_on_arm_en.md).
+
## Benchmark
-It is recommended to check out the [Benchmark Readme](benchmark/README.md)
+It is recommended to check out the [readme of benchmark](benchmark/README.md).
## Documentation
-We provide [English](docs/Manual/Tutorial_en.md) and
-[Chinese](docs/Manual/Tutorial_ch.md) documentation.
+We provide [English](docs/Manual/Tutorial_en.md) and [Chinese](docs/Manual/Tutorial_ch.md) documentation.
-- [Anakin developer guide]()
+- Developer guide
- You might want to know more details of Anakin and make it better.
+ You might want to know more details of Anakin and make it better. Please refer to [how to add custom devices](docs/Manual/addCustomDevice.md) and [how to add custom device operators](docs/Manual/addCustomOp.md).
-- [C++ API]()
+- User guide
- Python API is under-developing.
+ You can get the working principle of the project, C++ interface description and code examples from [here](docs/Manual/Tutorial_ch.md). You can also learn about the model converter [here](docs/Manual/Converter_ch.md).
-- [How to Contribute]()
+- [How to Contribute](docs/Manual/Contribution_ch.md)
We appreciate your contributions!
+
## Ask Questions
You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Anakin/issues).
diff --git a/benchmark/README.md b/benchmark/README.md
index 5dcf61d93..94f57930f 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -1,173 +1,42 @@
# Benchmark
-## Machine:
-
-This time, we only provide benchmark on GPU. In the near future, we will add benchmark on ARM and CPU.
-
-> CPU: `12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz`
-> GPU: `Tesla P4`
-> cuDNN: `v7`
-
-## Counterpart of anakin :
-The counterpart of **`Anakin`** is the acknowledged high performance inference engine **`NVIDIA TensorRT 3`** , The models which TensorRT 3 doesn't support we use the custom plugins to support.
-
## Benchmark Model
-The following convolutional neural networks are tested with both `Anakin` and `TenorRT3`.
- You can use pretrained caffe model or the model trained by youself.
-
> Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](#)
+### GPU
-- [Vgg16](#1) *caffe model can be found [here->](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)*
-- [Yolo](#2) *caffe model can be found [here->](https://github.com/hojel/caffe-yolo-model)*
-- [Resnet50](#3) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
-- [Resnet101](#4) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
-- [Mobilenet v1](#5) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
-- [Mobilenet v2](#6) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
-- [RNN](#7) *not support yet*
-
-We tested them on single-GPU with single-thread.
-
-### VGG16
-
-- Latency (`ms`) of different batch
-
- BatchSize | TensorRT | Anakin
- :---: | :---: | :---: |
- 1 | 8.8690 | 8.2815
- 2 | 15.5344 | 13.9116
- 4 | 26.6000 | 21.8747
- 8 | 49.8279 | 40.4076
- 32 | 188.6270 | 163.7660
-
-- GPU Memory Used (`MB`)
-
- BatchSize | TensorRT | Anakin
- :---: | :---: | :---: |
- 1 | 963 | 997
- 2 | 965 | 1039
- 4 | 991 | 1115
- 8 | 1067 | 1269
- 32 | 1715 | 2193
-
-
-### Yolo
-
-- Latency (`ms`) of different batch
-
- BatchSize | TensorRT | Anakin
- :---: | :---: | :---: |
- 1 | 16.4596| 15.2124
- 2 | 26.6347| 25.0442
- 4 | 43.3695| 43.5017
- 8 | 80.9139 | 80.9880
- 32 | 293.8080| 310.8810
-
-- GPU Memory Used (`MB`)
-
- BatchSize | TensorRT | Anakin
- :---: | :---: | :---: |
- 1 | 1569 | 1775
- 2 | 1649 | 1815
- 4 | 1709 | 1887
- 8 | 1731 | 2031
- 32 | 2253 | 2907
-
-### Resnet50
-
-- Latency (`ms`) of different batch
-
- BatchSize | TensorRT | Anakin
- :---: | :---: | :---: |
- 1 | 4.2459 | 4.1061
- 2 | 6.2627 | 6.5159
- 4 | 10.1277 | 11.3327
- 8 | 17.8209 | 20.6680
- 32 | 65.8582 | 77.8858
-
-- GPU Memory Used (`MB`)
-
- BatchSize | TensorRT | Anakin
- :---: | :---: | :---: |
- 1 | 531 | 503
- 2 | 543 | 517
- 4 | 583 | 541
- 8 | 611 | 589
- 32 | 809 | 879
-
-### Resnet101
-
-- Latency (`ms`) of different batch
-
- BatchSize | TensorRT | Anakin
- :---: | :---: | :---: |
- 1 | 7.5562 | 7.0837
- 2 | 11.6023 | 11.4079
- 4 | 18.3650 | 20.0493
- 8 | 32.7632 | 36.0648
- 32 | 123.2550 | 135.4880
-
-- GPU Memory Used (`MB)`
-
- BatchSize | TensorRT | Anakin
- :---: | :---: | :---: |
- 1 | 701 | 683
- 2 | 713 | 697
- 4 | 793 | 721
- 8 | 819 | 769
- 32 | 1043 | 1059
-
-
-### MobileNet V1
-
-- Latency (`ms`) of different batch
-
- BatchSize | TensorRT | Anakin
- :---: | :---: | :---: |
- 1 | 45.5156 | 1.3947
- 2 | 46.5585 | 2.5483
- 4 | 48.4242 | 4.3404
- 8 | 52.7957 | 8.1513
- 32 | 83.2519 | 31.3178
-
-- GPU Memory Used (`MB`)
-
- BatchSize | TensorRT | Anakin
- :---: | :---: | :---: |
- 1 | 329 | 283
- 2 | 345 | 289
- 4 | 371 | 299
- 8 | 393 | 319
- 32 | 531 | 433
-
-### MobileNet V2
-
-- Latency (`ms`) of different batch
+The following convolutional neural networks are tested with both `Anakin` and `TenorRT3` on GPU.
+ You can use pretrained caffe model or the model trained by youself.
- BatchSize | TensorRT | Anakin
- :---: | :---: | :---: |
- 1 | 65.6861 | 2.9842
- 2 | 66.6814 | 4.7472
- 4 | 69.7114 | 7.4163
- 8 | 76.1092 | 12.8779
- 32 | 124.9810 | 47.2142
+- [Vgg16]() *caffe model can be found [here->](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)*
+- [Yolo]() *caffe model can be found [here->](https://github.com/hojel/caffe-yolo-model)*
+- [Resnet50]() *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
+- [Resnet101]() *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
+- [Mobilenet v1]() *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [Mobilenet v2]() *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [RNN]() *not support yet*
-- GPU Memory Used (`MB`)
+### CPU
- BatchSize | TensorRT | Anakin
- :---: | :---: | :---: |
- 1 | 341 | 293
- 2 | 353 | 301
- 4 | 385 | 319
- 8 | 421 | 351
- 32 | 637 | 551
+The following convolutional neural networks are tested with `Anakin`, 'Tensorflow' and `Tensorflow`.
+ You can use pretrained model or the model trained by youself.
+- [Language model]() *fluid model can be found [here->](https://github.com/PaddlePaddle/models/tree/develop/fluid/language_model)*
+- [Chinese_ner]() *fluid model can be found [here->](https://github.com/PaddlePaddle/models/blob/develop/fluid/chinese_ner)*
+- [text_classification]() *fluid model can be found [here->](https://github.com/PaddlePaddle/models/blob/develop/fluid/text_classification)*
-### RNN
+### ARM
-The benchmark of rnn network will be added later.
+The following convolutional neural networks are tested with `Anakin`, 'Tensorflow' and `Tensorflow`.
+ You can use pretrained model or the model trained by youself.
-## How to run those Benchmark models?
+- [Mobilenet v1]() *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [Mobilenet v2]() *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [mobilenet-ssd]() *caffe model can be found [here->](https://github.com/chuanqi305/MobileNet-SSD)*
-> Please refer to [Instructions](CNN/README.md)
+## Test Results
+The detailed test results can be seen here.
+- [GPU](./README_GPU.md)
+- [CPU](./README_CPU.md)
+- [ARM](./README_ARM.md)
diff --git a/benchmark/README_ARM.md b/benchmark/README_ARM.md
new file mode 100644
index 000000000..368706570
--- /dev/null
+++ b/benchmark/README_ARM.md
@@ -0,0 +1,66 @@
+# BenchMark
+
+## Machine:
+
++ Compile circumstance: Android ndk cross compile,gcc 4.9,enable neon
++ ABI: armveabi-v7a with neon -mfloat-abi=softfp
++ Testing platform
+ - honor v9(root): Kirin960, 4 big cores in 2.36GHz, 4 little cores in 1.8GHz
+ - nubia z17:Qualcomm835, 4 big cores in 2.36GHz, 4 little cores in 1.9GHz
+ - 360 N5:Qualcomm653, 4 big cores in 1.8GHz, 4 little cores in 1.4GHz
++ Time:warmup 10,running 10 times to get average time
++ ncnn :git clone on github master branch and commits ID is 307a77f04be29875f40d337cfff6df747df09de6(msg:convert LogisticRegressionOutput)
++ TFlite:git clone on github master branch and commits ID is 65c05bc2ac19f51f7027e66350bc71652662125c(msg:Removed unneeded file copy that was causing failure in Pi builds)
+
+## Counterpart of Anakin
+
+The counterpart of **`Anakin`** are **`ncnn`** and **`TFlite`**.
+
+## BenchMark model
+
+> Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md)
+
+- [Mobilenet v1](#11) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [Mobilenet v2](#22) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [mobilenet-ssd](#33) *caffe model can be found [here->](https://github.com/chuanqi305/MobileNet-SSD)*
+
+We tested them on ARM with multi-thread and single-batchsize.
+
+### mobilenetv1
+
+- Latency (`ms`) of different thread
+
+ |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+ |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+ |Kirin960|107.7|61.1ms|38.2 |152.8 |85.2 |51.9 |152.6 |nan|nan|
+ |Qualcomm835|105.7 |63.1 |~~46.8 ~~|152.7 |87.0 |~~92.7 ~~|146.9 |nan|nan|
+ |Qualcomm653|120.3 |64.2 |46.6 |202.5 |117.6 |84.8 |158.6 |nan|nan|
+
+### mobilenetv2
+
+- Latency (`ms`) of different thread
+
+ |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+ |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+ |Kirin960|93.1 |53.9 |34.8 |144.4 |84.3 |55.3 |100.6 |nan|nan|
+ |Qualcomm835|93.0 |55.6 |41.1 |139.1 |88.4 |58.1 |95.2 |nan|nan|
+ |Qualcomm653|106.6 |64.2 |48.0 |199.9 |125.1 |98.9 |108.5 |nan|nan|
+
+### mobilenet-ssd
+
+- Latency (`ms`) of different thread
+
+ |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+ |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+ |Kirin960|213.9 |120.5 |74.5 |307.9 |166.5 |104.2 |nan|nan|nan|
+ |Qualcomm835|213.0 |125.7 |~~98.4 ~~|292.9 |177.9 |~~167.8 ~~|nan|nan|nan|
+ |Qualcomm653|236.0 |129.6 |96.0 |377.7 |228.9 |165.0 |nan|nan|nan
+
+## How to run those Benchmark models?
+
+1. At first, you should parse the caffe model with [External Converter](../docs/Manual/Converter_en.md)
+2. Second, adb push Anakin model and benchmark_arm bin to testing phone
+3. Then, switch to /data/local/tmp/ directory on testing phone, run `./benchmark_arm ./ anakin_model.anakin.bin 1 10 10 1` command
+4. Finally,model latency summary will be displayed on the screen.
+5. You can see the detailed parameters meaning by running `/benchmark_arm`
+
diff --git a/benchmark/README_CPU.md b/benchmark/README_CPU.md
new file mode 100644
index 000000000..6113e2e2c
--- /dev/null
+++ b/benchmark/README_CPU.md
@@ -0,0 +1,281 @@
+# Benchmark
+
+## Machine:
+
+This time, we only provide benchmark on CPU. In the near future, we will add benchmark on ARM and GPU.
+
+> System: `CentOS 7 in Docker`, for benchmark between Anakin and Tensorflow
+> System: `CentOS 6.3`, for benchmark between Anakin and Paddle
+
+## Counterpart of anakin :
+
+The counterpart of **`Anakin`** is `Tensorflow 1.8.0`, which installed by Anaconda 4.5.4, run by Python 3.6
+
+## Benchmark Model
+
+ You can use pretrained model or the model trained by youself.
+
+> Please note that you should transform fluid model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md)
+
+- [Language model](#1) *fluid model can be found [here->](https://github.com/PaddlePaddle/models/tree/develop/fluid/language_model)*
+- [Chinese_ner](#4) *fluid model can be found [here->](https://github.com/PaddlePaddle/models/blob/develop/fluid/chinese_ner)*
+- [text_classification](#7) *fluid model can be found [here->](https://github.com/PaddlePaddle/models/blob/develop/fluid/text_classification)*
+
+We tested them on single-CPU with different thread numbers.
+
+1. **`Anakin`** VS **`Tensorflow`**
+
+### language model in i7-7700
+
+- Latency (`ms`) of one batch
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 5.64 | 2.44
+ 2 | 8.29 | 4.44
+ 4 | 14.23 | 9.91
+ 6 | 19.83 | 15.51
+
+- Throughput (`words/s`)
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 3459 | 8536
+ 2 | 4772 | 9399
+ 4 | 5498 | 8418
+ 6 | 5764 | 8070
+
+### language model in E5-2620 v4
+
+- Latency (`ms`) of one batch
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 6.31 | 2.84
+ 2 | 7.94 | 2.678
+ 4 | 8.66 | 4.32
+ 6 | 12.33 | 7.12
+
+- Throughput (`words/s`)
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 2890 | 7257
+ 2 | 4726 | 15439
+ 4 | 8659 | 18351
+ 6 | 9414 | 17461
+
+### language model in E5-2650 v4
+
+- Latency (`ms`) of one batch
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 3.69 | 2.84
+ 2 | 4.62 | 2.85
+ 4 | 7.78 | 3.48
+ 6 | 13.54 | 4.79
+
+- Throughput (`words/s`)
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 4456 | 7300
+ 2 | 7522 | 14556
+ 4 | 9580 | 22086
+ 6 | 8664 | 23938
+
+### text_classfication model in i7-7700
+
+- Latency (`ms`) of one batch
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 1.25 | 0.32
+ 2 | 1.87 | 0.33
+ 4 | 2.01 | 0.35
+ 6 | 2.81 | 0.58
+
+- Throughput (`words/s`)
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 12797 | 53506
+ 2 | 17933 | 95898
+ 4 | 31965 | 148427
+ 6 | 31784 | 118684
+
+### text_classfication in E5-2620 v4
+
+- Latency (`ms`) of one batch
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 3.89 | 0.58
+ 2 | 3.77 | 0.61
+ 4 | 3.05 | 0.62
+ 6 | 3.84 | 0.66
+
+- Throughput (`words/s`)
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 4281 | 28192
+ 2 | 8804 | 49840
+ 4 | 19949 | 89710
+ 6 | 24798 | 116975
+
+### text_classfication in E5-2650 v4
+
+- Latency (`ms`) of one batch
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 2.26 | 0.67
+ 2 | 2.34 | 0.7
+ 4 | 2.25 | 0.72
+ 6 | 2.47 | 0.73
+
+- Throughput (`words/s`)
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 6337 | 24636
+ 2 | 12266 | 45368
+ 4 | 24869 | 81952
+ 6 | 34872 | 109993
+
+### chinese_ner model in i7-7700
+
+- Latency (`ms`) of one batch
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 1.96 | 0.094
+ 2 | 2.59 | 0.098
+ 4 | 3.74 | 0.1
+ 6 | 3.95 | 0.13
+
+- Throughput (`words/s`)
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 8747 | 156564
+ 2 | 13293 | 208484
+ 4 | 18294 | 114348
+ 6 | 25338 | 66480
+
+### chinese_ner in E5-2620 v4
+
+- Latency (`ms`) of one batch
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 5.44 | 0.13
+ 2 | 5.45 | 0.14
+ 4 | 4.84 | 0.15
+ 6 | 5.18 | 0.16
+
+- Throughput (`words/s`)
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 4281 | 93527
+ 2 | 8804 | 127232
+ 4 | 19949 | 118649
+ 6 | 24798 | 99553
+
+### chinese_ner in E5-2650 v4
+
+- Latency (`ms`) of one batch
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 3.61 | 0.16
+ 2 | 3.78 | 0.16
+ 4 | 3.74 | 0.17
+ 6 | 3.78 | 0.16
+
+- Throughput (`words/s`)
+
+ ThreadNum | Tensorflow | Anakin
+ :---: | :---: | :---: |
+ 1 | 4669 | 79225
+ 2 | 8953 | 115761
+ 4 | 18074 | 118696
+ 6 | 26607 | 102044
+
+2. **`Anakin`** VS **`PaddlePaddle/Fluid`**
+We use private dataset and different QPS index in this benchmark.
+### language model in E5-2650 v4
+
+- Latency (`ms`) of one batch
+
+ ThreadNum | Fluid | Anakin
+ :---: | :---: | :---: |
+ 1 | 42.7418 | 1.93589
+ 2 | 42.7418 | 2.49537
+ 6 | 42.7734 | 3.14332
+ 10 | 43.0721 | 4.55329
+ 12 | 42.8501 | 5.09893
+
+- Throughput (`sentence/s`)
+
+ ThreadNum | Fluid | Anakin
+ :---: | :---: | :---: |
+ 1 | 23 | 504
+ 2 | 46 | 762
+ 6 | 134 | 1393
+ 10 | 218 | 1556
+ 12 | 260 | 1541
+
+### Chinese_ner model in E5-2650 v4
+
+- Latency (`ms`) of one batch
+
+ ThreadNum | Fluid | Anakin
+ :---: | :---: | :---: |
+ 1 | 0.380475 | 0.17034
+ 4 | 0.380475 | 0.171143
+ 6 | 0.380475 | 0.172688
+ 10 | 0.380475 | 0.173269
+ 12 | 0.380475 | 0.17668
+
+- Throughput (`sentence/s`)
+
+ ThreadNum | Fluid | Anakin
+ :---: | :---: | :---: |
+ 1 | 7844 | 5822
+ 4 | 7844 | 11377
+ 6 | 7844 | 29725
+ 10 | 7844 | 41238
+ 12 | 7844 | 42790
+
+### text_classfication model in E5-2650 v4
+
+- Latency (`ms`) of one batch
+
+ ThreadNum | Fluid | Anakin
+ :---: | :---: | :---: |
+ 1 | 1.48578 | 1.10088
+ 4 | 1.54025 | 1.11258
+ 6 | 1.68529 | 1.1257
+ 10 | 1.9817 | 1.13267
+ 12 | 2.21864 | 1.1429
+
+- Throughput (`sentence/s`)
+
+ ThreadNum | Fluid | Anakin
+ :---: | :---: | :---: |
+ 1 | 673 | 901
+ 4 | 1289 | 1665
+ 6 | 3458 | 4449
+ 10 | 4875 | 6183
+ 12 | 5265 | 6188
+
+## How to run those Benchmark models?
+
+> 1. You can just run `sh benchmark_tensorflow.sh` and `sh benchmark_anakin.sh`
+> 2. Get the model of caffe or fluid, convert model to anakin model, use net_test_*** to test your model.
+
+
diff --git a/benchmark/README_GPU.md b/benchmark/README_GPU.md
new file mode 100644
index 000000000..04326535a
--- /dev/null
+++ b/benchmark/README_GPU.md
@@ -0,0 +1,176 @@
+# Benchmark
+
+## Machine:
+
+> CPU: `12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz`
+> GPU: `Tesla P4`
+> cuDNN: `v7`
+
+
+## Counterpart of anakin :
+
+The counterpart of **`Anakin`** is the acknowledged high performance inference engine **`NVIDIA TensorRT 3`** , The models which TensorRT 3 doesn't support we use the custom plugins to support.
+
+## Benchmark Model
+
+The following convolutional neural networks are tested with both `Anakin` and `TenorRT3`.
+ You can use pretrained caffe model or the model trained by youself.
+
+> Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md)
+
+
+- [Vgg16](#1) *caffe model can be found [here->](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)*
+- [Yolo](#2) *caffe model can be found [here->](https://github.com/hojel/caffe-yolo-model)*
+- [Resnet50](#3) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
+- [Resnet101](#4) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
+- [Mobilenet v1](#5) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [Mobilenet v2](#6) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [RNN](#7) *not support yet*
+
+We tested them on single-GPU with single-thread.
+
+### VGG16
+
+- Latency (`ms`) of different batch
+
+ BatchSize | TensorRT | Anakin
+ :---: | :---: | :---: |
+ 1 | 8.85176 | 8.15362
+ 2 | 15.6517 | 13.8716
+ 4 | 26.5303 | 21.8478
+ 8 | 48.2286 | 40.496
+ 32 | 183.994 | 163.035
+
+- GPU Memory Used (`MB`)
+
+ BatchSize | TensorRT | Anakin
+ :---: | :---: | :---: |
+ 1 | 887 | 648
+ 2 | 965 | 733
+ 4 | 991 | 810
+ 8 | 1067 | 911
+ 32 | 1715 | 1325
+
+
+### Yolo
+
+- Latency (`ms`) of different batch
+
+ BatchSize | TensorRT | Anakin
+ :---: | :---: | :---: |
+ 1 | 16.4623| 15.3214
+ 2 | 26.7082| 25.0305
+ 4 | 43.2129| 43.4758
+ 8 | 80.0053 | 80.7645
+ 32 | 283.352| 311.152
+
+- GPU Memory Used (`MB`)
+
+
+ BatchSize | TensorRT | Anakin
+ :---: | :---: | :---: |
+ 1 | 1226 | 1192
+ 2 | 1326 | 1269
+ 4 | 1435 | 1356
+ 8 | 1563 | 1434
+ 32 | 2150 | 1633
+
+### Resnet50
+
+- Latency (`ms`) of different batch
+
+ BatchSize | TensorRT | Anakin
+ :---: | :---: | :---: |
+ 1 | 4.26834 | 3.25853
+ 2 | 6.2811 | 6.12156
+ 4 | 10.1183 | 10.9219
+ 8 | 18.1395 | 20.323
+ 32 | 66.4728 | 83.9934
+
+- GPU Memory Used (`MB`)
+
+ BatchSize | TensorRT | Anakin
+ :---: | :---: | :---: |
+ 1 | 932 | 272
+ 2 | 936 | 318
+ 4 | 720 | 376
+ 8 | 697 | 480
+ 32 | 842 | 835
+
+### Resnet101
+
+- Latency (`ms`) of different batch
+
+ BatchSize | TensorRT | Anakin
+ :---: | :---: | :---: |
+ 1 | 7.58234 | 5.66457
+ 2 | 11.6014 | 10.9213
+ 4 | 18.3298 | 19.3987
+ 8 | 32.6523 | 37.5575
+ 32 | 123.114 | 149.089
+
+- GPU Memory Used (`MB)`
+
+ BatchSize | TensorRT | Anakin
+ :---: | :---: | :---: |
+ 1 | 1020 | 420
+ 2 | 961 | 467
+ 4 | 943 | 503
+ 8 | 885 | 606
+ 32 | 1048 | 1077
+
+### MobileNet V1
+
+- Latency (`ms`) of different batch
+
+ BatchSize | TensorRT | Anakin
+ :---: | :---: | :---: |
+ 1 | 45.2189 | 1.39566
+ 2 | 46.4538 | 2.50698
+ 4 | 47.8918 | 4.38727
+ 8 | 52.3636 | 8.21416
+ 32 | 83.0503 | 31.33
+
+- GPU Memory Used (`MB`)
+
+ BatchSize | TensorRT | Anakin
+ :---: | :---: | :---: |
+ 1 | 516 | 176
+ 2 | 524 | 166
+ 4 | 497 | 165
+ 8 | 508 | 239
+ 32 | 628 | 388
+
+### MobileNet V2
+
+- Latency (`ms`) of different batch
+
+ BatchSize | TensorRT | Anakin
+ :---: | :---: | :---: |
+ 1 | 65.4277 | 1.80542
+ 2 | 66.2048 | 3.85568
+ 4 | 68.8045 | 6.80921
+ 8 | 75.64 | 12.6038
+ 32 | 124.09 | 47.6079
+
+- GPU Memory Used (`MB`)
+
+ BatchSize | TensorRT | Anakin
+ :---: | :---: | :---: |
+ 1 | 341 | 293
+ 2 | 353 | 301
+ 4 | 385 | 319
+ 8 | 421 | 351
+ 32 | 637 | 551
+
+## How to run those Benchmark models?
+
+> 1. At first, you should parse the caffe model with [`external converter ->`](../docs/Manual/Converter_en.md).
+> 2. Switch to *source_root/benchmark/CNN* directory. Use 'mkdir ./models' to create ./models and put anakin models into this file.
+> 3. Use command 'sh run.sh', we will create files in logs to save model log with different batch size. Finally, model latency summary will be displayed on the screen.
+> 4. If you want to get more detailed information with op time, you can modify CMakeLists.txt with setting `ENABLE_OP_TIMER` to `YES`, then recompile and run. You will find detailed information in model log file.
+
+
+
+
+
diff --git a/benchmark/RNN/README.md b/benchmark/RNN/README.md
new file mode 100644
index 000000000..0232d7d22
--- /dev/null
+++ b/benchmark/RNN/README.md
@@ -0,0 +1,10 @@
+# RNN BenchMark
+
+
+## 1. How to run
+
+Two way to run anakin
+
+> 1.You can just run `sh benchmark_tensorflow.sh` `sh benchmark_anakin.sh`
+> 2.Get the model of caffe or fluid, convert model to anakin model, use net_test_*** to test your model
+
diff --git a/benchmark/RNN/Tokenizer.py b/benchmark/RNN/Tokenizer.py
new file mode 100644
index 000000000..cceac5310
--- /dev/null
+++ b/benchmark/RNN/Tokenizer.py
@@ -0,0 +1,384 @@
+# -*- coding: utf-8 -*-
+"""Utilities for text input preprocessing.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import string
+import sys
+import warnings
+from collections import OrderedDict
+from hashlib import md5
+
+import numpy as np
+from six.moves import range
+from six.moves import zip
+
+if sys.version_info < (3,):
+ maketrans = string.maketrans
+else:
+ maketrans = str.maketrans
+
+
+def text_to_word_sequence(text,
+ filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+ lower=True, split=" "):
+ """Converts a text to a sequence of words (or tokens).
+
+ # Arguments
+ text: Input text (string).
+ filters: list (or concatenation) of characters to filter out, such as
+ punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`,
+ includes basic punctuation, tabs, and newlines.
+ lower: boolean. Whether to convert the input to lowercase.
+ split: str. Separator for word splitting.
+
+ # Returns
+ A list of words (or tokens).
+ """
+ if lower:
+ text = text.lower()
+
+ if sys.version_info < (3,):
+ if isinstance(text, unicode):
+ translate_map = dict((ord(c), unicode(split)) for c in filters)
+ text = text.translate(translate_map)
+ elif len(split) == 1:
+ translate_map = maketrans(filters, split * len(filters))
+ text = text.translate(translate_map)
+ else:
+ for c in filters:
+ text = text.replace(c, split)
+ else:
+ translate_dict = dict((c, split) for c in filters)
+ translate_map = maketrans(translate_dict)
+ text = text.translate(translate_map)
+
+ seq = text.split(split)
+ return [i for i in seq if i]
+
+
+def one_hot(text, n,
+ filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+ lower=True,
+ split=' '):
+ """One-hot encodes a text into a list of word indexes of size n.
+
+ This is a wrapper to the `hashing_trick` function using `hash` as the
+ hashing function; unicity of word to index mapping non-guaranteed.
+
+ # Arguments
+ text: Input text (string).
+ n: int. Size of vocabulary.
+ filters: list (or concatenation) of characters to filter out, such as
+ punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`,
+ includes basic punctuation, tabs, and newlines.
+ lower: boolean. Whether to set the text to lowercase.
+ split: str. Separator for word splitting.
+
+ # Returns
+ List of integers in [1, n]. Each integer encodes a word
+ (unicity non-guaranteed).
+ """
+ return hashing_trick(text, n,
+ hash_function=hash,
+ filters=filters,
+ lower=lower,
+ split=split)
+
+
+def hashing_trick(text, n,
+ hash_function=None,
+ filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+ lower=True,
+ split=' '):
+ """Converts a text to a sequence of indexes in a fixed-size hashing space.
+
+ # Arguments
+ text: Input text (string).
+ n: Dimension of the hashing space.
+ hash_function: defaults to python `hash` function, can be 'md5' or
+ any function that takes in input a string and returns a int.
+ Note that 'hash' is not a stable hashing function, so
+ it is not consistent across different runs, while 'md5'
+ is a stable hashing function.
+ filters: list (or concatenation) of characters to filter out, such as
+ punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`,
+ includes basic punctuation, tabs, and newlines.
+ lower: boolean. Whether to set the text to lowercase.
+ split: str. Separator for word splitting.
+
+ # Returns
+ A list of integer word indices (unicity non-guaranteed).
+
+ `0` is a reserved index that won't be assigned to any word.
+
+ Two or more words may be assigned to the same index, due to possible
+ collisions by the hashing function.
+ The [probability](
+ https://en.wikipedia.org/wiki/Birthday_problem#Probability_table)
+ of a collision is in relation to the dimension of the hashing space and
+ the number of distinct objects.
+ """
+ if hash_function is None:
+ hash_function = hash
+ elif hash_function == 'md5':
+ hash_function = lambda w: int(md5(w.encode()).hexdigest(), 16)
+
+ seq = text_to_word_sequence(text,
+ filters=filters,
+ lower=lower,
+ split=split)
+ return [(hash_function(w) % (n - 1) + 1) for w in seq]
+
+
+class Tokenizer(object):
+ """Text tokenization utility class.
+
+ This class allows to vectorize a text corpus, by turning each
+ text into either a sequence of integers (each integer being the index
+ of a token in a dictionary) or into a vector where the coefficient
+ for each token could be binary, based on word count, based on tf-idf...
+
+ # Arguments
+ num_words: the maximum number of words to keep, based
+ on word frequency. Only the most common `num_words` words will
+ be kept.
+ filters: a string where each element is a character that will be
+ filtered from the texts. The default is all punctuation, plus
+ tabs and line breaks, minus the `'` character.
+ lower: boolean. Whether to convert the texts to lowercase.
+ split: str. Separator for word splitting.
+ char_level: if True, every character will be treated as a token.
+ oov_token: if given, it will be added to word_index and used to
+ replace out-of-vocabulary words during text_to_sequence calls
+
+ By default, all punctuation is removed, turning the texts into
+ space-separated sequences of words
+ (words maybe include the `'` character). These sequences are then
+ split into lists of tokens. They will then be indexed or vectorized.
+
+ `0` is a reserved index that won't be assigned to any word.
+ """
+
+ def __init__(self, num_words=None,
+ filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+ lower=True,
+ split=' ',
+ char_level=False,
+ oov_token=None,
+ **kwargs):
+ # Legacy support
+ if 'nb_words' in kwargs:
+ warnings.warn('The `nb_words` argument in `Tokenizer` '
+ 'has been renamed `num_words`.')
+ num_words = kwargs.pop('nb_words')
+ if kwargs:
+ raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
+ self.word_counts = OrderedDict()
+ self.word_docs = {}
+ self.filters = filters
+ self.split = split
+ self.lower = lower
+ self.num_words = num_words
+ self.document_count = 0
+ self.char_level = char_level
+ self.oov_token = oov_token
+ self.index_docs = {}
+
+ def fit_on_texts(self, texts):
+ """Updates internal vocabulary based on a list of texts.
+
+ In the case where texts contains lists,
+ we assume each entry of the lists to be a token.
+
+ Required before using `texts_to_sequences` or `texts_to_matrix`.
+
+ # Arguments
+ texts: can be a list of strings,
+ a generator of strings (for memory-efficiency),
+ or a list of list of strings.
+ """
+ for text in texts:
+ self.document_count += 1
+ if self.char_level or isinstance(text, list):
+ seq = text
+ else:
+ seq = text_to_word_sequence(text,
+ self.filters,
+ self.lower,
+ self.split)
+ for w in seq:
+ if w in self.word_counts:
+ self.word_counts[w] += 1
+ else:
+ self.word_counts[w] = 1
+ for w in set(seq):
+ if w in self.word_docs:
+ self.word_docs[w] += 1
+ else:
+ self.word_docs[w] = 1
+
+ wcounts = list(self.word_counts.items())
+ wcounts.sort(key=lambda x: x[1], reverse=True)
+ sorted_voc = [wc[0] for wc in wcounts]
+ # note that index 0 is reserved, never assigned to an existing word
+ self.word_index = dict(
+ list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
+
+ if self.oov_token is not None:
+ i = self.word_index.get(self.oov_token)
+ if i is None:
+ self.word_index[self.oov_token] = len(self.word_index) + 1
+
+ for w, c in list(self.word_docs.items()):
+ self.index_docs[self.word_index[w]] = c
+ # print(self.word_index)
+ # print(self.index_docs)
+
+ def fit_on_sequences(self, sequences):
+ """Updates internal vocabulary based on a list of sequences.
+
+ Required before using `sequences_to_matrix`
+ (if `fit_on_texts` was never called).
+
+ # Arguments
+ sequences: A list of sequence.
+ A "sequence" is a list of integer word indices.
+ """
+ self.document_count += len(sequences)
+ for seq in sequences:
+ seq = set(seq)
+ for i in seq:
+ if i not in self.index_docs:
+ self.index_docs[i] = 1
+ else:
+ self.index_docs[i] += 1
+
+ def texts_to_sequences(self, texts):
+ """Transforms each text in texts in a sequence of integers.
+
+ Only top "num_words" most frequent words will be taken into account.
+ Only words known by the tokenizer will be taken into account.
+
+ # Arguments
+ texts: A list of texts (strings).
+
+ # Returns
+ A list of sequences.
+ """
+ res = []
+ for vect in self.texts_to_sequences_generator(texts):
+ res.append(vect)
+ return res
+
+ def texts_to_sequences_generator(self, texts):
+ """Transforms each text in `texts` in a sequence of integers.
+
+ Each item in texts can also be a list,
+ in which case we assume each item of that list to be a token.
+
+ Only top "num_words" most frequent words will be taken into account.
+ Only words known by the tokenizer will be taken into account.
+
+ # Arguments
+ texts: A list of texts (strings).
+
+ # Yields
+ Yields individual sequences.
+ """
+ num_words = self.num_words
+ for text in texts:
+ if self.char_level or isinstance(text, list):
+ seq = text
+ else:
+ seq = text_to_word_sequence(text,
+ self.filters,
+ self.lower,
+ self.split)
+ vect = []
+ # print(self.word_index)
+ for w in seq:
+ i = self.word_index.get(w)
+
+ if num_words and i >= num_words:
+ if self.oov_token==None:
+ continue
+ else:
+ vect.append(num_words)
+ else:
+ vect.append(i)
+ yield vect
+
+ def texts_to_matrix(self, texts, mode='binary'):
+ """Convert a list of texts to a Numpy matrix.
+
+ # Arguments
+ texts: list of strings.
+ mode: one of "binary", "count", "tfidf", "freq".
+
+ # Returns
+ A Numpy matrix.
+ """
+ sequences = self.texts_to_sequences(texts)
+ return self.sequences_to_matrix(sequences, mode=mode)
+
+ def sequences_to_matrix(self, sequences, mode='binary'):
+ """Converts a list of sequences into a Numpy matrix.
+
+ # Arguments
+ sequences: list of sequences
+ (a sequence is a list of integer word indices).
+ mode: one of "binary", "count", "tfidf", "freq"
+
+ # Returns
+ A Numpy matrix.
+
+ # Raises
+ ValueError: In case of invalid `mode` argument,
+ or if the Tokenizer requires to be fit to sample data.
+ """
+ if not self.num_words:
+ if self.word_index:
+ num_words = len(self.word_index) + 1
+ else:
+ raise ValueError('Specify a dimension (num_words argument), '
+ 'or fit on some text data first.')
+ else:
+ num_words = self.num_words
+
+ if mode == 'tfidf' and not self.document_count:
+ raise ValueError('Fit the Tokenizer on some data '
+ 'before using tfidf mode.')
+
+ x = np.zeros((len(sequences), num_words))
+ for i, seq in enumerate(sequences):
+ if not seq:
+ continue
+ counts = {}
+ for j in seq:
+ if j >= num_words:
+ continue
+ if j not in counts:
+ counts[j] = 1.
+ else:
+ counts[j] += 1
+ for j, c in list(counts.items()):
+ if mode == 'count':
+ x[i][j] = c
+ elif mode == 'freq':
+ x[i][j] = c / len(seq)
+ elif mode == 'binary':
+ x[i][j] = 1
+ elif mode == 'tfidf':
+ # Use weighting scheme 2 in
+ # https://en.wikipedia.org/wiki/Tf%E2%80%93idf
+ tf = 1 + np.log(c)
+ idf = np.log(1 + self.document_count /
+ (1 + self.index_docs.get(j, 0)))
+ x[i][j] = tf * idf
+ else:
+ raise ValueError('Unknown vectorization mode:', mode)
+ return x
diff --git a/benchmark/RNN/benchmark_anakin.sh b/benchmark/RNN/benchmark_anakin.sh
new file mode 100755
index 000000000..51a8bc107
--- /dev/null
+++ b/benchmark/RNN/benchmark_anakin.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -e
+set -x
+sdir=$(cd `dirname $0`; pwd)
+
+sh $sdir/prepare.sh
+
+#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 1 $sdir/../../output/unit_test/net_exec_x86_oneinput $sdir/model/language_model/ $sdir/data/ptb.valid_tokenlize.txt 1
+#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 2 $sdir/../../output/unit_test/net_exec_x86_oneinput $sdir/model/language_model/ $sdir/data/ptb.valid_tokenlize.txt 2
+#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 4 $sdir/../../output/unit_test/net_exec_x86_oneinput $sdir/model/language_model/ $sdir/data/ptb.valid_tokenlize.txt 4
+#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 6 $sdir/../../output/unit_test/net_exec_x86_oneinput $sdir/model/language_model/ $sdir/data/ptb.valid_tokenlize.txt 6
+
+for i in {1,2,4,6} ;do
+$sdir/../../output/unit_test/net_exec_x86_oneinput $sdir/model/language_model/ $sdir/data/ptb.valid_tokenlize.txt $i
+done
+
+for i in {1,2,4,6} ;do
+$sdir/../../output/unit_test/net_exec_test_chinese_ner $sdir/model/chinese_ner_model/ $sdir/data/ner_data.txt $i 1
+done
+
+for i in {1,2,4,6} ;do
+$sdir/../../output/unit_test/net_exec_x86_oneinput $sdir/model/text_classfication/ $sdir/data/ptb.valid_tokenlize.txt $i
+done
\ No newline at end of file
diff --git a/benchmark/RNN/benchmark_tensorflow.sh b/benchmark/RNN/benchmark_tensorflow.sh
new file mode 100755
index 000000000..0d874dcf0
--- /dev/null
+++ b/benchmark/RNN/benchmark_tensorflow.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e
+set -x
+
+sdir=$(cd `dirname $0`; pwd)
+
+sh $sdir/prepare.sh
+
+#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 1 python $sdir/tensorflow_language_model.py 1
+#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 2 python $sdir/tensorflow_language_model.py 2
+#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 4 python $sdir/tensorflow_language_model.py 4
+#sh $sdir/sh_base/cpu_benchmark_base_some_thread.sh 6 python $sdir/tensorflow_language_model.py 6
+
+for i in {1,2,4,6};do
+python $sdir/tensorflow_language_model.py --process_num=$i
+done
+
+for i in {1,2,4,6};do
+python $sdir/tensorflow_chinese_ner.py --process_num=$i
+done
+
+for i in {1,2,4,6};do
+python $sdir/tensorflow_text_classfication.py --process_num=$i
+done
\ No newline at end of file
diff --git a/benchmark/RNN/prepare.sh b/benchmark/RNN/prepare.sh
new file mode 100755
index 000000000..7762fff96
--- /dev/null
+++ b/benchmark/RNN/prepare.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+sdir=$(cd `dirname $0`; pwd)
+
+if [ ! -e $sdir/data/ptb.valid.txt ]; then
+echo "can not find language_data download now"
+wget -P $sdir/data/ http://ojf1xbmzo.bkt.clouddn.com/ptb.valid.txt
+fi
+
+if [ ! -e $sdir/data/ner_data.txt ]; then
+echo "can not find language_data download now"
+wget -P $sdir/data/ https://raw.githubusercontent.com/PaddlePaddle/models/develop/fluid/chinese_ner/data/test_files/test_part_1
+for n in $(seq 30); do cat $sdir/data/test_part_1 >> $sdir/data/ner_data.txt; done
+rm $sdir/data/test_part_1
+fi
+
+if [ ! -e $sdir/data/ptb.valid_tokenlize.txt ]; then
+python $sdir/read_ptb_data.py
+fi
+
+
diff --git a/benchmark/RNN/read_ptb_data.py b/benchmark/RNN/read_ptb_data.py
new file mode 100644
index 000000000..f2c4ad92f
--- /dev/null
+++ b/benchmark/RNN/read_ptb_data.py
@@ -0,0 +1,36 @@
+from Tokenizer import Tokenizer
+# from keras.preprocessing.text import Tokenizer
+import os
+import sys
+class PTB_Data_Reader():
+
+ def read(self):
+ # print('!',sys.argv[0])
+ # print(os.path.dirname(__file__)+'/data/ptb.valid.txt')
+ file=open(os.path.dirname(__file__)+'/data/ptb.valid.txt')
+ lines=file.readlines()
+ tokenizer=Tokenizer(9999,oov_token=1)
+ tokenizer.fit_on_texts(lines)
+ self.seqs=tokenizer.texts_to_sequences(lines)
+ return self.seqs
+
+ def save_to(self):
+ save_file=open(os.path.dirname(__file__)+'/data/ptb.valid_tokenlize.txt','w')
+ for line in self.seqs:
+ line_str=''.join(str(i)+' ' for i in line)
+ line_str=line_str[:-1]
+ save_file.write(line_str+'\n')
+
+class NER_Data_Reader():
+ def read(self):
+ # print(os.path.dirname(__file__)+'/data/ptb.valid.txt')
+ file=open(os.path.dirname(__file__)+'/data/ner_data.txt')
+ self.seqs=[[[int(i) for i in line.split(';')[1].split(' ')],[int(i) for i in line.split(';')[3].split(' ')]] for line in file.readlines()]
+
+ return self.seqs
+
+if __name__ == '__main__':
+ read=PTB_Data_Reader()
+ read.read()
+ read.save_to()
+
diff --git a/benchmark/RNN/sh_base/cpu_benchmark_base_one_socket.sh b/benchmark/RNN/sh_base/cpu_benchmark_base_one_socket.sh
new file mode 100755
index 000000000..49beef068
--- /dev/null
+++ b/benchmark/RNN/sh_base/cpu_benchmark_base_one_socket.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e
+core_per_socker=`lscpu | grep "Core(s) per socket" | awk -F ':' '{print $2}' | sed 's/^ *\| *$//g'`
+core_num=$core_per_socker
+
+echo $core_num
+core_idx=$[$core_num-1]
+echo $core_idx
+core_range='0-'${core_idx}
+
+echo ${core_range}
+
+unset OMP_NUM_THREADS
+export OMP_NUM_THREADS=${core_num}
+unset MKL_NUM_THREADS
+export MKL_NUM_THREADS=${core_num}
+
+taskset -c ${core_range} numactl -l $*
\ No newline at end of file
diff --git a/benchmark/RNN/sh_base/cpu_benchmark_base_some_thread.sh b/benchmark/RNN/sh_base/cpu_benchmark_base_some_thread.sh
new file mode 100755
index 000000000..9b6d75910
--- /dev/null
+++ b/benchmark/RNN/sh_base/cpu_benchmark_base_some_thread.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -e
+set -x
+core_num=$1
+shift
+
+
+
+core_range='1-'$core_num
+
+
+echo ${core_range}
+
+unset OMP_NUM_THREADS
+export OMP_NUM_THREADS=${core_num}
+unset MKL_NUM_THREADS
+export MKL_NUM_THREADS=${core_num}
+
+#taskset -c ${core_range} numactl -l $*
+taskset -c ${core_range} $*
diff --git a/benchmark/RNN/tensorflow_c_benchmark/benchmark_tensorflow.sh b/benchmark/RNN/tensorflow_c_benchmark/benchmark_tensorflow.sh
new file mode 100755
index 000000000..1631f9fd7
--- /dev/null
+++ b/benchmark/RNN/tensorflow_c_benchmark/benchmark_tensorflow.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+set -x
+
+sdir=$(cd `dirname $0`; pwd)
+
+
+for i in {1,2,4,6};do
+bazel run //tensorflow/cc:example_model /root/tf_mount/RNN/model/language_model_tf/all.pb /root/tf_mount/RNN/data/ptb.valid_tokenlize.txt $i
+done
+
+for i in {1,2,4,6};do
+bazel run //tensorflow/cc:example_model /root/tf_mount/RNN/model/text_classfi_model_tf/all.pb /root/tf_mount/RNN/data/ptb.valid_tokenlize.txt $i
+done
diff --git a/benchmark/RNN/tensorflow_c_benchmark/example_model.cc b/benchmark/RNN/tensorflow_c_benchmark/example_model.cc
new file mode 100644
index 000000000..291f89e33
--- /dev/null
+++ b/benchmark/RNN/tensorflow_c_benchmark/example_model.cc
@@ -0,0 +1,295 @@
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/platform/env.h"
+#include "vector"
+#include
+#include
+#include "sys/time.h"
+#define DEFINE_GLOBAL(type, var, value) \
+ type (GLB_##var) = (value)
+DEFINE_GLOBAL(int, run_threads, 1);
+volatile DEFINE_GLOBAL(int, batch_size, 1);
+volatile DEFINE_GLOBAL(int, max_word_len, 0);
+volatile DEFINE_GLOBAL(int, word_count, 0);
+DEFINE_GLOBAL(std::string, model_dir, "");
+DEFINE_GLOBAL(std::string, input_file, "");
+DEFINE_GLOBAL(std::string, split_word, "\t");
+DEFINE_GLOBAL(std::string, output_name, "");
+DEFINE_GLOBAL(std::string, run_mode, "instance");
+DEFINE_GLOBAL(int, split_index, 0);
+
+using namespace tensorflow;
+int read_file(std::vector& results, const char* file_name) {
+
+ std::ifstream infile(file_name);
+
+ if (!infile.good()) {
+ std::cout << "Cannot open " << std::endl;
+ return false;
+ }
+
+ LOG(INFO) << "found filename: " << file_name;
+ std::string line;
+
+ while (std::getline(infile, line)) {
+ results.push_back((float)atof(line.c_str()));
+ }
+
+ return 0;
+}
+void SplitString(const std::string& s,
+ std::vector& v, const std::string& c) {
+ std::string::size_type pos1, pos2;
+ pos2 = s.find(c);
+ pos1 = 0;
+
+ while (std::string::npos != pos2) {
+ v.push_back(s.substr(pos1, pos2 - pos1));
+
+ pos1 = pos2 + c.size();
+ pos2 = s.find(c, pos1);
+ }
+
+ if (pos1 != s.length()) {
+ v.push_back(s.substr(pos1));
+ }
+}
+
+int split_word_from_file(
+ std::vector >& word_idx,
+ const std::string input_file_path,
+ const std::string split_token,
+ const std::string inner_split_token,
+ const int col_select) {
+
+ std::ifstream infile(input_file_path.c_str());
+
+ if (!infile.good()) {
+ std::cout << "Cannot open " << std::endl;
+ return 1;
+ }
+
+ LOG(INFO) << "found filename: " << input_file_path;
+ std::string line;
+ std::vector split_v;
+ std::vector split_w;
+ int word_count = 0;
+
+ while (std::getline(infile, line)) {
+ split_v.clear();
+ SplitString(line, split_v, split_token);
+ CHECK_GE(split_v.size(), col_select + 1) << " file need ; split";
+ std::vector word;
+ std::vector mention;
+ split_w.clear();
+ SplitString(split_v[col_select], split_w, inner_split_token);
+
+ for (auto w : split_w) {
+ word.push_back(atof(w.c_str()));
+ word_count++;
+ // printf("%d,",atoi(w.c_str()));
+ }
+
+ // printf("\n");
+ // exit(0);
+ word_idx.push_back(word);
+ }
+
+ GLB_word_count = word_count;
+ return 0;
+}
+
+int get_batch_data_offset(
+ std::vector& out_data,
+ const std::vector >& seq_data,
+ std::vector& seq_offset,
+ const int start_idx,
+ const int batch_num) {
+
+ seq_offset.clear();
+ out_data.clear();
+ seq_offset.push_back(0);
+ int len = 0;
+
+ for (int i = 0; i < batch_num; ++i) {
+ for (auto d : seq_data[i + start_idx]) {
+ len += 1;
+ out_data.push_back(d);
+ // printf("%.0f, ",d);
+ }
+
+ // printf("\n");
+ seq_offset.push_back(len);
+ }
+
+ return len;
+}
+std::vector > get_input_data() {
+ std::vector > word_idx;
+
+ if (split_word_from_file(word_idx, GLB_input_file, GLB_split_word, " ", GLB_split_index)) {
+ LOG(ERROR) << " NOT FOUND " << GLB_input_file;
+ exit(-1);
+ }
+
+ return word_idx;
+};
+void sess_thread(std::vector* tensor_vec) {
+ SessionOptions opts;
+ opts.config.set_intra_op_parallelism_threads(1);
+ opts.config.set_inter_op_parallelism_threads(1);
+ opts.config.set_use_per_session_threads(true);
+ Session* session;
+ Status status = NewSession(opts, &session);
+
+ if (!status.ok()) {
+ std::cerr << status.ToString() << std::endl;
+ return ;
+ } else {
+ std::cout << "Session created successfully" << std::endl;
+ }
+
+ // Load the protobuf graph
+ GraphDef graph_def;
+ std::string graph_path = GLB_model_dir;//argv[1];
+ status = ReadBinaryProto(Env::Default(), graph_path, &graph_def);
+
+ if (!status.ok()) {
+ std::cerr << status.ToString() << std::endl;
+ return ;
+ } else {
+ std::cout << "Load graph protobuf successfully" << std::endl;
+ }
+
+ // Add the graph to the session
+ status = session->Create(graph_def);
+
+ if (!status.ok()) {
+ std::cerr << status.ToString() << std::endl;
+ return ;
+ } else {
+ std::cout << "Add graph to session successfully" << std::endl;
+ }
+
+ {
+ //warm up
+ std::vector> inputs = {
+ { "x_input", *(*tensor_vec)[0] },
+ };
+ std::vector outputs;
+ session->Run(inputs, {"Softmax"}, {}, &outputs);
+
+ if (!status.ok()) {
+ std::cerr << status.ToString() << std::endl;
+ return ;
+ } else {
+ // std::cout << "Run session successfully i" << std::endl;
+ }
+
+
+ }
+
+ std::cout << "thread ready to run " << std::endl;
+ struct timeval time_start, time_end;
+
+ gettimeofday(&time_start, nullptr);
+ {
+ for (int i = 0; i < tensor_vec->size(); i++) {
+ std::vector> inputs = {
+ { "x_input", *(*tensor_vec)[i] },
+ };
+ std::vector outputs;
+ session->Run(inputs, {"Softmax"}, {}, &outputs);
+
+ if (!status.ok()) {
+ std::cerr << status.ToString() << std::endl;
+ return ;
+ } else {
+ // std::cout << "Run session successfully i" << std::endl;
+ }
+ }
+
+
+ }
+ gettimeofday(&time_end, nullptr);
+
+ float use_ms = (time_end.tv_sec - time_start.tv_sec) * 1000.f + (time_end.tv_usec -
+ time_start.tv_usec) / 1000.f;
+ std::cout << "thread summary : " << "usetime = " << use_ms << " ms," << "word_sum = " <<
+ GLB_word_count << ",delay = " << (use_ms / tensor_vec->size()) << ", QPS = " <<
+ (GLB_word_count / use_ms * 1000) << std::endl;
+
+ session->Close();
+}
+/**
+ * @brief deep model for click through rate prediction
+ * @details [long description]
+ *
+ * @param argv[1] graph protobuf
+ *
+ * @return [description]
+ */
+int main(int argc, char* argv[]) {
+ if (argc < 3) {
+ LOG(INFO) << "Example of Usage:\n \
+ ./output/unit_test/model_test\n \
+ anakin_models\n input file\n";
+ exit(0);
+ } else if (argc >= 3) {
+ GLB_model_dir = std::string(argv[1]);
+ GLB_input_file = std::string(argv[2]);
+ }
+
+ if (argc >= 4) {
+ GLB_run_threads = atoi(argv[3]);
+ }
+
+ // Initialize a tensorflow session
+
+ std::vector > word_idx;
+ word_idx = get_input_data();
+ std::vector tensor_vec;
+
+ for (int i = 0; i < word_idx.size(); i++) {
+ tensorflow::Tensor* t_tensor_p = new Tensor(DT_INT32, TensorShape({1, word_idx[i].size()}));
+ auto input_tensor_mapped = t_tensor_p->tensor();
+
+ for (int j = 0; j < word_idx[i].size(); j++) {
+ input_tensor_mapped(0, j) = word_idx[i][j];
+
+ }
+
+ tensor_vec.push_back(t_tensor_p);
+ }
+
+ std::cout << "get word success!" << std::endl;
+ std::cout << "first data = " << tensor_vec[0]->tensor()(0, 0) << std::endl;
+ // Setup inputs and outputs:
+ // Our graph doesn't require any inputs, since it specifies default values,
+ // but we'll change an input to demonstrate.
+ std::vector> threads;
+ int thread_num = GLB_run_threads;
+
+ for (int i = 0; i < thread_num; ++i) {
+ threads.emplace_back(
+ new std::thread(&sess_thread, &tensor_vec));
+ }
+
+ for (int i = 0; i < thread_num; ++i) {
+ threads[i]->join();
+ }
+
+ // Grab the first output (we only evaluated one graph node: "c")
+ // and convert the node to a scalar representation.
+ //auto output_c = outputs[0].scalar();
+
+ // (There are similar methods for vectors and matrices here:
+ // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/tensor.h)
+
+ // Print the results
+ //std::cout << outputs[0].DebugString() << std::endl; // Tensor
+ //std::cout << "output value: " << output_c() << std::endl; // 30
+
+ // Free any resources used by the session
+
+ return 0;
+}
diff --git a/benchmark/RNN/tensorflow_chinese_ner.py b/benchmark/RNN/tensorflow_chinese_ner.py
new file mode 100644
index 000000000..e939290df
--- /dev/null
+++ b/benchmark/RNN/tensorflow_chinese_ner.py
@@ -0,0 +1,167 @@
+
+# coding: utf-8
+
+# In[1]:
+
+
+import tensorflow as tf
+import numpy as np
+import time
+import timeit
+
+# In[2]:
+
+def language_run(data_set):
+ word_voc_size=1942562
+ mention_voc_size=57
+ word_hidden_size=32
+ mention_hidden_size=20
+ gru_hidden_size=36
+
+ fc1_hidden_size=49
+
+
+ batch_size=1
+ tf.device('/cpu:0')
+
+
+ # In[3]:
+
+
+ x_input = tf.placeholder(
+ tf.int32, [1,None], name="x_input")
+ x_input_len = tf.placeholder(
+ tf.int32, [None],name="x_input_len")
+ mention_input = tf.placeholder(
+ tf.int32, [1,None], name="mention_input")
+
+ # In[4]:
+
+
+ embedding_table_word_r = tf.get_variable('emb_w_r', [word_voc_size, word_hidden_size], dtype=tf.float32)
+ embedding_out_r=tf.nn.embedding_lookup(embedding_table_word_r, x_input)
+
+ embedding_table_mention_r = tf.get_variable('emb_m_r', [mention_voc_size, mention_hidden_size], dtype=tf.float32)
+ embedding_mention_out_r=tf.nn.embedding_lookup(embedding_table_mention_r, mention_input)
+ ##
+ embedding_table_word_l = tf.get_variable('emb_w_l', [word_voc_size, word_hidden_size], dtype=tf.float32)
+ embedding_out_l=tf.nn.embedding_lookup(embedding_table_word_l, x_input)
+
+ embedding_table_mention_l = tf.get_variable('emb_m_l', [mention_voc_size, mention_hidden_size], dtype=tf.float32)
+ embedding_mention_out_l=tf.nn.embedding_lookup(embedding_table_mention_l, mention_input)
+
+ emb_r=tf.concat([embedding_out_r,embedding_mention_out_r],axis=-1)
+ emb_l=tf.concat([embedding_out_l,embedding_mention_out_l],axis=-1)
+ # In[5]:
+ with tf.variable_scope('forward'):
+ gru_cell_r = tf.contrib.rnn.GRUCell(gru_hidden_size)
+ gru_init_state_r = gru_cell_r.zero_state(batch_size, dtype=tf.float32)
+ gru_out_r, _ = tf.nn.dynamic_rnn(gru_cell_r, emb_r, initial_state=gru_init_state_r)
+
+ with tf.variable_scope('backward'):
+ gru_cell_l = tf.contrib.rnn.GRUCell(gru_hidden_size)
+ gru_init_state_l = gru_cell_l.zero_state(batch_size, dtype=tf.float32)
+ gru_out_l, _ = tf.nn.dynamic_rnn(gru_cell_l, emb_l, initial_state=gru_init_state_l)
+
+ bi_gru_out=tf.concat([gru_out_l,gru_out_r],axis=-1)
+
+ # In[6]:
+
+
+ fc_weights = tf.get_variable(
+ 'fc_weights', [ gru_hidden_size*2,fc1_hidden_size],
+ initializer=tf.truncated_normal_initializer(
+ stddev=0.01, dtype=tf.float32),
+ dtype=tf.float32)
+ fc_bias = tf.get_variable(
+ 'fc_bias', [fc1_hidden_size],
+ initializer=tf.truncated_normal_initializer(
+ stddev=0.0, dtype=tf.float32),
+ dtype=tf.float32)
+ bi_gru_out=tf.squeeze(bi_gru_out,[0])
+ fc1_out=tf.matmul(bi_gru_out,fc_weights) + fc_bias
+
+
+ # In[7]:
+ crf_weights = tf.get_variable(
+ 'crf_weights', [ fc1_hidden_size,fc1_hidden_size],
+ initializer=tf.truncated_normal_initializer(
+ stddev=0.01, dtype=tf.float32),
+ dtype=tf.float32)
+
+ fc1_out=tf.reshape(fc1_out,[batch_size,-1,fc1_hidden_size])
+ crf_out,_=tf.contrib.crf.crf_decode(fc1_out,crf_weights,x_input_len)
+
+
+
+
+
+ # In[8]:
+
+ init = tf.global_variables_initializer()
+ sess = tf.Session()
+ sess.run(init)
+
+ # In[9]:
+
+
+ def clock(func):
+ def clocked(*args):
+ t0 = timeit.default_timer()
+ result = func(*args)
+ elapsed = timeit.default_timer() - t0
+ name = func.__name__
+ arg_str = ', '.join(repr(arg) for arg in args)
+ print('[%0.8fs] %s(%s) -> %r' % (elapsed, name, 'arg_str', result))
+ lines=len(args[0])
+ counter=sum(len(line) for line in args[0])
+ print('Delay = '+str(elapsed*1000/lines)+'ms')
+ return result
+ return clocked
+
+
+ # In[10]:
+
+
+ @clock
+ def benchmark(data_set):
+ for one_batch in data_set:
+ word_vec,mention_vec=one_batch[0],one_batch[1]
+ sess.run([crf_out],{x_input:np.array(word_vec).reshape(1,len(word_vec)),mention_input:np.array(mention_vec).reshape(1,len(mention_vec)),x_input_len:[len(word_vec)]})
+
+ # tf.train.write_graph(sess.graph.as_graph_def(), 'model/language_model_tf/', 'graph.pb', as_text=False)
+ # saver=tf.train.Saver()
+ # saver.save(sess, "model/chinese_ner_model_tf/")
+ # exit()
+
+ benchmark(data_set)
+if __name__=='__main__':
+ import getopt
+ import sys
+ proc_num=1
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], "ho:", ["help", "process_num="])
+ for key,arg in opts:
+ if key in ('-h','--help'):
+ print('usage --process_num=k ,default=1')
+ if key in ('--process_num'):
+ proc_num=int(arg)
+ print(opts)
+ except getopt.GetoptError:
+ pass
+
+ from read_ptb_data import NER_Data_Reader
+ data_set=NER_Data_Reader().read()
+ word_sum=sum(len(i[0]) for i in data_set)
+ from multiprocessing import Process
+ threads=[]
+ t0 = timeit.default_timer()
+ for i in range(proc_num):
+ t =Process(target=language_run,args=(data_set,))
+ t.start()
+ threads.append(t)
+
+ for t in threads:
+ t.join()
+ elapsed = timeit.default_timer() - t0
+ print(__file__,'process = ',proc_num,',QPS = ',len(data_set)/elapsed*proc_num,' line / second ,',word_sum/elapsed*proc_num,'words/second')
\ No newline at end of file
diff --git a/benchmark/RNN/tensorflow_language_model.py b/benchmark/RNN/tensorflow_language_model.py
new file mode 100644
index 000000000..31b4997fb
--- /dev/null
+++ b/benchmark/RNN/tensorflow_language_model.py
@@ -0,0 +1,136 @@
+
+# coding: utf-8
+
+# In[1]:
+
+
+import tensorflow as tf
+import numpy as np
+import time
+import timeit
+
+# In[2]:
+
+def language_run(data_set):
+ voc_size=10001
+ hidden_size=200
+ batch_size=1
+ tf.device('/cpu:0')
+
+
+ # In[3]:
+
+
+ x_input = tf.placeholder(
+ tf.int32, [1,None], name="x_input")
+ # x_input_len = tf.placeholder(
+ # tf.int32, name="x_input_len")
+
+
+ # In[4]:
+
+
+ embedding_table = tf.get_variable('emb', [voc_size, hidden_size], dtype=tf.float32)
+ embedding_out=tf.nn.embedding_lookup(embedding_table, x_input)
+
+
+ # In[5]:
+
+
+ gru_cell = tf.contrib.rnn.GRUCell(hidden_size)
+ gru_init_state=gru_cell.zero_state(batch_size, dtype=tf.float32)
+ gru_out,_=tf.nn.dynamic_rnn(gru_cell,embedding_out,initial_state=gru_init_state)
+
+
+ # In[6]:
+
+
+ fc_weights = tf.get_variable(
+ 'fc_weights', [ hidden_size,voc_size],
+ initializer=tf.truncated_normal_initializer(
+ stddev=0.01, dtype=tf.float32),
+ dtype=tf.float32)
+ fc_bias = tf.get_variable(
+ 'fc_bias', [voc_size],
+ initializer=tf.truncated_normal_initializer(
+ stddev=0.0, dtype=tf.float32),
+ dtype=tf.float32)
+ gru_out=tf.squeeze(gru_out,[0])
+ fc_out=tf.matmul(gru_out,fc_weights) + fc_bias
+
+
+ # In[7]:
+
+
+ softmax=tf.nn.softmax(fc_out)
+
+
+ # In[8]:
+
+ init = tf.global_variables_initializer()
+ sess = tf.Session()
+ sess.run(init)
+
+ # In[9]:
+
+
+ def clock(func):
+ def clocked(*args):
+ t0 = timeit.default_timer()
+ result = func(*args)
+ elapsed = timeit.default_timer() - t0
+ name = func.__name__
+ arg_str = ', '.join(repr(arg) for arg in args)
+ print('[%0.8fs] %s(%s) -> %r' % (elapsed, name, 'arg_str', result))
+ lines=len(args[0])
+ counter=sum(len(line) for line in args[0])
+ print('Delay = '+str(elapsed*1000/lines)+'ms')
+ return result
+ return clocked
+
+
+ # In[10]:
+
+
+ @clock
+ def benchmark(data_set):
+ for one_batch in data_set:
+ sess.run([softmax],{x_input:np.array(one_batch).reshape(1,len(one_batch))})
+
+ # tf.train.write_graph(sess.graph.as_graph_def(), 'model/language_model_tf/', 'graph.pb', as_text=False)
+ # saver=tf.train.Saver()
+ # saver.save(sess, "model/language_model_tf/model.cpkt")
+ # exit()
+
+
+ benchmark(data_set)
+if __name__=='__main__':
+ import getopt
+ import sys
+ proc_num=1
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], "ho:", ["help", "process_num="])
+ for key,arg in opts:
+ if key in ('-h','--help'):
+ print('usage --process_num=k ,default=1')
+ if key in ('--process_num'):
+ proc_num=int(arg)
+ print(opts)
+ except getopt.GetoptError:
+ pass
+
+ from read_ptb_data import PTB_Data_Reader
+ data_set=PTB_Data_Reader().read()
+ word_sum=sum(len(i) for i in data_set)
+ from multiprocessing import Process
+ threads=[]
+ t0 = timeit.default_timer()
+ for i in range(proc_num):
+ t =Process(target=language_run,args=(data_set,))
+ t.start()
+ threads.append(t)
+
+ for t in threads:
+ t.join()
+ elapsed = timeit.default_timer() - t0
+ print(__file__,'process = ',proc_num,',QPS = ',len(data_set)/elapsed*proc_num,' line / second ,',word_sum/elapsed*proc_num,'words/second')
\ No newline at end of file
diff --git a/benchmark/RNN/tensorflow_text_classfication.py b/benchmark/RNN/tensorflow_text_classfication.py
new file mode 100644
index 000000000..c690085e0
--- /dev/null
+++ b/benchmark/RNN/tensorflow_text_classfication.py
@@ -0,0 +1,149 @@
+
+# coding: utf-8
+
+# In[1]:
+
+
+import tensorflow as tf
+import numpy as np
+import time
+import timeit
+
+# In[2]:
+
+def language_run(data_set):
+ voc_size=566227
+ hidden_size=128
+ hidden_size_after_lstm=96
+ hidden_size_after_fc=2
+ batch_size=1
+ tf.device('/cpu:0')
+
+
+ # In[3]:
+
+
+ x_input = tf.placeholder(
+ tf.int32, [1,None], name="x_input")
+
+
+ # In[4]:
+
+
+ embedding_table = tf.get_variable('emb', [voc_size, hidden_size], dtype=tf.float32)
+ embedding_out=tf.nn.embedding_lookup(embedding_table, x_input)
+
+
+ # In[5]:
+
+
+ lstm_cell = tf.contrib.rnn.LSTMCell(hidden_size)
+ # lstm_init_state=lstm_cell.zero_state(batch_size, dtype=tf.float32)
+ # lstm_out,_=tf.nn.dynamic_rnn(lstm_cell,embedding_out,initial_state=lstm_init_state)
+ (output_fw, output_bw), _=tf.nn.bidirectional_dynamic_rnn(lstm_cell,
+ lstm_cell, embedding_out,
+ dtype=tf.float32)
+
+ bi_lstm_out = tf.concat([output_fw, output_bw], axis=-1)
+
+ # In[6]:
+
+
+ fc_weights = tf.get_variable(
+ 'fc_weights', [ hidden_size*2,hidden_size_after_lstm],
+ initializer=tf.truncated_normal_initializer(
+ stddev=0.01, dtype=tf.float32),
+ dtype=tf.float32)
+ fc_bias = tf.get_variable(
+ 'fc_bias', [hidden_size_after_lstm],
+ initializer=tf.truncated_normal_initializer(
+ stddev=0.0, dtype=tf.float32),
+ dtype=tf.float32)
+ bi_lstm_out=tf.squeeze(bi_lstm_out,[0])
+ fc1_out=tf.tanh(tf.matmul(bi_lstm_out,fc_weights) + fc_bias)
+
+ # In[7]:
+ fc2_weights = tf.get_variable(
+ 'fc2_weights', [ hidden_size_after_lstm,hidden_size_after_fc],
+ initializer=tf.truncated_normal_initializer(
+ stddev=0.01, dtype=tf.float32),
+ dtype=tf.float32)
+ fc2_bias = tf.get_variable(
+ 'fc2_bias', [hidden_size_after_fc],
+ initializer=tf.truncated_normal_initializer(
+ stddev=0.0, dtype=tf.float32),
+ dtype=tf.float32)
+ fc2_out=tf.matmul(fc1_out,fc2_weights) + fc2_bias
+
+ softmax=tf.nn.softmax(fc2_out)
+
+
+ # In[8]:
+
+ init = tf.global_variables_initializer()
+ sess = tf.Session()
+ sess.run(init)
+
+ # In[9]:
+
+
+ def clock(func):
+ def clocked(*args):
+ t0 = timeit.default_timer()
+ result = func(*args)
+ elapsed = timeit.default_timer() - t0
+ name = func.__name__
+ arg_str = ', '.join(repr(arg) for arg in args)
+ print('[%0.8fs] %s(%s) -> %r' % (elapsed, name, 'arg_str', result))
+ lines=len(args[0])
+ counter=sum(len(line) for line in args[0])
+ print('Delay = '+str(elapsed*1000/lines)+'ms')
+ return result
+ return clocked
+
+
+ # In[10]:
+
+
+ @clock
+ def benchmark(data_set):
+ for one_batch in data_set:
+ sess.run([softmax],{x_input:np.array(one_batch).reshape(1,len(one_batch))})
+
+ # tf.train.write_graph(sess.graph.as_graph_def(), 'model/text_classfi_model_tf/', 'graph.pb', as_text=False)
+ # saver=tf.train.Saver()
+ # saver.save(sess, "model/text_classfi_model_tf/model.cpkt")
+ # exit()
+
+
+ benchmark(data_set)
+if __name__=='__main__':
+ import getopt
+ import sys
+ proc_num=1
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], "ho:", ["help", "process_num="])
+ for key,arg in opts:
+ if key in ('-h','--help'):
+ print('usage --process_num=k ,default=1')
+ if key in ('--process_num'):
+ proc_num=int(arg)
+ print(opts)
+ except getopt.GetoptError:
+ pass
+
+ from read_ptb_data import PTB_Data_Reader
+ data_set=PTB_Data_Reader().read()
+ word_sum=sum(len(i) for i in data_set)
+ from multiprocessing import Process
+ threads=[]
+ t0 = timeit.default_timer()
+ for i in range(proc_num):
+ t =Process(target=language_run,args=(data_set,))
+ t.start()
+ threads.append(t)
+
+ for t in threads:
+ t.join()
+ elapsed = timeit.default_timer() - t0
+ print(__file__,'process = ',proc_num,',QPS = ',len(data_set)/elapsed*proc_num,' line / second ,',word_sum/elapsed*proc_num,'words/second')
\ No newline at end of file
diff --git a/benchmark/arm_benchmark.md b/benchmark/arm_benchmark.md
new file mode 100644
index 000000000..3ab4feb48
--- /dev/null
+++ b/benchmark/arm_benchmark.md
@@ -0,0 +1,57 @@
+# 测试环境和参数:
++ 测试模型Mobilenetv1, mobilenetv2, mobilenet-ssd
++ 采用android ndk交叉编译,gcc 4.9,enable neon, ABI: armveabi-v7a with neon -mfloat-abi=softfp
++ 测试平台
+ - 荣耀v9(root): 处理器:麒麟960, 4 big cores in 2.36GHz, 4 little cores in 1.8GHz
+ - nubia z17:处理器:高通835, 4 big cores in 2.36GHz, 4 little cores in 1.9GHz
+ - 360 N5:处理器:高通653, 4 big cores in 1.8GHz, 4 little cores in 1.4GHz
++ 多线程:openmp
++ 时间:warmup10次,运行10次取均值
++ ncnn版本:来源于github的master branch中commits ID:307a77f04be29875f40d337cfff6df747df09de6(msg:convert LogisticRegressionOutput)版本
++ TFlite版本:来源于github的master branch中commits ID:65c05bc2ac19f51f7027e66350bc71652662125c(msg:Removed unneeded file copy that was causing failure in Pi builds)版本
+
+## Anakin
+
+在BenchMark中本文将使用**`ncnn`**、**`TFlite`**和**`Anakin`**进行性能对比分析
+
+## BenchMark model
+
+> 注意在性能测试之前,请先将测试model通过[External Converter](#10003)转换为Anakin model
+> 对这些model,本文在ARM上进行多线程的单batch size测试。
+
+- [Mobilenet v1](#11) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [Mobilenet v2](#22) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [mobilenet-ssd](#33) *caffe model 可以在[这儿](https://github.com/chuanqi305/MobileNet-SSD)下载*
+
+### mobilenetv1
+
+ |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+ |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+ |麒麟960|107.7ms|61.1ms|38.2ms|152.8ms|85.2ms|51.9ms|152.6ms|nan|nan|
+ |高通835|105.7ms|63.1ms|~~46.8ms~~|152.7ms|87.0ms|~~92.7ms~~|146.9ms|nan|nan|
+ |高通653|120.3ms|64.2ms|46.6ms|202.5ms|117.6ms|84.8ms|158.6ms|nan|nan|
+
+### mobilenetv2
+
+ |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+ |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+ |麒麟960|93.1ms|53.9ms|34.8ms|144.4ms|84.3ms|55.3ms|100.6ms|nan|nan|
+ |高通835|93.0ms|55.6ms|41.1ms|139.1ms|88.4ms|58.1ms|95.2ms|nan|nan|
+ |高通653|106.6ms|64.2ms|48.0ms|199.9ms|125.1ms|98.9ms|108.5ms|nan|nan|
+
+### mobilenet-ssd
+
+ |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+ |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+ |麒麟960|213.9ms|120.5ms|74.5ms|307.9ms|166.5ms|104.2ms|nan|nan|nan|
+ |高通835|213.0ms|125.7ms|~~98.4ms~~|292.9ms|177.9ms|~~167.8ms~~|nan|nan|nan|
+ |高通653|236.0ms|129.6ms|96.0ms|377.7ms|228.9ms|165.0ms|nan|nan|nan
+
+## How to run those Benchmark models?
+
+1. 首先, 使用[External Converter](../docs/Manual/Converter_en.md)对caffe model 进行转换
+2. 然后将转换后的Anakin model和编译好的benchmark_arm 二进制文件通过'adb push'命令上传至测试机
+3. 接着在测试机含有Anakin model的目录中运行'./benchmark_arm ./ anakin_model.anakin.bin 1 10 10 1' 命令
+4. 最后,终端显示器上将会打印该模型的运行时间
+5. 其中运行命令的参数个数和含义可以通过运行'./benchmark_arm'看到
+
diff --git a/cmake/amd.cmake b/cmake/amd.cmake
new file mode 100644
index 000000000..1ebc7bf56
--- /dev/null
+++ b/cmake/amd.cmake
@@ -0,0 +1,53 @@
+# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+################################################################################
+macro(amd_set_opencl_path)
+ if(NOT DEFINED OpenCL_INCLUDE_DIR)
+ set(OpenCL_INCLUDE_DIR "/opt/rocm/opencl/include")
+ endif()
+ if(NOT DEFINED OpenCL_LIBRARY)
+ set(OpenCL_LIBRARY "/opt/rocm/opencl/lib/x86_64/libOpenCL.so")
+ endif()
+
+ #FIND_PACKAGE(OpenCL REQUIRED)
+ #if(OpenCL_FOUND)
+ # message(STATUS "Found OpenCL in ${OpenCL_INCLUDE_DIRS}")
+ # message(STATUS "Found OpenCL lib in ${OpenCL_LIBRARIES}")
+ # include_directories(${OpenCL_INCLUDE_DIRS})
+ # LINK_LIBRARIES(${OpenCL_LIBRARIES})
+ #endif()
+endmacro()
+
+macro(amd_build_cl_file file_path dest_path)
+ FILE(GLOB CL_FILES ${file_path}/*.cl)
+ message(STATUS "found cl files: ${CL_FILES}")
+ foreach(src_file ${CL_FILES})
+ get_filename_component(src_file_name ${src_file} NAME)
+ message(STATUS "copy ${src_file} to : ${dest_path}/${src_file_name}")
+ configure_file( ${absdir}/${src_file} ${dest_path}/${src_file_name} COPYONLY)
+ endforeach()
+endmacro()
+
+
+macro(amd_build_cl_binary_file file_path dest_path)
+ FILE(GLOB CL_FILES ${file_path}/*.so)
+ message(STATUS "found cl files: ${CL_FILES}")
+ foreach(src_file ${CL_FILES})
+ get_filename_component(src_file_name ${src_file} NAME)
+ message(STATUS "copy ${src_file} to : ${dest_path}/${src_file_name}")
+ configure_file( ${absdir}/${src_file} ${dest_path}/${src_file_name} COPYONLY)
+ endforeach()
+endmacro()
+
diff --git a/cmake/compiler_options.cmake b/cmake/compiler_options.cmake
index 49d133c7f..ef4a0dbcf 100644
--- a/cmake/compiler_options.cmake
+++ b/cmake/compiler_options.cmake
@@ -1,10 +1,16 @@
-# ----------------------------------------------------------------------------
-# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved
-# @file compiler_options.cmake
-# @auther cuichaowen
-# @date 2017-3-2
-# ----------------------------------------------------------------------------
-
+# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
# ----------------------------------------------------------------------------
# section: set the compiler and linker options
@@ -15,7 +21,6 @@ set(ANAKIN_NVCC_FLAG "")
anakin_add_compile_option(-std=c++11)
anakin_add_compile_option(-fPIC)
anakin_add_compile_option(-ldl)
-anakin_add_compile_option(-mavx2)
if(NOT USE_ARM_PLACE)
anakin_add_compile_option(-lrt)
endif()
@@ -34,6 +39,9 @@ anakin_add_compile_option(-Wshadow)
anakin_add_compile_option(-fpermissive)
anakin_add_compile_option(-Wsign-promo)
anakin_add_compile_option(-fdiagnostics-show-option)
+if(USE_BM_PLACE)
+ anakin_add_compile_option(-lbmlib-asic)
+endif()
if(ENABLE_NOISY_WARNINGS)
anakin_add_compile_option(-Wcast-align)
@@ -47,8 +55,10 @@ else()
anakin_add_compile_option(-Wno-delete-non-virtual-dtor)
anakin_add_compile_option(-Wno-comment)
anakin_add_compile_option(-Wno-sign-compare)
- anakin_add_compile_option(-Wno-ignored-qualifiers)
- anakin_add_compile_option(-Wno-enum-compare)
+ anakin_add_compile_option(-Wno-write-strings)
+ anakin_add_compile_option(-Wno-ignored-qualifiers)
+ anakin_add_compile_option(-Wno-enum-compare)
+ anakin_add_compile_option(-Wno-missing-field-initializers)
endif()
if(CMAKE_BUILD_TYPE MATCHES Debug)
@@ -57,6 +67,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug)
anakin_add_compile_option(-gdwarf-2) # for old version gcc and gdb. see: http://stackoverflow.com/a/15051109/673852
else()
anakin_add_compile_option(-O3)
+# anakin_add_compile_option(-g)
anakin_add_compile_option(-DNDEBUG)
endif()
@@ -74,6 +85,10 @@ if(TARGET_IOS)
endif()
if(USE_X86_PLACE)
+# anakin_add_compile_option(-mavx2)
+# anakin_add_compile_option(-fopenmp)
+ anakin_add_compile_option(-fabi-version=6)
+ anakin_add_compile_option(-march=native)
anakin_add_compile_option(-Ofast)
anakin_add_compile_option(-ffast-math)
anakin_add_compile_option(-Wall)
@@ -101,6 +116,7 @@ if(USE_CUDA)
anakin_add_compile_option(-G NVCC)
anakin_add_compile_option(-g NVCC)
anakin_add_compile_option(-std=c++11 NVCC)
+ anakin_add_compile_option("--default-stream per-thread" NVCC)
anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC) # suppress warning by architectures are deprecated (2.0,2.1)
else()
anakin_add_compile_option("-Xcompiler -fPIC" NVCC)
@@ -112,21 +128,3 @@ if(USE_CUDA)
# set default nvidia gpu arch
set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1")
endif()
-
-if(USE_BM)
- if(CMAKE_BUILD_TYPE MATCHES Debug)
- anakin_add_compile_option("-Xcompiler -fPIC" NVCC)
- anakin_add_compile_option(-G NVCC)
- anakin_add_compile_option(-g NVCC)
- anakin_add_compile_option(-std=c++11 NVCC)
- anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC) # suppress warning by architectures are deprecated (2.0,2.1)
- else()
- anakin_add_compile_option("-Xcompiler -fPIC" NVCC)
- anakin_add_compile_option(-O3 NVCC)
- anakin_add_compile_option(-std=c++11 NVCC)
- anakin_add_compile_option("--default-stream per-thread" NVCC)
- anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC)
- endif()
- # set default nvidia gpu arch
- set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1")
-endif()
diff --git a/cmake/config/anakin_config.h.in b/cmake/config/anakin_config.h.in
index 0a8560593..860e77f58 100644
--- a/cmake/config/anakin_config.h.in
+++ b/cmake/config/anakin_config.h.in
@@ -1,16 +1,17 @@
-/**********************************************************
- * Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved
- *
- * @file anakin_config.h.in
- * @brief file ak_config.h is autogenerated from config.h.in
- * during the cmake configuration of anakin.
- *
- * @auther cuichaowen
- * @version ANAKIN V @VERSION@
- * @date 2017-10-23
- *
- **********************************************************/
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
#ifndef _ANAKIN_CONFIGURATION_HEADER_GUARD_H_
#define _ANAKIN_CONFIGURATION_HEADER_GUARD_H_
@@ -26,7 +27,6 @@
// build options
#cmakedefine ENABLE_DEBUG
-// boost
#cmakedefine USE_BOOST
#cmakedefine USE_CUBLAS
@@ -35,8 +35,6 @@
#cmakedefine USE_CUDA
-#cmakedefine USE_BM
-
#cmakedefine USE_CUDNN
#cmakedefine USE_PYTHON
@@ -49,8 +47,11 @@
#cmakedefine USE_OPENMP
+#cmakedefine USE_LOGGER
+
#cmakedefine USE_GFLAGS
+
// plantform to use
#cmakedefine USE_GPU_PLACE
@@ -58,7 +59,9 @@
#cmakedefine USE_ARM_PLACE
-#cmakedefine TARGET_ANDRIOD
+#cmakedefine USE_BM_PLACE
+
+#cmakedefine TARGET_ANDROID
#cmakedefine TARGET_IOS
@@ -66,6 +69,15 @@
#cmakedefine NVIDIA_GPU
+#cmakedefine AMD_GPU
+
+#cmakedefine ENABLE_STACKTRACES
+
+#cmakedefine SUPPORT_PTHREADS
+
+// build AOT lite for device
+#cmakedefine BUILD_WITH_LITE
+
#if defined(ANDROID) || defined(__ANDROID__)
#define PLATFORM_ANDROID
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index da5540a91..4e2cd4815 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -1,10 +1,16 @@
-# ----------------------------------------------------------------------------
-# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved
-# @file cuda.cmake
-# @auther cuichaowen
-# @date 2017-10-23
-# ----------------------------------------------------------------------------
-
+# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
# ----------------------------------------------------------------------------
# section: Set nvcc arch info.
# ----------------------------------------------------------------------------
@@ -137,6 +143,9 @@ macro(anakin_find_cuda)
if(USE_CURAND)
list(APPEND ANAKIN_LINKER_LIBS ${CUDA_curand_LIBRARY})
endif()
+ if(BUILD_RPC)
+ list(APPEND ANAKIN_LINKER_LIBS ${CUDA_INCLUDE_DIRS}/../lib64/stubs/libnvidia-ml.so)
+ endif()
list(APPEND ANAKIN_LINKER_LIBS ${CUDA_CUDART_LIBRARY})
else()
message(FATAL_ERROR "Cuda SHARED lib Could not found !")
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index de9b90531..b8a1358bb 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -14,6 +14,11 @@
# limitations under the License.
#===============================================================================
+anakin_find_mklml()
+if(MKLML_FOUND)
+ return()
+endif()
+
# download mklml package is only for iomp so far
include(ExternalProject)
@@ -59,5 +64,12 @@ list(APPEND ANAKIN_SABER_DEPENDENCIES mklml)
list(APPEND ANAKIN_LINKER_LIBS ${MKLML_LIB};${MKLML_IOMP_LIB})
+#set(OPENMP_FLAGS "-fopenmp")
+##set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+#set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
+
+
# iomp5 must be installed
-install(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
+#install(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake
index 796b33c94..8d1bd276b 100644
--- a/cmake/find_modules.cmake
+++ b/cmake/find_modules.cmake
@@ -1,9 +1,16 @@
-# ----------------------------------------------------------------------------
-# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved
-# @file find_modules.cmake
-# @auther cuichaowen
-# @date 2016-11-9
-# ----------------------------------------------------------------------------
+# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
#anakin cmake module
set(CMAKE_MODULE_PATH "${ANAKIN_ROOT}/cmake")
@@ -11,12 +18,16 @@ set(CMAKE_MODULE_PATH "${ANAKIN_ROOT}/cmake")
set(ANAKIN_LINKER_LIBS "")
if(UNIX)
- find_library(RTLIB rt)
- if(RTLIB)
- list(APPEND ANAKIN_LINKER_LIBS ${RTLIB})
- else()
- message(SEND_ERROR "Could not found -lrt !")
- endif()
+ if(USE_ARM_PLACE )
+ elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+ else()
+ find_library(RTLIB rt)
+ if(RTLIB)
+ list(APPEND ANAKIN_LINKER_LIBS ${RTLIB})
+ else()
+ message(SEND_ERROR "Could not found -lrt !")
+ endif()
+ endif()
find_library(DLLIB dl)
if(DLLIB)
@@ -28,30 +39,38 @@ endif()
#find opencv version >= 2.4.3
macro(anakin_find_opencv)
- if(BUILD_SHARED OR TRUE) # temporary not support static link opencv.
- #set(CMAKE_FIND_ROOT_PATH ${ANAKIN_ROOT}/third-party/opencv243/lib)
- find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
- if(NOT OpenCV_FOUND)
- find_package(OpenCV QUIET COMPONENTS core highgui imgproc)
- endif()
- if(OpenCV_FOUND)
- message(STATUS "Found opencv: ${OpenCV_INCLUDE_DIRS}")
- include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
- list(APPEND ANAKIN_LINKER_LIBS ${OpenCV_LIBS})
- else()
- message(SEND_ERROR "Could not found opencv !")
- endif()
- else() # BUILD_STATIC
- list(APPEND OPENCV_STATIC_LIBS libopencv_core.a
- libopencv_highgui.a
- libopencv_imgproc.a
- libopencv_contrib.a)
- foreach(CV_LIB ${OPENCV_STATIC_LIBS})
- set(__CV_LIB_FULL_PATH "${ANAKIN_ROOT}/third-party/opencv243/lib/${CV_LIB}")
- #message(STATUS ${__CV_LIB_FULL_PATH})
- list(APPEND ANAKIN_LINKER_LIBS ${__CV_LIB_FULL_PATH})
- endforeach()
- unset(__CV_LIB_FULL_PATH)
+
+ if(USE_ARM_PLACE AND TARGET_ANDROID)
+ include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/)
+ LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/)
+
+ else()
+
+ if(BUILD_SHARED) # temporary not support static link opencv.
+ find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
+ if(NOT OpenCV_FOUND)
+ find_package(OpenCV QUIET COMPONENTS core highgui imgproc)
+ endif()
+ if(OpenCV_FOUND)
+ message(STATUS "Found opencv: ${OpenCV_INCLUDE_DIRS}")
+ include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
+ list(APPEND ANAKIN_LINKER_LIBS ${OpenCV_LIBS})
+
+ else()
+ message(SEND_ERROR "Could not found opencv !")
+ endif()
+ else() # BUILD_STATIC
+ set(OPENCV_LIB_PATH "" CACHE "Path to oopen cv library")
+ list(APPEND OPENCV_STATIC_LIBS ${OPENCV_LIB_PATH}/libopencv_core.a
+ ${OPENCV_LIB_PATH}libopencv_highgui.a
+ ${OPENCV_LIB_PATH}libopencv_imgproc.a
+ ${OPENCV_LIB_PATH}libopencv_contrib.a)
+ foreach(CV_LIB ${OPENCV_STATIC_LIBS})
+ list(APPEND ANAKIN_LINKER_LIBS ${CV_LIB})
+ endforeach()
+ unset(__CV_LIB_FULL_PATH)
+ endif()
+
endif()
endmacro()
@@ -60,8 +79,8 @@ macro(anakin_find_opencl)
set(OCL_ROOT "" CACHE PATH "openCL root dir.")
find_path(OCL_INCLUDE_DIR NAMES CL/cl.h PATHS ${OCL_ROOT}/include $ENV{OCL_ROOT}/include)
+ find_library(OCL_LIBRARIES NAMES libOpenCL.so PATHS ${OCL_ROOT} ${OCL_ROOT}/lib/x86_64 $ENV{OCL_ROOT}/lib $ENV{OCL_ROOT}/lib/x86_64)
- find_library(OCL_LIBRARIES NAMES libOpenCL.so PATHS ${OCL_ROOT})
if(OCL_INCLUDE_DIR AND OCL_LIBRARIES)
set(OCL_FOUND TRUE)
message(STATUS "Found opencl: ${OCL_INCLUDE_DIR}")
@@ -259,32 +278,100 @@ macro(anakin_find_mklml)
list(APPEND MKLML_LIBRARIES ${MKLML_ROOT}/lib/libiomp5.so)
list(APPEND MKLML_LIBRARIES ${MKLML_ROOT}/lib/libmklml_intel.so)
list(APPEND ANAKIN_LINKER_LIBS ${MKLML_LIBRARIES})
- else()
- message(FATAL_ERROR "NOT FOUND MKLML")
+# else()
+# message(FATAL_ERROR "NOT FOUND MKLML")
endif()
endmacro()
macro(anakin_find_protobuf)
- list(APPEND ANAKIN_LINKER_LIBS ${PROTOBUF_LIBRARIES})
- find_package(Protobuf REQUIRED)
- if(PROTOBUF_FOUND)
- message(STATUS "Found protobuf in ${PROTOBUF_INCLUDE_DIR}")
- include_directories(${PROTOBUF_INCLUDE_DIR})
- list(APPEND ANAKIN_LINKER_LIBS ${PROTOBUF_LIBRARIES})
- endif()
+ if(USE_ARM_PLACE)
+ set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf")
+ include_directories(${ARM_RPOTO_ROOT}/include)
+ set(PROTOBUF_LIBRARIES "")
+ #if(BUILD_SHARED)
+ # list(APPEND ANAKIN_LINKER_LIBS ${ARM_RPOTO_ROOT}/lib/libprotobuf.so)
+ #else()
+ list(APPEND ANAKIN_LINKER_LIBS ${ARM_RPOTO_ROOT}/lib/libprotobuf.a)
+ #endif()
+ find_library( # Sets the name of the path variable.
+ log-lib
+
+ # Specifies the name of the NDK library that
+ # you want CMake to locate.
+ log )
+ list(APPEND ANAKIN_LINKER_LIBS ${log-lib})
+ else()
+ find_program(PROTOBUF_PROTOC_EXECUTABLE protoc)
+ if(PROTOBUF_PROTOC_EXECUTABLE)
+ find_package(Protobuf REQUIRED)
+ message(STATUS "Found protobuf in ${PROTOBUF_INCLUDE_DIR}")
+ include_directories(${PROTOBUF_INCLUDE_DIR})
+ list(APPEND ANAKIN_LINKER_LIBS ${PROTOBUF_LIBRARIES})
+ else()
+ set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
+ if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
+ find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
+ find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+ find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+ find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+ find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
+ if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
+ message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
+ set(PROTOBUF_LIBRARIES ${PROTOBUF_LIBRARY} ${PROTOBUF_LITE_LIBRARY} ${PROTOBUF_PROTOC_LIBRARY})
+ list(APPEND ANAKIN_LINKER_LIBS ${PROTOBUF_LIBRARIES})
+ include_directories(${PROTOBUF_INCLUDE_DIR})
+ else()
+ message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}.")
+ endif()
+ endif()
+ endif()
+ endif()
endmacro()
+macro(anakin_find_baidu_rpc)
+ set(BAIDU_RPC_ROOT "/usr/local/" CACHE PATH "baidu rpc root dir")
+ find_path(RPC_INCLUDE_DIR server.h PATHS ${BAIDU_RPC_ROOT}/include/brpc/ $ENV{BAIDU_RPC_ROOT}/include/brpc/)
+ find_library(RPC_LIBRARY NAMES libbrpc.so
+ PATHS ${BAIDU_RPC_ROOT}/lib $ENV{BAIDU_RPC_ROOT}/include/brpc/
+ DOC "library path for baidu rpc.")
+ if(RPC_INCLUDE_DIR AND RPC_LIBRARY)
+ include_directories(${BAIDU_RPC_ROOT}/include)
+ list(APPEND ANAKIN_LINKER_LIBS ${RPC_LIBRARY})
+ else()
+ message(SEND_ERROR "Could not found baidu-rpc !")
+ endif()
+endmacro()
macro(anakin_find_openmp)
find_package(OpenMP REQUIRED)
if(OPENMP_FOUND OR OpenMP_CXX_FOUND)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
message(STATUS "Found openmp in ${OPENMP_INCLUDE_DIR}")
- message(STATUS " |-- openmp c flags: ${OpenMP_C_FLAGS}")
- message(STATUS " |-- openmp cxx flags: ${OpenMP_CXX_FLAGS}")
- message(STATUS " `-- openmp link flags: ${OpenMP_EXE_LINKER_FLAGS}")
- include_directories(${OPENMP_INCLUDE_DIR})
- list(APPEND ANAKIN_LINKER_LIBS ${OPENMP_LIBRARIES})
+ message(STATUS " |--openmp cflags: ${OpenMP_C_FLAGS}")
+ message(STATUS " |--openmp cxxflags: ${OpenMP_CXX_FLAGS}")
+ message(STATUS " |--openmp cflags: ${OpenMP_EXE_LINKER_FLAGS}")
else()
message(FATAL_ERROR "Could not found openmp !")
endif()
endmacro()
+
+macro(anakin_find_bmlib)
+ find_path(BM_ROOT include/bmlib/bmlib_runtime.h /usr/local/include/bm/ $ENV{BM_ROOT}/)
+ if(BM_ROOT)
+ set(BM_FOUND TRUE)
+ endif()
+ if(BM_FOUND)
+ message(STATUS " Found bm_lib in ${BM_ROOT}")
+ anakin_fetch_include_recursively(${BM_ROOT}/include)
+ set(BM_LIBRARIES "")
+# list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/cmodel/bmlib.a)
+# list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/cmodel/cmodel.a)
+# list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/cmodel/common.a)
+# list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/cmodel/fwcore.a)
+ list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/device/libbmlib-asic.so)
+ list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARIES})
+ else()
+ message(FATAL_ERROR "Could not found bm_lib")
+ endif()
+endmacro()
diff --git a/cmake/gather.cmake b/cmake/gather.cmake
index 5017efff7..e6aafc9f3 100644
--- a/cmake/gather.cmake
+++ b/cmake/gather.cmake
@@ -1,9 +1,16 @@
-# ----------------------------------------------------------------------------
-# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved
-# @file gather_libs.cmake
-# @auther cuichaowen
-# @date 2017-10-24
-# ----------------------------------------------------------------------------
+# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
# find cudnn default cudnn 5
if(USE_CUDNN)
@@ -17,16 +24,22 @@ if(USE_CUDA)
anakin_find_cuda()
endif()
-if(USE_BM)
- #set other cuda path
- #set(CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_PATH})
- #anakin_find_cuda()
+if(USE_BM_PLACE)
+ anakin_find_bmlib()
endif()
+# set amd opencl path
+if(AMD_GPU)
+ amd_build_cl_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl" "${CMAKE_BINARY_DIR}/cl/amd")
+ amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/lib" "${CMAKE_BINARY_DIR}/cl/amd")
+ amd_build_cl_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl" "${PROJECT_SOURCE_DIR}/output/unit_test")
+ amd_build_cl_binary_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/lib" "${PROJECT_SOURCE_DIR}/output/unit_test")
+ amd_build_cl_file("${CMAKE_SOURCE_DIR}/test/saber/amd" "${PROJECT_SOURCE_DIR}/output/unit_test")
+endif()
# find opencl
if(USE_OPENCL)
- anakin_generate_kernel(${ANAKIN_ROOT})
+ #anakin_generate_kernel(${ANAKIN_ROOT})
anakin_find_opencl()
endif()
@@ -49,6 +62,10 @@ if(USE_PROTOBUF)
anakin_protos_processing()
endif()
+if(BUILD_RPC)
+ anakin_find_baidu_rpc()
+endif()
+
if (USE_GFLAGS)
anakin_find_gflags()
endif()
@@ -71,9 +88,11 @@ endif()
if(DISABLE_ALL_WARNINGS)
anakin_disable_warnings(CMAKE_CXX_FLAGS)
endif()
-
+if(USE_OPENMP)
+ anakin_find_openmp()
+endif()
if(USE_ARM_PLACE)
- if(TARGET_ANDRIOD)
+ if(TARGET_ANDROID)
if(USE_OPENMP)
anakin_find_openmp()
endif()
diff --git a/cmake/ios/iosxc.toolchain.cmake b/cmake/ios/iosxc.toolchain.cmake
new file mode 100644
index 000000000..bcfd76937
--- /dev/null
+++ b/cmake/ios/iosxc.toolchain.cmake
@@ -0,0 +1,39 @@
+# Standard settings
+# set(UNIX True)
+# set(Darwin True)
+# set(IOS True)
+set (CMAKE_SYSTEM_NAME Darwin)
+set (CMAKE_SYSTEM_VERSION 1)
+set (UNIX True)
+set (APPLE True)
+set (IOS True)
+
+# suppress -rdynamic
+# set(CMAKE_SYSTEM_NAME Generic)
+
+set(CMAKE_C_COMPILER arm-apple-darwin11-clang)
+set(CMAKE_CXX_COMPILER arm-apple-darwin11-clang++)
+
+set(_CMAKE_TOOLCHAIN_PREFIX arm-apple-darwin11-)
+
+set(CMAKE_IOS_SDK_ROOT ${IOS_SDK_PATH})
+
+# Set the sysroot default to the most recent SDK
+set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+
+# set the architecture for iOS
+# set(IOS_ARCH arm64)
+set(IOS_ARCH armv7;arm64)
+
+set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
+
+# Set the find root to the iOS developer roots and to user defined paths
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root")
+
+# searching for frameworks only
+set(CMAKE_FIND_FRAMEWORK FIRST)
+
+# set up the default search directories for frameworks
+set(CMAKE_SYSTEM_FRAMEWORK_PATH
+ ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
+)
diff --git a/cmake/msg_color.cmake b/cmake/msg_color.cmake
index 3bf6da6b9..18fc4cf5e 100644
--- a/cmake/msg_color.cmake
+++ b/cmake/msg_color.cmake
@@ -1,9 +1,16 @@
-# ----------------------------------------------------------------------------
-# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved
-# @file msg_color.cmake
-# @auther cuichaowen
-# @date 2016-11-8
-# ----------------------------------------------------------------------------
+# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
# ----------------------------------------------------------------------------
# section: help to get colorful cmake message.
diff --git a/cmake/statistic.cmake b/cmake/statistic.cmake
index 65a9f7964..86138122a 100644
--- a/cmake/statistic.cmake
+++ b/cmake/statistic.cmake
@@ -1,10 +1,16 @@
-# ----------------------------------------------------------------------------
-# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved
-# @file statistic.cmake
-# @auther cuichaowen
-# @date 2017-4-20
-# ----------------------------------------------------------------------------
-
+# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
# ----------------------------------------------------------------------------
# section: prints the statistic of configuration of anakin.
@@ -113,7 +119,7 @@ function(anakin_print_statistic)
elseif(USE_ARM_PLACE)
message(STATUS " USE_ARM_PLACE : ${USE_ARM_PLACE}")
if(TARGET_ANDROID)
- message(STATUS " `--Target Andriod : ${TARGET_ANDROID}")
+ message(STATUS " `--Target Android : ${TARGET_ANDROID}")
else()
message(STATUS " `--Target IOS : ${TARGET_IOS}")
endif()
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 5daa82913..1804343d7 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -1,9 +1,16 @@
-# ----------------------------------------------------------------------------
-# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved
-# @file utils.cmake
-# @auther cuichaowen
-# @date 2016-11-8
-# ----------------------------------------------------------------------------
+# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
# ----------------------------------------------------------------------------
# section: help to search src and include files
@@ -24,9 +31,9 @@ function(anakin_fetch_files_with_suffix search_dir suffix outputs)
endforeach()
set(${outputs} ${${outputs}} ${abs_dir} PARENT_SCOPE)
else()
- #message(WARNING "anakin_fetch_files_recursively ${BoldRed}failed${ColourReset}:\n"
- # "real_dir:${BoldYellow}${search_dir}${ColourReset}\n"
- # "suffix:*.${BoldYellow}${suffix}${ColourReset} \n")
+ #message(WARNING "anakin_fetch_files_recursively ${BoldRed}failed${ColourReset}:\n"
+ # "real_dir:${BoldYellow}${search_dir}${ColourReset}\n"
+ # "suffix:*.${BoldYellow}${suffix}${ColourReset} \n")
endif()
endfunction()
@@ -39,7 +46,7 @@ endfunction()
# recursively fetch include dir
function(anakin_fetch_include_recursively root_dir)
if (IS_DIRECTORY ${root_dir})
- #message(STATUS "include dir: " ${Magenta}${root_dir}${ColourReset})
+ #message(STATUS "include dir: " ${Magenta}${root_dir}${ColourReset})
include_directories(${root_dir})
endif()
@@ -51,6 +58,14 @@ function(anakin_fetch_include_recursively root_dir)
endforeach()
endfunction()
+# judge fetch files
+function(anakin_judge_avx outputs)
+ exec_program(cat /proc/cpuinfo|greps flag|uniq
+ OUTPUT_VARIABLE OUTPUT
+ RETURN_VALUE VALUE)
+ message("it is anakin_judge_avx " OUTPUT)
+ set(${outputs} ${${outputs}} PARENT_SCOPE)
+endfunction()
# ----------------------------------------------------------------------------
# section: help to detect the compiler options
# ----------------------------------------------------------------------------
@@ -129,19 +144,19 @@ macro(anakin_check_compiler_flag LANG FLAG RESULT)
endmacro()
macro(anakin_check_flag_support lang flag varname)
- if("_${lang}_" MATCHES "_CXX_")
- set(_lang CXX)
+ if("_${lang}_" MATCHES "_CXX_")
+ set(_lang CXX)
elseif("_${lang}_" MATCHES "_CU_")
set(_lang NVCC)
- else()
- set(_lang ${lang})
- endif()
+ else()
+ set(_lang ${lang})
+ endif()
- string(TOUPPER "${flag}" ${varname})
- string(REGEX REPLACE "^(/|-)" "HAVE_${_lang}_" ${varname} "${${varname}}")
- string(REGEX REPLACE " --|-|=| |\\." "_" ${varname} "${${varname}}")
+ string(TOUPPER "${flag}" ${varname})
+ string(REGEX REPLACE "^(/|-)" "HAVE_${_lang}_" ${varname} "${${varname}}")
+ string(REGEX REPLACE " --|-|=| |\\." "_" ${varname} "${${varname}}")
- anakin_check_compiler_flag("${_lang}" "${ARGN} ${flag}" ${${varname}})
+ anakin_check_compiler_flag("${_lang}" "${ARGN} ${flag}" ${${varname}})
endmacro()
macro(anakin_add_compile_option option)
@@ -224,31 +239,26 @@ endfunction()
# ----------------------------------------------------------------------------
# section: generate the protobuf .h and .cpp files.
# ----------------------------------------------------------------------------
-function(anakin_protos_processing)
- set(PROTO_SRC_PATH ${ANAKIN_MODEL_PARSER}/proto)
- set(__working_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/PROTO_TEMP/)
-
- anakin_fetch_files_with_suffix(${PROTO_SRC_PATH} "proto" PROTO_SRC_FILES)
- foreach(__file ${PROTO_SRC_FILES})
- exec_program(protoc ${__working_dir} ARGS " -I=${PROTO_SRC_PATH} --cpp_out=. ${__file}"
- OUTPUT_VARIABLE OUTPUT
- RETURN_VALUE VALUE)
- if(NOT VALUE)
- anakin_fetch_files_with_suffix(${__working_dir} "h" PROTO_GENERATE_H)
- # get *.cpp or *.cc
- anakin_fetch_files_with_suffix(${__working_dir} "c*" PROTO_GENERATE_C)
- foreach(__include_file ${PROTO_GENERATE_H})
- exec_program(mv ARGS ${__include_file} ${PROTO_SRC_PATH}
- OUTPUT_VARIABLE __out
- RETURN_VALUE __value)
- endforeach()
- foreach(__src_file ${PROTO_GENERATE_C})
- if(POLICY CMP0007)
- cmake_policy(PUSH)
- cmake_policy(SET CMP0007 NEW)
- endif()
- string(REPLACE "." ";" SRC_LIST ${__src_file})
- list(GET SRC_LIST -1 __src_file_name_suffix)
+function(anakin_gen_pb proto_src_path)
+ set(__working_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/PROTO_TEMP/)
+ foreach(__proto_file ${ARGN})
+ exec_program(${PROTOBUF_PROTOC_EXECUTABLE} ${__working_dir} ARGS " -I=${proto_src_path} --cpp_out=. ${__proto_file}"
+ OUTPUT_VARIABLE OUTPUT RETURN_VALUE VALUE)
+ if(NOT VALUE)
+ anakin_fetch_files_with_suffix(${__working_dir} "h" PROTO_GENERATE_H)
+ # get *.cpp or *.cc
+ anakin_fetch_files_with_suffix(${__working_dir} "c*" PROTO_GENERATE_C)
+ foreach(__include_file ${PROTO_GENERATE_H})
+ exec_program(mv ARGS ${__include_file} ${proto_src_path}
+ OUTPUT_VARIABLE __out RETURN_VALUE __value)
+ endforeach()
+ foreach(__src_file ${PROTO_GENERATE_C})
+ if(POLICY CMP0007)
+ cmake_policy(PUSH)
+ cmake_policy(SET CMP0007 NEW)
+ endif()
+ string(REPLACE "." ";" SRC_LIST ${__src_file})
+ list(GET SRC_LIST -1 __src_file_name_suffix)
list(GET SRC_LIST -3 __src_file_name)
string(REPLACE "/" ";" SRC_LIST_PATH ${__src_file_name})
@@ -259,18 +269,31 @@ function(anakin_protos_processing)
else()
set(__full_src_filename "${__pure_src_file_name}.pb.cc")
endif()
- #message(STATUS " first ---> ${__working_dir}${__full_src_filename} ${ANAKIN_ROOT}/src/${__pure_src_file_name}.pb.cpp")
- exec_program(mv ARGS " ${__working_dir}${__full_src_filename} ${PROTO_SRC_PATH}/${__pure_src_file_name}.pb.cpp"
+ exec_program(mv ARGS " ${__working_dir}${__full_src_filename} ${proto_src_path}/${__pure_src_file_name}.pb.cpp"
OUTPUT_VARIABLE __out
RETURN_VALUE __value)
if(POLICY CMP0007)
cmake_policy(POP)
endif()
- endforeach()
- else()
- message(FATAL_ERROR "anakin_protos_processing : ${__file} \n error msg: ${OUTPUT}")
- endif()
- endforeach()
+ endforeach()
+ else()
+ message(FATAL_ERROR "anakin_gen_bp: ${__file} \n error msg: ${OUTPUT}")
+ endif()
+ endforeach()
+endfunction()
+
+function(anakin_protos_processing)
+ set(PROTO_SRC_PATH ${ANAKIN_MODEL_PARSER}/proto)
+ set(SERVICE_API_SRC_PATH ${ANAKIN_SERVICE}/api)
+
+ set(__working_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/PROTO_TEMP/)
+
+ anakin_fetch_files_with_suffix(${PROTO_SRC_PATH} "proto" PROTO_SRC_FILES)
+ anakin_fetch_files_with_suffix(${SERVICE_API_SRC_PATH} "proto" SERVICE_API_PROTO_SRC_FILES)
+ anakin_gen_pb(${PROTO_SRC_PATH} ${PROTO_SRC_FILES})
+ if(BUILD_RPC)
+ anakin_gen_pb(${SERVICE_API_SRC_PATH} ${SERVICE_API_PROTO_SRC_FILES})
+ endif()
endfunction()
# ----------------------------------------------------------------------------
diff --git a/docker/AMD/centos/centos7-rocm-opencl/Dockerfile b/docker/AMD/centos/centos7-rocm-opencl/Dockerfile
new file mode 100755
index 000000000..f0c65b7cc
--- /dev/null
+++ b/docker/AMD/centos/centos7-rocm-opencl/Dockerfile
@@ -0,0 +1,42 @@
+
+FROM centos:7.4.1708
+
+# anakin install ubuntu GPU env
+RUN yum -y install vim wget git make glibc-devel libstdc++-deve epel-release gcc gcc-c++ libstdc++ && rm -rf /var/cache/yum/*
+
+RUN yum -y install python-pip && rm -rf /var/cache/yum/*
+
+RUN pip install --no-cache-dir --upgrade pip && \
+ pip install --no-cache-dir \
+ flask numpy pyyaml scipy pandas
+
+# set env
+ENV LIBRARY_PATH /usr/lib64:$LIBRARY_PATH
+
+# install cmake
+RUN wget https://cmake.org/files/v3.2/cmake-3.2.0.tar.gz && tar xzf cmake-3.2.0.tar.gz && \
+ cd cmake-3.2.0 && ./bootstrap && \
+ make -j4 && make install && cd .. && rm -f cmake-3.2.0.tar.gz
+
+# install protobuf
+RUN wget --no-check-certificate https://mirror.sobukus.de/files/src/protobuf/protobuf-cpp-3.4.0.tar.gz \
+ && tar -xvf protobuf-cpp-3.4.0.tar.gz \
+ && cd protobuf-3.4.0 && ./configure \
+ && make -j4 && make install && cd .. \
+ && rm -f protobuf-cpp-3.4.0.tar.gz
+
+RUN echo "[ROCm]" > /etc/yum.repos.d/rocm.repo \
+ && echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo \
+ && echo "baseurl=http://repo.radeon.com/rocm/yum/rpm" >> /etc/yum.repos.d/rocm.repo \
+ && echo "enabled=1" >> /etc/yum.repos.d/rocm.repo \
+ && echo "gpgcheck=0" >> /etc/yum.repos.d/rocm.repo
+
+RUN yum -y install rocm-opencl rocm-opencl-devel && rm -rf /var/cache/yum/*
+
+# set env
+ENV LIBRARY_PATH /opt/rocm/lib:/opt/rocm/opencl/lib/x86_64:$LIBRARY_PATH
+ENV OCL_ROOT /opt/rocm/opencl/lib/x86_64
+ENV PATH /opt/rocm/bin:/opt/rocm/opencl/bin/x86_64:$PATH
+
+RUN git clone --branch AMD --recursive "https://github.com/PaddlePaddle/Anakin.git" /root/Anakin && cd /root/Anakin/tools/ && ./amd_gpu_build.sh && cd -
+
diff --git a/docker/NVIDIA/ubuntu/ubuntu16.04-cuda8-cudnn7/Dockerfile b/docker/NVIDIA/ubuntu/ubuntu16.04-cuda8-cudnn7/Dockerfile
index ab81b4ac8..3bb0ffb8c 100644
--- a/docker/NVIDIA/ubuntu/ubuntu16.04-cuda8-cudnn7/Dockerfile
+++ b/docker/NVIDIA/ubuntu/ubuntu16.04-cuda8-cudnn7/Dockerfile
@@ -43,4 +43,4 @@ ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
ENV CUDNN_ROOT=/usr/local/cuda/include
# build and install anakin
-RUN git clone --branch developing --recursive https://github.com/PaddlePaddle/Anakin.git
+#RUN git clone --branch developing --recursive https://github.com/PaddlePaddle/Anakin.git
diff --git a/docker/README.md b/docker/README.md
index fcc1511f9..5ea351395 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -37,6 +37,22 @@ $chmod +x ./anakin_docker_build_and_run.sh
$./anakin_docker_build_and_run.sh -p NVIDIA-GPU -o Centos -m Run
```
+### AMD Docker
+#### Build Image
+```bash
+$/usr/bash anakin_docker_build_and_run.sh -p AMD-GPU -o Centos -m Build
+or
+$chmod +x ./anakin_docker_build_and_run.sh
+$./anakin_docker_build_and_run.sh -p AMD-GPU -o Centos -m Build
+```
+
+#### Run docker
+```bash
+$/usr/bash anakin_docker_build_and_run.sh -p AMD-GPU -o Centos -m Run
+or
+$chmod +x ./anakin_docker_build_and_run.sh
+$./anakin_docker_build_and_run.sh -p AMD-GPU -o Centos -m Run
+```
### X86 Docker
> Not support yet
diff --git a/docker/README_cn.md b/docker/README_cn.md
new file mode 100644
index 000000000..6d5ed994a
--- /dev/null
+++ b/docker/README_cn.md
@@ -0,0 +1,46 @@
+# Anakin 2.0 And Docker
+---
+
+## 依赖软件
+
++ 你的操作系统上应该已经安装了docker.
++ 如果你要在docker中使用`NVIDIA GPU` 还需要安装[nvidia-docker2](https://github.com/NVIDIA/nvidia-docker/wiki/Installation-(version-2.0))
+
+## 使用方法
+
+推荐使用 `anakin_docker_build_and_run.sh` 脚本来构建和运行docker镜像,脚本的使用方法如下
+
+```bash
+Usage: anakin_docker_build_and_run.sh -p -o -m
+
+选项:
+
+ -p 硬件的运行环境 [ NVIDIA-GPU / AMD_GPU / X86-ONLY / ARM ]
+ -o 主机的操作系统类型 [ Centos / Ubuntu ]
+ -m 脚本的执行模式[ Build / Run / All] 默认模式是 build and run
+```
+
+### GPU Docker
+#### 构建镜像
+```bash
+/usr/bash anakin_docker_build_and_run.sh -p NVIDIA-GPU -o Centos -m Build
+或者
+chmod +x ./anakin_docker_build_and_run.sh
+./anakin_docker_build_and_run.sh -p NVIDIA-GPU -o Centos -m Build
+```
+
+#### 运行 docker容器
+```bash
+/usr/bash anakin_docker_build_and_run.sh -p NVIDIA-GPU -o Centos -m Run
+或者
+chmod +x ./anakin_docker_build_and_run.sh
+./anakin_docker_build_and_run.sh -p NVIDIA-GPU -o Centos -m Run
+```
+
+### X86 Docker
+
+> Not support yet
+
+### ARM Docer
+
+> Not support yet
diff --git a/docker/anakin_docker_build_and_run.sh b/docker/anakin_docker_build_and_run.sh
index 97802a4e8..1eb989ceb 100755
--- a/docker/anakin_docker_build_and_run.sh
+++ b/docker/anakin_docker_build_and_run.sh
@@ -14,7 +14,7 @@ help_anakin_docker_run() {
echo ""
echo "Options:"
echo ""
- echo " -p Hardware Place where docker will running [ NVIDIA-GPU / AMD_GPU / X86-ONLY / ARM ] "
+ echo " -p Hardware Place where docker will running [ NVIDIA-GPU / AMD-GPU / X86-ONLY / ARM ] "
echo " -o Operating system docker will reside on [ Centos / Ubuntu ] "
echo " -m Script exe mode [ Build / Run ] default mode is build and run"
exit 1
@@ -56,7 +56,7 @@ building_and_run_nvidia_gpu_docker() {
if [ ! $MODE = "Run" ]; then
echo "Building nvidia docker ... [ docker_image_name: anakin image_tag: $tag ]"
sudo docker build --network=host -t anakin:$tag"-base" . -f $DockerfilePath
- sudo docker run --network=host -it anakin:$tag"-base" Anakin/tools/gpu_build.sh
+ sudo docker run --network=host -it anakin:$tag"-base" Anakin/tools/nv_gpu_build.sh
container_id=$(sudo docker ps -l | sed -n 2p | awk '{print $1}')
sudo docker commit $container_id anakin:$tag
else
@@ -67,9 +67,19 @@ building_and_run_nvidia_gpu_docker() {
# buiding and running docker for amd gpu
building_and_run_amd_gpu_docker() {
- echo "not support yet"
- read
- exit 1
+ if [ ! $# -eq 2 ]; then
+ exit 1
+ fi
+ DockerfilePath=$1
+ MODE=$2
+ tag="$(echo $DockerfilePath | awk -F/ '{print tolower($(NF-3) "_" $(NF-1))}')"
+ if [ ! $MODE = "Run" ]; then
+ echo "Building amd docker ... [ docker_image_name: anakin image_tag: $tag ]"
+ sudo docker build --network=host -t anakin:$tag . -f $DockerfilePath
+ else
+ echo "Running amd docker ... [ docker_image_name: anakin image_tag: $tag ]"
+ sudo docker run -it --device=/dev/kfd --device=/dev/dri --group-add video anakin:$tag /bin/bash
+ fi
}
# building and running docker for x86
@@ -91,7 +101,7 @@ dispatch_docker_path() {
# declare associative map from place to relative path
declare -A PLACE2PATH
PLACE2PATH["NVIDIA-GPU"]=NVIDIA
- PLACE2PATH["AMD_GPU"]=AMD
+ PLACE2PATH["AMD-GPU"]=AMD
PLACE2PATH["X86-ONLY"]=X86
PLACE2PATH["ARM"]=ARM
# declare associative map from os to relative path
@@ -155,7 +165,7 @@ dispatch_docker_path $place $os
if [ $place = "NVIDIA-GPU" ]; then
building_and_run_nvidia_gpu_docker $SupportDockerFilePath $mode
-elif [ $place = "AMD_GPU" ]; then
+elif [ $place = "AMD-GPU" ]; then
building_and_run_amd_gpu_docker $SupportDockerFilePath $mode
elif [ $place = "X86-ONLY" ]; then
building_and_run_x86_docker $SupportDockerFilePath $mode
diff --git a/docs/Manual/C++APIs_ch.md b/docs/Manual/C++APIs_ch.md
new file mode 100644
index 000000000..e0dc81d71
--- /dev/null
+++ b/docs/Manual/C++APIs_ch.md
@@ -0,0 +1,624 @@
+# C++ APIs ##
+
+本教程将会介绍Anakin的一些基本的API及如何调用这些API。
+
+主要内容如下:
+
+- [Anakin APIs](#api)
+- [示例代码](#example)
+
+## Anakin APIs ###
+### Tensor ####
+
+`Tensor`提供基础的数据操作和管理,为ops提供统一的数据接口。`Tensor`包含以下几个属性:
+
+- Buffer
+ 数据存储区
+- Shape
+ 数据的维度信息
+- Event
+ 用于异步计算的同步
+
+ `Tensor` 类包含三个`Shape`对象, 分别是`_shape`, `_valid_shape`和 `offset`。 `_shape`为`tensor`真正空间信息,`_valid_shape`表示当前`tensor`使用的空间信息, `_offset`表示当前`tensor`数据指针相对于真正数据空间的信息。 `Tensor`不同维度与分别与数学中的向量、矩阵等相对应如下表所示。
+
+
+Dimentions | Math entity |
+ :----: | :----:
+1 | vector
+2 | matrix
+3 | 3-tensor
+n | n-tensor
+
+#### 声明tensor对象
+
+`Tensor`接受三个模板参数:
+
+
+```c++
+ template
+ class Tensor .../* Inherit other class */{
+ //some implements
+ ...
+ };
+```
+
+TargetType是平台类型,如X86,GPU等等,在Anakin内部有相应的标识与之对应;datatype是普通的数据类型,在Anakin内部也有相应的标志与之对应;[LayOutType](#layout)是数据分布类型,如batch x channel x height x width [NxCxHxW], 在Anakin内部用一个struct来标识。 Anakin中数据类型与基本数据类型的对应如下:
+
+1. TargetType
+
+ Anakin TargetType | platform
+ :----: | :----:|
+ NV | NVIDIA GPU
+ ARM | ARM
+ AMD | AMD GPU
+ X86 | X86
+ NVHX86 | NVIDIA GPU with Pinned Memory
+
+2. DataType
+
+Anakin DataType | C++ | Description
+:---: | :---: | :---: |
+AK_HALF | short | fp16
+AK_FLOAT | float | fp32
+AK_DOUBLE | double | fp64
+AK_INT8 | char | int8
+AK_INT16 | short | int16
+AK_INT32 | int | int32
+AK_INT64 | long | int64
+AK_UINT8 | unsigned char | uint8
+AK_UINT16 | unsigned short | uint8
+AK_UINT32 | unsigned int | uint32
+AK_STRING | std::string | /
+AK_BOOL | bool | /
+AK_SHAPE | / | Anakin Shape
+AK_TENSOR | / | Anakin Tensor
+
+
+3. LayOutType
+
+Anakin LayOutType ( Tensor LayOut ) | Tensor Dimention | Tensor Support | Op Support
+:---: | :---: | :---: | :---: |
+W | 1-D | YES | NO
+HW | 2-D | YES | NO
+WH | 2-D | YES | NO
+NW | 2-D | YES | YES
+NHW | 3-D | YES |YES
+NCHW ( default ) | 4-D | YES | YES
+NHWC | 4-D | YES | NO
+NCHW_C4 | 5-D | YES | YES
+
+
+理论上,Anakin支持申明1维以上的tensor,但是对于Anakin中的Op来说,只支持NW、NHW、NCHW、NCHW_C4这四种LayOut,其中NCHW是默认的LayOutType,NCHW_C4是专门针对于int8这种数据类型的。
+
+
+例子
+
+> 下面的代码将展示如何使用tensor, 我们建议先看看这些示例。
+
+> 要想获得更多关于tensor的信息, 请参考 *soure_path/core/tensor.h*
+
+> 1. 使用shape对象初始化tensor
+``` c++
+ //create a null tensor. A null tensor holds for nothing.
+ //tensor's buffer is resident at CPU and its datatype is AK_FLOAT.
+ //tensor's Layout is NCHW(default)
+ Tensor mytensor;
+
+ //1. using shape object to create a tensor.
+ Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
+ Tensor mytensor1(shape1); //1-D tensor.
+
+ // A 4-D shape
+ Shape shape2(N, C, H, W); // batch x channel x height x width
+```
+
+>`注意:Shape的维度必须和tensor的`[LayoutType](#layout)`相同,比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW,否则会出错。如下列代码所示`
+
+
+```c++
+ // A 4-D tensor.
+ Tensor mytensor2(shape2); //right
+
+ //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
+ Tensor mytensor3(shape2); //right
+
+ Tensor mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
+ Tensor mytensor5(shape2); //wrong!!!!
+
+```
+
+> 2. 使用现有的数据和shape初始化tensor
+
+```c++
+
+ /**
+ * A construtor of Tensor.
+ * data_ptr is a pointer to any data type of data
+ * TargetType is type of a platform [Anakin TargetType]
+ * id : device id
+ * shape: a Anakin shape
+ */
+ Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);
+
+ //using existing data feed to a tensor
+ Tensor mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
+
+```
+
+> 3. 使用tensor初始化tensor
+
+```c++
+ Tensor tensor(exist_tensor);
+```
+
+
+> 提示: 你可以用` typedef Tensor Tensor4d_X86 `方便定义tensor
+
+
+#### 填充tensor数据区
+
+
+填充数据区得看你申明tensor的方式, 下面展示了如何填充tensor的数据区。
+
+```c++
+首先来看看tensor的四种声明方式:
+
+1. Tensor mytensor;
+2. Tensor mytensor1(shape1);
+3. Tensor mytensor(data_ptr, TargetType, device_id, shape);
+4. Tensor tensor(exist_tensor);
+
+
+相关的声明方式的数据填充方法如下:
+
+1:声明一个空的tensor,此时没有为其分配内存,所以,我们需要手动的为其分配内存。
+
+ //parama shape
+ mytensor.re_alloc(Shape shape);
+
+ //Get writable pointer to mytensor.
+ //parama index (int): where you start to write.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor.mutable_data(index/*=0*/);
+ //write data to mytensor
+ for(int i = 0; i < mytensor.size(); i++){
+ p[i] = 1.0f;
+ }
+ //do something ...
+
+2: 这种声明方式会自动分配内存
+
+ //Get writable pointer to mytensor.
+ //parama index (int): where you start to write.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor1.mutable_data(index/*=0*/);
+ //write data to mytensor
+ for(int i = 0; i < mytensor.size(); i++){
+ p[i] = 1.0f;
+ }
+ //do something ...
+
+
+3:在该种声明方式中,我们仍不需要手动为其分配内存。但在构造函数内部是否为其分配内存,得依情况而定。如果data_ptr和申明的
+tensor都在都一个目标平台上,那么该tensor就会与data_ptr共享内存空间,相反,如果他们不在同一个平台上(如data_ptr在X86上,而
+tensor在GPU上),那么此时tensor就会开辟一个新的内存空间,并将data_ptr所指向的数据拷贝到tensor的buffer中。
+
+ //Get writable pointer to mytensor.
+ //parama index (int): where you start to write.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor.mutable_data(index/*=0*/);
+ //write data to mytensor
+ for(int i = 0; i < mytensor.size(); i++){
+ p[i] = 1.0f;
+ }
+ //do something ...
+
+4:该种方式仍不需要手动分配内存
+
+ //Get writable pointer to mytensor.
+ //parama index (int): where you start to write.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor.mutable_data(index/*=0*/);
+ //write data to mytensor
+ for(int i = 0; i < mytensor.size(); i++){
+ p[i] = 1.0f;
+ }
+ //do something ...
+
+
+另外,你还可以获取一个tensor的可读指针,示例如下:
+ //Get read-only pointer to mytensor.
+ //parama index (int): where you start to read.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor.data(index/*=0*/);
+ //do something ...
+```
+
+如果想更详细的了解tensor,请查阅*soure_path/saber/core/tensor.h*
+
+#### 获取tensor的shape
+
+```c++
+//some declarations
+// ...
+Shape shape = mytensor.shape();
+
+//Get a first dimetion size of tesor, if it has.
+int d1 = shape[0];
+
+//Get a second dimention size of tensor, if it has.
+int d2 = shape[1];
+
+...
+
+//Get a n-th dimention size of tensor, if it has.
+int dn = shape[n-1];
+
+
+//Get a tensor's dimention
+int dims = mytensor.dims();
+
+//Get the size of tensor.
+//size = d1 x d2 x ... x dn.
+int size = mytensor.size();
+
+//Get the size of tensor at interval [Di, Dj)
+// form i-th dimention to j-th dimention, but not including the j-th dimention.
+// which means di x (di+1) x ... x (dj -1)
+int size = mytensor.count(start, end);
+```
+
+#### 设置tensor的shape
+
+我们可以用tensor的成员函数set_shape来设置tensor的shape。 下面是set_shape的定义
+
+
+```c++
+/**
+ * \brief set a tensor's shape
+ * \param valid_shape [a Shape object]
+ * \param shape [a Shape object]
+ * \param offset [a Shape object]
+ * \return the status of this operation, that means whether it success * or not.
+ */
+SaberStatus set_shape(Shape valid_shape, Shape shape = Shape::zero(TensorAPI::layout_dims::value), Shape offset = Shape::minusone(TensorAPI::layout_dims::value));
+```
+
+这个成员函数只设置tensor的shape。这些shape对象(valid_shape, shape, offset)的[LayOutType](#layout)必须和当前的tensor的相应三个shape对象的LayOutType相同,如果不同就会出错,返回SaberInvalidValue。 如果相同,那么将成功设置tensor的shape。
+
+```c++
+
+// some declarations
+// ...
+//valid_shape, shape , offset are Shape object;
+//All these Shape object's LayOutType must be equal to mytensor's.
+mytensor.set_shape(valid_shape, shape, offset);
+
+```
+
+#### 重置 tensor的shape
+
+```c++
+//some declarations
+Shape shape, valid_shape, offset;
+
+//do some initializations
+...
+mytensor.reshape(valid_shape, shape, offset);
+```
+
+注意: Reshape操作仍然需要shape的[LayOutType](#layout) 与tensor的相同
+
+
+### Graph ###
+
+`Graph`类负责加载Anakin模型生成计算图、对图进行优化、存储模型等操作。
+
+#### 图的声明
+
+与`Tensor`一样,graph也接受三个模板参数。
+
+```c++
+
+template
+class Graph ... /* inherit other class*/{
+
+ //some implements
+ ...
+
+};
+```
+
+前面已经介绍过[TargetType](#target)和[DataType](#datatype)是Anakin内部自定义数据类型。[TargetType](#target)表示平台类型 (如NV、X86), [DataType](#datatype)是Anakin基本数据类型与C++/C中的基本数据类型相对应。 [Precision](#precision)为op所支持的精度类型, 稍后我们在介绍它。
+
+
+```c++
+
+//Create a empty graph object.
+Graph graph = Graph tmp();
+
+//Create a pointer to a empty graph.
+Graph *graph = new Graph();
+
+//Create a pointer to a empty graph.
+auto graph = new Graph();
+
+```
+
+#### 加载 Anakin 模型
+
+```c++
+//some declarations
+...
+auto graph = new Graph();
+std::string model_path = "the/path/to/where/your/models/are";
+const char *model_path1 = "the/path/to/where/your/models/are";
+
+//Loading Anakin model to generate a compute graph.
+auto status = graph->load(model_path);
+
+//Or this way.
+auto status = graph->load(model_path1);
+//Check whether load operation success.
+if(!status){
+ std::cout << "error" << endl;
+ //do something...
+}
+
+```
+
+#### 优化计算图
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+//According to the ops of loaded graph, optimize compute graph.
+graph->Optimize();
+
+```
+
+> 注意: 第一次加载原始图,必须要优化。
+
+#### 保存模型
+
+你可以在任何时候保存模型, 特别的, 你可以保存一个优化的模型,这样,下次再加载模型时,就不必进行优化操作。
+
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+// save a model
+//save_model_path: the path to where your model is.
+auto status = graph->save(save_model_path);
+
+//Checking
+if(!status){
+ cout << "error" << endl;
+ //do somethin...
+}
+```
+
+#### 重新设置计算图里的tensor的shape
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+vector shape{10, 256, 256, 10};
+//input_name : std::string.
+//Reshape a tensor named input_name.
+graph->Reshape(input_name, shape);//Note: shape is a vector, not a Shape object.
+```
+
+#### 设置 batch size
+
+`Graph` 支持重新设置batch size的大小。
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+//input_name : std::string.
+//Reset a tensor named input_name.
+int new_batch_size = 4;
+graph->ResetBatchSize(input_name, new_batch_size);
+```
+
+### Net ###
+
+
+`Net` 是计算图的执行器。你可以通过Net对象获得输入和输出
+#### Creating a graph executor
+
+`Net`接受四个模板参数。
+
+
+```c++
+template
+class Net{
+ //some implements
+ ...
+
+};
+```
+由于有些Op可能支持多种精度,我们可以通过Precision来指定。OpRunType表示同步或异步类型,异步是默认类型。OpRunType::SYNC表示同步,在GPU上只有单个流;OpRunType::ASYNC表示异步,在GPU上有多个流并以异步方式执行。实际上,Precision和OpRunType都是enum class, 详细设计请参考*source_root/framework/core/types.h*.
+
+
+1. Precision
+
+Precision | Op support
+:---: | :---:
+Precision::INT4 | NO
+Precision::INT8 | NO
+Precision::FP16 | NO
+Precision::FP32 | YES
+Precision::FP64 | NO
+
+现在Op的精度只支持FP32, 但在将来我们会支持剩下的Precision.
+
+
+
+2. OpRunType
+
+OpRunType | Sync/Aync |Description
+:---: | :---: | :---:
+OpRunType::SYNC | Synchronization | single-stream on GPU
+OpRunType::ASYNC | Asynchronization | multi-stream on GPU
+
+用graph对象创建一个执行器。
+```c++
+//some declarations
+...
+//Create a pointer to a graph.
+auto graph = new Graph();
+//do something...
+...
+
+//create a executor
+Net executor(*graph);
+
+```
+
+#### 获取输入输出tensor
+
+
+获取输入输出tensor,并填充输入tensor的buffer。如果想要获取输入和输出tensor,那么必须指定输入的名字,如"input_0", "input_1", "input_2", ..., 必须传入如上字符串才能够获得输入tensor。另外,如果想知道input_i对应哪个输入,你需要去dash board查看,如何使用dash board请看[Anakin Parser](Converter_ch.md)。请看如下示例代码
+
+```c++
+//some declaratinos
+...
+
+//create a executor
+//TargetType is NV [NVIDIA GPU]
+Net executor(*graph);
+
+//Get the first input tensor.
+//The following tensors(tensor_in0, tensor_in2 ...) are resident at GPU.
+//Note: Member function get_in returns an pointer to tensor.
+Tensor* tensor_in0 = executor.get_in("input_0");
+
+//If you have multiple input tensors
+//You just type this code below.
+Tensor* tensor_in1 = executor.get_in("input_1");
+...
+auto tensor_inn = executor.get_in("input_n");
+```
+
+当得到输入tensor之后,就可以填充它的数据区了。
+
+```c++
+//This tensor is resident at GPU.
+auto tensor_d_in = executor.get_in("input_0");
+
+//If we want to feed above tensor, we must feed the tensor which is resident at host. And then copy the host tensor to the device's one.
+
+//using Tensor4d = Tensor;
+Tensor4d tensor_h_in; //host tensor;
+//Tensor tensor_h_in;
+
+//Allocate memory for host tensor.
+tensor_h_in.re_alloc(tensor_d_in->valid_shape());
+//Get a writable pointer to tensor.
+float *h_data = tensor_h_in.mutable_data();
+
+//Feed your tensor.
+/** example
+for(int i = 0; i < tensor_h_in.size(); i++){
+ h_data[i] = 1.0f;
+}
+*/
+//Copy host tensor's data to device tensor.
+tensor_d_in->copy_from(tensor_h_in);
+
+// And then
+```
+
+
+类似的,我们可以利用成员函数get_out来获得输出tensor。但与获得输入tensor不同的是, 我们需要指定输入tensor结点的名字,这个可以从dash board中看到,请从[Anakin Parser](Converter_ch.md)中查看dash board的使用方法。假如有个输出结点叫pred_out, 那么我们可以通过如下代码获得相应的输出tensor:
+```c++
+//Note: this tensor are resident at GPU.
+Tensor* tensor_out_d = executor.get_out("pred_out");
+
+```
+
+
+#### Executing graph
+
+
+当一切准备就绪后,我们就可以执行真正的计算了!
+```c++
+executor.prediction();
+```
+
+## 示例代码 ##
+
+下面的例子展示了如何调用Anakin。
+
+在这儿之前, 请确保你已经有了Anakin模型。如果还没有,那么请使用[Anakin Parser](Converter_ch.md)转换你的模型。
+
+### Single-thread
+
+单线程例子在 *`source_root/test/framework/net/net_exec_test.cpp`*
+
+```c++
+
+std::string model_path = "your_Anakin_models/xxxxx.anakin.bin";
+// Create an empty graph object.
+auto graph = new Graph();
+// Load Anakin model.
+auto status = graph->load(model_path);
+if(!status ) {
+ LOG(FATAL) << " [ERROR] " << status.info();
+}
+// Reshape
+graph->Reshape("input_0", {10, 384, 960, 10});
+// You must optimize graph for the first time.
+graph->Optimize();
+// Create a executer.
+Net net_executer(*graph);
+
+//Get your input tensors through some specific string such as "input_0", "input_1", and
+//so on.
+//And then, feed the input tensor.
+//If you don't know Which input do these specific string ("input_0", "input_1") correspond with, you can launch dash board to find out.
+auto d_tensor_in_p = net_executer.get_in("input_0");
+Tensor4d h_tensor_in;
+auto valid_shape_in = d_tensor_in_p->valid_shape();
+for (int i=0; icopy_from(h_tensor_in);
+
+//Do inference.
+net_executer.prediction();
+
+//Get result tensor through the name of output node.
+//And also, you need to see the dash board again to find out how many output nodes are and remember their name.
+
+//For example, you've got a output node named obj_pre_out
+//Then, you can get an output tensor.
+auto d_tensor_out_0_p = net_executer.get_out("obj_pred_out"); //get_out returns a pointer to output tensor.
+auto d_tensor_out_1_p = net_executer.get_out("lc_pred_out"); //get_out returns a pointer to output tensor.
+//......
+// do something else ...
+//...
+//save model.
+//You might not optimize the graph when you load the saved model again.
+std::string save_model_path = model_path + std::string(".saved");
+auto status = graph->save(save_model_path);
+if (!status ) {
+ LOG(FATAL) << " [ERROR] " << status.info();
+}
+
+```
diff --git a/docs/Manual/Contribution_ch.md b/docs/Manual/Contribution_ch.md
new file mode 100644
index 000000000..438d207b8
--- /dev/null
+++ b/docs/Manual/Contribution_ch.md
@@ -0,0 +1,178 @@
+# 如何贡献代码
+
+我们真诚地感谢您的贡献,欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+
+## Contributor License Agreements
+
+在您的代码合入之前请签署个人或者公司的Contributor License Agreement(CLA)。
+
+- 如果您个人是原始代码的拥有者,并拥有代码的知识产权,您需要签署[个人CLA](https://gist.github.com/tanzhongyibidu/6605bdef5f7bb03b9084dd8fed027037)
+- 如果原始代码属于公司,并且公司同意提交代码到我们的仓储,那您需要签署[公司CLA](https://gist.github.com/tanzhongyibidu/709c675c1e79804e3e871f8c1e62292d)
+
+请您选择合适的CLA并仔细阅读,在您签署CLA后方可将代码合入。
+
+## 添加License
+
+在新提交的代码中包含license:
+
+- c++代码头文件
+
+```c++
+/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+```
+
+- python代码
+
+```python
+# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+```
+
+## 代码要求
+
+- 代码注释请遵守[Doxygen](http://www.stack.nl/~dimitri/doxygen/)的样式
+- 所有代码必须具有单元测试
+- 通过所有单元测试
+- 请遵守提交代码的一些约定
+
+以下教程将指导您提交代码
+
+## Fork
+首先跳转到[Anakin](https://github.com/PaddlePaddle/Anakin)的github首页,然后点击`Fork`, 生成自己目录下的仓库
+
+## 克隆(clone)
+
+将远程仓库clone到本地:
+
+```bash
+git clone YOUR_REPOSITORY_URL
+cd Anakin
+```
+
+## 创建本地分支
+Anakin目前使用[Git流分支模型](https://nvie.com/posts/a-successful-git-branching-model/)进行开发, 测试和维护。
+所有的feature和bug fix的开发工作都应该在一个新的分支上完成,根据需要从现有分支上创建新分支。
+使用`git checkout -b`创建并切换到新分支
+```bash
+git checkout -b YOUR_NEW_BRANCH
+```
+
+## 开始开发
+
+编写代码
+
+
+## 构建和测试
+
+详细请参考[Docker installation guide](docker/README.md) 和 [build from source guide](docs/Manual/INSTALL_en.md)。
+
+
+## 提交(commit)
+
+提交代码时,请认真写好提交说明,这样其他人就可以清楚的知道这次提交做了哪些改变:
+```bash
+git commit -m 'description'
+```
+
+## 保持本地仓库最新
+
+在发起Pull Request之前,需要与原始仓库同步。
+
+如果还没添加原仓库,请先添加源,可通过`git remote -v`查看是否添加源:
+```bash
+git remote -v
+origin .... (fetch)
+origin .... (push)
+```
+如果只出现origin,说明还未添加源,可通过如下命令添加源:
+```bash
+git remote add upstream ORIGIN_REPOSITORY_URL
+```
+获取 upstream 的最新代码并更新当前分支
+```bash
+git fetch upstream
+git pull upstream BRANCH_NAME
+```
+## Push到远程仓库
+
+将本地的修改push到远程仓库上
+```bash
+git push origin BRANCH_NAME
+```
+
+## 提交Pull Request
+
+切换到所建分支,然后点击`New pull request`。
+
+
+选择目标分支:
+
+
+接下来等待review。
+
+## 删除远程分支
+在PR被merge进主仓库后,可以在PR的界面删除远程仓库的分支。
+也可以通过以下命令删除远程分支:
+```bash
+git push origin :YOUR_NEW_BRANCH
+```
+
+## 删除本地分支
+
+最后,删除本地分支。
+```bash
+#切换到其他分支
+git checkout OTHER_BRANCH
+
+#删除YOUR_NEW_BRANCH分支
+git branch -D YOUR_NEW_BRANCH
+```
+
+至此,我们就完成了一次代码贡献的过程。
+
+## 提交代码的一些约定
+
+为了使评审人在评审代码时更好地专注于代码本身,请您每次提交代码时,遵守以下约定:
+
+1. 提交Pull Request前:
+- 注意commit的数量
+
+ - 原因:如果仅仅修改一个文件但提交了十几个commit,每个commit只做了少量的修改,这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改,且不排除commit之间的修改存在相互覆盖的情况。
+
+ - 建议:每次提交时,保持尽量少的commit,可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit,可以参考[squash commits after push](https://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)
+
+- 注意每个commit的名称:应能反映当前commit的内容,不能太随意。
+
+2. 如果解决了某个Issue的问题,请在该Pull Request的第一个评论框中加上:`fix #issue_number`,这样当该Pull Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+
+在回复评审人意见时,请您遵守以下约定:
+1. 评审人的每个意见都必须回复
+ - 对评审意见同意且按其修改完的,给个简单的Done即可
+ - 对评审意见不同意的,请给出您自己的反驳理由。
+2. 如果评审意见比较多
+ - 请给出总体的修改情况。
+ - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复,而非直接回复的方式。
diff --git a/docs/Manual/Converter_ch.md b/docs/Manual/Converter_ch.md
index d137ba24a..56ca582b2 100644
--- a/docs/Manual/Converter_ch.md
+++ b/docs/Manual/Converter_ch.md
@@ -1,77 +1,73 @@
-# External Converter
+# 模型转换指南
-This guide will show you how to convert your models to Anakin models.
+Anakin 支持不同框架的模型预测。但由于格式的差别,Anakin 需要您预先转换模型。本文档介绍如何转换模型。
-## Introduction
+## 简介
-Before using Anakin, you must convert your models to Anakin ones. If you don't, Anakin won't work properly.
+Anakin 模型转换器输入支持 Caffe 和 Fluid 两种格式的预测模型,模型包含网络结构(model 或 prototxt)和权重参数(param 或 caffemodel)。
-## Requirements
+模型转换的输出是一个 bin 文件,它作为 Anakin 框架的 graph 参数导入。
+
+您还可以使用模型转换器的 launch board 功能生成网络结构的 HTML 预览。
+
+
+## 系统要求
- python 2.7+
- pyyaml
- flask
+- protobuf 3.5+
-## Downloading Converter Source
-```bash
-git clone https://xxxxxxxxx
-```
+## 用法
+
+### 1、环境
+转换器所需的依赖标注于 *系统要求* 一节。
-## Usage
+### 2、配置
+您需要对 *config.yaml* 文件进行修改以告知您的需求。工程中给出了 *config.yaml* 示例,下面作进一步说明。
-### 1. Configuration
-Configure your *config.yaml* file. Find example *config.yaml* file in the `converter source` directory. The example below explains how to configure your config.yaml file.
-#### Caffe Case
+#### config.yaml
```bash
OPTIONS:
- Framework: CAFFE # select a target dl-framework you want parsing
- SavePath: ./output
- ResultName: googlenet # the name you want when saving the parsed model
+ Framework: CAFFE # 依框架类型填写 CAFFE 或 FLUID
+ SavePath: ./output # 转换结束后模型的保存位置
+ ResultName: googlenet # 输出模型的名字
Config:
- LaunchBoard: ON # should be on if you want to launch graph board
+ LaunchBoard: ON # 是否生成网络结构预览页面
Server:
ip: 0.0.0.0
- port: 8888
- OptimizedGraph: # only enable(set enable(ON) and path) when you have optimized graph model.
- enable: ON
+ port: 8888 # 从一个可用端口访问预览页面
+ OptimizedGraph: # 当您使用了 Anakin 框架的 Optimized 功能时,才应该打开此项
+ enable: OFF
path: /path/to/anakin_optimized_anakin_model/googlenet.anakin.bin.saved
LOGGER:
- LogToPath: ./log/ # the path where log
- WithColor: ON # colorful log message
+ LogToPath: ./log/ # 生成日志的路径
+ WithColor: ON
TARGET:
CAFFE:
- # path to proto files
+ # 当 Framework 为 CAFFE 时需填写
ProtoPaths:
- /path/to/caffe/src/caffe/proto/caffe.proto
PrototxtPath: /path/to/your/googlenet.prototxt
ModelPath: /path/to/your/googlenet.caffemodel
-
- # not support yet
- PADDLE:
- # path to proto files
- ProtoPath:
- - /path/to/proto_0
- - /path/to/proto_1
- - /path/to/proto_n
- PrototxtPath: /path/to/prototxt
- ModelPath: /path/to/model
- # ...
-```
-
-### 2. Converting
-After finishing configuration , you just need to call python script ```python converter.py``` to complete transfromation.
-
-### 3. Launching dash board
-Anakin external converter will be launched on site http://0.0.0.0:8888 (configurable).
-Then open you browser and search http://0.0.0.0:8888, amazing things will happen!
-
-> if you set ip to 0.0.0.0 in remote server, you need to open local browser and search the server real ip:port, not the 0.0.0.0.
+ FLUID:
+ # 当 Framework 为 FLUID 时需填写
+ Debug: NULL
+ ProtoPaths:
+ - /
+ PrototxtPath: /path/to/fluid/inference_model
+ ModelPath: /path/to/fluid/inference_model
+ # ...
+```
-### 4. Note
+### 3、转换
+在完成配置文件的修改后,您只需执行 ```python converter.py``` 就可以进行模型转换了。
-> 1.We support caffe so far
+### 4、预览
+最后一步,就是在浏览器中查看令人振奋的转换结果!网址是在 *config.yaml* 中配置的,例如 http://0.0.0.0:8888 。
+> 注意:若您使用了默认的 IP 地址 0.0.0.0,请在预览时使用真实的服务器地址 real_ip:port 替代它。
diff --git a/docs/Manual/Converter_en.md b/docs/Manual/Converter_en.md
index d137ba24a..4262726ba 100644
--- a/docs/Manual/Converter_en.md
+++ b/docs/Manual/Converter_en.md
@@ -16,7 +16,7 @@ Before using Anakin, you must convert your models to Anakin ones. If you don't,
```bash
git clone https://xxxxxxxxx
-```
+```
## Usage
@@ -47,9 +47,8 @@ TARGET:
- /path/to/caffe/src/caffe/proto/caffe.proto
PrototxtPath: /path/to/your/googlenet.prototxt
ModelPath: /path/to/your/googlenet.caffemodel
-
- # not support yet
- PADDLE:
+
+ FLUID:
# path to proto files
ProtoPath:
- /path/to/proto_0
@@ -57,10 +56,10 @@ TARGET:
- /path/to/proto_n
PrototxtPath: /path/to/prototxt
ModelPath: /path/to/model
- # ...
+ # ...
```
-### 2. Converting
+### 2. Converting
After finishing configuration , you just need to call python script ```python converter.py``` to complete transfromation.
### 3. Launching dash board
@@ -73,5 +72,3 @@ Then open you browser and search http://0.0.0.0:8888, amazing things will happen
### 4. Note
> 1.We support caffe so far
-
-
diff --git a/docs/Manual/INSTALL_ch.md b/docs/Manual/INSTALL_ch.md
index 833976936..212e07c7e 100644
--- a/docs/Manual/INSTALL_ch.md
+++ b/docs/Manual/INSTALL_ch.md
@@ -6,7 +6,7 @@
* [在CentOS上安装 Anakin]()
* [在Ubuntu上安装 Anakin]()
-* [在ARM上安装 Anakin]()
+* [在ARM上安装 Anakin](run_on_arm_ch.md)
* [验证安装]()
@@ -63,10 +63,11 @@
### 在ARM上安装 Anakin ###
-暂时还不支持
+请参考[ARM安装文档](run_on_arm_ch.md)
### 验证安装 ###
-we are coming soon...
+
+安装完成后,如果没有报错信息,你可以通过运行 `output/unit_test`路径下的单测示例验证是否编译成功。
diff --git a/docs/Manual/INSTALL_en.md b/docs/Manual/INSTALL_en.md
index 506e80b80..c02401473 100644
--- a/docs/Manual/INSTALL_en.md
+++ b/docs/Manual/INSTALL_en.md
@@ -66,8 +66,93 @@ Not support yet.
#### 4. Building Anakin with AMD GPU Support ####
-Coming soon..
+ For more detials of ROCm please see [RadeonOpenCompute/ROCm](https://github.com/RadeonOpenCompute/ROCm)
+
+- 4.1. Setup Environment
+
+ - 4.1.1 Update OS (Option, if your OS is able to be updated)
+ >$sudo yum update
+
+ - 4.1.2 Add ROCM repo
+ Create a /etc/yum.repos.d/rocm.repo file with the following contents:
+ ```bash
+ [ROCm]
+ name=ROCm
+ baseurl=http://repo.radeon.com/rocm/yum/rpm
+ enabled=1
+ gpgcheck=0
+ ```
+
+ - 4.1.3 Install ROCK-DKMS
+ Please check your kernel version before installing ROCk-DKMS and make sure the result is same as your installed kernel related packages, such as kernel-headers and kerenl-devel
+ >$ uname -r
+
+ - 4.1.3.1 For kernel ver 3.10.0-`693` (Option 1)
+ Download kernel-devel-3.10.0-693.el7.x86_64.rpm and kernel-headers-3.10.0-693.el7.x86_64.rpm
+ >$sudo yum install kernel-devel-3.10.0-693.el7.x86_64.rpm kernel-headers-3.10.0-693.el7.x86_64.rpm
+
+ - 4.1.3.2 For kernel ver 3.10.0-`862` (Option 2)
+ >$ sudo yum install kernel-devel kernel-headers
+
+ - 4.1.3.3 Install ROCk-DKMS
+ >$ sudo yum install epel-release
+ >$ sudo yum install dkms
+ >$ sudo yum install rock-dkms
+
+ Use below command to check amdgpu is installed successful or not.
+ >$ sudo dkms status
+ >$ 'amdgpu, 1.8-151.el7, ..., x86_64: installed (original_module exists)'
+
+ - 4.1.3.4
+ Reboot your device.
+
+ ** If you are using docker than step 4.1.4 to 4.1.8 are not required **
+
+ - 4.1.4 Install ROCm-OpenCL
+ >$sudo yum install rocm-opencl rocm-opencl-devel rocm-smi rocminfo
+
+ - 4.1.5 Add user to the video (or wheel) group
+ >$sudo usermod -a -G video $LOGNAME
+
+ - 4.1.6 Setting Environment variables
+ ```bash
+ echo 'export PATH=/opt/rocm/bin:/opt/rocm/opencl/bin/x86_64:$PATH' >> $HOME/.bashrc
+ echo 'export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH' >> $HOME/.bashrc
+ echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib:/usr/local/lib64' >>$HOME/.bashrc
+ source ~/.bashrc
+ ```
+ Check
+ >$ clinfo
+
+ - 4.1.7 protobuf 3.4.0
+ Download source from https://github.com/google/protobuf/releases/tag/v3.4.0
+ >tar -zxvf protobuf-cpp-3.4.0.tar.gz
+ >$ cd protobuf-3.4.0
+ >$ ./configure
+ >$ make
+ >$ make install
+
+ Check
+ >$ protoc --version
+ Any problems for protobuf installation, Please see [here](https://github.com/google/protobuf/blob/master/src/README.md)
+
+ - 4.1.8 cmake 3.2.0
+ Download source from https://cmake.org/files/v3.2/cmake-3.2.0.tar.gz
+ >tar -zxvf cmake-3.2.0.tar.gz
+ >$ cd cmake-3.2.0
+ >$ ./bootstrap
+ >$ make -j4
+ >$ make install
+
+- 4.2. Compile Anakin
+ >$ git clone xxx
+ >$ cd anakin
+ >$ ./tools/amd_gpu_build.sh
+
+- 4.3. Run Benchmark
+ >$ cd output/unit_test
+ >$ benchmark ../../benchmark/CNN/models/ vgg16.anakin.bin 1 2 100
### Installing on Ubuntu ###
@@ -76,8 +161,10 @@ Coming soon..
### Installing on ARM ###
-Coming soon..
+Please refer to [run on arm](run_on_arm_en.md)
### Verifying installation ###
+If build successfully, the libs will be in the directory `output/`, and you can run unit test in `output/unit_test` to verify your installation.
+
diff --git a/docs/Manual/addCustomDevice.md b/docs/Manual/addCustomDevice.md
new file mode 100644
index 000000000..0c8c7fd6f
--- /dev/null
+++ b/docs/Manual/addCustomDevice.md
@@ -0,0 +1,459 @@
+# 如何支持一个新的设备
+
+## 概览
+
+添加一个新的设备需要以下3个步骤:
+
+* [在`CMakeList`中添加设备的支持](#0001)
+* [在`saber`中添加设备的实现](#0002)
+* [在`framework`中添加设备的具体化或实例化](#0003)
+
+假设新设备的名称为`TNEW`, 以下将以这个设备名称进行演示。
+
+## 在`CMakeList`中添加设备的支持 ##
+
+* 修改根目录`CMakeList.txt`
+```cmake
+#select the plantform to build
+anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." NO)
+anakin_option(USE_X86_PLACE "Select the build mode for X86 place." NO)
+anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO)
+anakin_option(USE_TNEW_PLACE "Select the build mode for ARM place." YES)
+```
+
+* 修改`saber/CMakeList.txt`
+
+根据新增设备的目录完善`saber`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+ anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+ anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+endif()
+```
+
+* 修改`test/CMakeList.txt`
+
+新增设备的单测文件放在`test/saber/tnew`目录下,修改`test`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+ anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/tnew "cpp" ANAKIN_TEST_CASE_SRC)
+endif()
+```
+
+* 修改`cmake/anakin_config.h.in`
+```c++
+// plantform to use
+#cmakedefine USE_GPU_PLACE
+
+#cmakedefine USE_X86_PLACE
+
+#cmakedefine USE_ARM_PLACE
+
+#cmakedefine USE_TNEW_PLACE
+```
+
+* 其他依赖和编译选项
+修改`cmake`目录下的`compiler_options.cmake`和`find_modules.cmake`
+
+
+## 在`saber`中添加设备的实现 ##
+`saber`是`Anakin`的基础计算库,对外提供设备无关的统一的API,设备相关的实现都会封装到`TargetWrapper`中。
+
+### 在`saber/saber_types.h`中添加设备
+
+```c++
+enum TargetTypeEnum {
+ eINVALID = -1,
+ eNV = 1,
+ eAMD = 2,
+ eARM = 3,
+ eX86 = 4,
+ eNVHX86 = 5,
+ eTNEW = 6
+};
+
+typedef TargetType NV;
+typedef TargetType ARM;
+typedef TargetType AMD;
+typedef TargetType X86;
+typedef TargetType TNEW;
+
+```
+
+### 在`saber/core`中添加设备的实现
+
+1. 在`target_traits.h`中添加新设备
+
+* 增加设备类型
+```c++
+struct __cuda_device{};
+struct __arm_device{};
+struct __amd_device{};
+struct __x86_device{};
+struct __tnew_device{};
+```
+
+* `TargetTypeTraits`模板具体化
+```c++
+template <>
+struct TargetTypeTraits {
+ typedef __xxx_target target_category;//根据实际设备是host端还是device端进行选择
+ typedef __tnew_device target_type;
+};
+```
+
+2. 在`data_traits.h`中特化`DataTrait`模板类
+
+如果设备需要特殊的数据类型,则特化出设备的`DataTrait`类的实现,例如opencl数据类型的实现如下:
+```c++
+#ifdef USE_OPENCL
+struct ClMem{
+ ClMem(){
+ dmem = nullptr;
+ offset = 0;
+ }
+
+ ClMem(cl_mem* mem_in, int offset_in = 0) {
+ dmem = mem_in;
+ offset = offset_in;
+ }
+
+ ClMem(ClMem& right) {
+ dmem = right.dmem;
+ offset = right.offset;
+ }
+
+ ClMem& operator=(ClMem& right) {
+ this->dmem = right.dmem;
+ this->offset = right.offset;
+ return *this;
+ }
+
+ ClMem& operator+(int offset_in) {
+ this->offset += offset_in;
+ return *this;
+ }
+
+ int offset{0};
+ cl_mem* dmem;
+};
+
+template <>
+struct DataTrait {
+ typedef ClMem Dtype;
+ typedef float dtype;
+};
+
+template <>
+struct DataTrait {
+ typedef ClMem Dtype;
+ typedef double dtype;
+};
+
+template <>
+struct DataTrait {
+ typedef ClMem Dtype;
+ typedef char dtype;
+};
+#endif //use_opencl
+```
+
+3. 在`target_wrapper.h`中特化`TargetWrapper`模板类
+
+特化`TargetWrapper`模板类,在`target_wrapper.h`中声明函数,具体如下:
+```c++
+template <>
+struct TargetWrapper { //根据TNEW的具体类型修改__xxx_target,__host_target或者__device_target
+
+ typedef xxx_event event_t; //根据设备实现xxx_event
+ typedef xxx_stream stream_t; //根据设备实现xxx_stream
+
+ static void get_device_count(int& count);
+
+ static void set_device(int id);
+
+ //We should add strategy to avoid malloc directly
+ static void mem_alloc(void** ptr, size_t n);
+
+ static void mem_free(void* ptr);
+
+ static void mem_set(void* ptr, int value, size_t n);
+
+ static void create_event(event_t& event, bool flag = false);
+
+ static void create_stream(stream_t& stream);
+
+ static void create_stream_with_flag(stream_t& stream, unsigned int flag);
+
+ static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority);
+
+ static void destroy_stream(stream_t& stream);
+
+ static void destroy_event(event_t& event);
+
+ static void record_event(event_t& event, stream_t stream);
+
+ static void query_event(event_t& event);
+
+ static void sync_event(event_t& event);
+
+ static void sync_stream(event_t& event, stream_t& stream);
+
+ static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, __DtoD);
+
+ static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, stream_t& stream, __DtoD);
+
+ static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, __HtoD);
+
+ static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, stream_t& stream, __HtoD);
+
+ static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, __DtoH);
+
+ static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, stream_t& stream, __DtoH);
+
+ static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+ int src_dev, size_t count);
+
+ static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+ int src_dev, size_t count, stream_t& stream);
+
+ static int get_device_id();
+};
+
+```
+
+4. 在`impl/`目录下添加设备目录和实现
+
+在`saber/core/impl`目录下添加设备目录`tnew`。
+* 实现`TargetWrapper`结构体中各函数的定义。
+如果`TargetWrapper`的实现与默认的模板类一致,则不用特化出该类。
+
+```c++
+typedef TargetWrapper TNEW_API;
+void TNEW_API::get_device_count(int &count) {
+ // add implementation
+}
+
+void TNEW_API::set_device(int id){
+ // add implementation
+}
+
+void TNEW_API::mem_alloc(void** ptr, size_t n){
+ // add implementation
+}
+
+void TNEW_API::mem_free(void* ptr){
+ if(ptr != nullptr){
+ // add implementation
+ }
+}
+...
+
+```
+
+* 特化实现`device.h`中的`Device`
+
+```c++
+template <>
+void Device::create_stream() {
+ // add implementation
+}
+
+template <>
+void Device::get_info() {
+
+ // add implementation
+}
+
+```
+
+### 在`saber/funcs`中实现设备相关的op
+
+参考[如何增加新的Operator](addCustomOp.md)
+
+
+## 在`framework`中添加设备的具体化或实例化 ##
+
+### `framework/core`
+
+* `net.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Net;
+template class Net;
+#endif
+```
+
+* `operator_func.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class OperatorFunc;
+#endif
+```
+
+* `worker.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Worker;
+template class Worker;
+#endif
+```
+
+* `operator_attr.cpp`中添加实例化
+
+```c++
+template
+OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name);
+```
+
+* `parameter.h`中添加设备的实现
+
+```c++
+#ifdef USE_TNEW_PLACE
+template
+class PBlock {
+public:
+ typedef Tensor4d::type> type;
+
+ PBlock() {
+ _inner_tensor = std::make_shared();
+ }
+ ...
+}
+#endif //TNEW
+```
+
+* `type_traits_extend.h`中添加设备的实现
+
+```c++
+template<>
+struct target_host {
+ typedef saber::X86 type; //根据TNEW选择正确的host type
+};
+```
+
+### `framework/graph`
+
+* `graph.cpp`中添加实例化
+
+```c++
+ #ifdef USE_TNEW_PLACE
+ template class Graph;
+ template class Graph;
+ template class Graph;
+ #endif
+```
+
+### `framework/model_parser`
+
+* `parser.cpp`中添加实例化
+
+```c++
+ #ifdef USE_TNEW_PLACE
+ template
+ Status load(graph::Graph* graph,
+ const char* model_path);
+ template
+ Status load(graph::Graph* graph,
+ const char* model_path);
+ template
+ Status load(graph::Graph* graph,
+ const char* model_path);
+
+ template
+ Status save(graph::Graph* graph,
+ std::string& model_path);
+ template
+ Status save(graph::Graph* graph,
+ std::string& model_path);
+ template
+ Status save(graph::Graph* graph,
+ std::string& model_path);
+
+ template
+ Status load(graph::Graph* graph,
+ std::string& model_path);
+ template
+ Status load(graph::Graph* graph,
+ std::string& model_path);
+ template
+ Status load(graph::Graph* graph,
+ std::string& model_path);
+
+ template
+ Status save(graph::Graph* graph,
+ const char* model_path);
+ template
+ Status save(graph::Graph* graph,
+ const char* model_path);
+ template
+ Status save(graph::Graph* graph,
+ const char* model_path);
+ #endif
+```
+
+* `model_io.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class NodeIO;
+template class NodeIO;
+template class NodeIO;
+#endif
+```
+
+### `framework/operators`
+
+为`framework/operators`目录下所有op添加实例化或具体化
+以`activation.cpp`为例,实例化如下:
+
+```c++
+#ifdef USE_TNEW_PLACE
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
+template class ActivationHelper;
+ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
+#endif
+```
+
+如果TNEW设备函数的实现与现有模板实现不一致,可以特化实现如下(以init()为例):
+```c++
+#ifdef USE_TNEW_PLACE
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
+template <>
+Status ActivationHelper::Init(OpContext &ctx,\
+ const std::vector >& ins, \
+ std::vector >& outs) {
+ SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, SPECIFY, SABER_IMPL, ctx)); //在这里选择实现方式
+ return Status::OK();
+}
+ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
+#endif
+```
+
+在`ANAKIN_REGISTER_OP(Activation)`中添加TNEW的注册
+
+```c++
+#ifdef USE_TNEW_PLACE
+.__alias__("activation")
+#endif
+```
+
+## 注意事项
+不要修改`Tensor`/`Buffer`/`Env`/`Context`这些类函数的接口和实现
\ No newline at end of file
diff --git a/docs/Manual/addCustomOp.md b/docs/Manual/addCustomOp.md
new file mode 100644
index 000000000..f2783eb9f
--- /dev/null
+++ b/docs/Manual/addCustomOp.md
@@ -0,0 +1,405 @@
+# 如何增加新的Operator
+
+## 基本概念
+
+简单介绍下几个同Operator相关的基本概念,详情请参考设计文档。
+
+```framework```: 上层的逻辑代码,负责从parser中获取参数及weights,添加op时主要修改framework/operator目录下的内容。
+
+```saber```: 底层的实现代码,Anakin通过saber封装了不同的backends,不同的实现(impl)分别特化出自己的实现,外层framework通过不同的template进入各自的impl完成调用。各个op的parameter放在saber/saber_funcs_param.h文件中,增加op主要修改saber/funcs下的内容。
+
+saber的文件结构:
+* saber/funcs下的是各个funcs的外部接口,这一层的op与具体的设备实现无关,只与各op完成的功能有关。由于跟实现(impl)无关,本层文件明均不带impl。
+* saber/funcs/impl下是各个op的impl声明,特定设备需要完成该层声明的特化版本,如saber/funcs/impl/x86实现了上一层impl声明的x86特化版本,saber/funcs/impl/cuda实现了上一层impl声明的NV特化版本。当增加新的backends时需要特化出新的实现。本层代码同实现相关,均带有```impl_```前缀。
+* saber/funcs/impl/cuda/base/cuda_c内有cuda```.cu```扩展名的文件,添加cuda的kernel需要在该文件目录下添加。
+* saber/funcs/impl/cuda/base/sass 内有不同架构的汇编代码编译的静态库。
+
+### 涉及到的基类及各个类之前的关系
+
+简单介绍相关的基类
+
+* ```anakin::Operator```: framework的operator基类,位于framework/core/operator/operator.h
+
+* ```anakin::saber::BaseFunc```: saber对外的op接口基类,提供统一的对外接口,位于saber/funcs/base.h。BaseFunc的```compute_output_shape```接口只根据input的shape和param的参数计算输出的shape,并通过```tensor```的```set_shape```接口(只设置shape,不分配空间)设置到output中。```operator()```接口为各个op的计算接口。
+
+* ```ankain::saber::ImplBase```: saber设备实现的op的接口,所有设备相关实现的基类。位于saber/funcs/impl/impl_base.h。实现版本中这里分为两类,一类以```vender_```为前缀,带有```vender_```代码意为使用第三方库来实现该op,如cudnn的conv,或mkl的conv等等,这类op的性能我们难以调优,因此单独列为一类。另一类是带有源码的saber实现,这些实现都带有```saber_```为前缀,此类实现带有源码,能够通过后续优化不断提升性能,实现起名时需要注意这一点。
+
+## 添加operator
+
+添加一个新的op需要以下几步:
+
+1. 添加saber的param
+2. 定义saber的Operator类
+3. 定义新的impl声明
+3. 完成新的impl实现
+4. 增加framework的实现或特化
+
+接下来就针对这几步,以一个简单例子为例介绍实现。
+
+例如我们要添加新的Mul op。给出计算公式如下:$$Out = alpha \dot X * Y$$
+
+### 为operator增加param
+
+涉及到的文件:```saber/saber_funcs_param.h```。如果之前已经存在需要添加的op的param,这一步可以跳过。
+这里```XXXParam```是一个```struct```。包含一个无参数的构造函数,含参数的构造函数,复制构造函数,```operator=()```及```operator==()```。
+```
+template // 能够获得target, datatype, layout
+struct MulParam{
+ MulParam()
+ : alpha(0)
+ {}
+ MulParam(float alpha_in)
+ : alpha(alpha_in)
+ {}
+ MulParam(const MulParam& right)
+ : alpha(right.alpha)
+ {}
+ MulParam &operator=(const MulParam &right) {
+ alpha = right.alpha;
+ }
+ bool operator==(const MulParam &right) {
+ return alpha == right.alpha;
+ }
+ float alpha;
+};
+```
+
+### 定义Operator类
+涉及到的文件:```saber/funcs/mul.h```。如果之前定义过该op的类,这里需要修改输入的impl定义头文件。
+下面给出一个相对完整的定义结构供参考。
+```
+//不同的设备需要包含对应的operator实现.[详见](#impl)
+#ifdef NVIDIA_GPU
+#include "saber/funcs/impl/cuda/saber_mul.h"
+#include "saber/funcs/impl/cuda/vender_mul.h"
+#endif
+//如果一个设备现在还没有对应的operator实现,需要包含声明。[详见](#declare)
+#ifdef USE_X86_PLACE
+#include "saber/funcs/impl/impl_mul.h"
+#endif
+namespace anakin {
+namespace saber {
+template
+class Mul : public BaseFunc<
+ Tensor,
+ Tensor,
+ Tensor,
+ ImplBase, MulParam> {
+public:
+ using BaseFunc<
+ Tensor,
+ Tensor,
+ Tensor,
+ ImplBase, MulParam>::BaseFunc;
+ Mul() = default;
+ typedef Tensor InDataTensor;
+ typedef Tensor OutDataTensor;
+ typedef Tensor OpTensor;
+ typedef MulParam Param_t;
+ typedef std::vector Input_v;
+ typedef std::vector Output_v;
+ typedef std::vector Shape_v;
+
+ virtual SaberStatus compute_output_shape(const Input_v &input,
+ Output_v &output, Param_t ¶m) override {
+ //计算输出的shape,
+ Shape output_shape = (input[0]->valid_shape());
+ /* code */
+ return output[0]->set_shape(output_shape);
+ }
+ virtual SaberStatus init_impl(ImplEnum implenum) override {
+ // 不同设备均使用此init_impl, 此接口创建对应impl的实现。
+ switch (implenum) {
+ case VENDER_IMPL:
+ this->_impl.push_back(new VenderMul );
+ return SaberSuccess;
+ case SABER_IMPL:
+ this->_impl.push_back(new SaberMul );
+ return SaberSuccess;
+ default:
+ return SaberUnImplError;
+ }
+ }
+private:
+ virtual void pick_best_static() override {
+ if (true) // some condition?
+ this->_best_impl = this->_impl[0];
+ }
+ virtual void pick_best_specify(ImplEnum implenum) override {
+ this->_best_impl = this->_impl[0];
+ }
+};
+} // namespace saber
+} // namespace anakin
+```
+
+### 为operator增加新的impl声明
+
+涉及的文件:```saber/funcs/impl/impl_mul.h```。不同的设备都特化同一个声明,特化版本放在对应的文件夹下,这里的声明就是给出所有设备的统一声明。下面给出一个参考。
+```
+#include "saber/funcs/impl/impl_macro.h"
+namespace anakin{
+namespace saber{
+DEFINE_OP_CLASS(Mul, MulParam); // 第一个参数是op的名字,第二个是对应param的名字
+}
+}
+```
+
+### 完成新的operator特定后端实现
+
+涉及的文件:```saber/funcs/impl/xxx/vender_mul.h```或```saber/funcs/impl/xxx/saber_mul.h```
+这里```xxx```指代特定的一种设备。```vender```是指的使用第三方库实现的op,```saber```指的源码实现的op。这里以cuda的vender实现为例,简单介绍一下特化出的函数的几个基本接口。
+
+```
+// include 对应的声明
+#include "saber/funcs/impl/impl_mul.h"
+
+namespace anakin{
+namespace saber{
+template
+class VenderMul :
+ public ImplBase<
+ Tensor,
+ Tensor,
+ Tensor,
+ MulParam > >
+{
+public:
+ typedef Tensor DataTensor_in;
+ typedef Tensor DataTensor_out;
+ typedef Tensor OpTensor;
+ typedef typename DataTensor_in::Dtype InDataType;
+ typedef typename DataTensor_out::Dtype OutDataType;
+ typedef typename OpTensor::Dtype OpDataType;
+ VenderMul(){}
+ ~VenderMul() {}
+
+ virtual SaberStatus init(const std::vector& inputs,
+ std::vector& outputs,
+ MulParam& param, Context& ctx) {
+ this->_ctx = ctx;
+ create(inputs, outputs, param, ctx);
+ }
+
+ virtual SaberStatus create(const std::vector& inputs,
+ std::vector& outputs,
+ MulParam& param, Context& ctx) {
+ // set内部参数
+ }
+
+ virtual SaberStatus dispatch(const std::vector& inputs,
+ std::vector& outputs,
+ MulParam& param) {
+ // dispatch kernel.
+ }
+
+private:
+};
+}
+}
+```
+```init```和```create```的区别:```init```接口是第一次初始化op的时候进入的接口,此函数只在第一次初始化op时调用,这个接口一般放一些只需要执行一次的代码,如malloc或者create之类的函数。```create```函数除了第一次init执行外,在输入发生变化或者param发生变化时会再次触发,create一般放置set函数,设置内部变量,当input发生变化时这里执行一些同input或weights直接相关的代码。但create因为触发位置在网络内,如果```create```函数执行了一些严重耗时的操作,这里会拖慢整个op的执行时间,需要慎重选择操作放置的位置。
+### 添加framework的特化
+
+涉及的文件:```framework/operators/mul.h```和```framework/operators/mul.cpp```。
+这里简单介绍下如果添加或修改framework内的operator
+
+```
+#include "framework/core/base.h"
+#include "framework/core/data_types.h"
+#include "framework/core/operator/operator.h"
+#include "utils/logger/logger.h"
+#include "saber/funcs/mul.h" // 需要包对应的saber头文件
+namespace anakin {
+namespace ops {
+template
+class MulHelper;
+
+template
+class Mul : public Operator {
+public:
+ Mul() {}
+ /// forward impl
+ virtual void operator() (OpContext &ctx,
+ const std::vector >& ins,
+ std::vector >& outs) {
+ LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">";
+ }
+ friend class MulHelper;
+};
+template
+class MulHelper : public OperatorHelper {
+public:
+ MulHelper() = default;
+ ~MulHelper();
+ Status InitParam() override;
+
+ Status Init(OpContext &ctx,
+ const std::vector >& ins,
+ std::vector >& outs) override;
+ Status InferShape(const std::vector >& ins,
+ std::vector >& outs) override;
+
+public:
+ saber::MulParam> _param_mul;
+ saber::Mul _funcs_mul;
+};
+}
+} /* namespace anakin */
+```
+对应的```.cpp```文件如下:
+```
+#include "framework/operators/mul.h"
+
+namespace anakin {
+namespace ops {
+
+#ifdef USE_CUDA
+template<>
+void Mul::operator()(
+ OpContext& ctx,
+ const std::vector >& ins,
+ std::vector >& outs) {
+ auto* impl =
+ static_cast*>(this->_helper);
+ auto& param =
+ static_cast*>(this->_helper)->_param_mul;
+ impl->_funcs_mul(ins, outs, param, ctx);
+}
+#endif
+
+template
+Status MulHelper::InitParam() {
+ auto alpha = GET_PARAMETER(float, alpha);
+ MulParam> param_mul(alpha);
+ _param_mul = param_mul;
+ return Status::OK();
+}
+
+template
+Status MulHelper::Init(OpContext& ctx,
+ const std::vector >& ins,
+ std::vector >& outs) {
+
+ SABER_CHECK(_funcs_mul.init(ins, outs, _param_mul, SPECIFY, VENDER_IMPL, ctx));
+ return Status::OK();
+}
+
+template
+Status MulHelper::InferShape(const
+ std::vector >& ins,
+ std::vector >& outs) {
+ SABER_CHECK(_funcs_mul.compute_output_shape(ins, outs, _param_mul));
+ return Status::OK();
+}
+
+#ifdef USE_CUDA
+template class MulHelper;
+#endif
+#ifdef USE_ARM_PLACE
+template class MulHelper;
+#endif
+// register helper
+#ifdef USE_CUDA
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32);
+#endif
+#ifdef USE_ARM_PLACE
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32);
+#endif
+//! register op
+ANAKIN_REGISTER_OP(Mul)
+.Doc("Mul operator")
+#ifdef USE_CUDA
+.__alias__("mul")
+#endif
+#ifdef USE_ARM_PLACE
+.__alias__("mul")
+#endif
+.num_in(1)
+.num_out(1)
+.Args("alpha", " alpha of Mul "); //注册
+
+} /* namespace ops */
+
+} /* namespace anakin */
+```
+
+## 实现单元测试
+涉及的文件:```test/saber/xxx/test_saber_funcs_mul_xxx.cpp```
+在对应的test下需要添加新的单元测试
+
+```
+TEST(TestSaberFuncNV, test_depthwise_conv) {
+
+ // init tensors and some param.
+
+ // start Reshape & doInfer
+ Context ctx1(0, 1, 1);
+
+ // create param
+ MulParam > param(alpha);
+
+ std::vector*> input;
+ std::vector*> output;
+
+ // create saber op
+ Mul mul;
+
+ // compute output shape
+ mul.compute_output_shape(input, output, param);
+
+ // re_alloc output tensors memory based on output shape
+ output[0]->re_alloc(output[0]->shape());
+
+ // init saber op(calling init and create)
+ mul.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+ // call operator()
+ mul(input, output, param, ctx1);
+
+ // cuda specified, record events
+ cudaStream_t cuda_stream = ctx1.get_compute_stream();
+ output[0]->record_event(cuda_stream);
+ output_dev.sync();
+
+ // param changed
+ param.alpha = 2.0;
+ // auto calling saber op(create and dispatch)
+ mul(input, output, param, ctx1);
+
+ cudaDeviceSynchronize();
+ CUDA_CHECK(cudaPeekAtLastError());
+}
+
+int main(int argc, const char** argv){
+ anakin::saber::Env::env_init();
+
+ // initial logger
+ //logger::init(argv[0]);
+ InitTest();
+ RUN_ALL_TESTS(argv[0]);
+ return 0;
+}
+
+```
+## 调试及注意事项
+
+一个op需要有对外的op接口和内部实现,由于存在saber/funcs/impl的非特化版本声明,当有op在某种设备下没有对应实现时,也能够编译,但此时是没有任何实现的空实现,
diff --git a/docs/Manual/int8_design_ch.md b/docs/Manual/int8_design_ch.md
new file mode 100644
index 000000000..2444c8735
--- /dev/null
+++ b/docs/Manual/int8_design_ch.md
@@ -0,0 +1,17 @@
+
+# Int8设计文档
+
+## 计算流程
+
+
+
+## saber完成的功能
+
+对于支持int8的op,接口需要完成的功能做如下规定:
+1、init/create部分完成外部变量的量化和应有的判断,weights和bias计算后,scale存回对应的tensor
+
+2、dispatch检查input,如果是int8,检查是否符号合适,如果是fp32,需要添加量化部分代码(静态量化在tensor中的scale里,动态量化需要实时计算,并存回原tensor的scale中)。检查output,如果是fp32,按照输出fp32的逻辑反量化回fp32的tensor,如果是int8,根据当前kernel的实现,选择输出s8或u8(对于带relu合并的输出u8,对于单独的conv输出s8)。
+
+3、输入是s8还是u8主要取决于kernel是否支持,跟设备相关。
+
+对于不支持fp32的op,需要确认输入输出都是fp32,防止误调用。
diff --git a/docs/Manual/pics/contri1.JPG b/docs/Manual/pics/contri1.JPG
new file mode 100755
index 000000000..753f7c4e7
Binary files /dev/null and b/docs/Manual/pics/contri1.JPG differ
diff --git a/docs/Manual/pics/contri2.JPG b/docs/Manual/pics/contri2.JPG
new file mode 100755
index 000000000..e7880585e
Binary files /dev/null and b/docs/Manual/pics/contri2.JPG differ
diff --git a/docs/Manual/pics/int8_design.png b/docs/Manual/pics/int8_design.png
new file mode 100644
index 000000000..d6feafbd3
Binary files /dev/null and b/docs/Manual/pics/int8_design.png differ
diff --git a/docs/Manual/run_on_arm_ch.md b/docs/Manual/run_on_arm_ch.md
new file mode 100644
index 000000000..ebeb38f53
--- /dev/null
+++ b/docs/Manual/run_on_arm_ch.md
@@ -0,0 +1,151 @@
+## 源码编译 Anakin ##
+
+目前Anakin支持ARM Android平台,采用Android NDK交叉编译工具链,已在mac os和centos上编译和测试通过。
+
+### 安装概览 ###
+
+* [系统需求](#0001)
+* [安装第三方依赖](#0002)
+* [Anakin源码编译](#0003)
+* [验证安装](#0004)
+
+
+### 1. 系统需求 ###
+
+* 宿主机: linux, mac
+* cmake 3.8.2+
+* Android NDK r14, Linux 版本[从这里下载](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip)
+
+### 2. 安装第三方依赖 ###
+
+- 2.1 protobuf3.4.0
+ 源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0)
+ - 2.1.1 为宿主机编译protobuf
+ ```bash
+ $ tar -xzf protobuf-3.4.0.tar.gz
+ $ cd protobuf-3.4.0
+ $ ./autogen.sh
+ $ ./configure
+ $ make
+ $ make check
+ $ make install
+ ```
+ 上述 $make install 执行后,可在 /usr/local/include/google 找到 libprotobuf 所需的头文件,将整个google文件夹拷贝至Anakin/third-party/arm-android/protobuf/下,
+ 如有问题,请点[这里](https://github.com/google/protobuf/blob/v3.4.0/src/README.md)。
+ 然后将已经生成文件清除。
+ ```bash
+ $ make distclean
+ ```
+ - 2.1.1 交叉编译Android`armeabi-v7a`的protobuf,注意设置ANDROID_NDK的路径,以及ARCH_ABI、HOSTOSN的值,
+ ```bash
+
+ $ export ANDROID_NDK=your_ndk_path
+ $ ARCH_ABI="arm-linux-androideabi-4.9"
+ $ HOSTOSN="darwin-x86_64"
+ $ export SYSROOT=$ANDROID_NDK/platforms/android-9/arch-arm
+ $ export PREBUILT=$ANDROID_NDK/toolchains/$ARCH_ABI
+ $ export LDFLAGS="--sysroot=$SYSROOT"
+ $ export LD="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/arm-linux-androideabi/bin/ld $LDFLAGS"
+ $ export LIBS="-llog $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/libgnustl_static.a"
+ $ export CPPFLAGS=""
+ $ export INCLUDES="-I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/include/ -I$ANDROID_NDK/platforms/android-9/arch-arm/usr/include/ -I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/include/"
+ $ export CXXFLAGS="-march=armv7-a -mfloat-abi=softfp -DGOOGLE_PROTOBUF_NO_RTTI --sysroot=$SYSROOT"
+ $ export CCFLAGS="$CXXFLAGS"
+ $ export CXX="$PREBUILT/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-g++ $CXXFLAGS"
+ $ export CC="$CXX"
+ $ export RANLIB="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-ranlib"
+ $ ./autogen.sh
+ $ ./configure --host=arm-linux-androideabi --with-sysroot=$SYSROOT --enable-cross-compile --with-protoc=protoc --disable-shared CXX="$CXX" CC="$CC" LD="$LD"
+ $ make
+ ```
+
+ 编译生成 *.a 静态库,若希望编译*.so 动态链接库 ,请在./configure参数中改--disable-shared为--disable-static --enable-shared。
+ 生成文件在src/.libs/下,将生成的文件拷贝至Anakin/third-party/arm-android/protobuf/lib下。
+ 在[cmake](../../cmake/find_modules.cmake)中更新`ARM_RPOTO_ROOT`的路径。
+ ```cmake
+ set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf")
+ ```
+
+- 2.2 opencv 2.4.3+(optional)
+ Anakin只在examples示例中使用opencv
+ Android系统的opencv从[这里下载](https://opencv.org/releases.html)
+ 解压后将 `3rdparty/libs/armeabi-v7a`中的库文件拷贝到`libs/armeabi-v7a`
+ 在[cmake](../../cmake/find_modules.cmake)中搜索`anakin_find_opencv`,
+ 并设置 `include_directories` 和 `LINK_DIRECTORIES`为自己安装的库的路径。
+ ```cmake
+ include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/)
+ LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/)
+ ```
+### 3. Anakin源码编译 ###
+
+#### 编译Android版本
+
+ 克隆[源码](https://github.com/PaddlePaddle/Anakin/tree/arm)
+```bash
+ cd your_dir
+ git clone https://github.com/PaddlePaddle/Anakin.git
+ cd Anakin
+ git fetch origin arm
+ git checkout arm
+ ```
+ 修改`android_build.sh`
+- 修改NDK路径
+ ```bash
+ #modify "your_ndk_path" to your NDK path
+ export ANDROID_NDK=your_ndk_path
+ ```
+- 修改ARM 处理器架构
+ 对于32位ARM处理器, 将ANDROID_ABI 设置为 `armeabi-v7a with NEON`,
+ 对于64位ARM处理器, 可以将ANDROID_ABI 设置为 `armeabi-v7a with NEON`或者`arm64-v8a`。
+ 目前我们只支持 `armeabi-v7a with NEON`;`arm64-v8a` 还在开发中。
+ ```bash
+ -DANDROID_ABI="armeabi-v7a with NEON"
+ ```
+- 设置Android API
+ 根据Android系统的版本设置API level, 例如API Level 21 -> Android 5.0.1
+ ```bash
+ -DANDROID_NATIVE_API_LEVEL=21
+ ```
+
+- 选择编译静态库或动态库
+ 设置`BUILD_SHARED=NO`编译静态库
+ 设置`BUILD_SHARED=YES`编译动态库
+ ```bash
+ -DBUILD_SHARED=NO
+ ```
+- OpenMP多线程支持
+ 设置`USE_OPENMP=YES`开启OpenMP多线程
+ ```bash
+ -DUSE_OPENMP=YES
+ ```
+
+- 编译单测文件
+ 设置`BUILD_WITH_UNIT_TEST=YES`将会编译单测文件
+ ```bash
+ -DBUILD_WITH_UNIT_TEST=YES
+ ```
+
+- 编译示例文件
+ 设置`BUILD_EXAMPLES=YES`将会编译示例文件
+ ```bash
+ -DBUILD_EXAMPLES=YES
+ ```
+
+- 开启opencv
+ 如果使用opencv,设置`USE_OPENCV=YES`
+ ```bash
+ -DUSE_OPENCV=YES
+ ```
+
+- 开始编译
+ 运行脚本 `android_build.sh` 将自动编译Anakin
+ ```bash
+ ./android_build.sh
+ ```
+
+### 4. 验证安装 ###
+ 编译好的库会放在目录`${Anakin_root}/output`下;
+ 编译好的单测文件会放在`${Anakin_root}/output/unit_test`目录下;
+ 编译好的示例文件会放在`${Anakin_root}/output/examples`目录下。
+
+ 对于Android系统,打开设备的调试模式,通过ADB可以访问的目录是`data/local/tmp`,通过ADB push将测试文件、模型和数据发送到设备目录, 运行测试文件。
diff --git a/docs/Manual/run_on_arm_en.md b/docs/Manual/run_on_arm_en.md
new file mode 100644
index 000000000..a726b7d82
--- /dev/null
+++ b/docs/Manual/run_on_arm_en.md
@@ -0,0 +1,127 @@
+## Build Anakin for ARM from source ##
+
+Now, we have successfully build on mac os and centos, using Android NDK
+
+### Installation overview ###
+
+* [system requirements](#0001)
+* [dependencies](#0002)
+* [build from source](#0003)
+* [verification](#0004)
+
+
+### 1. system requirements ###
+
+* Host machine: linux, mac
+* cmake 3.8.2+
+* Android NDK r14, download linux version from [here](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip)
+
+### 2. dependencies ###
+
+- 2.1 protobuf3.4.0
+ Download source from https://github.com/google/protobuf/releases/tag/v3.4.0
+ - 2.1.1 Build protobuf for host
+ ```bash
+ $ tar -xzf protobuf-3.4.0.tar.gz
+ $ cd protobuf-3.4.0
+ $ ./autogen.sh
+ $ ./configure
+ $ make
+ $ make check
+ $ make install
+ ```
+ for details, please refer [here](https://github.com/google/protobuf/blob/v3.4.0/src/README.md)
+
+ - 2.1.2 Build protobuf for ARM `armeabi-v7a`
+ ```bash
+
+ ```
+ Set your protobuf path [here](../../cmake/find_modules.cmake), search `anakin_find_protobuf`, and set `ARM_RPOTO_ROOT` to your path.
+ ```cmake
+ set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf")
+ ```
+
+- 2.2 opencv 2.4.3+(optional)
+ We only use opencv in examples
+ For Android, visit opencv [release page](https://opencv.org/releases.html), choose Android pack and download,
+ copy libs in `3rdparty/libs/armeabi-v7a` to `libs/armeabi-v7a`.
+ Set your opencv path [here](../../cmake/find_modules.cmake), Search `anakin_find_opencv`,
+ and set `include_directories` and `LINK_DIRECTORIES` according to your path.
+ ```cmake
+ include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/)
+ LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/)
+ ```
+### 3. build from source ###
+
+#### build for Android
+
+ clone the [source code](https://github.com/PaddlePaddle/Anakin/tree/arm)
+```bash
+ cd your_dir
+ git clone https://github.com/PaddlePaddle/Anakin.git
+ cd Anakin
+ git fetch origin arm
+ git checkout arm
+ ```
+ change the `android_build.sh`
+- Set NDK path to yours
+ ```bash
+ #modify "your_ndk_path" to your NDK path
+ export ANDROID_NDK=your_ndk_path
+ ```
+- Set your ARM target platform
+
+ For 32bits ARM CPU with NEON, Set ANDROID_ABI to `armeabi-v7a with NEON`,
+ for 64bits ARM CPU, either `arm64-v8a` or `armeabi-v7a with NEON` can work.
+ Now, we only support `armeabi-v7a with NEON`,`arm64-v8a` is under developing
+ ```bash
+ -DANDROID_ABI="armeabi-v7a with NEON"
+ ```
+- Set Android API level
+ Choose your API LEVEL according to your android system version
+ API Level 21 -> Android 5.0.1
+ ```bash
+ -DANDROID_NATIVE_API_LEVEL=21
+ ```
+
+- build static or shared lib
+ if building static lib, set `BUILD_SHARED=NO`
+ if building shared lib, set `BUILD_SHARED=YES`
+ ```bash
+ -DBUILD_SHARED=NO
+ ```
+- OpenMP for multi-threads
+ set `USE_OPENMP=YES` to use OpenMP multi-threads
+ ```bash
+ -DUSE_OPENMP=YES
+ ```
+
+- build unit test
+ set `BUILD_WITH_UNIT_TEST=YES` to build unit tests
+ ```bash
+ -DBUILD_WITH_UNIT_TEST=YES
+ ```
+
+- build examples
+ set `BUILD_EXAMPLES=YES` to build detection and classification examples
+ ```bash
+ -DBUILD_EXAMPLES=YES
+ ```
+
+- use opencv in examples
+ set `USE_OPENCV=YES` to use opencv in examples
+ ```bash
+ -DUSE_OPENCV=YES
+ ```
+
+- build
+ run `android_build.sh` to build the Anakin
+ ```bash
+ ./android_build.sh
+ ```
+
+### 4. Verification ###
+ The libs is in `${Anakin_root}/output`, the unit test and benchmark file is in `${Anakin_root}/output/unit_test`
+ and the examples is in `${Anakin_root}/output/examples`
+ Open `USB debug mode` in your Android device, Use ADB to push the test files and model files to `data/local/tmp/your_dir`
+ run the test
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 000000000..b93757289
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,58 @@
+# used for temporary
+anakin_fetch_include_recursively(${ANAKIN_FRAMEWORK})
+anakin_fetch_include_recursively(${ANAKIN_MODEL_PARSER})
+anakin_fetch_include_recursively(${ANAKIN_SABER})
+
+if(NVIDIA_GPU)
+anakin_fetch_files_with_suffix(${ANAKIN_EXAMPLES}/cuda "cpp" ANAKIN_TEST_CASE_SRC)
+endif()
+
+if(AMD_GPU)
+anakin_fetch_files_with_suffix(${ANAKIN_EXAMPLES}/amd "cpp" ANAKIN_TEST_CASE_SRC)
+endif()
+
+if(USE_X86_PLACE)
+anakin_fetch_files_with_suffix(${ANAKIN_EXAMPLES}/x86 "cpp" ANAKIN_TEST_CASE_SRC)
+endif()
+
+if(USE_ARM_PLACE) #build unit test for arm devices
+ anakin_fetch_files_with_suffix(${ANAKIN_EXAMPLES}/arm "cpp" ANAKIN_TEST_CASE_SRC)
+ if(USE_OPENMP)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+ endif()
+ if (USE_PROTOBUF)
+ find_library(log-lib log)
+ endif()
+endif()
+
+file(REMOVE ${PROJECT_SOURCE_DIR}/output/examples/*)
+
+# build test cases
+foreach(SRC_NAME ${ANAKIN_TEST_CASE_SRC})
+ #unpack the dir "/"
+ string(REPLACE "/" ";" SEXY_LIST ${SRC_NAME})
+ list(GET SEXY_LIST -1 TEST_CASE_NAME)
+ #get the file name without suffix
+ string(REPLACE "." ";" SEXY_LIST ${TEST_CASE_NAME})
+ list(GET SEXY_LIST 0 TEST_CASE_NAME)
+ add_executable(${TEST_CASE_NAME} ${SRC_NAME})
+ if(BUILD_SHARED)
+ target_link_libraries(${TEST_CASE_NAME} ${anakin_lib_so} ${ANAKIN_LINKER_LIBS})
+ else()
+ target_link_libraries(${TEST_CASE_NAME} -Wl,--whole-archive ${anakin_lib_static} -Wl,--no-whole-archive ${ANAKIN_LINKER_LIBS})
+ endif()
+ if(USE_ARM_PLACE)
+ target_link_libraries(${TEST_CASE_NAME} ${log-lib})
+ endif()
+ if(USE_OPENCV)
+ if (USE_ARM_PLACE)
+ target_link_libraries(${TEST_CASE_NAME} -lopencv_core -lopencv_highgui -lopencv_imgproc
+ -ltbb -llibtiff -llibpng -llibjpeg -llibjasper -lIlmImf -lc -lz -llog -ldl)
+ else()
+ target_link_libraries(${TEST_CASE_NAME} -lopencv_core -lopencv_highgui -lopencv_imgproc)
+ endif()
+ endif()
+ set_target_properties(${TEST_CASE_NAME} PROPERTIES
+ RUNTIME_OUTPUT_DIRECTORY
+ ${PROJECT_SOURCE_DIR}/output/examples)
+endforeach()
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 000000000..160231de6
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,20 @@
+# hands on examples
+
+## dependecies
+
+- opencv2.4.3+ for image reading
+
+## NV GPU
+
+
+
+## ARM
+- refer [run on arm](../docs/Manual/run_on_arm_en.md) to set your opencv path
+- Enable `USE_OPENCV` in [CMakeList.txt](../CMakeLists.txt)
+- Enable building examples in [CMakeList.txt](../CMakeLists.txt)
+
+### mobilenet_ssd detection
+
+
+### mobilenetv1 classification
+
diff --git a/examples/amd/classification.cpp b/examples/amd/classification.cpp
new file mode 100644
index 000000000..1d64ead71
--- /dev/null
+++ b/examples/amd/classification.cpp
@@ -0,0 +1,238 @@
+#include "graph_base.h"
+#include "graph.h"
+#include "scheduler.h"
+#include "net.h"
+#include "worker.h"
+#include "tensor_op.h"
+#include "timer.h"
+#include "saber/utils.h"
+
+using namespace anakin::saber;
+using namespace anakin::graph;
+using namespace anakin;
+typedef Tensor Tensor4hf;
+typedef Tensor Tensor4df;
+
+void load_labels(std::string path, std::vector& labels) {
+
+ FILE* fp = fopen(path.c_str(), "r");
+ if (fp == nullptr) {
+ LOG(FATAL) << "load label file failed";
+ }
+ while (!feof(fp)) {
+ char str[1024];
+ fgets(str, 1024, fp);
+ std::string str_s(str);
+
+ if (str_s.length() > 0) {
+ for (int i = 0; i < str_s.length(); i++) {
+ if (str_s[i] == ' ') {
+ std::string strr = str_s.substr(i, str_s.length() - i - 1);
+ labels.push_back(strr);
+ i = str_s.length();
+ }
+ }
+ }
+ }
+ fclose(fp);
+}
+
+void print_topk(const float* scores, const int size, const int topk, \
+ const std::vector& labels) {
+
+ std::vector< std::pair > vec;
+ vec.resize(size);
+ for (int i = 0; i < size; i++) {
+ vec[i] = std::make_pair(scores[i], i);
+ }
+
+ std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+ std::greater< std::pair >());
+
+ // print topk and score
+ for (int i = 0; i < topk; i++) {
+ float score = vec[i].first;
+ int index = vec[i].second;
+ LOG(INFO) << i <<": " << index << " " << labels[index] << " " << score;
+ }
+}
+
+#ifdef USE_OPENCV
+#include "opencv2/opencv.hpp"
+
+using namespace cv;
+
+void fill_tensor_with_cvmat(const Mat& img_in, Tensor4hf& tout, const int num, \
+ const int width, const int height, const float* mean, const float* scale) {
+ cv::Mat im;
+ cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f);
+ float* ptr_data_in = tout.mutable_data();
+ int stride = width * height;
+ for (int i = 0; i < num; i++) {
+ float* ptr_in = ptr_data_in + i * tout.channel() * tout.height() * tout.width();
+ for (int r = 0; r < height; r++) {
+ for (int c = 0; c < width; c++) {
+ ptr_in[r * width + c] = (im.at(r, c)[0] - mean[0]) * scale[0];
+ ptr_in[stride + r * width + c] = (im.at(r, c)[1] - mean[1]) * scale[1];
+ ptr_in[2 * stride + r * width + c] = (im.at(r, c)[2] - mean[2]) * scale[2];
+ }
+ }
+ }
+}
+#endif
+
+void test_net(const std::string model_file_name, const std::string image_file_name, \
+ const std::vector& labels, const int topk, const int threads, \
+ const int test_iter) {
+
+ int batch_size = 1;
+
+ //! create runtime context
+ LOG(INFO) << "create runtime context";
+ std::shared_ptr> ctx1 = std::make_shared>(0,0,0);
+
+ //! load model
+ LOG(WARNING) << "load anakin model file from " << model_file_name << " ...";
+ Graph graph;
+ auto status = graph.load(model_file_name);
+ if (!status) {
+ LOG(FATAL) << " [ERROR] " << status.info();
+ }
+
+ //! set batch size
+ graph.ResetBatchSize("input_0", batch_size);
+
+ //! optimize the graph
+ LOG(INFO) << "optimize the graph";
+ graph.Optimize();
+
+ //! get output name
+ std::vector& vout_name = graph.get_outs();
+ LOG(INFO) << "output size: " << vout_name.size();
+
+ //! constructs the executer net
+ LOG(INFO) << "create net to execute";
+ Net net_executer(graph, ctx1, true);
+
+ //! get in
+ LOG(INFO) << "get input";
+ auto d_tensor_in_p = net_executer.get_in("input_0");
+ auto valid_shape_in = d_tensor_in_p->valid_shape();
+ for (int i = 0; i < valid_shape_in.size(); i++) {
+ LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i];
+ }
+ Tensor4hf thin(valid_shape_in);
+
+ LOG(INFO) << thin.width() << "x" << thin.height() << " size" << thin.valid_size();;
+ //! feed input image to input tensor
+
+#ifdef USE_OPENCV
+ LOG(INFO) << "loading image " << image_file_name << " ...";
+ Mat img = imread(image_file_name, CV_LOAD_IMAGE_COLOR);
+ if (img.empty()) {
+ LOG(FATAL) << "opencv read image " << image_file_name << " failed";
+ }
+ //! set your mean value and scale value here
+ float mean_mb[3] = {103.94f, 116.78f, 123.68f};
+ float scale_mb[3] = {0.017f, 0.017f, 0.017f};
+ LOG(INFO) << thin.width() << "x" << thin.height();
+ fill_tensor_with_cvmat(img, thin, batch_size, thin.width(), thin.height(), mean_mb, scale_mb);
+#else
+ fill_tensor_host_const(thin, 1.f);
+#endif
+
+ //! do inference
+ Context ctx(0, 0, 0);
+ anakin::saber::SaberTimer my_time;
+ LOG(INFO) << "run prediction ";
+
+ double to = 0;
+ double tmin = 1000000;
+ double tmax = 0;
+ my_time.start(ctx);
+ saber::SaberTimer t1;
+ for (int i = 0; i < test_iter; i++) {
+ d_tensor_in_p->copy_from(thin);
+ t1.clear();
+ t1.start(ctx);
+ net_executer.prediction();
+ t1.end(ctx);
+ double tdiff = t1.get_average_ms();
+ if (tdiff > tmax) {
+ tmax = tdiff;
+ }
+ if (tdiff < tmin) {
+ tmin = tdiff;
+ }
+ to += tdiff;
+ }
+ my_time.end(ctx);
+
+
+ LOG(INFO) << model_file_name << " batch_size " << batch_size << \
+ " average time " << to / test_iter << \
+ ", min time: " << tmin << "ms, max time: " << tmax << " ms";
+
+ //! get output
+ //! fixme get output
+ //std::vector vout = net_executer.get_out_list();
+ std::vector vout;
+ for (auto& it : vout_name) {
+ vout.push_back(net_executer.get_out(it));
+ }
+ Tensor4df* tensor_out_d = vout[0];
+ LOG(INFO) << "output size: " << vout.size();
+
+ Tensor4hf tensor_out;
+ tensor_out.re_alloc(tensor_out_d->shape());
+ tensor_out.copy_from(*tensor_out_d);
+#if 0 //print output tensor data
+ LOG(INFO) << "extract data: size: " << tensor_out->valid_size() << \
+ ", width=" << tensor_out->width() << ", height=" << tensor_out->height();
+ const float* ptr_out = tensor_out->data();
+ for (int i = 0; i < tensor_out->valid_size(); i++) {
+ printf("%0.4f ", ptr_out[i]);
+ if ((i + 1) % 7 == 0) {
+ printf("\n");
+ }
+ }
+ printf("\n");
+#endif
+ print_topk(tensor_out.data(), tensor_out.valid_size(), topk, labels);
+}
+
+int main(int argc, char** argv){
+
+ LOG(INFO) << "initialized the device";
+ Env::env_init();
+
+ if (argc < 4) {
+ LOG(ERROR) << "usage: " << argv[0] << ": model_file label_file image_name [topk] [test_iter] [threads]";
+ return -1;
+ }
+ char* model_file = argv[1];
+ char* label_file = argv[2];
+ char* image_path = argv[3];
+
+ std::vector labels;
+ load_labels(label_file, labels);
+
+ int topk = 5;
+ if (argc > 4) {
+ topk = atoi(argv[4]);
+ }
+
+ int test_iter = 10;
+ if (argc > 5) {
+ test_iter = atoi(argv[5]);
+ }
+
+ int threads = 1;
+ if (argc > 6) {
+ threads = atoi(argv[6]);
+ }
+
+ test_net(model_file, image_path, labels, topk, threads, test_iter);
+ return 0;
+}
+
diff --git a/examples/arm/classification.cpp b/examples/arm/classification.cpp
new file mode 100644
index 000000000..27c3ce45d
--- /dev/null
+++ b/examples/arm/classification.cpp
@@ -0,0 +1,234 @@
+#include "graph_base.h"
+#include "graph.h"
+#include "scheduler.h"
+#include "net.h"
+#include "worker.h"
+#include "tensor_op.h"
+#include "timer.h"
+
+using namespace anakin::saber;
+using namespace anakin::graph;
+using namespace anakin;
+typedef Tensor Tensor4hf;
+
+
+void load_labels(std::string path, std::vector& labels) {
+
+ FILE* fp = fopen(path.c_str(), "r");
+ if (fp == nullptr) {
+ LOG(FATAL) << "load label file failed";
+ }
+ while (!feof(fp)) {
+ char str[1024];
+ fgets(str, 1024, fp);
+ std::string str_s(str);
+
+ if (str_s.length() > 0) {
+ for (int i = 0; i < str_s.length(); i++) {
+ if (str_s[i] == ' ') {
+ std::string strr = str_s.substr(i, str_s.length() - i - 1);
+ labels.push_back(strr);
+ i = str_s.length();
+ }
+ }
+ }
+ }
+ fclose(fp);
+}
+
+void print_topk(const float* scores, const int size, const int topk, \
+ const std::vector& labels) {
+
+ std::vector< std::pair > vec;
+ vec.resize(size);
+ for (int i = 0; i < size; i++) {
+ vec[i] = std::make_pair(scores[i], i);
+ }
+
+ std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+ std::greater< std::pair >());
+
+ // print topk and score
+ for (int i = 0; i < topk; i++) {
+ float score = vec[i].first;
+ int index = vec[i].second;
+ LOG(INFO) << i <<": " << index << " " << labels[index] << " " << score;
+ }
+}
+
+#ifdef USE_OPENCV
+#include "opencv2/opencv.hpp"
+
+using namespace cv;
+
+void fill_tensor_with_cvmat(const Mat& img_in, Tensor4hf& tout, const int num, \
+ const int width, const int height, const float* mean, const float* scale) {
+ cv::Mat im;
+ cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f);
+ float* ptr_data_in = tout.mutable_data();
+ int stride = width * height;
+ for (int i = 0; i < num; i++) {
+ float* ptr_in = ptr_data_in + i * tout.channel() * tout.height() * tout.width();
+ for (int r = 0; r < height; r++) {
+ for (int c = 0; c < width; c++) {
+ ptr_in[r * width + c] = (im.at(r, c)[0] - mean[0]) * scale[0];
+ ptr_in[stride + r * width + c] = (im.at(r, c)[1] - mean[1]) * scale[1];
+ ptr_in[2 * stride + r * width + c] = (im.at(r, c)[2] - mean[2]) * scale[2];
+ }
+ }
+ }
+}
+#endif
+
+void test_net(const std::string model_file_name, const std::string image_file_name, \
+ const std::vector& labels, const int topk, const int threads, \
+ const int test_iter) {
+
+ int batch_size = 1;
+
+ //! create runtime context
+ LOG(INFO) << "create runtime context";
+ std::shared_ptr> ctx1 = std::make_shared>();
+ ctx1->set_run_mode(SABER_POWER_HIGH, threads);
+ LOG(INFO) << omp_get_num_threads() << " threads is activated";
+
+ //! load model
+ LOG(WARNING) << "load anakin model file from " << model_file_name << " ...";
+ Graph graph;
+ auto status = graph.load(model_file_name);
+ if (!status) {
+ LOG(FATAL) << " [ERROR] " << status.info();
+ }
+
+ //! set batch size
+ graph.ResetBatchSize("input_0", batch_size);
+
+ //! optimize the graph
+ LOG(INFO) << "optimize the graph";
+ graph.Optimize();
+
+ //! get output name
+ std::vector& vout_name = graph.get_outs();
+ LOG(INFO) << "output size: " << vout_name.size();
+
+ //! constructs the executer net
+ LOG(INFO) << "create net to execute";
+ Net net_executer(graph, ctx1, true);
+
+ //! get in
+ LOG(INFO) << "get input";
+ auto d_tensor_in_p = net_executer.get_in("input_0");
+ auto valid_shape_in = d_tensor_in_p->valid_shape();
+ for (int i = 0; i < valid_shape_in.size(); i++) {
+ LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i];
+ }
+ Tensor4hf thin(valid_shape_in);
+
+ //! feed input image to input tensor
+#ifdef USE_OPENCV
+ LOG(INFO) << "loading image " << image_file_name << " ...";
+ Mat img = imread(image_file_name, CV_LOAD_IMAGE_COLOR);
+ if (img.empty()) {
+ LOG(FATAL) << "opencv read image " << image_file_name << " failed";
+ }
+ //! set your mean value and scale value here
+ float mean_mb[3] = {103.94f, 116.78f, 123.68f};
+ float scale_mb[3] = {0.017f, 0.017f, 0.017f};
+ fill_tensor_with_cvmat(img, thin, batch_size, thin.width(), thin.height(), mean_mb, scale_mb);
+
+#else
+ fill_tensor_host_const(thin, 1.f);
+#endif
+
+ //! do inference
+ Context ctx(0, 0, 0);
+ anakin::saber::SaberTimer my_time;
+ LOG(INFO) << "run prediction ";
+
+ double to = 0;
+ double tmin = 1000000;
+ double tmax = 0;
+ my_time.start(ctx);
+ saber::SaberTimer t1;
+ for (int i = 0; i < test_iter; i++) {
+ d_tensor_in_p->copy_from(thin);
+ t1.clear();
+ t1.start(ctx);
+ net_executer.prediction();
+ t1.end(ctx);
+ double tdiff = t1.get_average_ms();
+ if (tdiff > tmax) {
+ tmax = tdiff;
+ }
+ if (tdiff < tmin) {
+ tmin = tdiff;
+ }
+ to += tdiff;
+ }
+ my_time.end(ctx);
+
+
+ LOG(INFO) << model_file_name << " batch_size " << batch_size << \
+ " average time " << to / test_iter << \
+ ", min time: " << tmin << "ms, max time: " << tmax << " ms";
+
+ //! get output
+ //! fixme get output
+ //std::vector vout = net_executer.get_out_list();
+ std::vector vout;
+ for (auto& it : vout_name) {
+ vout.push_back(net_executer.get_out(it));
+ }
+ Tensor4hf* tensor_out = vout[0];
+ LOG(INFO) << "output size: " << vout.size();
+
+#if 0 //print output tensor data
+ LOG(INFO) << "extract data: size: " << tensor_out->valid_size() << \
+ ", width=" << tensor_out->width() << ", height=" << tensor_out->height();
+ const float* ptr_out = tensor_out->data();
+ for (int i = 0; i < tensor_out->valid_size(); i++) {
+ printf("%0.4f ", ptr_out[i]);
+ if ((i + 1) % 7 == 0) {
+ printf("\n");
+ }
+ }
+ printf("\n");
+#endif
+ print_topk(tensor_out->data(), tensor_out->valid_size(), topk, labels);
+}
+
+int main(int argc, char** argv){
+
+ LOG(INFO) << "initialized the device";
+ Env::env_init();
+
+ if (argc < 4) {
+ LOG(ERROR) << "usage: " << argv[0] << ": model_file label_file image_name [topk] [test_iter] [threads]";
+ return -1;
+ }
+ char* model_file = argv[1];
+ char* label_file = argv[2];
+ char* image_path = argv[3];
+
+ std::vector labels;
+ load_labels(label_file, labels);
+
+ int topk = 5;
+ if (argc > 4) {
+ topk = atoi(argv[4]);
+ }
+
+ int test_iter = 10;
+ if (argc > 5) {
+ test_iter = atoi(argv[5]);
+ }
+
+ int threads = 1;
+ if (argc > 6) {
+ threads = atoi(argv[6]);
+ }
+
+ test_net(model_file, image_path, labels, topk, threads, test_iter);
+ return 0;
+}
+
diff --git a/examples/arm/ssd_detection.cpp b/examples/arm/ssd_detection.cpp
new file mode 100644
index 000000000..50b02b396
--- /dev/null
+++ b/examples/arm/ssd_detection.cpp
@@ -0,0 +1,233 @@
+#include "graph_base.h"
+#include "graph.h"
+#include "scheduler.h"
+#include "net.h"
+#include "worker.h"
+#include "tensor_op.h"
+#include "timer.h"
+
+using namespace anakin::saber;
+using namespace anakin::graph;
+using namespace anakin;
+typedef Tensor