Skip to content

Commit 1d8dead

Browse files
Marius MeyerMellich
authored andcommitted
Initial commit
0 parents  commit 1d8dead

205 files changed

Lines changed: 18320 additions & 0 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
*/.DS_Store
2+
.DS_Store
3+
cmake-*
4+
.vscode
5+
*._*
6+
build-*
7+
.idea

.gitmodules

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[submodule "extern/cxxopts"]
2+
path = extern/cxxopts
3+
url = https://github.com/jarro2783/cxxopts.git
4+
[submodule "extern/hlslib"]
5+
path = extern/hlslib
6+
url = https://github.com/definelicht/hlslib.git
7+
[submodule "extern/googletest"]
8+
path = extern/googletest
9+
url = https://github.com/google/googletest.git

FFT/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
cmake-*
2+
.DS_Store
3+
build-*
4+
.idea

FFT/CHANGELOG

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Changelog
2+
3+
This file contains all changes made to the source code for each release.
4+
5+
## 1.0
6+
7+
#### Added:
8+
- Host code and OpenCL kernel from Intel FPGA SDK AOC examples
9+
- Execution result for the Bittware 520N board with brief performance model

FFT/CMakeLists.txt

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
cmake_minimum_required(VERSION 2.8.12)
2+
project(fFFT)
3+
4+
set(VERSION 1.0)
5+
set(DEFAULT_REPETITIONS 10 CACHE STRING "Default number of repetitions")
6+
set(DEFAULT_ITERATIONS 100 CACHE STRING "Default number of iterations that is done with a single kernel execution")
7+
set(HOST_DATA_TYPE cl_float CACHE STRING "Data type used by the host code. Should match the data type of the used FFT")
8+
set(DEFAULT_DEVICE -1 CACHE STRING "Index of the default device to use")
9+
set(DEFAULT_PLATFORM -1 CACHE STRING "Index of the default platform to use")
10+
set(FPGA_BOARD_NAME p520_hpc_sg280l CACHE STRING "Name of the target FPGA board")
11+
12+
set(AOC_FLAGS "-fpc -fp-relaxed" CACHE STRING "Used flags for the AOC compiler")
13+
separate_arguments(AOC_FLAGS)
14+
15+
16+
set(FFT_KERNEL_NAME fft1d CACHE STRING "Name of the kernel that is used for calculation")
17+
set(FETCH_KERNEL_NAME fetch CACHE STRING "Name of the kernel that is used to fetch data from global memory")
18+
set(LOG_FFT_SIZE 12 CACHE STRING "Log2 of the used FFT size")
19+
set(FFT_UNROLL 8 CACHE STRING "Amount of global memory unrolling of the kernel. Will be used by the host to calculate NDRange sizes")
20+
21+
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_SOURCE_DIR}/../extern/hlslib/cmake)
22+
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
23+
24+
configure_file(
25+
"${CMAKE_SOURCE_DIR}/src/common/parameters.h.in"
26+
"${CMAKE_BINARY_DIR}/src/common/parameters.h"
27+
)
28+
29+
include_directories(${CMAKE_BINARY_DIR}/src/common)
30+
31+
find_package(IntelFPGAOpenCL REQUIRED)
32+
33+
add_subdirectory(src/device)
34+
add_subdirectory(src/host)
35+
add_subdirectory(tests)
36+

FFT/LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2020 pc2
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

FFT/README.md

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# FFT Benchmark for FPGA
2+
3+
This repository contains the FFT Benchmark for FPGA and its OpenCL kernels.
4+
Currently only the Intel FPGA SDK for OpenCL utility is supported.
5+
6+
It is based on the FFT benchmark of the [HPC Challenge Benchmark](https://icl.utk.edu/hpcc/) suite.
7+
The FFT1D reference implementation is used for the kernel code.
8+
9+
## Dependencies
10+
11+
The benchmark comes with the following requirements for building and running:
12+
13+
- CMake 2.8
14+
- GCC 4.9
15+
- Intel OpenCL FPGA SDK 19.3
16+
17+
It also contains submodules that will be automatically updated when running cmake:
18+
19+
- cxxopts: A header only library to parse command line parameters
20+
- googletest: A C++ test framework
21+
22+
## Build
23+
24+
CMake is used as the build system.
25+
The targets below can be used to build the benchmark and its kernels:
26+
27+
| Target | Description |
28+
| -------- | ---------------------------------------------- |
29+
| fFFT | Builds the host application |
30+
| Google_Tests_run| Compile the tests and its dependencies |
31+
32+
More over the are additional targets to generate kernel reports and bitstreams.
33+
The provided kernel is optimized for Stratix 10 with 512bit LSUs.
34+
The kernel targets are:
35+
36+
| Target | Description |
37+
| -------- | ---------------------------------------------- |
38+
| fft1d_float_8 | Synthesizes the kernel (takes several hours!) |
39+
| fft1d_float_8_report | Create an HTML report for the kernel |
40+
| fft1d_float_8_emulate | Create a n emulation kernel |
41+
42+
43+
You can build for example the host application by running
44+
45+
mkdir build && cd build
46+
cmake ..
47+
make fFFT
48+
49+
You will find all executables and kernel files in the `bin`
50+
folder of your build directory.
51+
You should always specify a target with make to reduce the build time!
52+
You might want to specify predefined parameters before build:
53+
54+
Name | Default | Description |
55+
---------------- |-------------|--------------------------------------|
56+
`DEFAULT_DEVICE` | -1 | Index of the default device (-1 = ask) |
57+
`DEFAULT_PLATFORM`| -1 | Index of the default platform (-1 = ask) |
58+
`FPGA_BOARD_NAME`| p520_hpc_sg280l | Name of the target board |
59+
`DEFAULT_REPETITIONS`| 10 | Number of times the kernel will be executed |
60+
`DEFAULT_ITERATIONS`| 100 | Default number of iterations that is done with a single kernel execution|
61+
`LOG_FFT_SIZE` | 12 | Log2 of the FFT Size that has to be used i.e. 3 leads to a FFT Size of 2^3=8|
62+
`AOC_FLAGS`| `-fpc -fp-relaxed` | Additional AOC compiler flags that are used for kernel compilation |
63+
64+
Moreover the environment variable `INTELFPGAOCLSDKROOT` has to be set to the root
65+
of the Intel FPGA SDK installation.
66+
67+
Additionally it is possible to set the used compiler and other build tools
68+
in the `CMakeCache.txt` located in the build directory after running cmake.
69+
70+
71+
72+
## Execution
73+
74+
For execution of the benchmark run:
75+
76+
./fFFT -f path_to_kernel.aocx
77+
78+
For more information on available input parameters run
79+
80+
./fFFT -h
81+
82+
To execute the unit and integration tests run
83+
84+
./Google_Tests_run
85+
86+
in the `bin` folder within the build directory.
87+
It will run an emulation of the kernel and execute some functionality tests.
88+
89+
## Output Interpretation
90+
91+
The benchmark will print the following two tables to standard output after execution:
92+
93+
res. error mach. eps
94+
2.67000e-01 1.19209e-07
95+
96+
avg best
97+
Time in s: 7.56801e-03 7.07241e-03
98+
GFLOPS: 3.24735e-02 3.47491e-02
99+
100+
The first table contains the maximum residual error of the calculation and the
101+
machine epsilon that was used to calculate the residual error.
102+
The benchmark will perform a FFT with the FPGA kernel on random input data.
103+
In a second step the resulting data will be used as input for an iFFT using a CPU
104+
reference implementation in double precision.
105+
The residual error is then calculated with:
106+
107+
![res=\frac{||x-x'||}{\epsilon*ld(n)}](https://latex.codecogs.com/gif.latex?res=\frac{||x-x'||}{\epsilon*ld(n)})
108+
109+
where `x` is the input data of the FFT, `x'` the resulting data from the iFFT, epsilon the machine epsilon and `n` the FFT size.
110+
111+
In the second table the measured execution times and calculated FLOPs are given.
112+
It gives the average and bast for both.
113+
The time gives the averaged execution time for a single FFT in case of a batched execution (an execution with more than one iteration).
114+
They are also used to calculate the FLOPs.

FFT/performance/README.md

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Performance Evaluation
2+
3+
## Performance Model
4+
5+
FFT1d kernel modelled here can be found in the Intel OpenCL Design Samples.
6+
The design follows the radix 2<sup>2</sup> FFT architecture, which consists of the following:
7+
8+
1. ld(N) radix-2 butterflies
9+
2. trivial rotations at every even stage
10+
3. non-trivial rotations at every odd stage. This is the twiddle factor multiplication computed after the stage's butterfly.
11+
4. shuffling using shift registers
12+
13+
The FFT if fully pipelined and the FFT step is unrolled over all ld(N) stages.
14+
Hence the performance is limited by the global memory to feed the pipeline with data.
15+
We will focus on modeling the fetch kernel that is loading the data from memory.
16+
The kernel pipeline can be expressed with the following equation:
17+
18+
![t_{mempipeline}=\frac{\frac{s_{FFT}}{s_{bus}}}{f}](https://latex.codecogs.com/gif.latex?t_{mempipeline}=\frac{\frac{s_{FFT}}{s_{bus}}}{f})
19+
20+
where ![s_{FFT}](https://latex.codecogs.com/gif.latex?s_{block}) is the number of bytes needed to load from global memory for the FFT i.e. 4096 * 8B for a 4096 FFT with single precision complex values.
21+
![s_{bus}](https://latex.codecogs.com/gif.latex?s_{bus}) the bus width of the global memory in bytes.
22+
![f](https://latex.codecogs.com/gif.latex?f) is the kernel frequency.
23+
Moreover latency will be added to this operation for every DRAM page that has to be activated:
24+
25+
![t_{memoverhead}=\frac{s_{FFT}}{s_{page}}*\(t_{RCD}+t_{RP}\)](https://latex.codecogs.com/gif.latex?t_{memoverhead}=\frac{s_{FFT}}{s_{page}}*\(t_{RCD}+t_{RP}\))
26+
27+
where ![s_{page}](https://latex.codecogs.com/gif.latex?s_{page}) is the size of a DRAM page in bytes.
28+
![t_{RCD}](https://latex.codecogs.com/gif.latex?t_{RCD}) and ![t_{RP}](https://latex.codecogs.com/gif.latex?t_{RP}) are the
29+
row address to column address delay and the row precharge time.
30+
31+
So the total time for the memory accesses for a the calculation of a single FFT is:
32+
33+
![t_{mem}=t_{mempipeline}+t_{memoverhead}](https://latex.codecogs.com/gif.latex?t_{mem}=t_{mempipeline}+t_{memoverhead})
34+
35+
This model does not consider latencies of the calculation pipeline or of the memory but it holds for batched calculations where these latencies are hidden.
36+
If memory interleaving is used, t_memoverhead is also hidden by the access to subsequent memory banks.
37+
38+
## Synthesis Results
39+
40+
The kernel was synthesized with the following configuration for the Bittware 520N board:
41+
42+
Name | Default | Description |
43+
---------------- |-------------|--------------------------------------|
44+
`DEFAULT_DEVICE` | -1 | Index of the default device (-1 = ask) |
45+
`DEFAULT_PLATFORM`| -1 | Index of the default platform (-1 = ask) |
46+
`FPGA_BOARD_NAME`| p520_hpc_sg280l | Name of the target board |
47+
`DEFAULT_REPETITIONS`| 10 | Number of times the kernel will be executed |
48+
`DEFAULT_ITERATIONS`| 5000 | Default number of iterations that is done with a single kernel execution|
49+
`LOG_FFT_SIZE` | 12 | Log2 of the FFT Size that has to be used i.e. 3 leads to a FFT Size of 2^3=8|
50+
`AOC_FLAGS`| `-fpc -fp-relaxed` | Additional AOC compiler flags that are used for kernel compilation |
51+
52+
The used tool versions:
53+
54+
Tool | Version |
55+
---------------- |---------|
56+
Intel OpenCL SDK | 19.4.0 |
57+
BSP | 19.2.0 |
58+
GCC | 8.3.0 |
59+
60+
The resulting output:
61+
62+
-------------------------------------------------------------
63+
Implementation of the FFT benchmark proposed in the HPCC benchmark suite for FPGA.
64+
Version: 1.0
65+
-------------------------------------------------------------
66+
Summary:
67+
FFT Size: 4096
68+
Data Size: 5000 * FFT Size * sizeof(cl_float) = 8.19200e+07 Byte
69+
Repetitions: 10
70+
Kernel file: fft1d_float_8.aocx
71+
Device: p520_hpc_sg280l : BittWare Stratix 10 OpenCL platform (aclbitt_s10_pcie0)
72+
-------------------------------------------------------------
73+
Start benchmark using the given configuration.
74+
-------------------------------------------------------------
75+
res. error mach. eps
76+
3.17324e-01 1.19209e-07
77+
78+
avg best
79+
Time in s: 1.81336e-06 1.81170e-06
80+
GFLOPS: 1.35528e+02 1.35652e+02
81+
82+
So the FFT implementation achieved 135.7 GFLOPs with a kernel frequency of 297.5MHz.
83+
The kernel uses memory interleaving so the model simplifies to:
84+
85+
![t_{mem}=\frac{\frac{4096}{8}}{297.5MHz}=1.72\mu&space;s](https://latex.codecogs.com/gif.latex?t_{mem}=\frac{\frac{4096}{8}}{297.5MHz}=1.72\mu&space;s)
86+
87+
which shows an 5.2% difference to the measurement that resulted in 1.81µs.
88+
The difference may be caused by the latencies of the global memory and the calculation pipeline.
89+
Also the store of the FFT result may interfere with the load operations since they use the same memory banks.
90+

FFT/src/common/parameters.h.in

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#ifndef SRC_COMMON_PARAMETERS_H_
2+
#define SRC_COMMON_PARAMETERS_H_
3+
4+
/**
5+
* Host specific parameters
6+
*/
7+
#define VERSION "@VERSION@"
8+
#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@
9+
#define DEFAULT_ITERATIONS @DEFAULT_ITERATIONS@
10+
#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@
11+
#define DEFAULT_DEVICE @DEFAULT_DEVICE@
12+
#define HOST_DATA_TYPE @HOST_DATA_TYPE@
13+
#define FFT_KERNEL_NAME "@FFT_KERNEL_NAME@"
14+
#define FETCH_KERNEL_NAME "@FETCH_KERNEL_NAME@"
15+
16+
/**
17+
* Kernel Parameters
18+
*/
19+
#define LOG_FFT_SIZE @LOG_FFT_SIZE@
20+
#define FFT_UNROLL @FFT_UNROLL@
21+
22+
/**
23+
Output separator
24+
*/
25+
#define HLINE "-------------------------------------------------------------\n"
26+
27+
#endif // SRC_COMMON_PARAMETERS_H_

FFT/src/device/CMakeLists.txt

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
2+
set(AOC_INCLUDES "-I${CMAKE_CURRENT_BINARY_DIR}/../common")
3+
4+
function(generate_kernel_targets)
5+
foreach (kernel_file_name ${ARGN})
6+
set(source_f ${CMAKE_CURRENT_SOURCE_DIR}/${kernel_file_name}.cl)
7+
set(report_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_report)
8+
set(bitstream_emulate_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_emulate.aocx)
9+
set(bitstream_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}.aocx)
10+
set(out_f "${CMAKE_CURRENT_BINARY_DIR}/${out_f}")
11+
add_custom_command(OUTPUT ${bitstream_emulate_f}
12+
COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} ${AOC_INCLUDES} ${AOC_FLAGS} -legacy-emulator -march=emulator
13+
-o ${bitstream_emulate_f}
14+
)
15+
add_custom_command(OUTPUT ${bitstream_f}
16+
COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} ${AOC_INCLUDES} ${AOC_FLAGS} -board=${FPGA_BOARD_NAME}
17+
-o ${bitstream_f}
18+
)
19+
add_custom_command(OUTPUT ${report_f}
20+
COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} ${AOC_INCLUDES} ${AOC_FLAGS} -rtl -report -board=${FPGA_BOARD_NAME}
21+
-o ${report_f}
22+
)
23+
add_custom_target(${kernel_file_name}_report DEPENDS ${report_f}
24+
DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h
25+
SOURCES ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h)
26+
add_custom_target(${kernel_file_name} DEPENDS ${bitstream_f}
27+
DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h
28+
SOURCES ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h)
29+
add_custom_target(${kernel_file_name}_emulate DEPENDS ${bitstream_emulate_f}
30+
DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h
31+
SOURCES ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h)
32+
endforeach ()
33+
endfunction()
34+
35+
generate_kernel_targets(fft1d_float_8)

0 commit comments

Comments
 (0)