Skip to content

Commit 60651eb

Browse files
committed
Merge branch 'beff_synth' into 'master'
Beff synth See merge request pc2/HPCC_FPGA!44
2 parents af5b916 + 5c206e7 commit 60651eb

8 files changed

Lines changed: 81 additions & 12 deletions

File tree

PTRANS/src/host/transpose_handlers.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ std::unique_ptr<transpose::TransposeData> transpose::DistributedDiagonalTranspos
110110

111111
void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose::TransposeData& data) {
112112
#ifndef NDEBUG
113-
std::cout << "Start data exchange " << mpi_comm_rank << std::endl;
113+
// std::cout << "Start data exchange " << mpi_comm_rank << std::endl;
114114
#endif
115115
// Only need to exchange data, if rank has a partner
116116
if (mpi_comm_rank < mpi_comm_size - num_diagonal_ranks) {
@@ -132,7 +132,7 @@ void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose:
132132
while (remaining_data_size > 0) {
133133
int next_chunk = (remaining_data_size > std::numeric_limits<int>::max()) ? std::numeric_limits<int>::max(): remaining_data_size;
134134
#ifndef NDEBUG
135-
std::cout << "Rank " << mpi_comm_rank << " " << next_chunk << " to " << pair_rank << std::endl;
135+
// std::cout << "Rank " << mpi_comm_rank << " " << next_chunk << " to " << pair_rank << std::endl;
136136
#endif
137137
if (pair_rank > mpi_comm_rank) {
138138
MPI_Send(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD);
@@ -148,14 +148,14 @@ void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose:
148148
}
149149
// MPI_Sendrecv_replace(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, pair_rank, 0, MPI_COMM_WORLD, &status);
150150
#ifndef NDEBUG
151-
std::cout << "Rank " << mpi_comm_rank << " Done!"<< std::endl;
151+
// std::cout << "Rank " << mpi_comm_rank << " Done!"<< std::endl;
152152
#endif
153153
remaining_data_size -= next_chunk;
154154
offset += next_chunk;
155155
}
156156
}
157157
#ifndef NDEBUG
158-
std::cout << "End data exchange " << mpi_comm_rank << std::endl;
158+
// std::cout << "End data exchange " << mpi_comm_rank << std::endl;
159159
#endif
160160
}
161161

b_eff/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,16 @@ set(DEFAULT_MAX_LOOP_LENGTH 65536 CACHE STRING "Maximum number of repetitions do
99
set(DEFAULT_MIN_LOOP_LENGTH 256 CACHE STRING "Minimum number of repetitions done for a single message size")#
1010
set(DEFAULT_LOOP_LENGTH_OFFSET 11 CACHE STRING "Offset that is used for the message sizes before reducing the number of repetitions")
1111
set(DEFAULT_LOOP_LENGTH_DECREASE 7 CACHE STRING "Number of steps that are used to decrease the number of repetitions to its minimum")
12+
set(NUM_REPLICATIONS 2 CACHE STRING "")
1213

1314
set(USE_MPI Yes)
1415
set(USE_DEPRECATED_HPP_HEADER No)
1516

1617
set(DATA_TYPE char)
1718
include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake)
1819
unset(DATA_TYPE CACHE)
19-
if (NOT MPI_FOUND)
20-
message(ERROR "Benchmarks host code requires MPI, but MPI could not be found.")
21-
endif()
20+
find_package(MPI REQUIRED)
21+
2222
if (NOT INTELFPGAOPENCL_FOUND)
2323
message(ERROR "Benchmark does only support the Intel OpenCL SDK")
2424
endif()

b_eff/configs/Bittware_520N.cmake

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# This file contains the default configuration for the Nallatech 520N board
2+
# for the use with single precision floating point values.
3+
# To use this configuration file, call cmake with the parameter
4+
#
5+
# cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
6+
#
7+
8+
9+
set(USE_MPI Yes CACHE BOOL "" FORCE)
10+
set(USE_SVM No CACHE BOOL "" FORCE)
11+
set(USE_HBM No CACHE BOOL "" FORCE)
12+
set(FPGA_BOARD_NAME "p520_max_sg280l" CACHE STRING "" FORCE)
13+
set(AOC_FLAGS "-fpc -fp-relaxed -seed=7" CACHE STRING "" FORCE)
14+
15+
# GEMM specific options
16+
set(CHANNEL_WIDTH 32 CACHE STRING "Width of a single external channel in Byte" FORCE)
17+
set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications" FORCE)

b_eff/scripts/build_520n.sh

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/bin/bash
2+
#
3+
# Synthesize the b_eff kernel for the Nallaatech 520N board.
4+
# This is an example script, how the synthesis can be started on Noctua using a HPCC FPGA configuration file.
5+
# Submit this script to sbatch in this folder!
6+
#
7+
#SBATCH -p fpgasyn
8+
#SBATCH -J b_eff
9+
10+
module load intelFPGA_pro/20.4.0
11+
module load nalla_pcie/19.4.0_hpc
12+
module load intel
13+
module load devel/CMake/3.15.3-GCCcore-8.3.0
14+
15+
SCRIPT_PATH=${SLURM_SUBMIT_DIR}
16+
17+
BENCHMARK_DIR=${SCRIPT_PATH}/../
18+
19+
SYNTH_DIR=${PFS_SCRATCH}/synth/520n/multi_fpga/b_eff
20+
21+
22+
mkdir -p ${SYNTH_DIR}
23+
cd ${SYNTH_DIR}
24+
25+
cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/Bittware_520N.cmake
26+
27+
make communication_bw520n_intel Network_intel
28+

b_eff/src/common/parameters.h.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
* Kernel Parameters
1818
*/
1919
#define CHANNEL_WIDTH @CHANNEL_WIDTH@
20+
#define NUM_REPLICATIONS @NUM_REPLICATIONS@
2021

2122
#define HOST_DATA_TYPE @HOST_DATA_TYPE@
2223
#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@

b_eff/src/host/execution_default.cpp

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ namespace bm_execution {
5252
std::vector<cl::Buffer> validationBuffers;
5353

5454
// Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
55-
for (int r = 0; r < 2; r++) {
55+
for (int r = 0; r < config.programSettings->kernelReplications; r++) {
5656

5757
validationBuffers.push_back(cl::Buffer(*config.context, CL_MEM_WRITE_ONLY, sizeof(HOST_DATA_TYPE) * validationData.size(),0,&err));
5858
ASSERT_CL(err)
@@ -90,23 +90,42 @@ namespace bm_execution {
9090
for (uint r =0; r < config.programSettings->numRepetitions; r++) {
9191
MPI_Barrier(MPI_COMM_WORLD);
9292
auto startCalculation = std::chrono::high_resolution_clock::now();
93-
for (int i = 0; i < 2; i++) {
93+
for (int i = 0; i < config.programSettings->kernelReplications; i++) {
9494
sendQueues[i].enqueueNDRangeKernel(sendKernels[i], cl::NullRange, cl::NDRange(1));
9595
recvQueues[i].enqueueNDRangeKernel(recvKernels[i], cl::NullRange, cl::NDRange(1));
96+
#ifndef NDEBUG
97+
int current_rank;
98+
MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
99+
std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
100+
#endif
96101
}
97-
for (int i = 0; i < 2; i++) {
102+
for (int i = 0; i < config.programSettings->kernelReplications; i++) {
98103
sendQueues[i].finish();
104+
#ifndef NDEBUG
105+
int current_rank;
106+
MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
107+
std::cout << "Rank " << current_rank << ": Send done " << r << "," << i << std::endl;
108+
#endif
99109
recvQueues[i].finish();
110+
#ifndef NDEBUG
111+
MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
112+
std::cout << "Rank " << current_rank << ": Recv done " << r << "," << i << std::endl;
113+
#endif
100114
}
101115
auto endCalculation = std::chrono::high_resolution_clock::now();
102116
std::chrono::duration<double> calculationTime =
103117
std::chrono::duration_cast<std::chrono::duration<double>>
104118
(endCalculation - startCalculation);
105119
calculationTimings.push_back(calculationTime.count());
120+
#ifndef NDEBUG
121+
int current_rank;
122+
MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
123+
std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
124+
#endif
106125
}
107126
// Read validation data from FPGA will be placed sequentially in buffer for all replications
108127
// The data order should not matter, because every byte should have the same value!
109-
for (int r = 0; r < 2; r++) {
128+
for (int r = 0; r < config.programSettings->kernelReplications; r++) {
110129
err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / 2, &validationData.data()[r * validationData.size() / 2]);
111130
ASSERT_CL(err);
112131
}

scripts/evaluation/parse_raw_to_csv.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
fft_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Batch\\sSize\\s+(?P<batch_size>\d+)\nFFT\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<best_time>(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P<avg_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_flops>(\d|\.|\+|-|e)+)"
1313
gemm_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<resid>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P<best_time>.+)\\s+(?P<avg_time>.+)\\s+(?P<gflops>.+)"
1414
ra_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+(?P<size>(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+Error:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<gops>(\d|\.|\+|-|e)+)"
15-
trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+calc\\s+calc\\s+FLOPS\\s+Net\\s+\\[GB/s\\]\\s+Mem\\s+\\[GB/s\\]\n\\s*avg:\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<avg_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)"
15+
trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+calc\\s+calc\\s+FLOPS\\s+Net\\s+\\[B/s\\]\\s+Mem\\s+\\[B/s\\]\n\\s*avg:\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<avg_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)"
1616
stream_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P<size>(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P<data_type>.+)\n(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Kernel\\sType\\s+(?P<type>.+)\n(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P<add_rate>(\d|\.|\+|-|e)+)\\s+(?P<add_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<add_min_time>(\d|\.|\+|-|e)+)\\s+(?P<add_max_time>(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P<copy_rate>(\d|\.|\+|-|e)+)\\s+(?P<copy_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<copy_min_time>(\d|\.|\+|-|e)+)\\s+(?P<copy_max_time>(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P<pcir_rate>(\d|\.|\+|-|e)+)\\s+(?P<pcir_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<pcir_min_time>(\d|\.|\+|-|e)+)\\s+(?P<pcir_max_time>(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P<pciw_rate>(\d|\.|\+|-|e)+)\\s+(?P<pciw_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<pciw_min_time>(\d|\.|\+|-|e)+)\\s+(?P<pciw_max_time>(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P<scale_rate>(\d|\.|\+|-|e)+)\\s+(?P<scale_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<scale_min_time>(\d|\.|\+|-|e)+)\\s+(?P<scale_max_time>(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P<triad_rate>(\d|\.|\+|-|e)+)\\s+(?P<triad_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<triad_min_time>(\d|\.|\+|-|e)+)\\s+(?P<triad_max_time>(\d|\.|\+|-|e)+)"
1717
linpack_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P<error>((\d|\.|\+|-|e)+|nan))\\s+(?P<resid>((\d|\.|\+|-|e)+|nan))\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P<total_best_time>(\d|\.|\+|-|e)+)\\s+(?P<total_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<total_gflops>(\d|\.|\+|-|e)+)(\\s*\n)\\s+GEFA\\s+(?P<lu_best_time>(\d|\.|\+|-|e)+)\\s+(?P<lu_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<lu_gflops>(\d|\.|\+|-|e)+)(\\s*\n)\\s+GESL\\s+(?P<sl_best_time>(\d|\.|\+|-|e)+)\\s+(?P<sl_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<sl_gflops>(\d|\.|\+|-|e)+)"
1818

shared/include/hpcc_benchmark.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,11 @@ class BaseSettings {
130130
defaultPlatform(results["platform"].as<int>()),
131131
defaultDevice(results["device"].as<int>()),
132132
kernelFileName(results["f"].as<std::string>()),
133+
#ifdef NUM_REPLICATIONS
134+
kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : NUM_REPLICATIONS),
135+
#else
133136
kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : 1),
137+
#endif
134138
testOnly(static_cast<bool>(results.count("test"))) {}
135139

136140
/**

0 commit comments

Comments
 (0)