Merge branch 'beff_synth' into 'master'

Mellich · Mellich · commit 60651eb40fa5 · 2021-04-09T09:23:15.000Z
Beff synth

See merge request pc2/HPCC_FPGA!44
diff --git a/PTRANS/src/host/transpose_handlers.cpp b/PTRANS/src/host/transpose_handlers.cpp
@@ -110,7 +110,7 @@ std::unique_ptr<transpose::TransposeData> transpose::DistributedDiagonalTranspos
 
 void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose::TransposeData& data) {
 #ifndef NDEBUG
-    std::cout << "Start data exchange " << mpi_comm_rank << std::endl;
+    // std::cout << "Start data exchange " << mpi_comm_rank << std::endl;
 #endif
     // Only need to exchange data, if rank has a partner
     if (mpi_comm_rank < mpi_comm_size - num_diagonal_ranks) {
@@ -132,7 +132,7 @@ void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose:
         while (remaining_data_size > 0) {
             int next_chunk = (remaining_data_size > std::numeric_limits<int>::max()) ? std::numeric_limits<int>::max(): remaining_data_size;
 #ifndef NDEBUG
-    std::cout << "Rank " << mpi_comm_rank << " " << next_chunk << " to " << pair_rank << std::endl;
+    // std::cout << "Rank " << mpi_comm_rank << " " << next_chunk << " to " << pair_rank << std::endl;
 #endif      
             if (pair_rank > mpi_comm_rank) {
                 MPI_Send(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD);
@@ -148,14 +148,14 @@ void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose:
             }
             // MPI_Sendrecv_replace(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, pair_rank, 0, MPI_COMM_WORLD, &status);
  #ifndef NDEBUG
-    std::cout << "Rank " << mpi_comm_rank << " Done!"<< std::endl;
+    // std::cout << "Rank " << mpi_comm_rank << " Done!"<< std::endl;
 #endif  
             remaining_data_size -= next_chunk;
             offset += next_chunk;
         }
     }
 #ifndef NDEBUG
-    std::cout << "End data exchange " << mpi_comm_rank << std::endl;
+    // std::cout << "End data exchange " << mpi_comm_rank << std::endl;
 #endif
 }
 
diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt
@@ -9,16 +9,16 @@ set(DEFAULT_MAX_LOOP_LENGTH 65536 CACHE STRING "Maximum number of repetitions do
 set(DEFAULT_MIN_LOOP_LENGTH 256 CACHE STRING "Minimum number of repetitions done for a single message size")#
 set(DEFAULT_LOOP_LENGTH_OFFSET 11 CACHE STRING "Offset that is used for the message sizes before reducing the number of repetitions")
 set(DEFAULT_LOOP_LENGTH_DECREASE 7 CACHE STRING "Number of steps that are used to decrease the number of repetitions to its minimum")
+set(NUM_REPLICATIONS 2 CACHE STRING "")
 
 set(USE_MPI Yes)
 set(USE_DEPRECATED_HPP_HEADER No)
 
 set(DATA_TYPE char)
 include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake)
 unset(DATA_TYPE CACHE)
-if (NOT MPI_FOUND) 
-    message(ERROR "Benchmarks host code requires MPI, but MPI could not be found.")
-endif()
+find_package(MPI REQUIRED)
+
 if (NOT INTELFPGAOPENCL_FOUND)
     message(ERROR "Benchmark does only support the Intel OpenCL SDK")
 endif()
diff --git a/b_eff/configs/Bittware_520N.cmake b/b_eff/configs/Bittware_520N.cmake
@@ -0,0 +1,17 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "p520_max_sg280l" CACHE STRING "" FORCE)
+set(AOC_FLAGS "-fpc -fp-relaxed -seed=7" CACHE STRING "" FORCE)
+
+# GEMM specific options
+set(CHANNEL_WIDTH 32 CACHE STRING "Width of a single external channel in Byte" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications" FORCE)
diff --git a/b_eff/scripts/build_520n.sh b/b_eff/scripts/build_520n.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+# Synthesize the b_eff kernel for the Nallaatech 520N board.
+# This is an example script, how the synthesis can be started on Noctua using a HPCC FPGA configuration file.
+# Submit this script to sbatch in this folder!
+#
+#SBATCH -p fpgasyn
+#SBATCH -J b_eff
+
+module load intelFPGA_pro/20.4.0
+module load nalla_pcie/19.4.0_hpc
+module load intel
+module load devel/CMake/3.15.3-GCCcore-8.3.0
+
+SCRIPT_PATH=${SLURM_SUBMIT_DIR}
+
+BENCHMARK_DIR=${SCRIPT_PATH}/../
+
+SYNTH_DIR=${PFS_SCRATCH}/synth/520n/multi_fpga/b_eff
+
+
+mkdir -p ${SYNTH_DIR}
+cd ${SYNTH_DIR}
+
+cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/Bittware_520N.cmake
+
+make communication_bw520n_intel Network_intel
+
diff --git a/b_eff/src/common/parameters.h.in b/b_eff/src/common/parameters.h.in
@@ -17,6 +17,7 @@
  * Kernel Parameters
  */
 #define CHANNEL_WIDTH @CHANNEL_WIDTH@
+#define NUM_REPLICATIONS @NUM_REPLICATIONS@
 
 #define HOST_DATA_TYPE @HOST_DATA_TYPE@
 #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
diff --git a/b_eff/src/host/execution_default.cpp b/b_eff/src/host/execution_default.cpp
@@ -52,7 +52,7 @@ namespace bm_execution {
         std::vector<cl::Buffer> validationBuffers;
 
         // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
-        for (int r = 0; r < 2; r++) {
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
 
             validationBuffers.push_back(cl::Buffer(*config.context, CL_MEM_WRITE_ONLY, sizeof(HOST_DATA_TYPE) * validationData.size(),0,&err));
             ASSERT_CL(err)
@@ -90,23 +90,42 @@ namespace bm_execution {
         for (uint r =0; r < config.programSettings->numRepetitions; r++) {
             MPI_Barrier(MPI_COMM_WORLD);
             auto startCalculation = std::chrono::high_resolution_clock::now();
-            for (int i = 0; i < 2; i++) {
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
                 sendQueues[i].enqueueNDRangeKernel(sendKernels[i], cl::NullRange, cl::NDRange(1));
                 recvQueues[i].enqueueNDRangeKernel(recvKernels[i], cl::NullRange, cl::NDRange(1));
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
             }
-            for (int i = 0; i < 2; i++) {
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
                 sendQueues[i].finish();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Send done " << r << "," << i << std::endl;
+                #endif
                 recvQueues[i].finish();
+                #ifndef NDEBUG
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Recv done " << r << "," << i << std::endl;
+                #endif
             }
             auto endCalculation = std::chrono::high_resolution_clock::now();
             std::chrono::duration<double> calculationTime =
                     std::chrono::duration_cast<std::chrono::duration<double>>
                             (endCalculation - startCalculation);
             calculationTimings.push_back(calculationTime.count());
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
         }
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
-        for (int r = 0; r < 2; r++) {
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
             err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / 2, &validationData.data()[r * validationData.size() / 2]);
             ASSERT_CL(err);
         }
diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py
@@ -12,7 +12,7 @@
 fft_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Batch\\sSize\\s+(?P<batch_size>\d+)\nFFT\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<best_time>(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P<avg_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_flops>(\d|\.|\+|-|e)+)"
 gemm_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<resid>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P<best_time>.+)\\s+(?P<avg_time>.+)\\s+(?P<gflops>.+)"
 ra_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+(?P<size>(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+Error:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<gops>(\d|\.|\+|-|e)+)"
-trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+calc\\s+calc\\s+FLOPS\\s+Net\\s+\\[GB/s\\]\\s+Mem\\s+\\[GB/s\\]\n\\s*avg:\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<avg_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)"
+trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+calc\\s+calc\\s+FLOPS\\s+Net\\s+\\[B/s\\]\\s+Mem\\s+\\[B/s\\]\n\\s*avg:\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<avg_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)"
 stream_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P<size>(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P<data_type>.+)\n(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Kernel\\sType\\s+(?P<type>.+)\n(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P<add_rate>(\d|\.|\+|-|e)+)\\s+(?P<add_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<add_min_time>(\d|\.|\+|-|e)+)\\s+(?P<add_max_time>(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P<copy_rate>(\d|\.|\+|-|e)+)\\s+(?P<copy_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<copy_min_time>(\d|\.|\+|-|e)+)\\s+(?P<copy_max_time>(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P<pcir_rate>(\d|\.|\+|-|e)+)\\s+(?P<pcir_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<pcir_min_time>(\d|\.|\+|-|e)+)\\s+(?P<pcir_max_time>(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P<pciw_rate>(\d|\.|\+|-|e)+)\\s+(?P<pciw_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<pciw_min_time>(\d|\.|\+|-|e)+)\\s+(?P<pciw_max_time>(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P<scale_rate>(\d|\.|\+|-|e)+)\\s+(?P<scale_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<scale_min_time>(\d|\.|\+|-|e)+)\\s+(?P<scale_max_time>(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P<triad_rate>(\d|\.|\+|-|e)+)\\s+(?P<triad_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<triad_min_time>(\d|\.|\+|-|e)+)\\s+(?P<triad_max_time>(\d|\.|\+|-|e)+)"
 linpack_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P<error>((\d|\.|\+|-|e)+|nan))\\s+(?P<resid>((\d|\.|\+|-|e)+|nan))\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P<total_best_time>(\d|\.|\+|-|e)+)\\s+(?P<total_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<total_gflops>(\d|\.|\+|-|e)+)(\\s*\n)\\s+GEFA\\s+(?P<lu_best_time>(\d|\.|\+|-|e)+)\\s+(?P<lu_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<lu_gflops>(\d|\.|\+|-|e)+)(\\s*\n)\\s+GESL\\s+(?P<sl_best_time>(\d|\.|\+|-|e)+)\\s+(?P<sl_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<sl_gflops>(\d|\.|\+|-|e)+)"
    
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
@@ -130,7 +130,11 @@ class BaseSettings {
             defaultPlatform(results["platform"].as<int>()),
             defaultDevice(results["device"].as<int>()),
             kernelFileName(results["f"].as<std::string>()),
+#ifdef NUM_REPLICATIONS
+            kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : NUM_REPLICATIONS),
+#else
             kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : 1),
+#endif
             testOnly(static_cast<bool>(results.count("test"))) {}
 
     /**