From 92c614b12db8b0b7fc6019780da7a852adabf7e9 Mon Sep 17 00:00:00 2001 From: Tristan AMIOTTE-SUCHET Date: Sat, 30 May 2026 04:22:34 +0300 Subject: [PATCH 1/7] working base --- .gitignore | 2 + CMakeLists.txt | 6 +- main_ans.cpp | 236 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 243 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 main_ans.cpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..45bb338 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# data folder +build/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b04fd0..b2cf801 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,4 +22,8 @@ add_executable(matmul main_ans.cpp) if(OpenMP_CXX_FOUND) target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX) -endif() \ No newline at end of file +endif() + +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/data + DESTINATION ${CMAKE_CURRENT_BINARY_DIR} +) diff --git a/main_ans.cpp b/main_ans.cpp new file mode 100644 index 0000000..ae9fb43 --- /dev/null +++ b/main_ans.cpp @@ -0,0 +1,236 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define FLOAT_TOLERANCE 1e-2 + +void matfile_read_dimensions(std::ifstream& file, uint32_t& m, uint32_t& n); +void matfile_read_matrix(std::ifstream& file, float* matrix); +void matfile_write_matrix(std::string const& filename, const float* matrix, uint32_t m, uint32_t n); + +void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p); +void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size); +void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p); +bool validate_result(const std::string &result_file, const std::string &reference_file); + +int main(int argc, char* argv[]) +{ + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + int case_number = std::atoi(argv[1]); + if (case_number < 0 || case_number > 9) { + std::cerr << "Case number must be between 0 and 9" << std::endl; + return 1; + } + + // Construct file paths + std::string folder = "data/" + std::to_string(case_number) + "/"; + std::string input0_file = folder + "input0.raw"; + std::string input1_file = folder + "input1.raw"; + std::string result_file = folder + "result.raw"; + std::string reference_file = folder + "output.raw"; + + // open input files + std::ifstream input0(input0_file, std::ios::binary); + std::ifstream input1(input1_file, std::ios::binary); + + // read dimensions of input matrices + uint32_t m, n, n2, p; + matfile_read_dimensions(input0, m, n); + matfile_read_dimensions(input1, n2, p); + // validate dimensions + if (n != n2) { + std::cerr << "Inner dimensions of A and B must match" << std::endl; + return EXIT_FAILURE; + } + + // allocate memory for input matrices + float* A = new float[m * n]; + float* B = new float[n * p]; + + // read input matrices from files + matfile_read_matrix(input0, A); + matfile_read_matrix(input1, B); + + // close input files + input0.close(); + input1.close(); + + // Allocate memory for result matrices + float* C_naive = new float[m * p]; + float* C_blocked = new float[m * p]; + float* C_parallel = new float[m * p]; + + // Measure performance of naive_matmul + double start_time = omp_get_wtime(); + naive_matmul(C_naive, A, B, m, n, p); + double naive_time = omp_get_wtime() - start_time; + + // write naive result to file + matfile_write_matrix(result_file, C_naive, m, p); + + // Validate naive result + bool naive_correct = validate_result(result_file, reference_file); + if (!naive_correct) { + std::cerr << "Naive result validation failed for case " << case_number << std::endl; + } + + // Measure performance of blocked_matmul (use block_size = 32 as default) + start_time = omp_get_wtime(); + blocked_matmul(C_blocked, A, B, m, n, p, 32); + double blocked_time = omp_get_wtime() - start_time; + + // write blocked result to file + matfile_write_matrix(result_file, C_blocked, m, p); + + // Validate blocked result + bool blocked_correct = validate_result(result_file, reference_file); + if (!blocked_correct) { + std::cerr << "Blocked result validation failed for case " << case_number << std::endl; + } + + // Measure performance of parallel_matmul + start_time = omp_get_wtime(); + parallel_matmul(C_parallel, A, B, m, n, p); + double parallel_time = omp_get_wtime() - start_time; + + // write parallel result to file + matfile_write_matrix(result_file, C_parallel, m, p); + + // Validate parallel result + bool parallel_correct = validate_result(result_file, reference_file); + if (!parallel_correct) { + std::cerr << "Parallel result validation failed for case " << case_number << std::endl; + } + + // Print performance results + std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n"; + std::cout << "Naive time: " << naive_time << " seconds\n"; + std::cout << "Blocked time: " << blocked_time << " seconds\n"; + std::cout << "Parallel time: " << parallel_time << " seconds\n"; + std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n"; + std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n"; + + // close input files + input0.close(); + input1.close(); + + // Clean up + delete[] A; + delete[] B; + delete[] C_naive; + delete[] C_blocked; + delete[] C_parallel; + + return EXIT_SUCCESS; +} + +void matfile_read_dimensions(std::ifstream& file, uint32_t& m, uint32_t& n) +{ + // read the first line of the file to extract the dimensions of the matrix + std::string str_m, str_n; + file >> str_m >> str_n; + m = std::stoul(str_m); + n = std::stoul(str_n); +} + +void matfile_read_matrix(std::ifstream& file, float* matrix) +{ + // read the file line by line + std::string line; + std::size_t index = 0; + std::string value; + while (file >> value) { + matrix[index++] = static_cast(std::stof(value)); + } +} + +void matfile_write_matrix(std::string const& filename, const float* matrix, uint32_t m, uint32_t n) +{ + // open the file for writing + std::ofstream file(filename, std::ios::binary); + // write the matrix dimensions as the first line of the file + file << m << ' ' << n << std::endl; + // write the matrix to the file line by line + for (uint32_t i = 0; i < m; ++i) { + for (uint32_t j = 0; j < n; ++j) { + file << matrix[i*n + j] << " "; + } + file << std::endl; + } + // close the file + file.close(); +} + +void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) +{ + for (uint32_t i = 0; i < m; ++i) { + for (uint32_t j = 0; j < p; ++j) { + float sum = 0.0f; + for (uint32_t k = 0; k < n; ++k) { + sum += A[i*n + k] * B[k*p + j]; + } + C[i*p + j] = sum; + } + } +} + +void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) +{ + // TODO: Implement blocked matrix multiplication + // A is m x n, B is n x p, C is m x p + // Use block_size to divide matrices into submatrices + naive_matmul(C, A, B, m, n, p); // Placeholder: replace with actual blocked implementation +} + +void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) +{ + // TODO: Implement parallel matrix multiplication using OpenMP + // A is m x n, B is n x p, C is m x p + naive_matmul(C, A, B, m, n, p); // Placeholder: replace with actual parallel implementation +} + +bool validate_result(const std::string &result_file, const std::string &reference_file) +{ + // Open result and reference files + std::ifstream result(result_file, std::ios::binary); + std::ifstream reference(reference_file, std::ios::binary); + // Read dimensions of result and reference matrices + uint32_t m_result, n_result, m_ref, n_ref; + matfile_read_dimensions(result, m_result, n_result); + matfile_read_dimensions(reference, m_ref, n_ref); + // Validate dimensions + if (m_result != m_ref || n_result != n_ref) { + std::cerr << "Dimension mismatch: result is " << m_result << "x" << n_result + << ", reference is " << m_ref << "x" << n_ref << std::endl; + return false; + } + // Read matrices into memory + float* C_result = new float[m_result * n_result]; + float* C_reference = new float[m_ref * n_ref]; + matfile_read_matrix(result, C_result); + matfile_read_matrix(reference, C_reference); + // Validate values with tolerance + bool valid = true; + for (uint32_t i = 0; i < m_result * n_result; ++i) { + if (std::fabs(C_result[i] - C_reference[i]) > FLOAT_TOLERANCE) { + std::cerr << "Value mismatch at index " << i << ": result is " << C_result[i] + << ", reference is " << C_reference[i] << std::endl; + valid = false; + break; + } + } + // Clean up + delete[] C_result; + delete[] C_reference; + // exit + return valid; +} From 6be18112d9cc102a624ddee7559a3e71cd130d60 Mon Sep 17 00:00:00 2001 From: Tristan AMIOTTE-SUCHET Date: Sat, 30 May 2026 05:26:14 +0300 Subject: [PATCH 2/7] multi-thread matmul Still no improvement with the cached version of matmul. --- load-env.sh | 2 ++ main_ans.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 4 deletions(-) create mode 100755 load-env.sh diff --git a/load-env.sh b/load-env.sh new file mode 100755 index 0000000..dfeae0c --- /dev/null +++ b/load-env.sh @@ -0,0 +1,2 @@ +OMP_NUM_THREADS=8 +export OMP_NUM_THREADS diff --git a/main_ans.cpp b/main_ans.cpp index ae9fb43..50acded 100644 --- a/main_ans.cpp +++ b/main_ans.cpp @@ -7,6 +7,7 @@ #include #include +#define BLOCK_SIZE 128 #define FLOAT_TOLERANCE 1e-2 void matfile_read_dimensions(std::ifstream& file, uint32_t& m, uint32_t& n); @@ -83,9 +84,9 @@ int main(int argc, char* argv[]) std::cerr << "Naive result validation failed for case " << case_number << std::endl; } - // Measure performance of blocked_matmul (use block_size = 32 as default) + // Measure performance of blocked_matmul start_time = omp_get_wtime(); - blocked_matmul(C_blocked, A, B, m, n, p, 32); + blocked_matmul(C_blocked, A, B, m, n, p, BLOCK_SIZE); double blocked_time = omp_get_wtime() - start_time; // write blocked result to file @@ -183,19 +184,59 @@ void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t } } +// // C = A * B +// for (ii = 0; ii < m; ii += block_size) +// for (jj = 0; jj < p; jj += block_size) +// for (kk = 0; kk < n; kk += block_size) +// // Process block: C[ii:ii+block_size, jj:jj+block_size] += A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size, jj:jj+block_size] +// for (i = ii; i < min(ii + block_size, m); i++) +// for (j = jj; j < min(jj + block_size, p); j++) +// for (k = kk; k < min(kk + block_size, n); k++) +// C[i * p + j] += A[i * n + k] * B[k * p + j] void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) { // TODO: Implement blocked matrix multiplication // A is m x n, B is n x p, C is m x p // Use block_size to divide matrices into submatrices - naive_matmul(C, A, B, m, n, p); // Placeholder: replace with actual blocked implementation + for (uint32_t ii = 0; ii < m; ii += BLOCK_SIZE) { + for (uint32_t kk = 0; kk < n; kk += BLOCK_SIZE) { + for (uint32_t jj = 0; jj < p; jj += BLOCK_SIZE) { + + uint32_t i_end = std::min(ii + BLOCK_SIZE, m); + uint32_t k_end = std::min(kk + BLOCK_SIZE, n); + uint32_t j_end = std::min(jj + BLOCK_SIZE, p); + + for (uint32_t i = ii; i < i_end; ++i) { + for (uint32_t k = kk; k < k_end; ++k) { + + float aik = A[i * n + k]; + + for (uint32_t j = jj; j < j_end; ++j) { + C[i * p + j] += + aik * B[k * p + j]; + } + } + } + } + } + } } void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { // TODO: Implement parallel matrix multiplication using OpenMP // A is m x n, B is n x p, C is m x p - naive_matmul(C, A, B, m, n, p); // Placeholder: replace with actual parallel implementation + // naive_matmul(C, A, B, m, n, p); + #pragma omp parallel for + for (uint32_t i = 0; i < m; ++i) { + for (uint32_t j = 0; j < p; ++j) { + float sum = 0.0f; + for (uint32_t k = 0; k < n; ++k) { + sum += A[i*n + k] * B[k*p + j]; + } + C[i*p + j] = sum; + } + } } bool validate_result(const std::string &result_file, const std::string &reference_file) From e6fd549e519d2cf0ceae0af7dc3a157834c02b7b Mon Sep 17 00:00:00 2001 From: Tristan AMIOTTE-SUCHET Date: Sun, 31 May 2026 19:05:20 +0300 Subject: [PATCH 3/7] refacto & random matrix generation --- CMakeLists.txt | 5 +++++ main_ans.cpp | 61 +++++++++----------------------------------------- matfile.cpp | 40 +++++++++++++++++++++++++++++++++ matfile.h | 8 +++++++ randmat.cpp | 49 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 51 deletions(-) create mode 100644 matfile.cpp create mode 100644 matfile.h create mode 100644 randmat.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b2cf801..ef5b383 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,14 +16,19 @@ if(APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_Alignof=alignof") endif() +add_library(matfile matfile.cpp) add_executable(matmul main_ans.cpp) +add_executable(randmat randmat.cpp) +target_link_libraries(matmul PRIVATE matfile) if(OpenMP_CXX_FOUND) target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX) endif() +target_link_libraries(randmat PRIVATE matfile) + file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/data DESTINATION ${CMAKE_CURRENT_BINARY_DIR} ) diff --git a/main_ans.cpp b/main_ans.cpp index 50acded..699af0d 100644 --- a/main_ans.cpp +++ b/main_ans.cpp @@ -7,12 +7,10 @@ #include #include -#define BLOCK_SIZE 128 -#define FLOAT_TOLERANCE 1e-2 +#include "matfile.h" -void matfile_read_dimensions(std::ifstream& file, uint32_t& m, uint32_t& n); -void matfile_read_matrix(std::ifstream& file, float* matrix); -void matfile_write_matrix(std::string const& filename, const float* matrix, uint32_t m, uint32_t n); +#define BLOCK_SIZE 32 +#define FLOAT_TOLERANCE 1e-2 void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p); void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size); @@ -134,45 +132,9 @@ int main(int argc, char* argv[]) return EXIT_SUCCESS; } -void matfile_read_dimensions(std::ifstream& file, uint32_t& m, uint32_t& n) -{ - // read the first line of the file to extract the dimensions of the matrix - std::string str_m, str_n; - file >> str_m >> str_n; - m = std::stoul(str_m); - n = std::stoul(str_n); -} - -void matfile_read_matrix(std::ifstream& file, float* matrix) -{ - // read the file line by line - std::string line; - std::size_t index = 0; - std::string value; - while (file >> value) { - matrix[index++] = static_cast(std::stof(value)); - } -} - -void matfile_write_matrix(std::string const& filename, const float* matrix, uint32_t m, uint32_t n) -{ - // open the file for writing - std::ofstream file(filename, std::ios::binary); - // write the matrix dimensions as the first line of the file - file << m << ' ' << n << std::endl; - // write the matrix to the file line by line - for (uint32_t i = 0; i < m; ++i) { - for (uint32_t j = 0; j < n; ++j) { - file << matrix[i*n + j] << " "; - } - file << std::endl; - } - // close the file - file.close(); -} - void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { + // A is m x n, B is n x p, C is m x p for (uint32_t i = 0; i < m; ++i) { for (uint32_t j = 0; j < p; ++j) { float sum = 0.0f; @@ -195,16 +157,15 @@ void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t // C[i * p + j] += A[i * n + k] * B[k * p + j] void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) { - // TODO: Implement blocked matrix multiplication // A is m x n, B is n x p, C is m x p // Use block_size to divide matrices into submatrices - for (uint32_t ii = 0; ii < m; ii += BLOCK_SIZE) { - for (uint32_t kk = 0; kk < n; kk += BLOCK_SIZE) { - for (uint32_t jj = 0; jj < p; jj += BLOCK_SIZE) { + for (uint32_t ii = 0; ii < m; ii += block_size) { + for (uint32_t kk = 0; kk < n; kk += block_size) { + for (uint32_t jj = 0; jj < p; jj += block_size) { - uint32_t i_end = std::min(ii + BLOCK_SIZE, m); - uint32_t k_end = std::min(kk + BLOCK_SIZE, n); - uint32_t j_end = std::min(jj + BLOCK_SIZE, p); + uint32_t i_end = std::min(ii + block_size, m); + uint32_t k_end = std::min(kk + block_size, n); + uint32_t j_end = std::min(jj + block_size, p); for (uint32_t i = ii; i < i_end; ++i) { for (uint32_t k = kk; k < k_end; ++k) { @@ -224,9 +185,7 @@ void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32 void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - // TODO: Implement parallel matrix multiplication using OpenMP // A is m x n, B is n x p, C is m x p - // naive_matmul(C, A, B, m, n, p); #pragma omp parallel for for (uint32_t i = 0; i < m; ++i) { for (uint32_t j = 0; j < p; ++j) { diff --git a/matfile.cpp b/matfile.cpp new file mode 100644 index 0000000..14b5ced --- /dev/null +++ b/matfile.cpp @@ -0,0 +1,40 @@ +#include "matfile.h" + +#include + +void matfile_read_dimensions(std::ifstream& file, uint32_t& m, uint32_t& n) +{ + // read the first line of the file to extract the dimensions of the matrix + std::string str_m, str_n; + file >> str_m >> str_n; + m = std::stoul(str_m); + n = std::stoul(str_n); +} + +void matfile_read_matrix(std::ifstream& file, float* matrix) +{ + // read the file line by line + std::string line; + std::size_t index = 0; + std::string value; + while (file >> value) { + matrix[index++] = static_cast(std::stof(value)); + } +} + +void matfile_write_matrix(std::string const& filename, const float* matrix, uint32_t m, uint32_t n) +{ + // open the file for writing + std::ofstream file(filename, std::ios::binary); + // write the matrix dimensions as the first line of the file + file << m << ' ' << n << std::endl; + // write the matrix to the file line by line + for (uint32_t i = 0; i < m; ++i) { + for (uint32_t j = 0; j < n; ++j) { + file << matrix[i*n + j] << " "; + } + file << std::endl; + } + // close the file + file.close(); +} \ No newline at end of file diff --git a/matfile.h b/matfile.h new file mode 100644 index 0000000..1d4bf04 --- /dev/null +++ b/matfile.h @@ -0,0 +1,8 @@ +#pragma once + +#include +#include + +void matfile_read_dimensions(std::ifstream& file, uint32_t& m, uint32_t& n); +void matfile_read_matrix(std::ifstream& file, float* matrix); +void matfile_write_matrix(std::string const& filename, const float* matrix, uint32_t m, uint32_t n); diff --git a/randmat.cpp b/randmat.cpp new file mode 100644 index 0000000..8230134 --- /dev/null +++ b/randmat.cpp @@ -0,0 +1,49 @@ +#include +#include +#include +#include + +#include "matfile.h" + +void gen_random_matrix(float* matrix, uint32_t m, uint32_t n, std::tuple range = {0.0f, 1.0f}); + +int main(int argc, char* argv[]) +{ + // set random seed based on current time + srand(static_cast(time(nullptr))); + // get the matrix size and filename from command line arguments + if (argc != 4) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return EXIT_FAILURE; + } + uint32_t m = std::atoi(argv[1]); + uint32_t n = std::atoi(argv[2]); + std::string filename = argv[3]; + // allocate memory for the matrix + float *matrix = new float[m*n]; + // generate random matrix + gen_random_matrix(matrix, m, n, {0.0f, 4.0f}); + // write matrix to file + matfile_write_matrix(filename, matrix, m, n); + // free memory + delete[] matrix; + // exit + return EXIT_SUCCESS; +} + +/** + * This function should fill the provided matrix with random float values. + * Values are generated with two significant digits. + */ +void gen_random_matrix(float *matrix, uint32_t m, uint32_t n, std::tuple range) +{ + float min = std::get<0>(range); + float max = std::get<1>(range); + int modulo = static_cast((max - min)*100); + for (uint32_t i = 0; i < m; ++i) { + for (uint32_t j = 0; j < n; ++j) { + // Generate random float with two decimal places + matrix[i*n + j] = static_cast(rand() % modulo + static_cast(min*100)) / 100.0f; + } + } +} From 05c6adc7b2804dec813724110f14586283da4a14 Mon Sep 17 00:00:00 2001 From: Tristan AMIOTTE-SUCHET Date: Sun, 31 May 2026 20:22:38 +0300 Subject: [PATCH 4/7] optimized block multiplication --- main_ans.cpp | 61 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/main_ans.cpp b/main_ans.cpp index 699af0d..87d29e4 100644 --- a/main_ans.cpp +++ b/main_ans.cpp @@ -146,36 +146,63 @@ void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t } } -// // C = A * B -// for (ii = 0; ii < m; ii += block_size) -// for (jj = 0; jj < p; jj += block_size) -// for (kk = 0; kk < n; kk += block_size) -// // Process block: C[ii:ii+block_size, jj:jj+block_size] += A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size, jj:jj+block_size] -// for (i = ii; i < min(ii + block_size, m); i++) -// for (j = jj; j < min(jj + block_size, p); j++) -// for (k = kk; k < min(kk + block_size, n); k++) -// C[i * p + j] += A[i * n + k] * B[k * p + j] void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) { // A is m x n, B is n x p, C is m x p // Use block_size to divide matrices into submatrices for (uint32_t ii = 0; ii < m; ii += block_size) { + uint32_t i_end = std::min(ii + block_size, m); + for (uint32_t kk = 0; kk < n; kk += block_size) { - for (uint32_t jj = 0; jj < p; jj += block_size) { + uint32_t k_end = std::min(kk + block_size, n); - uint32_t i_end = std::min(ii + block_size, m); - uint32_t k_end = std::min(kk + block_size, n); + for (uint32_t jj = 0; jj < p; jj += block_size) { uint32_t j_end = std::min(jj + block_size, p); for (uint32_t i = ii; i < i_end; ++i) { - for (uint32_t k = kk; k < k_end; ++k) { + for (uint32_t j = jj; j + 7 < j_end; j += 8) { // unroll 8 columns at a time + // load C[i][j..j+7] into registers to ensure optimized memory access + float c0 = C[i * p + j + 0]; + float c1 = C[i * p + j + 1]; + float c2 = C[i * p + j + 2]; + float c3 = C[i * p + j + 3]; + float c4 = C[i * p + j + 4]; + float c5 = C[i * p + j + 5]; + float c6 = C[i * p + j + 6]; + float c7 = C[i * p + j + 7]; + + for (uint32_t k = kk; k < k_end; ++k) { + const float aik = A[i * n + k]; + const float* b = B + k*p + j; + // process calculation for the 8 columns in registers + c0 += aik * b[0]; + c1 += aik * b[1]; + c2 += aik * b[2]; + c3 += aik * b[3]; + c4 += aik * b[4]; + c5 += aik * b[5]; + c6 += aik * b[6]; + c7 += aik * b[7]; + } - float aik = A[i * n + k]; + // store results back to C + C[i * p + j + 0] = c0; + C[i * p + j + 1] = c1; + C[i * p + j + 2] = c2; + C[i * p + j + 3] = c3; + C[i * p + j + 4] = c4; + C[i * p + j + 5] = c5; + C[i * p + j + 6] = c6; + C[i * p + j + 7] = c7; + } - for (uint32_t j = jj; j < j_end; ++j) { - C[i * p + j] += - aik * B[k * p + j]; + // cleanup for remaining columns that were not processed in the unrolled loop + for (uint32_t j = j_end - ((j_end - jj) % 8) ; j < j_end ; ++j) { + float sum = C[i * p + j]; + for (uint32_t k = kk; k < k_end; ++k) { + sum += A[i * n + k] * B[k * p + j]; } + C[i * p + j] = sum; } } } From fc33c8726cc1fca610652311fa9b2c39cc7c320b Mon Sep 17 00:00:00 2001 From: Tristan AMIOTTE-SUCHET Date: Sun, 31 May 2026 22:19:22 +0300 Subject: [PATCH 5/7] benchmark script + init data --- CMakeLists.txt | 2 +- benchmark.sh | 17 +++++++++++++++++ main_ans.cpp | 46 +++++++++++++++++++++++++++------------------- matfile.cpp | 2 +- 4 files changed, 46 insertions(+), 21 deletions(-) create mode 100755 benchmark.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index ef5b383..f04f6bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,6 @@ endif() target_link_libraries(randmat PRIVATE matfile) -file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/data +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/data ${CMAKE_CURRENT_SOURCE_DIR}/benchmark.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR} ) diff --git a/benchmark.sh b/benchmark.sh new file mode 100755 index 0000000..ca0b54c --- /dev/null +++ b/benchmark.sh @@ -0,0 +1,17 @@ +#! /bin/sh + +mkdir -p results +echo '|====================================================================================================|' +for data in `seq 0 9`; do + echo "| Running benchmark on $data |" + echo -n '|' + : > .buffer.txt + for i in `seq 100`; do + ./matmul $data | grep -E 'time|speedup' >> .buffer.txt + echo -n '#' + done + echo '|' + sort .buffer.txt > results/benchmark$data.txt +done +echo '|====================================================================================================|' +rm .buffer.txt diff --git a/main_ans.cpp b/main_ans.cpp index 87d29e4..c027a80 100644 --- a/main_ans.cpp +++ b/main_ans.cpp @@ -68,6 +68,11 @@ int main(int argc, char* argv[]) float* C_blocked = new float[m * p]; float* C_parallel = new float[m * p]; + // Initialize result matrices to zero + std::fill(C_naive, C_naive + m*p, 0.0f); + std::fill(C_blocked, C_blocked + m*p, 0.0f); + std::fill(C_parallel, C_parallel + m*p, 0.0f); + // Measure performance of naive_matmul double start_time = omp_get_wtime(); naive_matmul(C_naive, A, B, m, n, p); @@ -162,14 +167,14 @@ void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32 for (uint32_t i = ii; i < i_end; ++i) { for (uint32_t j = jj; j + 7 < j_end; j += 8) { // unroll 8 columns at a time // load C[i][j..j+7] into registers to ensure optimized memory access - float c0 = C[i * p + j + 0]; - float c1 = C[i * p + j + 1]; - float c2 = C[i * p + j + 2]; - float c3 = C[i * p + j + 3]; - float c4 = C[i * p + j + 4]; - float c5 = C[i * p + j + 5]; - float c6 = C[i * p + j + 6]; - float c7 = C[i * p + j + 7]; + float c0 = C[i*p + j + 0]; + float c1 = C[i*p + j + 1]; + float c2 = C[i*p + j + 2]; + float c3 = C[i*p + j + 3]; + float c4 = C[i*p + j + 4]; + float c5 = C[i*p + j + 5]; + float c6 = C[i*p + j + 6]; + float c7 = C[i*p + j + 7]; for (uint32_t k = kk; k < k_end; ++k) { const float aik = A[i * n + k]; @@ -186,14 +191,14 @@ void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32 } // store results back to C - C[i * p + j + 0] = c0; - C[i * p + j + 1] = c1; - C[i * p + j + 2] = c2; - C[i * p + j + 3] = c3; - C[i * p + j + 4] = c4; - C[i * p + j + 5] = c5; - C[i * p + j + 6] = c6; - C[i * p + j + 7] = c7; + C[i*p + j + 0] = c0; + C[i*p + j + 1] = c1; + C[i*p + j + 2] = c2; + C[i*p + j + 3] = c3; + C[i*p + j + 4] = c4; + C[i*p + j + 5] = c5; + C[i*p + j + 6] = c6; + C[i*p + j + 7] = c7; } // cleanup for remaining columns that were not processed in the unrolled loop @@ -225,11 +230,11 @@ void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint3 } } -bool validate_result(const std::string &result_file, const std::string &reference_file) +bool validate_result(const std::string &result_filename, const std::string &reference_filename) { // Open result and reference files - std::ifstream result(result_file, std::ios::binary); - std::ifstream reference(reference_file, std::ios::binary); + std::ifstream result(result_filename, std::ios::binary); + std::ifstream reference(reference_filename, std::ios::binary); // Read dimensions of result and reference matrices uint32_t m_result, n_result, m_ref, n_ref; matfile_read_dimensions(result, m_result, n_result); @@ -255,6 +260,9 @@ bool validate_result(const std::string &result_file, const std::string &referenc break; } } + // close files + result.close(); + reference.close(); // Clean up delete[] C_result; delete[] C_reference; diff --git a/matfile.cpp b/matfile.cpp index 14b5ced..8f32a78 100644 --- a/matfile.cpp +++ b/matfile.cpp @@ -37,4 +37,4 @@ void matfile_write_matrix(std::string const& filename, const float* matrix, uint } // close the file file.close(); -} \ No newline at end of file +} From 5a130f28ebdc6422dcd2c671e78c1d9b26c359b6 Mon Sep 17 00:00:00 2001 From: Tristan AMIOTTE-SUCHET Date: Sun, 31 May 2026 22:57:40 +0300 Subject: [PATCH 6/7] benchmark summary --- benchmark.sh | 32 +- results/benchmark0.txt | 600 ++++++++++++++++++++++++++++++++++ results/benchmark1.txt | 600 ++++++++++++++++++++++++++++++++++ results/benchmark2.txt | 600 ++++++++++++++++++++++++++++++++++ results/benchmark3.txt | 600 ++++++++++++++++++++++++++++++++++ results/benchmark4.txt | 600 ++++++++++++++++++++++++++++++++++ results/benchmark5.txt | 600 ++++++++++++++++++++++++++++++++++ results/benchmark6.txt | 600 ++++++++++++++++++++++++++++++++++ results/benchmark7.txt | 600 ++++++++++++++++++++++++++++++++++ results/benchmark8.txt | 600 ++++++++++++++++++++++++++++++++++ results/benchmark9.txt | 600 ++++++++++++++++++++++++++++++++++ results/benchmark_summary.csv | 11 + 12 files changed, 6036 insertions(+), 7 deletions(-) create mode 100644 results/benchmark0.txt create mode 100644 results/benchmark1.txt create mode 100644 results/benchmark2.txt create mode 100644 results/benchmark3.txt create mode 100644 results/benchmark4.txt create mode 100644 results/benchmark5.txt create mode 100644 results/benchmark6.txt create mode 100644 results/benchmark7.txt create mode 100644 results/benchmark8.txt create mode 100644 results/benchmark9.txt create mode 100644 results/benchmark_summary.csv diff --git a/benchmark.sh b/benchmark.sh index ca0b54c..48478b7 100755 --- a/benchmark.sh +++ b/benchmark.sh @@ -1,17 +1,35 @@ #! /bin/sh +echo 'Iterating each data folder and running benchmark 100 times on each case...' + +# run benchmark and save results to results/benchmark{case_number}.txt + mkdir -p results -echo '|====================================================================================================|' +echo '+====================================================================================================+' for data in `seq 0 9`; do echo "| Running benchmark on $data |" echo -n '|' - : > .buffer.txt + : > results/benchmark$data.txt for i in `seq 100`; do - ./matmul $data | grep -E 'time|speedup' >> .buffer.txt - echo -n '#' + ./matmul $data >> results/benchmark$data.txt + echo -n '-' done echo '|' - sort .buffer.txt > results/benchmark$data.txt done -echo '|====================================================================================================|' -rm .buffer.txt +echo '+====================================================================================================+' + +# extract statistics from results and save to results/benchmark_summary.csv +echo 'Extracting statistics from benchmark results...' + +RESULT_FILE=results/benchmark_summary.csv +: > $RESULT_FILE +echo 'Test Case,Dimensions (m × n × p),Naive Time (s),Blocked Time (s),Parallel Time (s),Blocked Speedup,Parallel Speedup' > $RESULT_FILE +for data in `seq 0 9`; do + echo -n "$data," >> $RESULT_FILE + grep -oEm1 '[0-9]+x[0-9]+x[0-9]+' results/benchmark$data.txt | tr '\n' ',' >> $RESULT_FILE + grep 'Naive time' results/benchmark$data.txt | cut -d' ' -f3 | awk '{sum+=$1; count++} END {print sum/count}' | tr '\n' ',' >> $RESULT_FILE + grep 'Blocked time' results/benchmark$data.txt | cut -d' ' -f3 | awk '{sum+=$1; count++} END {print sum/count}' | tr '\n' ',' >> $RESULT_FILE + grep 'Parallel time' results/benchmark$data.txt | cut -d' ' -f3 | awk '{sum+=$1; count++} END {print sum/count}' | tr '\n' ',' >> $RESULT_FILE + grep 'Blocked speedup' results/benchmark$data.txt | cut -d' ' -f3 | tr -d 'x' | awk '{sum+=$1; count++} END {print sum/count}' | tr '\n' ',' >> $RESULT_FILE + grep 'Parallel speedup' results/benchmark$data.txt | cut -d' ' -f3 | tr -d 'x' | awk '{sum+=$1; count++} END {print sum/count}' >> $RESULT_FILE +done diff --git a/results/benchmark0.txt b/results/benchmark0.txt new file mode 100644 index 0000000..ad98160 --- /dev/null +++ b/results/benchmark0.txt @@ -0,0 +1,600 @@ +Case 0 (64x64x64): +Naive time: 0.000681318 seconds +Blocked time: 0.000256301 seconds +Parallel time: 0.00301695 seconds +Blocked speedup: 2.65827x +Parallel speedup: 0.22583x +Case 0 (64x64x64): +Naive time: 0.000747252 seconds +Blocked time: 0.000354645 seconds +Parallel time: 0.00299161 seconds +Blocked speedup: 2.10704x +Parallel speedup: 0.249782x +Case 0 (64x64x64): +Naive time: 0.000674946 seconds +Blocked time: 0.000241523 seconds +Parallel time: 0.00341503 seconds +Blocked speedup: 2.79454x +Parallel speedup: 0.19764x +Case 0 (64x64x64): +Naive time: 0.000673233 seconds +Blocked time: 0.000249068 seconds +Parallel time: 0.000688582 seconds +Blocked speedup: 2.70301x +Parallel speedup: 0.977709x +Case 0 (64x64x64): +Naive time: 0.000685165 seconds +Blocked time: 0.000270277 seconds +Parallel time: 0.00416857 seconds +Blocked speedup: 2.53505x +Parallel speedup: 0.164364x +Case 0 (64x64x64): +Naive time: 0.000675788 seconds +Blocked time: 0.000276689 seconds +Parallel time: 0.0028603 seconds +Blocked speedup: 2.44241x +Parallel speedup: 0.236265x +Case 0 (64x64x64): +Naive time: 0.00131042 seconds +Blocked time: 0.000286007 seconds +Parallel time: 0.0107864 seconds +Blocked speedup: 4.58177x +Parallel speedup: 0.121488x +Case 0 (64x64x64): +Naive time: 0.000688392 seconds +Blocked time: 0.000248346 seconds +Parallel time: 0.00337183 seconds +Blocked speedup: 2.77191x +Parallel speedup: 0.20416x +Case 0 (64x64x64): +Naive time: 0.000739698 seconds +Blocked time: 0.000241503 seconds +Parallel time: 0.00428252 seconds +Blocked speedup: 3.06289x +Parallel speedup: 0.172725x +Case 0 (64x64x64): +Naive time: 0.000685626 seconds +Blocked time: 0.00023973 seconds +Parallel time: 0.0136889 seconds +Blocked speedup: 2.85999x +Parallel speedup: 0.0500861x +Case 0 (64x64x64): +Naive time: 0.000690155 seconds +Blocked time: 0.000240882 seconds +Parallel time: 0.00478214 seconds +Blocked speedup: 2.86512x +Parallel speedup: 0.144319x +Case 0 (64x64x64): +Naive time: 0.000687099 seconds +Blocked time: 0.000254648 seconds +Parallel time: 0.00764461 seconds +Blocked speedup: 2.69823x +Parallel speedup: 0.0898802x +Case 0 (64x64x64): +Naive time: 0.00068243 seconds +Blocked time: 0.000244319 seconds +Parallel time: 0.00328706 seconds +Blocked speedup: 2.79319x +Parallel speedup: 0.207611x +Case 0 (64x64x64): +Naive time: 0.000682461 seconds +Blocked time: 0.000300203 seconds +Parallel time: 0.00474219 seconds +Blocked speedup: 2.27333x +Parallel speedup: 0.143913x +Case 0 (64x64x64): +Naive time: 0.000694032 seconds +Blocked time: 0.000243887 seconds +Parallel time: 0.00063392 seconds +Blocked speedup: 2.84571x +Parallel speedup: 1.09483x +Case 0 (64x64x64): +Naive time: 0.000682611 seconds +Blocked time: 0.000267031 seconds +Parallel time: 0.00383948 seconds +Blocked speedup: 2.5563x +Parallel speedup: 0.177787x +Case 0 (64x64x64): +Naive time: 0.000690575 seconds +Blocked time: 0.000389872 seconds +Parallel time: 0.00337341 seconds +Blocked speedup: 1.77129x +Parallel speedup: 0.204711x +Case 0 (64x64x64): +Naive time: 0.000684424 seconds +Blocked time: 0.000259957 seconds +Parallel time: 0.000669235 seconds +Blocked speedup: 2.63284x +Parallel speedup: 1.0227x +Case 0 (64x64x64): +Naive time: 0.000706265 seconds +Blocked time: 0.000308499 seconds +Parallel time: 0.0024662 seconds +Blocked speedup: 2.28936x +Parallel speedup: 0.286378x +Case 0 (64x64x64): +Naive time: 0.000688081 seconds +Blocked time: 0.000310202 seconds +Parallel time: 0.00376422 seconds +Blocked speedup: 2.21817x +Parallel speedup: 0.182795x +Case 0 (64x64x64): +Naive time: 0.000715592 seconds +Blocked time: 0.000243076 seconds +Parallel time: 0.00937736 seconds +Blocked speedup: 2.9439x +Parallel speedup: 0.0763106x +Case 0 (64x64x64): +Naive time: 0.000682089 seconds +Blocked time: 0.000248546 seconds +Parallel time: 0.00292064 seconds +Blocked speedup: 2.74432x +Parallel speedup: 0.233541x +Case 0 (64x64x64): +Naive time: 0.0007303 seconds +Blocked time: 0.000268944 seconds +Parallel time: 0.0104198 seconds +Blocked speedup: 2.71544x +Parallel speedup: 0.0700878x +Case 0 (64x64x64): +Naive time: 0.000685606 seconds +Blocked time: 0.000243227 seconds +Parallel time: 0.0031099 seconds +Blocked speedup: 2.81879x +Parallel speedup: 0.220459x +Case 0 (64x64x64): +Naive time: 0.000684615 seconds +Blocked time: 0.000256712 seconds +Parallel time: 0.0103187 seconds +Blocked speedup: 2.66686x +Parallel speedup: 0.0663473x +Case 0 (64x64x64): +Naive time: 0.000730791 seconds +Blocked time: 0.000247204 seconds +Parallel time: 0.00555132 seconds +Blocked speedup: 2.95623x +Parallel speedup: 0.131643x +Case 0 (64x64x64): +Naive time: 0.000698581 seconds +Blocked time: 0.000255439 seconds +Parallel time: 0.0058838 seconds +Blocked speedup: 2.73483x +Parallel speedup: 0.11873x +Case 0 (64x64x64): +Naive time: 0.000720642 seconds +Blocked time: 0.000343785 seconds +Parallel time: 0.00672881 seconds +Blocked speedup: 2.0962x +Parallel speedup: 0.107098x +Case 0 (64x64x64): +Naive time: 0.00103685 seconds +Blocked time: 0.000305463 seconds +Parallel time: 0.00376709 seconds +Blocked speedup: 3.39434x +Parallel speedup: 0.275238x +Case 0 (64x64x64): +Naive time: 0.000706555 seconds +Blocked time: 0.000239139 seconds +Parallel time: 0.00364308 seconds +Blocked speedup: 2.95458x +Parallel speedup: 0.193945x +Case 0 (64x64x64): +Naive time: 0.000696647 seconds +Blocked time: 0.000306034 seconds +Parallel time: 0.011499 seconds +Blocked speedup: 2.27637x +Parallel speedup: 0.060583x +Case 0 (64x64x64): +Naive time: 0.000702738 seconds +Blocked time: 0.000284293 seconds +Parallel time: 0.00422423 seconds +Blocked speedup: 2.47188x +Parallel speedup: 0.166359x +Case 0 (64x64x64): +Naive time: 0.000683082 seconds +Blocked time: 0.000295674 seconds +Parallel time: 0.00289921 seconds +Blocked speedup: 2.31025x +Parallel speedup: 0.23561x +Case 0 (64x64x64): +Naive time: 0.000686769 seconds +Blocked time: 0.000281418 seconds +Parallel time: 0.0122109 seconds +Blocked speedup: 2.44039x +Parallel speedup: 0.0562425x +Case 0 (64x64x64): +Naive time: 0.000681549 seconds +Blocked time: 0.00025058 seconds +Parallel time: 0.000860444 seconds +Blocked speedup: 2.71989x +Parallel speedup: 0.79209x +Case 0 (64x64x64): +Naive time: 0.000704672 seconds +Blocked time: 0.000247644 seconds +Parallel time: 0.00282223 seconds +Blocked speedup: 2.8455x +Parallel speedup: 0.249687x +Case 0 (64x64x64): +Naive time: 0.00070895 seconds +Blocked time: 0.000265358 seconds +Parallel time: 0.00334206 seconds +Blocked speedup: 2.67167x +Parallel speedup: 0.21213x +Case 0 (64x64x64): +Naive time: 0.000688933 seconds +Blocked time: 0.000239579 seconds +Parallel time: 0.000650801 seconds +Blocked speedup: 2.8756x +Parallel speedup: 1.05859x +Case 0 (64x64x64): +Naive time: 0.000687911 seconds +Blocked time: 0.000245741 seconds +Parallel time: 0.00289321 seconds +Blocked speedup: 2.79933x +Parallel speedup: 0.237767x +Case 0 (64x64x64): +Naive time: 0.000686609 seconds +Blocked time: 0.0002452 seconds +Parallel time: 0.00369092 seconds +Blocked speedup: 2.8002x +Parallel speedup: 0.186027x +Case 0 (64x64x64): +Naive time: 0.000685086 seconds +Blocked time: 0.00028249 seconds +Parallel time: 0.00064499 seconds +Blocked speedup: 2.42517x +Parallel speedup: 1.06217x +Case 0 (64x64x64): +Naive time: 0.000681378 seconds +Blocked time: 0.000326934 seconds +Parallel time: 0.00467731 seconds +Blocked speedup: 2.08415x +Parallel speedup: 0.145677x +Case 0 (64x64x64): +Naive time: 0.000682 seconds +Blocked time: 0.000248917 seconds +Parallel time: 0.00107348 seconds +Blocked speedup: 2.73987x +Parallel speedup: 0.635315x +Case 0 (64x64x64): +Naive time: 0.000736001 seconds +Blocked time: 0.000295174 seconds +Parallel time: 0.0107304 seconds +Blocked speedup: 2.49345x +Parallel speedup: 0.0685901x +Case 0 (64x64x64): +Naive time: 0.000681929 seconds +Blocked time: 0.000242726 seconds +Parallel time: 0.00936221 seconds +Blocked speedup: 2.80946x +Parallel speedup: 0.0728385x +Case 0 (64x64x64): +Naive time: 0.00068177 seconds +Blocked time: 0.000259587 seconds +Parallel time: 0.0026838 seconds +Blocked speedup: 2.62636x +Parallel speedup: 0.254032x +Case 0 (64x64x64): +Naive time: 0.000709962 seconds +Blocked time: 0.000409108 seconds +Parallel time: 0.00349611 seconds +Blocked speedup: 1.73539x +Parallel speedup: 0.203072x +Case 0 (64x64x64): +Naive time: 0.000689744 seconds +Blocked time: 0.000238157 seconds +Parallel time: 0.000611418 seconds +Blocked speedup: 2.89617x +Parallel speedup: 1.12811x +Case 0 (64x64x64): +Naive time: 0.00068759 seconds +Blocked time: 0.000247284 seconds +Parallel time: 0.00262697 seconds +Blocked speedup: 2.78057x +Parallel speedup: 0.261743x +Case 0 (64x64x64): +Naive time: 0.000695125 seconds +Blocked time: 0.000236624 seconds +Parallel time: 0.00290149 seconds +Blocked speedup: 2.93768x +Parallel speedup: 0.239575x +Case 0 (64x64x64): +Naive time: 0.000686879 seconds +Blocked time: 0.0002986 seconds +Parallel time: 0.00317015 seconds +Blocked speedup: 2.30033x +Parallel speedup: 0.216671x +Case 0 (64x64x64): +Naive time: 0.000681369 seconds +Blocked time: 0.000247915 seconds +Parallel time: 0.00544843 seconds +Blocked speedup: 2.7484x +Parallel speedup: 0.125058x +Case 0 (64x64x64): +Naive time: 0.000699012 seconds +Blocked time: 0.000271199 seconds +Parallel time: 0.00436205 seconds +Blocked speedup: 2.57749x +Parallel speedup: 0.160248x +Case 0 (64x64x64): +Naive time: 0.000693001 seconds +Blocked time: 0.000238407 seconds +Parallel time: 0.000642034 seconds +Blocked speedup: 2.9068x +Parallel speedup: 1.07938x +Case 0 (64x64x64): +Naive time: 0.000708339 seconds +Blocked time: 0.000268954 seconds +Parallel time: 0.00340551 seconds +Blocked speedup: 2.63368x +Parallel speedup: 0.207998x +Case 0 (64x64x64): +Naive time: 0.00069296 seconds +Blocked time: 0.000262142 seconds +Parallel time: 0.00358358 seconds +Blocked speedup: 2.64345x +Parallel speedup: 0.193371x +Case 0 (64x64x64): +Naive time: 0.000691828 seconds +Blocked time: 0.000276028 seconds +Parallel time: 0.00059703 seconds +Blocked speedup: 2.50637x +Parallel speedup: 1.15878x +Case 0 (64x64x64): +Naive time: 0.000690565 seconds +Blocked time: 0.000241503 seconds +Parallel time: 0.00334018 seconds +Blocked speedup: 2.85945x +Parallel speedup: 0.206745x +Case 0 (64x64x64): +Naive time: 0.000687049 seconds +Blocked time: 0.000237335 seconds +Parallel time: 0.00257497 seconds +Blocked speedup: 2.89485x +Parallel speedup: 0.266818x +Case 0 (64x64x64): +Naive time: 0.000772329 seconds +Blocked time: 0.000253135 seconds +Parallel time: 0.00246709 seconds +Blocked speedup: 3.05106x +Parallel speedup: 0.313053x +Case 0 (64x64x64): +Naive time: 0.000723668 seconds +Blocked time: 0.00028274 seconds +Parallel time: 0.00356478 seconds +Blocked speedup: 2.55948x +Parallel speedup: 0.203005x +Case 0 (64x64x64): +Naive time: 0.000686348 seconds +Blocked time: 0.00027721 seconds +Parallel time: 0.0045835 seconds +Blocked speedup: 2.47591x +Parallel speedup: 0.149743x +Case 0 (64x64x64): +Naive time: 0.000688051 seconds +Blocked time: 0.00026146 seconds +Parallel time: 0.000694112 seconds +Blocked speedup: 2.63157x +Parallel speedup: 0.991268x +Case 0 (64x64x64): +Naive time: 0.00068744 seconds +Blocked time: 0.000256752 seconds +Parallel time: 0.00305083 seconds +Blocked speedup: 2.67745x +Parallel speedup: 0.225328x +Case 0 (64x64x64): +Naive time: 0.000690345 seconds +Blocked time: 0.000247625 seconds +Parallel time: 0.0130646 seconds +Blocked speedup: 2.78786x +Parallel speedup: 0.0528409x +Case 0 (64x64x64): +Naive time: 0.000683112 seconds +Blocked time: 0.000237676 seconds +Parallel time: 0.00305081 seconds +Blocked speedup: 2.87413x +Parallel speedup: 0.223911x +Case 0 (64x64x64): +Naive time: 0.000731462 seconds +Blocked time: 0.000270267 seconds +Parallel time: 0.00423841 seconds +Blocked speedup: 2.70644x +Parallel speedup: 0.172579x +Case 0 (64x64x64): +Naive time: 0.000686588 seconds +Blocked time: 0.000359013 seconds +Parallel time: 0.00322889 seconds +Blocked speedup: 1.91243x +Parallel speedup: 0.212639x +Case 0 (64x64x64): +Naive time: 0.000686108 seconds +Blocked time: 0.00026138 seconds +Parallel time: 0.00162197 seconds +Blocked speedup: 2.62494x +Parallel speedup: 0.423008x +Case 0 (64x64x64): +Naive time: 0.000687771 seconds +Blocked time: 0.00027142 seconds +Parallel time: 0.00308604 seconds +Blocked speedup: 2.53397x +Parallel speedup: 0.222865x +Case 0 (64x64x64): +Naive time: 0.000688992 seconds +Blocked time: 0.000306074 seconds +Parallel time: 0.003015 seconds +Blocked speedup: 2.25106x +Parallel speedup: 0.228522x +Case 0 (64x64x64): +Naive time: 0.000681939 seconds +Blocked time: 0.000259357 seconds +Parallel time: 0.00263187 seconds +Blocked speedup: 2.62934x +Parallel speedup: 0.259108x +Case 0 (64x64x64): +Naive time: 0.000687389 seconds +Blocked time: 0.000238116 seconds +Parallel time: 0.0028042 seconds +Blocked speedup: 2.88678x +Parallel speedup: 0.245128x +Case 0 (64x64x64): +Naive time: 0.000698751 seconds +Blocked time: 0.000256161 seconds +Parallel time: 0.00260068 seconds +Blocked speedup: 2.72778x +Parallel speedup: 0.26868x +Case 0 (64x64x64): +Naive time: 0.000681749 seconds +Blocked time: 0.000308589 seconds +Parallel time: 0.00518857 seconds +Blocked speedup: 2.20925x +Parallel speedup: 0.131395x +Case 0 (64x64x64): +Naive time: 0.000690326 seconds +Blocked time: 0.00023989 seconds +Parallel time: 0.000647805 seconds +Blocked speedup: 2.87768x +Parallel speedup: 1.06564x +Case 0 (64x64x64): +Naive time: 0.000741912 seconds +Blocked time: 0.000241243 seconds +Parallel time: 0.00509941 seconds +Blocked speedup: 3.07537x +Parallel speedup: 0.14549x +Case 0 (64x64x64): +Naive time: 0.00068178 seconds +Blocked time: 0.000296396 seconds +Parallel time: 0.00260601 seconds +Blocked speedup: 2.30023x +Parallel speedup: 0.261618x +Case 0 (64x64x64): +Naive time: 0.000709551 seconds +Blocked time: 0.000236724 seconds +Parallel time: 0.00342891 seconds +Blocked speedup: 2.99738x +Parallel speedup: 0.206932x +Case 0 (64x64x64): +Naive time: 0.0006873 seconds +Blocked time: 0.000282621 seconds +Parallel time: 0.0027204 seconds +Blocked speedup: 2.43188x +Parallel speedup: 0.252646x +Case 0 (64x64x64): +Naive time: 0.000737614 seconds +Blocked time: 0.000332904 seconds +Parallel time: 0.00288432 seconds +Blocked speedup: 2.2157x +Parallel speedup: 0.255732x +Case 0 (64x64x64): +Naive time: 0.000685396 seconds +Blocked time: 0.000256591 seconds +Parallel time: 0.000637176 seconds +Blocked speedup: 2.67116x +Parallel speedup: 1.07568x +Case 0 (64x64x64): +Naive time: 0.000687731 seconds +Blocked time: 0.000288311 seconds +Parallel time: 0.0065341 seconds +Blocked speedup: 2.38538x +Parallel speedup: 0.105253x +Case 0 (64x64x64): +Naive time: 0.000691908 seconds +Blocked time: 0.000253165 seconds +Parallel time: 0.00297787 seconds +Blocked speedup: 2.73303x +Parallel speedup: 0.23235x +Case 0 (64x64x64): +Naive time: 0.000687029 seconds +Blocked time: 0.000240581 seconds +Parallel time: 0.00375607 seconds +Blocked speedup: 2.85571x +Parallel speedup: 0.182912x +Case 0 (64x64x64): +Naive time: 0.000692128 seconds +Blocked time: 0.000241383 seconds +Parallel time: 0.00430049 seconds +Blocked speedup: 2.86734x +Parallel speedup: 0.160942x +Case 0 (64x64x64): +Naive time: 0.000768512 seconds +Blocked time: 0.000264706 seconds +Parallel time: 0.00315614 seconds +Blocked speedup: 2.90327x +Parallel speedup: 0.243497x +Case 0 (64x64x64): +Naive time: 0.000681639 seconds +Blocked time: 0.000249167 seconds +Parallel time: 0.000653055 seconds +Blocked speedup: 2.73567x +Parallel speedup: 1.04377x +Case 0 (64x64x64): +Naive time: 0.000681399 seconds +Blocked time: 0.000245941 seconds +Parallel time: 0.00349416 seconds +Blocked speedup: 2.77058x +Parallel speedup: 0.195011x +Case 0 (64x64x64): +Naive time: 0.000681759 seconds +Blocked time: 0.000242285 seconds +Parallel time: 0.0102177 seconds +Blocked speedup: 2.81387x +Parallel speedup: 0.0667236x +Case 0 (64x64x64): +Naive time: 0.000686418 seconds +Blocked time: 0.000279705 seconds +Parallel time: 0.00455304 seconds +Blocked speedup: 2.45408x +Parallel speedup: 0.15076x +Case 0 (64x64x64): +Naive time: 0.000681739 seconds +Blocked time: 0.000246092 seconds +Parallel time: 0.00332617 seconds +Blocked speedup: 2.77026x +Parallel speedup: 0.204962x +Case 0 (64x64x64): +Naive time: 0.000683001 seconds +Blocked time: 0.000240511 seconds +Parallel time: 0.0102468 seconds +Blocked speedup: 2.83979x +Parallel speedup: 0.0666548x +Case 0 (64x64x64): +Naive time: 0.000689734 seconds +Blocked time: 0.000252354 seconds +Parallel time: 0.0103618 seconds +Blocked speedup: 2.7332x +Parallel speedup: 0.0665651x +Case 0 (64x64x64): +Naive time: 0.00188779 seconds +Blocked time: 0.000330139 seconds +Parallel time: 0.00335011 seconds +Blocked speedup: 5.71817x +Parallel speedup: 0.563502x +Case 0 (64x64x64): +Naive time: 0.000696687 seconds +Blocked time: 0.000247865 seconds +Parallel time: 0.00249908 seconds +Blocked speedup: 2.81075x +Parallel speedup: 0.278778x +Case 0 (64x64x64): +Naive time: 0.000703239 seconds +Blocked time: 0.000251662 seconds +Parallel time: 0.00848719 seconds +Blocked speedup: 2.79438x +Parallel speedup: 0.0828588x +Case 0 (64x64x64): +Naive time: 0.000700795 seconds +Blocked time: 0.000251071 seconds +Parallel time: 0.00292975 seconds +Blocked speedup: 2.79122x +Parallel speedup: 0.2392x +Case 0 (64x64x64): +Naive time: 0.000681679 seconds +Blocked time: 0.000246012 seconds +Parallel time: 0.000535043 seconds +Blocked speedup: 2.77092x +Parallel speedup: 1.27406x +Case 0 (64x64x64): +Naive time: 0.000681498 seconds +Blocked time: 0.0002448 seconds +Parallel time: 0.00250924 seconds +Blocked speedup: 2.7839x +Parallel speedup: 0.271596x diff --git a/results/benchmark1.txt b/results/benchmark1.txt new file mode 100644 index 0000000..f1961eb --- /dev/null +++ b/results/benchmark1.txt @@ -0,0 +1,600 @@ +Case 1 (128x64x128): +Naive time: 0.00280887 seconds +Blocked time: 0.000987683 seconds +Parallel time: 0.00290389 seconds +Blocked speedup: 2.8439x +Parallel speedup: 0.967279x +Case 1 (128x64x128): +Naive time: 0.00275895 seconds +Blocked time: 0.000967025 seconds +Parallel time: 0.00126659 seconds +Blocked speedup: 2.85303x +Parallel speedup: 2.17825x +Case 1 (128x64x128): +Naive time: 0.0027917 seconds +Blocked time: 0.000966704 seconds +Parallel time: 0.00143942 seconds +Blocked speedup: 2.88785x +Parallel speedup: 1.93946x +Case 1 (128x64x128): +Naive time: 0.00271837 seconds +Blocked time: 0.000965542 seconds +Parallel time: 0.0014643 seconds +Blocked speedup: 2.81538x +Parallel speedup: 1.85643x +Case 1 (128x64x128): +Naive time: 0.00273778 seconds +Blocked time: 0.000968577 seconds +Parallel time: 0.00151301 seconds +Blocked speedup: 2.8266x +Parallel speedup: 1.80949x +Case 1 (128x64x128): +Naive time: 0.00280591 seconds +Blocked time: 0.000967997 seconds +Parallel time: 0.00144622 seconds +Blocked speedup: 2.89868x +Parallel speedup: 1.94017x +Case 1 (128x64x128): +Naive time: 0.00273415 seconds +Blocked time: 0.000973267 seconds +Parallel time: 0.00146167 seconds +Blocked speedup: 2.80925x +Parallel speedup: 1.87056x +Case 1 (128x64x128): +Naive time: 0.0027283 seconds +Blocked time: 0.00106622 seconds +Parallel time: 0.00144709 seconds +Blocked speedup: 2.55885x +Parallel speedup: 1.88538x +Case 1 (128x64x128): +Naive time: 0.00273105 seconds +Blocked time: 0.000972645 seconds +Parallel time: 0.00144949 seconds +Blocked speedup: 2.80786x +Parallel speedup: 1.88415x +Case 1 (128x64x128): +Naive time: 0.0027344 seconds +Blocked time: 0.00099633 seconds +Parallel time: 0.00168078 seconds +Blocked speedup: 2.74447x +Parallel speedup: 1.62686x +Case 1 (128x64x128): +Naive time: 0.00276004 seconds +Blocked time: 0.000970922 seconds +Parallel time: 0.00145027 seconds +Blocked speedup: 2.8427x +Parallel speedup: 1.90312x +Case 1 (128x64x128): +Naive time: 0.0027376 seconds +Blocked time: 0.000965131 seconds +Parallel time: 0.0014447 seconds +Blocked speedup: 2.8365x +Parallel speedup: 1.89492x +Case 1 (128x64x128): +Naive time: 0.00273363 seconds +Blocked time: 0.000963488 seconds +Parallel time: 0.00146921 seconds +Blocked speedup: 2.83722x +Parallel speedup: 1.86062x +Case 1 (128x64x128): +Naive time: 0.00273036 seconds +Blocked time: 0.000965432 seconds +Parallel time: 0.00161387 seconds +Blocked speedup: 2.82813x +Parallel speedup: 1.69181x +Case 1 (128x64x128): +Naive time: 0.0027491 seconds +Blocked time: 0.00104674 seconds +Parallel time: 0.0014409 seconds +Blocked speedup: 2.62633x +Parallel speedup: 1.9079x +Case 1 (128x64x128): +Naive time: 0.00274764 seconds +Blocked time: 0.000966233 seconds +Parallel time: 0.00097532 seconds +Blocked speedup: 2.84366x +Parallel speedup: 2.81716x +Case 1 (128x64x128): +Naive time: 0.00275784 seconds +Blocked time: 0.000970801 seconds +Parallel time: 0.00144414 seconds +Blocked speedup: 2.84079x +Parallel speedup: 1.90968x +Case 1 (128x64x128): +Naive time: 0.00312265 seconds +Blocked time: 0.000966924 seconds +Parallel time: 0.00145369 seconds +Blocked speedup: 3.22947x +Parallel speedup: 2.14809x +Case 1 (128x64x128): +Naive time: 0.00273148 seconds +Blocked time: 0.00098045 seconds +Parallel time: 0.00147226 seconds +Blocked speedup: 2.78594x +Parallel speedup: 1.85529x +Case 1 (128x64x128): +Naive time: 0.002747 seconds +Blocked time: 0.000960292 seconds +Parallel time: 0.00144181 seconds +Blocked speedup: 2.86059x +Parallel speedup: 1.90525x +Case 1 (128x64x128): +Naive time: 0.0027481 seconds +Blocked time: 0.000974178 seconds +Parallel time: 0.00178297 seconds +Blocked speedup: 2.82094x +Parallel speedup: 1.5413x +Case 1 (128x64x128): +Naive time: 0.00273616 seconds +Blocked time: 0.00103188 seconds +Parallel time: 0.00154966 seconds +Blocked speedup: 2.65164x +Parallel speedup: 1.76566x +Case 1 (128x64x128): +Naive time: 0.00282888 seconds +Blocked time: 0.000966243 seconds +Parallel time: 0.00144953 seconds +Blocked speedup: 2.92771x +Parallel speedup: 1.95158x +Case 1 (128x64x128): +Naive time: 0.00273266 seconds +Blocked time: 0.00101455 seconds +Parallel time: 0.00144075 seconds +Blocked speedup: 2.69346x +Parallel speedup: 1.89669x +Case 1 (128x64x128): +Naive time: 0.00272857 seconds +Blocked time: 0.000970761 seconds +Parallel time: 0.00144789 seconds +Blocked speedup: 2.81075x +Parallel speedup: 1.88452x +Case 1 (128x64x128): +Naive time: 0.00273526 seconds +Blocked time: 0.00096471 seconds +Parallel time: 0.00803401 seconds +Blocked speedup: 2.83532x +Parallel speedup: 0.34046x +Case 1 (128x64x128): +Naive time: 0.00272978 seconds +Blocked time: 0.000974949 seconds +Parallel time: 0.00325086 seconds +Blocked speedup: 2.79992x +Parallel speedup: 0.839711x +Case 1 (128x64x128): +Naive time: 0.00274793 seconds +Blocked time: 0.000993995 seconds +Parallel time: 0.00151123 seconds +Blocked speedup: 2.76453x +Parallel speedup: 1.81834x +Case 1 (128x64x128): +Naive time: 0.00273617 seconds +Blocked time: 0.000966784 seconds +Parallel time: 0.00145523 seconds +Blocked speedup: 2.83018x +Parallel speedup: 1.88023x +Case 1 (128x64x128): +Naive time: 0.00311162 seconds +Blocked time: 0.00097564 seconds +Parallel time: 0.00774841 seconds +Blocked speedup: 3.18931x +Parallel speedup: 0.401582x +Case 1 (128x64x128): +Naive time: 0.0027257 seconds +Blocked time: 0.000997552 seconds +Parallel time: 0.00150028 seconds +Blocked speedup: 2.73239x +Parallel speedup: 1.81679x +Case 1 (128x64x128): +Naive time: 0.00272832 seconds +Blocked time: 0.000963056 seconds +Parallel time: 0.00144569 seconds +Blocked speedup: 2.83298x +Parallel speedup: 1.8872x +Case 1 (128x64x128): +Naive time: 0.00276288 seconds +Blocked time: 0.000974018 seconds +Parallel time: 0.00287702 seconds +Blocked speedup: 2.83658x +Parallel speedup: 0.960329x +Case 1 (128x64x128): +Naive time: 0.00272922 seconds +Blocked time: 0.000961664 seconds +Parallel time: 0.00651201 seconds +Blocked speedup: 2.83802x +Parallel speedup: 0.419106x +Case 1 (128x64x128): +Naive time: 0.00276917 seconds +Blocked time: 0.000987974 seconds +Parallel time: 0.00143026 seconds +Blocked speedup: 2.80287x +Parallel speedup: 1.93612x +Case 1 (128x64x128): +Naive time: 0.00280679 seconds +Blocked time: 0.000961825 seconds +Parallel time: 0.00145362 seconds +Blocked speedup: 2.91819x +Parallel speedup: 1.9309x +Case 1 (128x64x128): +Naive time: 0.00273192 seconds +Blocked time: 0.000965552 seconds +Parallel time: 0.00150306 seconds +Blocked speedup: 2.82938x +Parallel speedup: 1.81757x +Case 1 (128x64x128): +Naive time: 0.00273072 seconds +Blocked time: 0.000970972 seconds +Parallel time: 0.00147204 seconds +Blocked speedup: 2.81236x +Parallel speedup: 1.85506x +Case 1 (128x64x128): +Naive time: 0.00317814 seconds +Blocked time: 0.000971623 seconds +Parallel time: 0.00191004 seconds +Blocked speedup: 3.27096x +Parallel speedup: 1.66391x +Case 1 (128x64x128): +Naive time: 0.00275333 seconds +Blocked time: 0.00100225 seconds +Parallel time: 0.00146635 seconds +Blocked speedup: 2.74714x +Parallel speedup: 1.87767x +Case 1 (128x64x128): +Naive time: 0.00273446 seconds +Blocked time: 0.000967525 seconds +Parallel time: 0.00145091 seconds +Blocked speedup: 2.82624x +Parallel speedup: 1.88465x +Case 1 (128x64x128): +Naive time: 0.0027609 seconds +Blocked time: 0.000976683 seconds +Parallel time: 0.00152244 seconds +Blocked speedup: 2.82681x +Parallel speedup: 1.81348x +Case 1 (128x64x128): +Naive time: 0.00276252 seconds +Blocked time: 0.000969009 seconds +Parallel time: 0.00333341 seconds +Blocked speedup: 2.85087x +Parallel speedup: 0.828737x +Case 1 (128x64x128): +Naive time: 0.00274715 seconds +Blocked time: 0.000968207 seconds +Parallel time: 0.00143222 seconds +Blocked speedup: 2.83736x +Parallel speedup: 1.91811x +Case 1 (128x64x128): +Naive time: 0.00311121 seconds +Blocked time: 0.000981742 seconds +Parallel time: 0.00148828 seconds +Blocked speedup: 3.16907x +Parallel speedup: 2.09047x +Case 1 (128x64x128): +Naive time: 0.00276656 seconds +Blocked time: 0.00101413 seconds +Parallel time: 0.00146749 seconds +Blocked speedup: 2.72801x +Parallel speedup: 1.88523x +Case 1 (128x64x128): +Naive time: 0.00272855 seconds +Blocked time: 0.000985369 seconds +Parallel time: 0.00106191 seconds +Blocked speedup: 2.76906x +Parallel speedup: 2.56947x +Case 1 (128x64x128): +Naive time: 0.00274965 seconds +Blocked time: 0.000991371 seconds +Parallel time: 0.00144483 seconds +Blocked speedup: 2.77358x +Parallel speedup: 1.90309x +Case 1 (128x64x128): +Naive time: 0.00273464 seconds +Blocked time: 0.000968647 seconds +Parallel time: 0.0015607 seconds +Blocked speedup: 2.82316x +Parallel speedup: 1.75219x +Case 1 (128x64x128): +Naive time: 0.00273412 seconds +Blocked time: 0.000982673 seconds +Parallel time: 0.00143623 seconds +Blocked speedup: 2.78233x +Parallel speedup: 1.90367x +Case 1 (128x64x128): +Naive time: 0.00273282 seconds +Blocked time: 0.000969769 seconds +Parallel time: 0.00148871 seconds +Blocked speedup: 2.81801x +Parallel speedup: 1.83569x +Case 1 (128x64x128): +Naive time: 0.00273219 seconds +Blocked time: 0.000966874 seconds +Parallel time: 0.00325521 seconds +Blocked speedup: 2.82579x +Parallel speedup: 0.839328x +Case 1 (128x64x128): +Naive time: 0.00285871 seconds +Blocked time: 0.000962386 seconds +Parallel time: 0.00297508 seconds +Blocked speedup: 2.97044x +Parallel speedup: 0.960886x +Case 1 (128x64x128): +Naive time: 0.00274162 seconds +Blocked time: 0.00105132 seconds +Parallel time: 0.00145107 seconds +Blocked speedup: 2.60779x +Parallel speedup: 1.88938x +Case 1 (128x64x128): +Naive time: 0.00275159 seconds +Blocked time: 0.000977183 seconds +Parallel time: 0.00113591 seconds +Blocked speedup: 2.81584x +Parallel speedup: 2.42236x +Case 1 (128x64x128): +Naive time: 0.00274855 seconds +Blocked time: 0.000987362 seconds +Parallel time: 0.00147445 seconds +Blocked speedup: 2.78373x +Parallel speedup: 1.86412x +Case 1 (128x64x128): +Naive time: 0.00275501 seconds +Blocked time: 0.00098096 seconds +Parallel time: 0.00147049 seconds +Blocked speedup: 2.80848x +Parallel speedup: 1.87353x +Case 1 (128x64x128): +Naive time: 0.00275469 seconds +Blocked time: 0.000976803 seconds +Parallel time: 0.00146522 seconds +Blocked speedup: 2.82011x +Parallel speedup: 1.88005x +Case 1 (128x64x128): +Naive time: 0.00275061 seconds +Blocked time: 0.00107682 seconds +Parallel time: 0.00146048 seconds +Blocked speedup: 2.55438x +Parallel speedup: 1.88336x +Case 1 (128x64x128): +Naive time: 0.00276388 seconds +Blocked time: 0.000963267 seconds +Parallel time: 0.00134498 seconds +Blocked speedup: 2.86927x +Parallel speedup: 2.05495x +Case 1 (128x64x128): +Naive time: 0.00276601 seconds +Blocked time: 0.00103104 seconds +Parallel time: 0.00145108 seconds +Blocked speedup: 2.68272x +Parallel speedup: 1.90617x +Case 1 (128x64x128): +Naive time: 0.00274687 seconds +Blocked time: 0.000965482 seconds +Parallel time: 0.0014446 seconds +Blocked speedup: 2.84508x +Parallel speedup: 1.90148x +Case 1 (128x64x128): +Naive time: 0.00274236 seconds +Blocked time: 0.000975421 seconds +Parallel time: 0.00335794 seconds +Blocked speedup: 2.81146x +Parallel speedup: 0.816678x +Case 1 (128x64x128): +Naive time: 0.00276309 seconds +Blocked time: 0.0010331 seconds +Parallel time: 0.00145844 seconds +Blocked speedup: 2.67457x +Parallel speedup: 1.89456x +Case 1 (128x64x128): +Naive time: 0.0027297 seconds +Blocked time: 0.000969669 seconds +Parallel time: 0.00146974 seconds +Blocked speedup: 2.81509x +Parallel speedup: 1.85727x +Case 1 (128x64x128): +Naive time: 0.00273531 seconds +Blocked time: 0.000967335 seconds +Parallel time: 0.00401488 seconds +Blocked speedup: 2.82768x +Parallel speedup: 0.681293x +Case 1 (128x64x128): +Naive time: 0.002728 seconds +Blocked time: 0.00095951 seconds +Parallel time: 0.00250721 seconds +Blocked speedup: 2.84312x +Parallel speedup: 1.08806x +Case 1 (128x64x128): +Naive time: 0.00272579 seconds +Blocked time: 0.00096991 seconds +Parallel time: 0.00145431 seconds +Blocked speedup: 2.81036x +Parallel speedup: 1.87429x +Case 1 (128x64x128): +Naive time: 0.00272732 seconds +Blocked time: 0.000963818 seconds +Parallel time: 0.00167325 seconds +Blocked speedup: 2.8297x +Parallel speedup: 1.62995x +Case 1 (128x64x128): +Naive time: 0.00275109 seconds +Blocked time: 0.00119249 seconds +Parallel time: 0.00166368 seconds +Blocked speedup: 2.30702x +Parallel speedup: 1.65362x +Case 1 (128x64x128): +Naive time: 0.00274089 seconds +Blocked time: 0.000962466 seconds +Parallel time: 0.00150352 seconds +Blocked speedup: 2.84778x +Parallel speedup: 1.82298x +Case 1 (128x64x128): +Naive time: 0.00276146 seconds +Blocked time: 0.000972835 seconds +Parallel time: 0.0068163 seconds +Blocked speedup: 2.83857x +Parallel speedup: 0.405126x +Case 1 (128x64x128): +Naive time: 0.00274453 seconds +Blocked time: 0.000963207 seconds +Parallel time: 0.00184379 seconds +Blocked speedup: 2.84937x +Parallel speedup: 1.48853x +Case 1 (128x64x128): +Naive time: 0.00274385 seconds +Blocked time: 0.000987764 seconds +Parallel time: 0.00153817 seconds +Blocked speedup: 2.77784x +Parallel speedup: 1.78384x +Case 1 (128x64x128): +Naive time: 0.00277227 seconds +Blocked time: 0.000971483 seconds +Parallel time: 0.00825747 seconds +Blocked speedup: 2.85365x +Parallel speedup: 0.335729x +Case 1 (128x64x128): +Naive time: 0.00275663 seconds +Blocked time: 0.000974518 seconds +Parallel time: 0.00145623 seconds +Blocked speedup: 2.82871x +Parallel speedup: 1.89299x +Case 1 (128x64x128): +Naive time: 0.00272383 seconds +Blocked time: 0.000966534 seconds +Parallel time: 0.00309411 seconds +Blocked speedup: 2.81814x +Parallel speedup: 0.880326x +Case 1 (128x64x128): +Naive time: 0.00274999 seconds +Blocked time: 0.000968417 seconds +Parallel time: 0.00412795 seconds +Blocked speedup: 2.83968x +Parallel speedup: 0.666187x +Case 1 (128x64x128): +Naive time: 0.00275924 seconds +Blocked time: 0.000986791 seconds +Parallel time: 0.00145578 seconds +Blocked speedup: 2.79617x +Parallel speedup: 1.89536x +Case 1 (128x64x128): +Naive time: 0.00272612 seconds +Blocked time: 0.000981872 seconds +Parallel time: 0.00146197 seconds +Blocked speedup: 2.77646x +Parallel speedup: 1.86469x +Case 1 (128x64x128): +Naive time: 0.00275762 seconds +Blocked time: 0.000967566 seconds +Parallel time: 0.00144235 seconds +Blocked speedup: 2.85006x +Parallel speedup: 1.9119x +Case 1 (128x64x128): +Naive time: 0.00273694 seconds +Blocked time: 0.000960963 seconds +Parallel time: 0.00149861 seconds +Blocked speedup: 2.84812x +Parallel speedup: 1.82631x +Case 1 (128x64x128): +Naive time: 0.0027432 seconds +Blocked time: 0.000966363 seconds +Parallel time: 0.00349575 seconds +Blocked speedup: 2.83868x +Parallel speedup: 0.784724x +Case 1 (128x64x128): +Naive time: 0.00275275 seconds +Blocked time: 0.000957426 seconds +Parallel time: 0.00225371 seconds +Blocked speedup: 2.87515x +Parallel speedup: 1.22143x +Case 1 (128x64x128): +Naive time: 0.00273599 seconds +Blocked time: 0.000972534 seconds +Parallel time: 0.00145622 seconds +Blocked speedup: 2.81326x +Parallel speedup: 1.87883x +Case 1 (128x64x128): +Naive time: 0.0027724 seconds +Blocked time: 0.00099119 seconds +Parallel time: 0.00146454 seconds +Blocked speedup: 2.79704x +Parallel speedup: 1.89302x +Case 1 (128x64x128): +Naive time: 0.00274533 seconds +Blocked time: 0.00105092 seconds +Parallel time: 0.00150281 seconds +Blocked speedup: 2.61231x +Parallel speedup: 1.8268x +Case 1 (128x64x128): +Naive time: 0.00275816 seconds +Blocked time: 0.000978055 seconds +Parallel time: 0.00144413 seconds +Blocked speedup: 2.82005x +Parallel speedup: 1.90991x +Case 1 (128x64x128): +Naive time: 0.00276951 seconds +Blocked time: 0.000972455 seconds +Parallel time: 0.00455727 seconds +Blocked speedup: 2.84795x +Parallel speedup: 0.607712x +Case 1 (128x64x128): +Naive time: 0.00271372 seconds +Blocked time: 0.00104264 seconds +Parallel time: 0.00144764 seconds +Blocked speedup: 2.60275x +Parallel speedup: 1.87459x +Case 1 (128x64x128): +Naive time: 0.00274478 seconds +Blocked time: 0.00102562 seconds +Parallel time: 0.00147062 seconds +Blocked speedup: 2.6762x +Parallel speedup: 1.86641x +Case 1 (128x64x128): +Naive time: 0.00273827 seconds +Blocked time: 0.00098588 seconds +Parallel time: 0.00147696 seconds +Blocked speedup: 2.77749x +Parallel speedup: 1.85399x +Case 1 (128x64x128): +Naive time: 0.0027559 seconds +Blocked time: 0.00103623 seconds +Parallel time: 0.00144664 seconds +Blocked speedup: 2.65954x +Parallel speedup: 1.90503x +Case 1 (128x64x128): +Naive time: 0.00274392 seconds +Blocked time: 0.000967385 seconds +Parallel time: 0.00150194 seconds +Blocked speedup: 2.83643x +Parallel speedup: 1.82692x +Case 1 (128x64x128): +Naive time: 0.00273461 seconds +Blocked time: 0.000971834 seconds +Parallel time: 0.00143347 seconds +Blocked speedup: 2.81387x +Parallel speedup: 1.90769x +Case 1 (128x64x128): +Naive time: 0.00275463 seconds +Blocked time: 0.000986541 seconds +Parallel time: 0.00151384 seconds +Blocked speedup: 2.79221x +Parallel speedup: 1.81963x +Case 1 (128x64x128): +Naive time: 0.00275227 seconds +Blocked time: 0.000985549 seconds +Parallel time: 0.00105038 seconds +Blocked speedup: 2.79263x +Parallel speedup: 2.62027x +Case 1 (128x64x128): +Naive time: 0.00272837 seconds +Blocked time: 0.000964029 seconds +Parallel time: 0.00154762 seconds +Blocked speedup: 2.83017x +Parallel speedup: 1.76294x +Case 1 (128x64x128): +Naive time: 0.00295809 seconds +Blocked time: 0.000975721 seconds +Parallel time: 0.00147505 seconds +Blocked speedup: 3.0317x +Parallel speedup: 2.00542x +Case 1 (128x64x128): +Naive time: 0.00277215 seconds +Blocked time: 0.00096988 seconds +Parallel time: 0.0014496 seconds +Blocked speedup: 2.85824x +Parallel speedup: 1.91236x diff --git a/results/benchmark2.txt b/results/benchmark2.txt new file mode 100644 index 0000000..755317e --- /dev/null +++ b/results/benchmark2.txt @@ -0,0 +1,600 @@ +Case 2 (100x128x56): +Naive time: 0.00188693 seconds +Blocked time: 0.000772239 seconds +Parallel time: 0.00199778 seconds +Blocked speedup: 2.44345x +Parallel speedup: 0.944515x +Case 2 (100x128x56): +Naive time: 0.00188168 seconds +Blocked time: 0.000656863 seconds +Parallel time: 0.00327373 seconds +Blocked speedup: 2.86465x +Parallel speedup: 0.574781x +Case 2 (100x128x56): +Naive time: 0.00190676 seconds +Blocked time: 0.000654207 seconds +Parallel time: 0.00257246 seconds +Blocked speedup: 2.91461x +Parallel speedup: 0.741221x +Case 2 (100x128x56): +Naive time: 0.00189839 seconds +Blocked time: 0.000674275 seconds +Parallel time: 0.00779216 seconds +Blocked speedup: 2.81546x +Parallel speedup: 0.243628x +Case 2 (100x128x56): +Naive time: 0.00190202 seconds +Blocked time: 0.0006826 seconds +Parallel time: 0.00287211 seconds +Blocked speedup: 2.78643x +Parallel speedup: 0.662238x +Case 2 (100x128x56): +Naive time: 0.00189843 seconds +Blocked time: 0.000645581 seconds +Parallel time: 0.0044668 seconds +Blocked speedup: 2.94066x +Parallel speedup: 0.425009x +Case 2 (100x128x56): +Naive time: 0.00190583 seconds +Blocked time: 0.000668915 seconds +Parallel time: 0.00645108 seconds +Blocked speedup: 2.84913x +Parallel speedup: 0.295427x +Case 2 (100x128x56): +Naive time: 0.00197184 seconds +Blocked time: 0.000686237 seconds +Parallel time: 0.00575571 seconds +Blocked speedup: 2.87341x +Parallel speedup: 0.342588x +Case 2 (100x128x56): +Naive time: 0.00190509 seconds +Blocked time: 0.000664427 seconds +Parallel time: 0.00240791 seconds +Blocked speedup: 2.86727x +Parallel speedup: 0.791182x +Case 2 (100x128x56): +Naive time: 0.0019355 seconds +Blocked time: 0.000673324 seconds +Parallel time: 0.00346452 seconds +Blocked speedup: 2.87455x +Parallel speedup: 0.558664x +Case 2 (100x128x56): +Naive time: 0.00190522 seconds +Blocked time: 0.000653857 seconds +Parallel time: 0.00762078 seconds +Blocked speedup: 2.91381x +Parallel speedup: 0.250003x +Case 2 (100x128x56): +Naive time: 0.00190509 seconds +Blocked time: 0.000650109 seconds +Parallel time: 0.00109153 seconds +Blocked speedup: 2.93042x +Parallel speedup: 1.74535x +Case 2 (100x128x56): +Naive time: 0.00193345 seconds +Blocked time: 0.00116679 seconds +Parallel time: 0.00267754 seconds +Blocked speedup: 1.65707x +Parallel speedup: 0.722097x +Case 2 (100x128x56): +Naive time: 0.00203113 seconds +Blocked time: 0.00076212 seconds +Parallel time: 0.00433853 seconds +Blocked speedup: 2.66511x +Parallel speedup: 0.468161x +Case 2 (100x128x56): +Naive time: 0.00189178 seconds +Blocked time: 0.000643547 seconds +Parallel time: 0.00637055 seconds +Blocked speedup: 2.93961x +Parallel speedup: 0.296957x +Case 2 (100x128x56): +Naive time: 0.00190657 seconds +Blocked time: 0.000660469 seconds +Parallel time: 0.00106389 seconds +Blocked speedup: 2.88669x +Parallel speedup: 1.79208x +Case 2 (100x128x56): +Naive time: 0.00197204 seconds +Blocked time: 0.000696857 seconds +Parallel time: 0.00304037 seconds +Blocked speedup: 2.82991x +Parallel speedup: 0.648618x +Case 2 (100x128x56): +Naive time: 0.00191836 seconds +Blocked time: 0.000706476 seconds +Parallel time: 0.00307635 seconds +Blocked speedup: 2.71539x +Parallel speedup: 0.623583x +Case 2 (100x128x56): +Naive time: 0.00190816 seconds +Blocked time: 0.000657383 seconds +Parallel time: 0.00790343 seconds +Blocked speedup: 2.90266x +Parallel speedup: 0.241434x +Case 2 (100x128x56): +Naive time: 0.00190931 seconds +Blocked time: 0.000656271 seconds +Parallel time: 0.00102557 seconds +Blocked speedup: 2.90934x +Parallel speedup: 1.8617x +Case 2 (100x128x56): +Naive time: 0.00191433 seconds +Blocked time: 0.000676219 seconds +Parallel time: 0.00815034 seconds +Blocked speedup: 2.83093x +Parallel speedup: 0.234877x +Case 2 (100x128x56): +Naive time: 0.00193095 seconds +Blocked time: 0.000653326 seconds +Parallel time: 0.00772192 seconds +Blocked speedup: 2.95557x +Parallel speedup: 0.250061x +Case 2 (100x128x56): +Naive time: 0.00190568 seconds +Blocked time: 0.000674485 seconds +Parallel time: 0.000950393 seconds +Blocked speedup: 2.82538x +Parallel speedup: 2.00515x +Case 2 (100x128x56): +Naive time: 0.00192761 seconds +Blocked time: 0.000706756 seconds +Parallel time: 0.00295737 seconds +Blocked speedup: 2.7274x +Parallel speedup: 0.651798x +Case 2 (100x128x56): +Naive time: 0.0019508 seconds +Blocked time: 0.000701998 seconds +Parallel time: 0.00285254 seconds +Blocked speedup: 2.77893x +Parallel speedup: 0.683881x +Case 2 (100x128x56): +Naive time: 0.00190593 seconds +Blocked time: 0.000669627 seconds +Parallel time: 0.002636 seconds +Blocked speedup: 2.84625x +Parallel speedup: 0.723038x +Case 2 (100x128x56): +Naive time: 0.00221422 seconds +Blocked time: 0.000661391 seconds +Parallel time: 0.00089013 seconds +Blocked speedup: 3.34782x +Parallel speedup: 2.48752x +Case 2 (100x128x56): +Naive time: 0.00192599 seconds +Blocked time: 0.000728126 seconds +Parallel time: 0.0033727 seconds +Blocked speedup: 2.64514x +Parallel speedup: 0.571054x +Case 2 (100x128x56): +Naive time: 0.00192071 seconds +Blocked time: 0.000679745 seconds +Parallel time: 0.00385201 seconds +Blocked speedup: 2.82564x +Parallel speedup: 0.498627x +Case 2 (100x128x56): +Naive time: 0.0019136 seconds +Blocked time: 0.000675448 seconds +Parallel time: 0.00372914 seconds +Blocked speedup: 2.83308x +Parallel speedup: 0.513148x +Case 2 (100x128x56): +Naive time: 0.001906 seconds +Blocked time: 0.000662964 seconds +Parallel time: 0.00287048 seconds +Blocked speedup: 2.87496x +Parallel speedup: 0.664x +Case 2 (100x128x56): +Naive time: 0.00190965 seconds +Blocked time: 0.000657884 seconds +Parallel time: 0.0104832 seconds +Blocked speedup: 2.90272x +Parallel speedup: 0.182164x +Case 2 (100x128x56): +Naive time: 0.00195179 seconds +Blocked time: 0.000679104 seconds +Parallel time: 0.00743501 seconds +Blocked speedup: 2.87407x +Parallel speedup: 0.262514x +Case 2 (100x128x56): +Naive time: 0.00191855 seconds +Blocked time: 0.000674776 seconds +Parallel time: 0.00786396 seconds +Blocked speedup: 2.84324x +Parallel speedup: 0.243967x +Case 2 (100x128x56): +Naive time: 0.00190266 seconds +Blocked time: 0.00071407 seconds +Parallel time: 0.00286821 seconds +Blocked speedup: 2.66453x +Parallel speedup: 0.663361x +Case 2 (100x128x56): +Naive time: 0.00190516 seconds +Blocked time: 0.000677982 seconds +Parallel time: 0.00823752 seconds +Blocked speedup: 2.81004x +Parallel speedup: 0.231278x +Case 2 (100x128x56): +Naive time: 0.00192164 seconds +Blocked time: 0.000691627 seconds +Parallel time: 0.0028619 seconds +Blocked speedup: 2.77843x +Parallel speedup: 0.671455x +Case 2 (100x128x56): +Naive time: 0.00190527 seconds +Blocked time: 0.000679254 seconds +Parallel time: 0.00337161 seconds +Blocked speedup: 2.80495x +Parallel speedup: 0.565094x +Case 2 (100x128x56): +Naive time: 0.00192063 seconds +Blocked time: 0.00068177 seconds +Parallel time: 0.00213528 seconds +Blocked speedup: 2.81713x +Parallel speedup: 0.899478x +Case 2 (100x128x56): +Naive time: 0.00190543 seconds +Blocked time: 0.000694503 seconds +Parallel time: 0.00291889 seconds +Blocked speedup: 2.7436x +Parallel speedup: 0.652795x +Case 2 (100x128x56): +Naive time: 0.00194449 seconds +Blocked time: 0.000694343 seconds +Parallel time: 0.00442509 seconds +Blocked speedup: 2.80047x +Parallel speedup: 0.439423x +Case 2 (100x128x56): +Naive time: 0.00190112 seconds +Blocked time: 0.000684936 seconds +Parallel time: 0.00264863 seconds +Blocked speedup: 2.77561x +Parallel speedup: 0.717774x +Case 2 (100x128x56): +Naive time: 0.00191295 seconds +Blocked time: 0.000669707 seconds +Parallel time: 0.00343602 seconds +Blocked speedup: 2.8564x +Parallel speedup: 0.556735x +Case 2 (100x128x56): +Naive time: 0.00189649 seconds +Blocked time: 0.000646893 seconds +Parallel time: 0.00629332 seconds +Blocked speedup: 2.93169x +Parallel speedup: 0.301349x +Case 2 (100x128x56): +Naive time: 0.00191696 seconds +Blocked time: 0.000653236 seconds +Parallel time: 0.00314587 seconds +Blocked speedup: 2.93456x +Parallel speedup: 0.609356x +Case 2 (100x128x56): +Naive time: 0.00190286 seconds +Blocked time: 0.000667964 seconds +Parallel time: 0.00641429 seconds +Blocked speedup: 2.84875x +Parallel speedup: 0.296659x +Case 2 (100x128x56): +Naive time: 0.00190846 seconds +Blocked time: 0.000664447 seconds +Parallel time: 0.00229637 seconds +Blocked speedup: 2.87225x +Parallel speedup: 0.831078x +Case 2 (100x128x56): +Naive time: 0.00190492 seconds +Blocked time: 0.000662924 seconds +Parallel time: 0.00106572 seconds +Blocked speedup: 2.87352x +Parallel speedup: 1.78745x +Case 2 (100x128x56): +Naive time: 0.00194727 seconds +Blocked time: 0.000693602 seconds +Parallel time: 0.00581548 seconds +Blocked speedup: 2.80748x +Parallel speedup: 0.334843x +Case 2 (100x128x56): +Naive time: 0.00189662 seconds +Blocked time: 0.000657123 seconds +Parallel time: 0.00105164 seconds +Blocked speedup: 2.88625x +Parallel speedup: 1.80348x +Case 2 (100x128x56): +Naive time: 0.00190248 seconds +Blocked time: 0.000760196 seconds +Parallel time: 0.00275049 seconds +Blocked speedup: 2.50262x +Parallel speedup: 0.691687x +Case 2 (100x128x56): +Naive time: 0.00193417 seconds +Blocked time: 0.000684244 seconds +Parallel time: 0.00754415 seconds +Blocked speedup: 2.82672x +Parallel speedup: 0.25638x +Case 2 (100x128x56): +Naive time: 0.00190576 seconds +Blocked time: 0.000665329 seconds +Parallel time: 0.00492096 seconds +Blocked speedup: 2.86438x +Parallel speedup: 0.387273x +Case 2 (100x128x56): +Naive time: 0.00190456 seconds +Blocked time: 0.000660238 seconds +Parallel time: 0.000862488 seconds +Blocked speedup: 2.88466x +Parallel speedup: 2.20822x +Case 2 (100x128x56): +Naive time: 0.00191422 seconds +Blocked time: 0.000847641 seconds +Parallel time: 0.00341581 seconds +Blocked speedup: 2.25829x +Parallel speedup: 0.5604x +Case 2 (100x128x56): +Naive time: 0.00190132 seconds +Blocked time: 0.000682941 seconds +Parallel time: 0.00790153 seconds +Blocked speedup: 2.78402x +Parallel speedup: 0.240627x +Case 2 (100x128x56): +Naive time: 0.00196003 seconds +Blocked time: 0.000786836 seconds +Parallel time: 0.00691742 seconds +Blocked speedup: 2.49102x +Parallel speedup: 0.283347x +Case 2 (100x128x56): +Naive time: 0.00190829 seconds +Blocked time: 0.000657143 seconds +Parallel time: 0.00328116 seconds +Blocked speedup: 2.90392x +Parallel speedup: 0.581591x +Case 2 (100x128x56): +Naive time: 0.00190971 seconds +Blocked time: 0.000667092 seconds +Parallel time: 0.00320071 seconds +Blocked speedup: 2.86274x +Parallel speedup: 0.596654x +Case 2 (100x128x56): +Naive time: 0.0019056 seconds +Blocked time: 0.000667382 seconds +Parallel time: 0.00369267 seconds +Blocked speedup: 2.85534x +Parallel speedup: 0.516051x +Case 2 (100x128x56): +Naive time: 0.00191097 seconds +Blocked time: 0.000657914 seconds +Parallel time: 0.00322641 seconds +Blocked speedup: 2.90458x +Parallel speedup: 0.592288x +Case 2 (100x128x56): +Naive time: 0.00190661 seconds +Blocked time: 0.000767721 seconds +Parallel time: 0.00784511 seconds +Blocked speedup: 2.48346x +Parallel speedup: 0.243031x +Case 2 (100x128x56): +Naive time: 0.0019011 seconds +Blocked time: 0.000678754 seconds +Parallel time: 0.00441378 seconds +Blocked speedup: 2.80086x +Parallel speedup: 0.430718x +Case 2 (100x128x56): +Naive time: 0.00191129 seconds +Blocked time: 0.000657584 seconds +Parallel time: 0.00549519 seconds +Blocked speedup: 2.90653x +Parallel speedup: 0.347811x +Case 2 (100x128x56): +Naive time: 0.00196355 seconds +Blocked time: 0.000670458 seconds +Parallel time: 0.00316909 seconds +Blocked speedup: 2.92868x +Parallel speedup: 0.619596x +Case 2 (100x128x56): +Naive time: 0.00190996 seconds +Blocked time: 0.000659176 seconds +Parallel time: 0.00336236 seconds +Blocked speedup: 2.8975x +Parallel speedup: 0.568043x +Case 2 (100x128x56): +Naive time: 0.00192514 seconds +Blocked time: 0.000677391 seconds +Parallel time: 0.00295746 seconds +Blocked speedup: 2.842x +Parallel speedup: 0.650945x +Case 2 (100x128x56): +Naive time: 0.00190914 seconds +Blocked time: 0.000686939 seconds +Parallel time: 0.00766331 seconds +Blocked speedup: 2.7792x +Parallel speedup: 0.249128x +Case 2 (100x128x56): +Naive time: 0.00194181 seconds +Blocked time: 0.000650381 seconds +Parallel time: 0.000992923 seconds +Blocked speedup: 2.98565x +Parallel speedup: 1.95565x +Case 2 (100x128x56): +Naive time: 0.00191722 seconds +Blocked time: 0.000719099 seconds +Parallel time: 0.00642576 seconds +Blocked speedup: 2.66614x +Parallel speedup: 0.298365x +Case 2 (100x128x56): +Naive time: 0.00190404 seconds +Blocked time: 0.000657072 seconds +Parallel time: 0.000857198 seconds +Blocked speedup: 2.89777x +Parallel speedup: 2.22124x +Case 2 (100x128x56): +Naive time: 0.00197299 seconds +Blocked time: 0.000726894 seconds +Parallel time: 0.00279164 seconds +Blocked speedup: 2.71428x +Parallel speedup: 0.706751x +Case 2 (100x128x56): +Naive time: 0.00191092 seconds +Blocked time: 0.00100716 seconds +Parallel time: 0.00111023 seconds +Blocked speedup: 1.89733x +Parallel speedup: 1.72118x +Case 2 (100x128x56): +Naive time: 0.00195923 seconds +Blocked time: 0.000670438 seconds +Parallel time: 0.00255585 seconds +Blocked speedup: 2.92231x +Parallel speedup: 0.766567x +Case 2 (100x128x56): +Naive time: 0.00190645 seconds +Blocked time: 0.000660379 seconds +Parallel time: 0.00438804 seconds +Blocked speedup: 2.8869x +Parallel speedup: 0.434464x +Case 2 (100x128x56): +Naive time: 0.00191128 seconds +Blocked time: 0.000660519 seconds +Parallel time: 0.00347746 seconds +Blocked speedup: 2.8936x +Parallel speedup: 0.549619x +Case 2 (100x128x56): +Naive time: 0.00190505 seconds +Blocked time: 0.000777989 seconds +Parallel time: 0.00760739 seconds +Blocked speedup: 2.44869x +Parallel speedup: 0.250421x +Case 2 (100x128x56): +Naive time: 0.0019062 seconds +Blocked time: 0.000648427 seconds +Parallel time: 0.0010963 seconds +Blocked speedup: 2.93972x +Parallel speedup: 1.73876x +Case 2 (100x128x56): +Naive time: 0.00193422 seconds +Blocked time: 0.000716284 seconds +Parallel time: 0.0029507 seconds +Blocked speedup: 2.70035x +Parallel speedup: 0.655513x +Case 2 (100x128x56): +Naive time: 0.00190616 seconds +Blocked time: 0.000663444 seconds +Parallel time: 0.00421681 seconds +Blocked speedup: 2.87312x +Parallel speedup: 0.452037x +Case 2 (100x128x56): +Naive time: 0.0019053 seconds +Blocked time: 0.000975911 seconds +Parallel time: 0.00354557 seconds +Blocked speedup: 1.95233x +Parallel speedup: 0.537376x +Case 2 (100x128x56): +Naive time: 0.00192431 seconds +Blocked time: 0.000660479 seconds +Parallel time: 0.0027166 seconds +Blocked speedup: 2.91351x +Parallel speedup: 0.708353x +Case 2 (100x128x56): +Naive time: 0.0019212 seconds +Blocked time: 0.000659157 seconds +Parallel time: 0.00805672 seconds +Blocked speedup: 2.91462x +Parallel speedup: 0.238459x +Case 2 (100x128x56): +Naive time: 0.00191305 seconds +Blocked time: 0.00066102 seconds +Parallel time: 0.00262204 seconds +Blocked speedup: 2.89409x +Parallel speedup: 0.729603x +Case 2 (100x128x56): +Naive time: 0.00191924 seconds +Blocked time: 0.000674395 seconds +Parallel time: 0.00827042 seconds +Blocked speedup: 2.84587x +Parallel speedup: 0.232061x +Case 2 (100x128x56): +Naive time: 0.00191194 seconds +Blocked time: 0.000667221 seconds +Parallel time: 0.0027087 seconds +Blocked speedup: 2.86552x +Parallel speedup: 0.70585x +Case 2 (100x128x56): +Naive time: 0.00189387 seconds +Blocked time: 0.000669276 seconds +Parallel time: 0.000827683 seconds +Blocked speedup: 2.82973x +Parallel speedup: 2.28816x +Case 2 (100x128x56): +Naive time: 0.00193975 seconds +Blocked time: 0.000683122 seconds +Parallel time: 0.00348742 seconds +Blocked speedup: 2.83954x +Parallel speedup: 0.556213x +Case 2 (100x128x56): +Naive time: 0.00201315 seconds +Blocked time: 0.000751891 seconds +Parallel time: 0.00354999 seconds +Blocked speedup: 2.67745x +Parallel speedup: 0.567085x +Case 2 (100x128x56): +Naive time: 0.00193903 seconds +Blocked time: 0.000742643 seconds +Parallel time: 0.00261244 seconds +Blocked speedup: 2.61098x +Parallel speedup: 0.742228x +Case 2 (100x128x56): +Naive time: 0.00190515 seconds +Blocked time: 0.000645832 seconds +Parallel time: 0.00331267 seconds +Blocked speedup: 2.94992x +Parallel speedup: 0.575112x +Case 2 (100x128x56): +Naive time: 0.00191232 seconds +Blocked time: 0.000684094 seconds +Parallel time: 0.00298458 seconds +Blocked speedup: 2.7954x +Parallel speedup: 0.640733x +Case 2 (100x128x56): +Naive time: 0.00190743 seconds +Blocked time: 0.000657935 seconds +Parallel time: 0.00772057 seconds +Blocked speedup: 2.89911x +Parallel speedup: 0.247058x +Case 2 (100x128x56): +Naive time: 0.00192706 seconds +Blocked time: 0.000658365 seconds +Parallel time: 0.00787343 seconds +Blocked speedup: 2.92703x +Parallel speedup: 0.244754x +Case 2 (100x128x56): +Naive time: 0.00197836 seconds +Blocked time: 0.000666481 seconds +Parallel time: 0.0037008 seconds +Blocked speedup: 2.96837x +Parallel speedup: 0.534576x +Case 2 (100x128x56): +Naive time: 0.00190755 seconds +Blocked time: 0.000784932 seconds +Parallel time: 0.00254131 seconds +Blocked speedup: 2.43021x +Parallel speedup: 0.750617x +Case 2 (100x128x56): +Naive time: 0.00190544 seconds +Blocked time: 0.000653937 seconds +Parallel time: 0.00324639 seconds +Blocked speedup: 2.91379x +Parallel speedup: 0.586939x +Case 2 (100x128x56): +Naive time: 0.00192371 seconds +Blocked time: 0.000683713 seconds +Parallel time: 0.00370925 seconds +Blocked speedup: 2.81362x +Parallel speedup: 0.518625x +Case 2 (100x128x56): +Naive time: 0.00191025 seconds +Blocked time: 0.00067125 seconds +Parallel time: 0.00567068 seconds +Blocked speedup: 2.84582x +Parallel speedup: 0.336865x +Case 2 (100x128x56): +Naive time: 0.00190374 seconds +Blocked time: 0.000654157 seconds +Parallel time: 0.0030329 seconds +Blocked speedup: 2.91022x +Parallel speedup: 0.627697x diff --git a/results/benchmark3.txt b/results/benchmark3.txt new file mode 100644 index 0000000..819fe3b --- /dev/null +++ b/results/benchmark3.txt @@ -0,0 +1,600 @@ +Case 3 (128x64x128): +Naive time: 0.00278907 seconds +Blocked time: 0.000993795 seconds +Parallel time: 0.006631 seconds +Blocked speedup: 2.80649x +Parallel speedup: 0.420611x +Case 3 (128x64x128): +Naive time: 0.00272661 seconds +Blocked time: 0.000972304 seconds +Parallel time: 0.00145378 seconds +Blocked speedup: 2.80427x +Parallel speedup: 1.87553x +Case 3 (128x64x128): +Naive time: 0.00273466 seconds +Blocked time: 0.000962846 seconds +Parallel time: 0.00145361 seconds +Blocked speedup: 2.84019x +Parallel speedup: 1.88129x +Case 3 (128x64x128): +Naive time: 0.00276543 seconds +Blocked time: 0.00107144 seconds +Parallel time: 0.00145778 seconds +Blocked speedup: 2.58104x +Parallel speedup: 1.89702x +Case 3 (128x64x128): +Naive time: 0.00274059 seconds +Blocked time: 0.000989917 seconds +Parallel time: 0.00147042 seconds +Blocked speedup: 2.76851x +Parallel speedup: 1.86382x +Case 3 (128x64x128): +Naive time: 0.00275091 seconds +Blocked time: 0.000976051 seconds +Parallel time: 0.00148982 seconds +Blocked speedup: 2.81841x +Parallel speedup: 1.84648x +Case 3 (128x64x128): +Naive time: 0.00284611 seconds +Blocked time: 0.000980229 seconds +Parallel time: 0.00143548 seconds +Blocked speedup: 2.90352x +Parallel speedup: 1.98268x +Case 3 (128x64x128): +Naive time: 0.00275 seconds +Blocked time: 0.000989206 seconds +Parallel time: 0.0026834 seconds +Blocked speedup: 2.78001x +Parallel speedup: 1.02482x +Case 3 (128x64x128): +Naive time: 0.00274859 seconds +Blocked time: 0.0010293 seconds +Parallel time: 0.00155008 seconds +Blocked speedup: 2.67034x +Parallel speedup: 1.77319x +Case 3 (128x64x128): +Naive time: 0.00274578 seconds +Blocked time: 0.000983024 seconds +Parallel time: 0.00147977 seconds +Blocked speedup: 2.7932x +Parallel speedup: 1.85555x +Case 3 (128x64x128): +Naive time: 0.00274683 seconds +Blocked time: 0.000972305 seconds +Parallel time: 0.00149458 seconds +Blocked speedup: 2.82507x +Parallel speedup: 1.83786x +Case 3 (128x64x128): +Naive time: 0.00275405 seconds +Blocked time: 0.000986531 seconds +Parallel time: 0.00145224 seconds +Blocked speedup: 2.79165x +Parallel speedup: 1.89642x +Case 3 (128x64x128): +Naive time: 0.00274218 seconds +Blocked time: 0.000981081 seconds +Parallel time: 0.00148947 seconds +Blocked speedup: 2.79505x +Parallel speedup: 1.84105x +Case 3 (128x64x128): +Naive time: 0.00275258 seconds +Blocked time: 0.000984638 seconds +Parallel time: 0.0014455 seconds +Blocked speedup: 2.79552x +Parallel speedup: 1.90423x +Case 3 (128x64x128): +Naive time: 0.00279489 seconds +Blocked time: 0.00102914 seconds +Parallel time: 0.00148257 seconds +Blocked speedup: 2.71575x +Parallel speedup: 1.88517x +Case 3 (128x64x128): +Naive time: 0.00280451 seconds +Blocked time: 0.000978647 seconds +Parallel time: 0.00144694 seconds +Blocked speedup: 2.8657x +Parallel speedup: 1.93823x +Case 3 (128x64x128): +Naive time: 0.00280258 seconds +Blocked time: 0.001011 seconds +Parallel time: 0.00817233 seconds +Blocked speedup: 2.77209x +Parallel speedup: 0.342935x +Case 3 (128x64x128): +Naive time: 0.00271103 seconds +Blocked time: 0.000984458 seconds +Parallel time: 0.00543568 seconds +Blocked speedup: 2.75383x +Parallel speedup: 0.498747x +Case 3 (128x64x128): +Naive time: 0.00315377 seconds +Blocked time: 0.00101481 seconds +Parallel time: 0.00431718 seconds +Blocked speedup: 3.10773x +Parallel speedup: 0.730516x +Case 3 (128x64x128): +Naive time: 0.00281973 seconds +Blocked time: 0.00100225 seconds +Parallel time: 0.00240974 seconds +Blocked speedup: 2.8134x +Parallel speedup: 1.17014x +Case 3 (128x64x128): +Naive time: 0.00277158 seconds +Blocked time: 0.00102995 seconds +Parallel time: 0.00147034 seconds +Blocked speedup: 2.69098x +Parallel speedup: 1.88499x +Case 3 (128x64x128): +Naive time: 0.00281488 seconds +Blocked time: 0.000972715 seconds +Parallel time: 0.00148778 seconds +Blocked speedup: 2.89384x +Parallel speedup: 1.892x +Case 3 (128x64x128): +Naive time: 0.00280287 seconds +Blocked time: 0.00098617 seconds +Parallel time: 0.00147312 seconds +Blocked speedup: 2.84218x +Parallel speedup: 1.90267x +Case 3 (128x64x128): +Naive time: 0.00276355 seconds +Blocked time: 0.000957237 seconds +Parallel time: 0.0016649 seconds +Blocked speedup: 2.887x +Parallel speedup: 1.65988x +Case 3 (128x64x128): +Naive time: 0.0027956 seconds +Blocked time: 0.000952467 seconds +Parallel time: 0.00146382 seconds +Blocked speedup: 2.93511x +Parallel speedup: 1.9098x +Case 3 (128x64x128): +Naive time: 0.00275083 seconds +Blocked time: 0.000970922 seconds +Parallel time: 0.0023153 seconds +Blocked speedup: 2.83322x +Parallel speedup: 1.18811x +Case 3 (128x64x128): +Naive time: 0.00273792 seconds +Blocked time: 0.000973948 seconds +Parallel time: 0.00144732 seconds +Blocked speedup: 2.81115x +Parallel speedup: 1.89172x +Case 3 (128x64x128): +Naive time: 0.00273375 seconds +Blocked time: 0.0009912 seconds +Parallel time: 0.00432368 seconds +Blocked speedup: 2.75802x +Parallel speedup: 0.632273x +Case 3 (128x64x128): +Naive time: 0.00277512 seconds +Blocked time: 0.000971202 seconds +Parallel time: 0.00146949 seconds +Blocked speedup: 2.8574x +Parallel speedup: 1.88849x +Case 3 (128x64x128): +Naive time: 0.00273612 seconds +Blocked time: 0.000977143 seconds +Parallel time: 0.00145817 seconds +Blocked speedup: 2.80013x +Parallel speedup: 1.87641x +Case 3 (128x64x128): +Naive time: 0.00277576 seconds +Blocked time: 0.000977675 seconds +Parallel time: 0.00112396 seconds +Blocked speedup: 2.83914x +Parallel speedup: 2.46963x +Case 3 (128x64x128): +Naive time: 0.00274987 seconds +Blocked time: 0.0010693 seconds +Parallel time: 0.00146596 seconds +Blocked speedup: 2.57166x +Parallel speedup: 1.87581x +Case 3 (128x64x128): +Naive time: 0.00274098 seconds +Blocked time: 0.000968347 seconds +Parallel time: 0.00145722 seconds +Blocked speedup: 2.83058x +Parallel speedup: 1.88096x +Case 3 (128x64x128): +Naive time: 0.00274915 seconds +Blocked time: 0.00120853 seconds +Parallel time: 0.00152598 seconds +Blocked speedup: 2.27479x +Parallel speedup: 1.80156x +Case 3 (128x64x128): +Naive time: 0.00276553 seconds +Blocked time: 0.000986 seconds +Parallel time: 0.00136748 seconds +Blocked speedup: 2.8048x +Parallel speedup: 2.02236x +Case 3 (128x64x128): +Naive time: 0.0027624 seconds +Blocked time: 0.000976022 seconds +Parallel time: 0.00148998 seconds +Blocked speedup: 2.83027x +Parallel speedup: 1.85399x +Case 3 (128x64x128): +Naive time: 0.00276811 seconds +Blocked time: 0.000973437 seconds +Parallel time: 0.00149679 seconds +Blocked speedup: 2.84365x +Parallel speedup: 1.84937x +Case 3 (128x64x128): +Naive time: 0.0027458 seconds +Blocked time: 0.000975891 seconds +Parallel time: 0.00144135 seconds +Blocked speedup: 2.81364x +Parallel speedup: 1.90502x +Case 3 (128x64x128): +Naive time: 0.00277863 seconds +Blocked time: 0.000982303 seconds +Parallel time: 0.00170521 seconds +Blocked speedup: 2.82869x +Parallel speedup: 1.6295x +Case 3 (128x64x128): +Naive time: 0.00281472 seconds +Blocked time: 0.000968647 seconds +Parallel time: 0.00147677 seconds +Blocked speedup: 2.90583x +Parallel speedup: 1.906x +Case 3 (128x64x128): +Naive time: 0.00278837 seconds +Blocked time: 0.000973146 seconds +Parallel time: 0.00812964 seconds +Blocked speedup: 2.86532x +Parallel speedup: 0.342988x +Case 3 (128x64x128): +Naive time: 0.00276398 seconds +Blocked time: 0.000993664 seconds +Parallel time: 0.00147975 seconds +Blocked speedup: 2.7816x +Parallel speedup: 1.86787x +Case 3 (128x64x128): +Naive time: 0.00273186 seconds +Blocked time: 0.000974999 seconds +Parallel time: 0.001486 seconds +Blocked speedup: 2.80191x +Parallel speedup: 1.8384x +Case 3 (128x64x128): +Naive time: 0.00276307 seconds +Blocked time: 0.00110419 seconds +Parallel time: 0.00144941 seconds +Blocked speedup: 2.50235x +Parallel speedup: 1.90635x +Case 3 (128x64x128): +Naive time: 0.00277102 seconds +Blocked time: 0.000966423 seconds +Parallel time: 0.00145903 seconds +Blocked speedup: 2.86729x +Parallel speedup: 1.89922x +Case 3 (128x64x128): +Naive time: 0.00276852 seconds +Blocked time: 0.000973286 seconds +Parallel time: 0.00357108 seconds +Blocked speedup: 2.84451x +Parallel speedup: 0.775262x +Case 3 (128x64x128): +Naive time: 0.00279397 seconds +Blocked time: 0.000968638 seconds +Parallel time: 0.00146067 seconds +Blocked speedup: 2.88443x +Parallel speedup: 1.9128x +Case 3 (128x64x128): +Naive time: 0.00276364 seconds +Blocked time: 0.000985679 seconds +Parallel time: 0.00143778 seconds +Blocked speedup: 2.80379x +Parallel speedup: 1.92216x +Case 3 (128x64x128): +Naive time: 0.00275208 seconds +Blocked time: 0.00104609 seconds +Parallel time: 0.00144804 seconds +Blocked speedup: 2.63082x +Parallel speedup: 1.90056x +Case 3 (128x64x128): +Naive time: 0.00279844 seconds +Blocked time: 0.000964059 seconds +Parallel time: 0.00149103 seconds +Blocked speedup: 2.90277x +Parallel speedup: 1.87685x +Case 3 (128x64x128): +Naive time: 0.00275825 seconds +Blocked time: 0.000981702 seconds +Parallel time: 0.00144068 seconds +Blocked speedup: 2.80966x +Parallel speedup: 1.91454x +Case 3 (128x64x128): +Naive time: 0.00276583 seconds +Blocked time: 0.000979297 seconds +Parallel time: 0.00156438 seconds +Blocked speedup: 2.8243x +Parallel speedup: 1.76801x +Case 3 (128x64x128): +Naive time: 0.00276848 seconds +Blocked time: 0.000971162 seconds +Parallel time: 0.00164325 seconds +Blocked speedup: 2.85068x +Parallel speedup: 1.68475x +Case 3 (128x64x128): +Naive time: 0.00277405 seconds +Blocked time: 0.00096468 seconds +Parallel time: 0.001711 seconds +Blocked speedup: 2.87562x +Parallel speedup: 1.62131x +Case 3 (128x64x128): +Naive time: 0.00278005 seconds +Blocked time: 0.00096998 seconds +Parallel time: 0.00143725 seconds +Blocked speedup: 2.86609x +Parallel speedup: 1.93429x +Case 3 (128x64x128): +Naive time: 0.00277215 seconds +Blocked time: 0.000967034 seconds +Parallel time: 0.00155194 seconds +Blocked speedup: 2.86665x +Parallel speedup: 1.78625x +Case 3 (128x64x128): +Naive time: 0.00274792 seconds +Blocked time: 0.0010853 seconds +Parallel time: 0.00146026 seconds +Blocked speedup: 2.53195x +Parallel speedup: 1.8818x +Case 3 (128x64x128): +Naive time: 0.00274756 seconds +Blocked time: 0.000971072 seconds +Parallel time: 0.00143572 seconds +Blocked speedup: 2.8294x +Parallel speedup: 1.91371x +Case 3 (128x64x128): +Naive time: 0.00278406 seconds +Blocked time: 0.000969198 seconds +Parallel time: 0.00144995 seconds +Blocked speedup: 2.87254x +Parallel speedup: 1.92011x +Case 3 (128x64x128): +Naive time: 0.00285824 seconds +Blocked time: 0.000970571 seconds +Parallel time: 0.00145002 seconds +Blocked speedup: 2.94491x +Parallel speedup: 1.97117x +Case 3 (128x64x128): +Naive time: 0.00280014 seconds +Blocked time: 0.000973166 seconds +Parallel time: 0.00340417 seconds +Blocked speedup: 2.87735x +Parallel speedup: 0.822563x +Case 3 (128x64x128): +Naive time: 0.00277716 seconds +Blocked time: 0.000976904 seconds +Parallel time: 0.00146553 seconds +Blocked speedup: 2.84282x +Parallel speedup: 1.89499x +Case 3 (128x64x128): +Naive time: 0.0027487 seconds +Blocked time: 0.000967135 seconds +Parallel time: 0.00282378 seconds +Blocked speedup: 2.8421x +Parallel speedup: 0.973411x +Case 3 (128x64x128): +Naive time: 0.0027533 seconds +Blocked time: 0.000974708 seconds +Parallel time: 0.00145951 seconds +Blocked speedup: 2.82474x +Parallel speedup: 1.88645x +Case 3 (128x64x128): +Naive time: 0.00276761 seconds +Blocked time: 0.000969299 seconds +Parallel time: 0.00144583 seconds +Blocked speedup: 2.85527x +Parallel speedup: 1.9142x +Case 3 (128x64x128): +Naive time: 0.00311311 seconds +Blocked time: 0.000976783 seconds +Parallel time: 0.00155278 seconds +Blocked speedup: 3.18711x +Parallel speedup: 2.00486x +Case 3 (128x64x128): +Naive time: 0.00274487 seconds +Blocked time: 0.000974708 seconds +Parallel time: 0.0014704 seconds +Blocked speedup: 2.81609x +Parallel speedup: 1.86675x +Case 3 (128x64x128): +Naive time: 0.00276675 seconds +Blocked time: 0.000970601 seconds +Parallel time: 0.00146639 seconds +Blocked speedup: 2.85056x +Parallel speedup: 1.88678x +Case 3 (128x64x128): +Naive time: 0.00275245 seconds +Blocked time: 0.00097027 seconds +Parallel time: 0.00145152 seconds +Blocked speedup: 2.83679x +Parallel speedup: 1.89625x +Case 3 (128x64x128): +Naive time: 0.00282561 seconds +Blocked time: 0.000966193 seconds +Parallel time: 0.00101469 seconds +Blocked speedup: 2.92448x +Parallel speedup: 2.78469x +Case 3 (128x64x128): +Naive time: 0.00275284 seconds +Blocked time: 0.000973336 seconds +Parallel time: 0.00152979 seconds +Blocked speedup: 2.82825x +Parallel speedup: 1.79949x +Case 3 (128x64x128): +Naive time: 0.00275742 seconds +Blocked time: 0.000968537 seconds +Parallel time: 0.00144379 seconds +Blocked speedup: 2.847x +Parallel speedup: 1.90985x +Case 3 (128x64x128): +Naive time: 0.00275115 seconds +Blocked time: 0.00100427 seconds +Parallel time: 0.00929713 seconds +Blocked speedup: 2.73944x +Parallel speedup: 0.295914x +Case 3 (128x64x128): +Naive time: 0.00274371 seconds +Blocked time: 0.00100084 seconds +Parallel time: 0.00144512 seconds +Blocked speedup: 2.74141x +Parallel speedup: 1.8986x +Case 3 (128x64x128): +Naive time: 0.00277891 seconds +Blocked time: 0.00096987 seconds +Parallel time: 0.00151201 seconds +Blocked speedup: 2.86524x +Parallel speedup: 1.8379x +Case 3 (128x64x128): +Naive time: 0.00274946 seconds +Blocked time: 0.00097504 seconds +Parallel time: 0.00148467 seconds +Blocked speedup: 2.81984x +Parallel speedup: 1.85189x +Case 3 (128x64x128): +Naive time: 0.00274208 seconds +Blocked time: 0.000968347 seconds +Parallel time: 0.00150242 seconds +Blocked speedup: 2.83172x +Parallel speedup: 1.82511x +Case 3 (128x64x128): +Naive time: 0.00274916 seconds +Blocked time: 0.00107299 seconds +Parallel time: 0.00146855 seconds +Blocked speedup: 2.56214x +Parallel speedup: 1.87203x +Case 3 (128x64x128): +Naive time: 0.00278274 seconds +Blocked time: 0.000973957 seconds +Parallel time: 0.00145648 seconds +Blocked speedup: 2.85715x +Parallel speedup: 1.91059x +Case 3 (128x64x128): +Naive time: 0.0027438 seconds +Blocked time: 0.000972254 seconds +Parallel time: 0.00149912 seconds +Blocked speedup: 2.8221x +Parallel speedup: 1.83027x +Case 3 (128x64x128): +Naive time: 0.00280528 seconds +Blocked time: 0.000973908 seconds +Parallel time: 0.00762172 seconds +Blocked speedup: 2.88044x +Parallel speedup: 0.368064x +Case 3 (128x64x128): +Naive time: 0.00278593 seconds +Blocked time: 0.000960493 seconds +Parallel time: 0.00257962 seconds +Blocked speedup: 2.90052x +Parallel speedup: 1.07998x +Case 3 (128x64x128): +Naive time: 0.00318784 seconds +Blocked time: 0.000974629 seconds +Parallel time: 0.00164879 seconds +Blocked speedup: 3.27083x +Parallel speedup: 1.93344x +Case 3 (128x64x128): +Naive time: 0.00275671 seconds +Blocked time: 0.000988635 seconds +Parallel time: 0.00788969 seconds +Blocked speedup: 2.7884x +Parallel speedup: 0.349407x +Case 3 (128x64x128): +Naive time: 0.0027347 seconds +Blocked time: 0.000969288 seconds +Parallel time: 0.00146241 seconds +Blocked speedup: 2.82135x +Parallel speedup: 1.86999x +Case 3 (128x64x128): +Naive time: 0.00284022 seconds +Blocked time: 0.000979238 seconds +Parallel time: 0.00150014 seconds +Blocked speedup: 2.90044x +Parallel speedup: 1.89331x +Case 3 (128x64x128): +Naive time: 0.00275371 seconds +Blocked time: 0.00097538 seconds +Parallel time: 0.00125346 seconds +Blocked speedup: 2.82321x +Parallel speedup: 2.19688x +Case 3 (128x64x128): +Naive time: 0.00275919 seconds +Blocked time: 0.000965401 seconds +Parallel time: 0.00146997 seconds +Blocked speedup: 2.85807x +Parallel speedup: 1.87704x +Case 3 (128x64x128): +Naive time: 0.00275996 seconds +Blocked time: 0.00100813 seconds +Parallel time: 0.00145309 seconds +Blocked speedup: 2.7377x +Parallel speedup: 1.89938x +Case 3 (128x64x128): +Naive time: 0.00275158 seconds +Blocked time: 0.000973066 seconds +Parallel time: 0.0064484 seconds +Blocked speedup: 2.82774x +Parallel speedup: 0.426708x +Case 3 (128x64x128): +Naive time: 0.00312095 seconds +Blocked time: 0.00103824 seconds +Parallel time: 0.00177206 seconds +Blocked speedup: 3.006x +Parallel speedup: 1.76119x +Case 3 (128x64x128): +Naive time: 0.00274225 seconds +Blocked time: 0.000970892 seconds +Parallel time: 0.00268688 seconds +Blocked speedup: 2.82446x +Parallel speedup: 1.0206x +Case 3 (128x64x128): +Naive time: 0.00277585 seconds +Blocked time: 0.000983656 seconds +Parallel time: 0.00298814 seconds +Blocked speedup: 2.82197x +Parallel speedup: 0.928956x +Case 3 (128x64x128): +Naive time: 0.0027982 seconds +Blocked time: 0.00109159 seconds +Parallel time: 0.00843077 seconds +Blocked speedup: 2.56342x +Parallel speedup: 0.331903x +Case 3 (128x64x128): +Naive time: 0.00281644 seconds +Blocked time: 0.00109352 seconds +Parallel time: 0.00138877 seconds +Blocked speedup: 2.57557x +Parallel speedup: 2.02802x +Case 3 (128x64x128): +Naive time: 0.00278008 seconds +Blocked time: 0.000988074 seconds +Parallel time: 0.00146205 seconds +Blocked speedup: 2.81363x +Parallel speedup: 1.90149x +Case 3 (128x64x128): +Naive time: 0.00279186 seconds +Blocked time: 0.00100635 seconds +Parallel time: 0.00456488 seconds +Blocked speedup: 2.77425x +Parallel speedup: 0.611596x +Case 3 (128x64x128): +Naive time: 0.00282107 seconds +Blocked time: 0.00100253 seconds +Parallel time: 0.00135158 seconds +Blocked speedup: 2.81395x +Parallel speedup: 2.08725x +Case 3 (128x64x128): +Naive time: 0.00278576 seconds +Blocked time: 0.000989256 seconds +Parallel time: 0.00376582 seconds +Blocked speedup: 2.81601x +Parallel speedup: 0.739749x +Case 3 (128x64x128): +Naive time: 0.00275917 seconds +Blocked time: 0.00100829 seconds +Parallel time: 0.00820324 seconds +Blocked speedup: 2.73648x +Parallel speedup: 0.336351x diff --git a/results/benchmark4.txt b/results/benchmark4.txt new file mode 100644 index 0000000..fbdda75 --- /dev/null +++ b/results/benchmark4.txt @@ -0,0 +1,600 @@ +Case 4 (32x128x32): +Naive time: 0.000347612 seconds +Blocked time: 0.000125345 seconds +Parallel time: 0.00726666 seconds +Blocked speedup: 2.77324x +Parallel speedup: 0.0478366x +Case 4 (32x128x32): +Naive time: 0.000346981 seconds +Blocked time: 0.000119013 seconds +Parallel time: 0.000387988 seconds +Blocked speedup: 2.91549x +Parallel speedup: 0.894309x +Case 4 (32x128x32): +Naive time: 0.000355988 seconds +Blocked time: 0.000200025 seconds +Parallel time: 0.00692344 seconds +Blocked speedup: 1.77972x +Parallel speedup: 0.0514178x +Case 4 (32x128x32): +Naive time: 0.000350397 seconds +Blocked time: 0.000170259 seconds +Parallel time: 0.00293162 seconds +Blocked speedup: 2.05802x +Parallel speedup: 0.119523x +Case 4 (32x128x32): +Naive time: 0.000347081 seconds +Blocked time: 0.000118993 seconds +Parallel time: 0.000389431 seconds +Blocked speedup: 2.91682x +Parallel speedup: 0.891252x +Case 4 (32x128x32): +Naive time: 0.000350207 seconds +Blocked time: 0.000136496 seconds +Parallel time: 0.00534252 seconds +Blocked speedup: 2.56569x +Parallel speedup: 0.0655509x +Case 4 (32x128x32): +Naive time: 0.000349997 seconds +Blocked time: 0.000128942 seconds +Parallel time: 0.00344805 seconds +Blocked speedup: 2.71438x +Parallel speedup: 0.101506x +Case 4 (32x128x32): +Naive time: 0.000347021 seconds +Blocked time: 0.000123151 seconds +Parallel time: 0.00340279 seconds +Blocked speedup: 2.81785x +Parallel speedup: 0.101981x +Case 4 (32x128x32): +Naive time: 0.000360316 seconds +Blocked time: 0.000121318 seconds +Parallel time: 0.000421832 seconds +Blocked speedup: 2.97001x +Parallel speedup: 0.854169x +Case 4 (32x128x32): +Naive time: 0.000351169 seconds +Blocked time: 0.000157957 seconds +Parallel time: 0.00138375 seconds +Blocked speedup: 2.22319x +Parallel speedup: 0.253781x +Case 4 (32x128x32): +Naive time: 0.000366918 seconds +Blocked time: 0.000121838 seconds +Parallel time: 0.00293283 seconds +Blocked speedup: 3.01152x +Parallel speedup: 0.125107x +Case 4 (32x128x32): +Naive time: 0.000688041 seconds +Blocked time: 0.000125786 seconds +Parallel time: 0.000353834 seconds +Blocked speedup: 5.46993x +Parallel speedup: 1.94453x +Case 4 (32x128x32): +Naive time: 0.000352502 seconds +Blocked time: 0.000123352 seconds +Parallel time: 0.000397836 seconds +Blocked speedup: 2.85769x +Parallel speedup: 0.886049x +Case 4 (32x128x32): +Naive time: 0.000348444 seconds +Blocked time: 0.000159319 seconds +Parallel time: 0.0105995 seconds +Blocked speedup: 2.18708x +Parallel speedup: 0.0328735x +Case 4 (32x128x32): +Naive time: 0.000353524 seconds +Blocked time: 0.000126758 seconds +Parallel time: 0.00787665 seconds +Blocked speedup: 2.78897x +Parallel speedup: 0.0448825x +Case 4 (32x128x32): +Naive time: 0.000347041 seconds +Blocked time: 0.000120607 seconds +Parallel time: 0.000403618 seconds +Blocked speedup: 2.87745x +Parallel speedup: 0.859825x +Case 4 (32x128x32): +Naive time: 0.00035193 seconds +Blocked time: 0.000142267 seconds +Parallel time: 0.00409052 seconds +Blocked speedup: 2.47373x +Parallel speedup: 0.0860354x +Case 4 (32x128x32): +Naive time: 0.000351369 seconds +Blocked time: 0.000130665 seconds +Parallel time: 0.0026261 seconds +Blocked speedup: 2.68908x +Parallel speedup: 0.133799x +Case 4 (32x128x32): +Naive time: 0.000349095 seconds +Blocked time: 0.000124734 seconds +Parallel time: 0.000346039 seconds +Blocked speedup: 2.79872x +Parallel speedup: 1.00883x +Case 4 (32x128x32): +Naive time: 0.000348423 seconds +Blocked time: 0.000120005 seconds +Parallel time: 0.000377679 seconds +Blocked speedup: 2.9034x +Parallel speedup: 0.922537x +Case 4 (32x128x32): +Naive time: 0.000352211 seconds +Blocked time: 0.000137538 seconds +Parallel time: 0.00320409 seconds +Blocked speedup: 2.56083x +Parallel speedup: 0.109925x +Case 4 (32x128x32): +Naive time: 0.00038371 seconds +Blocked time: 0.000126597 seconds +Parallel time: 0.000378621 seconds +Blocked speedup: 3.03096x +Parallel speedup: 1.01344x +Case 4 (32x128x32): +Naive time: 0.000349135 seconds +Blocked time: 0.000120446 seconds +Parallel time: 0.00233669 seconds +Blocked speedup: 2.89868x +Parallel speedup: 0.149414x +Case 4 (32x128x32): +Naive time: 0.000372959 seconds +Blocked time: 0.0002558 seconds +Parallel time: 0.012814 seconds +Blocked speedup: 1.45801x +Parallel speedup: 0.0291056x +Case 4 (32x128x32): +Naive time: 0.000354255 seconds +Blocked time: 0.000121588 seconds +Parallel time: 0.000294102 seconds +Blocked speedup: 2.91357x +Parallel speedup: 1.20453x +Case 4 (32x128x32): +Naive time: 0.000349696 seconds +Blocked time: 0.000122339 seconds +Parallel time: 0.00595878 seconds +Blocked speedup: 2.85842x +Parallel speedup: 0.0586858x +Case 4 (32x128x32): +Naive time: 0.000348944 seconds +Blocked time: 0.000121819 seconds +Parallel time: 0.00765977 seconds +Blocked speedup: 2.86445x +Parallel speedup: 0.0455554x +Case 4 (32x128x32): +Naive time: 0.000349045 seconds +Blocked time: 0.000120997 seconds +Parallel time: 0.000403637 seconds +Blocked speedup: 2.88474x +Parallel speedup: 0.86475x +Case 4 (32x128x32): +Naive time: 0.000351139 seconds +Blocked time: 0.0001277 seconds +Parallel time: 0.00244891 seconds +Blocked speedup: 2.74972x +Parallel speedup: 0.143386x +Case 4 (32x128x32): +Naive time: 0.000349596 seconds +Blocked time: 0.000131476 seconds +Parallel time: 0.00380671 seconds +Blocked speedup: 2.65901x +Parallel speedup: 0.0918367x +Case 4 (32x128x32): +Naive time: 0.000346951 seconds +Blocked time: 0.000119163 seconds +Parallel time: 0.000338605 seconds +Blocked speedup: 2.91157x +Parallel speedup: 1.02465x +Case 4 (32x128x32): +Naive time: 0.000359275 seconds +Blocked time: 0.000193162 seconds +Parallel time: 0.0110934 seconds +Blocked speedup: 1.85997x +Parallel speedup: 0.0323864x +Case 4 (32x128x32): +Naive time: 0.000361488 seconds +Blocked time: 0.000126307 seconds +Parallel time: 0.000320231 seconds +Blocked speedup: 2.86198x +Parallel speedup: 1.12884x +Case 4 (32x128x32): +Naive time: 0.00035156 seconds +Blocked time: 0.000121237 seconds +Parallel time: 0.000399329 seconds +Blocked speedup: 2.89977x +Parallel speedup: 0.880377x +Case 4 (32x128x32): +Naive time: 0.000358332 seconds +Blocked time: 0.000124263 seconds +Parallel time: 0.00406836 seconds +Blocked speedup: 2.88366x +Parallel speedup: 0.0880777x +Case 4 (32x128x32): +Naive time: 0.000361558 seconds +Blocked time: 0.000169638 seconds +Parallel time: 0.00574538 seconds +Blocked speedup: 2.13135x +Parallel speedup: 0.0629302x +Case 4 (32x128x32): +Naive time: 0.000353313 seconds +Blocked time: 0.000123882 seconds +Parallel time: 0.00581588 seconds +Blocked speedup: 2.85201x +Parallel speedup: 0.0607497x +Case 4 (32x128x32): +Naive time: 0.000347111 seconds +Blocked time: 0.000119153 seconds +Parallel time: 0.000377509 seconds +Blocked speedup: 2.91315x +Parallel speedup: 0.919477x +Case 4 (32x128x32): +Naive time: 0.000363271 seconds +Blocked time: 0.000131266 seconds +Parallel time: 0.00260199 seconds +Blocked speedup: 2.76744x +Parallel speedup: 0.139613x +Case 4 (32x128x32): +Naive time: 0.000349866 seconds +Blocked time: 0.00012273 seconds +Parallel time: 0.0108818 seconds +Blocked speedup: 2.8507x +Parallel speedup: 0.0321514x +Case 4 (32x128x32): +Naive time: 0.000349285 seconds +Blocked time: 0.000129443 seconds +Parallel time: 0.000388629 seconds +Blocked speedup: 2.69837x +Parallel speedup: 0.898762x +Case 4 (32x128x32): +Naive time: 0.000347151 seconds +Blocked time: 0.000120246 seconds +Parallel time: 0.000301325 seconds +Blocked speedup: 2.88701x +Parallel speedup: 1.15208x +Case 4 (32x128x32): +Naive time: 0.000354946 seconds +Blocked time: 0.000119735 seconds +Parallel time: 0.00789181 seconds +Blocked speedup: 2.96443x +Parallel speedup: 0.0449765x +Case 4 (32x128x32): +Naive time: 0.000350398 seconds +Blocked time: 0.000121929 seconds +Parallel time: 0.00846147 seconds +Blocked speedup: 2.87379x +Parallel speedup: 0.041411x +Case 4 (32x128x32): +Naive time: 0.000349065 seconds +Blocked time: 0.000121167 seconds +Parallel time: 0.00029878 seconds +Blocked speedup: 2.88086x +Parallel speedup: 1.1683x +Case 4 (32x128x32): +Naive time: 0.000363282 seconds +Blocked time: 0.000137869 seconds +Parallel time: 0.002367 seconds +Blocked speedup: 2.63498x +Parallel speedup: 0.153478x +Case 4 (32x128x32): +Naive time: 0.000353142 seconds +Blocked time: 0.000124273 seconds +Parallel time: 0.00299622 seconds +Blocked speedup: 2.84166x +Parallel speedup: 0.117862x +Case 4 (32x128x32): +Naive time: 0.000370095 seconds +Blocked time: 0.000121908 seconds +Parallel time: 0.000392637 seconds +Blocked speedup: 3.03585x +Parallel speedup: 0.942588x +Case 4 (32x128x32): +Naive time: 0.000351449 seconds +Blocked time: 0.000131867 seconds +Parallel time: 0.00266511 seconds +Blocked speedup: 2.66518x +Parallel speedup: 0.13187x +Case 4 (32x128x32): +Naive time: 0.000348964 seconds +Blocked time: 0.000147336 seconds +Parallel time: 0.00307981 seconds +Blocked speedup: 2.36849x +Parallel speedup: 0.113307x +Case 4 (32x128x32): +Naive time: 0.000379962 seconds +Blocked time: 0.000164418 seconds +Parallel time: 0.00832678 seconds +Blocked speedup: 2.31095x +Parallel speedup: 0.0456313x +Case 4 (32x128x32): +Naive time: 0.000378931 seconds +Blocked time: 0.000144671 seconds +Parallel time: 0.00690857 seconds +Blocked speedup: 2.61926x +Parallel speedup: 0.0548494x +Case 4 (32x128x32): +Naive time: 0.000382117 seconds +Blocked time: 0.000151053 seconds +Parallel time: 0.00320673 seconds +Blocked speedup: 2.52969x +Parallel speedup: 0.119161x +Case 4 (32x128x32): +Naive time: 0.000351059 seconds +Blocked time: 0.000130295 seconds +Parallel time: 0.00235691 seconds +Blocked speedup: 2.69434x +Parallel speedup: 0.148949x +Case 4 (32x128x32): +Naive time: 0.000355147 seconds +Blocked time: 0.000130915 seconds +Parallel time: 0.00336286 seconds +Blocked speedup: 2.71281x +Parallel speedup: 0.105609x +Case 4 (32x128x32): +Naive time: 0.000860124 seconds +Blocked time: 0.000218059 seconds +Parallel time: 0.00569773 seconds +Blocked speedup: 3.94446x +Parallel speedup: 0.150959x +Case 4 (32x128x32): +Naive time: 0.000349235 seconds +Blocked time: 0.000132579 seconds +Parallel time: 0.00282417 seconds +Blocked speedup: 2.63417x +Parallel speedup: 0.123659x +Case 4 (32x128x32): +Naive time: 0.000349356 seconds +Blocked time: 0.000139302 seconds +Parallel time: 0.00312775 seconds +Blocked speedup: 2.5079x +Parallel speedup: 0.111696x +Case 4 (32x128x32): +Naive time: 0.000352572 seconds +Blocked time: 0.000128832 seconds +Parallel time: 0.010599 seconds +Blocked speedup: 2.73668x +Parallel speedup: 0.0332647x +Case 4 (32x128x32): +Naive time: 0.000358643 seconds +Blocked time: 0.000130314 seconds +Parallel time: 0.00236068 seconds +Blocked speedup: 2.75214x +Parallel speedup: 0.151924x +Case 4 (32x128x32): +Naive time: 0.000349035 seconds +Blocked time: 0.000122901 seconds +Parallel time: 0.012132 seconds +Blocked speedup: 2.83997x +Parallel speedup: 0.0287697x +Case 4 (32x128x32): +Naive time: 0.000350728 seconds +Blocked time: 0.000151354 seconds +Parallel time: 0.00947093 seconds +Blocked speedup: 2.31727x +Parallel speedup: 0.0370321x +Case 4 (32x128x32): +Naive time: 0.000358052 seconds +Blocked time: 0.000121849 seconds +Parallel time: 0.00284007 seconds +Blocked speedup: 2.93849x +Parallel speedup: 0.126072x +Case 4 (32x128x32): +Naive time: 0.000349315 seconds +Blocked time: 0.000126317 seconds +Parallel time: 0.00239782 seconds +Blocked speedup: 2.76538x +Parallel speedup: 0.14568x +Case 4 (32x128x32): +Naive time: 0.000375474 seconds +Blocked time: 0.00012242 seconds +Parallel time: 0.00248736 seconds +Blocked speedup: 3.0671x +Parallel speedup: 0.150953x +Case 4 (32x128x32): +Naive time: 0.000349836 seconds +Blocked time: 0.000140644 seconds +Parallel time: 0.00312397 seconds +Blocked speedup: 2.48739x +Parallel speedup: 0.111984x +Case 4 (32x128x32): +Naive time: 0.000349295 seconds +Blocked time: 0.000141275 seconds +Parallel time: 0.00301367 seconds +Blocked speedup: 2.47245x +Parallel speedup: 0.115903x +Case 4 (32x128x32): +Naive time: 0.000349376 seconds +Blocked time: 0.000131106 seconds +Parallel time: 0.00316758 seconds +Blocked speedup: 2.66484x +Parallel speedup: 0.110297x +Case 4 (32x128x32): +Naive time: 0.000355938 seconds +Blocked time: 0.000136196 seconds +Parallel time: 0.00335319 seconds +Blocked speedup: 2.61342x +Parallel speedup: 0.106149x +Case 4 (32x128x32): +Naive time: 0.000349265 seconds +Blocked time: 0.00012222 seconds +Parallel time: 0.00260173 seconds +Blocked speedup: 2.85767x +Parallel speedup: 0.134243x +Case 4 (32x128x32): +Naive time: 0.000353083 seconds +Blocked time: 0.000146635 seconds +Parallel time: 0.000298541 seconds +Blocked speedup: 2.4079x +Parallel speedup: 1.1827x +Case 4 (32x128x32): +Naive time: 0.000349265 seconds +Blocked time: 0.000120796 seconds +Parallel time: 0.0135533 seconds +Blocked speedup: 2.89136x +Parallel speedup: 0.0257697x +Case 4 (32x128x32): +Naive time: 0.000349075 seconds +Blocked time: 0.000144821 seconds +Parallel time: 0.00265487 seconds +Blocked speedup: 2.41039x +Parallel speedup: 0.131485x +Case 4 (32x128x32): +Naive time: 0.000349335 seconds +Blocked time: 0.000136857 seconds +Parallel time: 0.00549122 seconds +Blocked speedup: 2.55255x +Parallel speedup: 0.063617x +Case 4 (32x128x32): +Naive time: 0.000349095 seconds +Blocked time: 0.000121638 seconds +Parallel time: 0.00330691 seconds +Blocked speedup: 2.86995x +Parallel speedup: 0.105565x +Case 4 (32x128x32): +Naive time: 0.000364203 seconds +Blocked time: 0.00012242 seconds +Parallel time: 0.00299429 seconds +Blocked speedup: 2.97503x +Parallel speedup: 0.121633x +Case 4 (32x128x32): +Naive time: 0.000349235 seconds +Blocked time: 0.000150893 seconds +Parallel time: 0.00220541 seconds +Blocked speedup: 2.31445x +Parallel speedup: 0.158354x +Case 4 (32x128x32): +Naive time: 0.000347653 seconds +Blocked time: 0.000131466 seconds +Parallel time: 0.0026123 seconds +Blocked speedup: 2.64443x +Parallel speedup: 0.133083x +Case 4 (32x128x32): +Naive time: 0.000347672 seconds +Blocked time: 0.000129052 seconds +Parallel time: 0.00247373 seconds +Blocked speedup: 2.69405x +Parallel speedup: 0.140546x +Case 4 (32x128x32): +Naive time: 0.00038381 seconds +Blocked time: 0.000119684 seconds +Parallel time: 0.00242334 seconds +Blocked speedup: 3.20686x +Parallel speedup: 0.158381x +Case 4 (32x128x32): +Naive time: 0.000355036 seconds +Blocked time: 0.000126017 seconds +Parallel time: 0.00273599 seconds +Blocked speedup: 2.81737x +Parallel speedup: 0.129765x +Case 4 (32x128x32): +Naive time: 0.000347211 seconds +Blocked time: 0.000118903 seconds +Parallel time: 0.000386726 seconds +Blocked speedup: 2.92012x +Parallel speedup: 0.897822x +Case 4 (32x128x32): +Naive time: 0.000347312 seconds +Blocked time: 0.000119314 seconds +Parallel time: 0.00300162 seconds +Blocked speedup: 2.91091x +Parallel speedup: 0.115708x +Case 4 (32x128x32): +Naive time: 0.000347292 seconds +Blocked time: 0.000124663 seconds +Parallel time: 0.00313436 seconds +Blocked speedup: 2.78585x +Parallel speedup: 0.110802x +Case 4 (32x128x32): +Naive time: 0.000355958 seconds +Blocked time: 0.000125736 seconds +Parallel time: 0.00851669 seconds +Blocked speedup: 2.831x +Parallel speedup: 0.0417953x +Case 4 (32x128x32): +Naive time: 0.000367209 seconds +Blocked time: 0.000121187 seconds +Parallel time: 0.00286492 seconds +Blocked speedup: 3.0301x +Parallel speedup: 0.128174x +Case 4 (32x128x32): +Naive time: 0.000347242 seconds +Blocked time: 0.000123412 seconds +Parallel time: 0.00337721 seconds +Blocked speedup: 2.81368x +Parallel speedup: 0.102819x +Case 4 (32x128x32): +Naive time: 0.000349195 seconds +Blocked time: 0.000126948 seconds +Parallel time: 0.00329863 seconds +Blocked speedup: 2.75069x +Parallel speedup: 0.105861x +Case 4 (32x128x32): +Naive time: 0.000358393 seconds +Blocked time: 0.000121708 seconds +Parallel time: 0.00347178 seconds +Blocked speedup: 2.9447x +Parallel speedup: 0.10323x +Case 4 (32x128x32): +Naive time: 0.000353834 seconds +Blocked time: 0.000128631 seconds +Parallel time: 0.00764412 seconds +Blocked speedup: 2.75077x +Parallel speedup: 0.0462884x +Case 4 (32x128x32): +Naive time: 0.000371006 seconds +Blocked time: 0.000124213 seconds +Parallel time: 0.00331433 seconds +Blocked speedup: 2.98685x +Parallel speedup: 0.11194x +Case 4 (32x128x32): +Naive time: 0.000347402 seconds +Blocked time: 0.000121828 seconds +Parallel time: 0.00509892 seconds +Blocked speedup: 2.85158x +Parallel speedup: 0.0681325x +Case 4 (32x128x32): +Naive time: 0.000347352 seconds +Blocked time: 0.000118853 seconds +Parallel time: 0.000376667 seconds +Blocked speedup: 2.92253x +Parallel speedup: 0.922173x +Case 4 (32x128x32): +Naive time: 0.000348885 seconds +Blocked time: 0.00012796 seconds +Parallel time: 0.00385769 seconds +Blocked speedup: 2.72652x +Parallel speedup: 0.0904389x +Case 4 (32x128x32): +Naive time: 0.000356329 seconds +Blocked time: 0.000137298 seconds +Parallel time: 0.00360399 seconds +Blocked speedup: 2.5953x +Parallel speedup: 0.0988706x +Case 4 (32x128x32): +Naive time: 0.000350738 seconds +Blocked time: 0.000160921 seconds +Parallel time: 0.00356422 seconds +Blocked speedup: 2.17957x +Parallel speedup: 0.0984053x +Case 4 (32x128x32): +Naive time: 0.000353132 seconds +Blocked time: 0.000145803 seconds +Parallel time: 0.00315427 seconds +Blocked speedup: 2.42198x +Parallel speedup: 0.111954x +Case 4 (32x128x32): +Naive time: 0.000349496 seconds +Blocked time: 0.000126297 seconds +Parallel time: 0.00281166 seconds +Blocked speedup: 2.76725x +Parallel speedup: 0.124302x +Case 4 (32x128x32): +Naive time: 0.000347652 seconds +Blocked time: 0.000124594 seconds +Parallel time: 0.00264448 seconds +Blocked speedup: 2.79028x +Parallel speedup: 0.131463x +Case 4 (32x128x32): +Naive time: 0.000347131 seconds +Blocked time: 0.000123202 seconds +Parallel time: 0.00348588 seconds +Blocked speedup: 2.81758x +Parallel speedup: 0.099582x diff --git a/results/benchmark5.txt b/results/benchmark5.txt new file mode 100644 index 0000000..9ede726 --- /dev/null +++ b/results/benchmark5.txt @@ -0,0 +1,600 @@ +Case 5 (200x100x256): +Naive time: 0.0141818 seconds +Blocked time: 0.00505984 seconds +Parallel time: 0.00395211 seconds +Blocked speedup: 2.80281x +Parallel speedup: 3.5884x +Case 5 (200x100x256): +Naive time: 0.0137366 seconds +Blocked time: 0.00496477 seconds +Parallel time: 0.00375972 seconds +Blocked speedup: 2.7668x +Parallel speedup: 3.65362x +Case 5 (200x100x256): +Naive time: 0.0135554 seconds +Blocked time: 0.00488445 seconds +Parallel time: 0.00418621 seconds +Blocked speedup: 2.77521x +Parallel speedup: 3.2381x +Case 5 (200x100x256): +Naive time: 0.0138531 seconds +Blocked time: 0.00494483 seconds +Parallel time: 0.00317052 seconds +Blocked speedup: 2.80153x +Parallel speedup: 4.36934x +Case 5 (200x100x256): +Naive time: 0.0136325 seconds +Blocked time: 0.00500023 seconds +Parallel time: 0.00401678 seconds +Blocked speedup: 2.72638x +Parallel speedup: 3.39389x +Case 5 (200x100x256): +Naive time: 0.0136184 seconds +Blocked time: 0.00489597 seconds +Parallel time: 0.00384499 seconds +Blocked speedup: 2.78156x +Parallel speedup: 3.54186x +Case 5 (200x100x256): +Naive time: 0.013961 seconds +Blocked time: 0.00509912 seconds +Parallel time: 0.00390276 seconds +Blocked speedup: 2.73792x +Parallel speedup: 3.57721x +Case 5 (200x100x256): +Naive time: 0.0136246 seconds +Blocked time: 0.00499972 seconds +Parallel time: 0.00384947 seconds +Blocked speedup: 2.72507x +Parallel speedup: 3.53934x +Case 5 (200x100x256): +Naive time: 0.0136209 seconds +Blocked time: 0.00492715 seconds +Parallel time: 0.00922291 seconds +Blocked speedup: 2.76446x +Parallel speedup: 1.47686x +Case 5 (200x100x256): +Naive time: 0.0137748 seconds +Blocked time: 0.0049287 seconds +Parallel time: 0.00371219 seconds +Blocked speedup: 2.79483x +Parallel speedup: 3.71071x +Case 5 (200x100x256): +Naive time: 0.0136863 seconds +Blocked time: 0.00494847 seconds +Parallel time: 0.00389972 seconds +Blocked speedup: 2.76576x +Parallel speedup: 3.50956x +Case 5 (200x100x256): +Naive time: 0.0138084 seconds +Blocked time: 0.00492977 seconds +Parallel time: 0.00795933 seconds +Blocked speedup: 2.80101x +Parallel speedup: 1.73486x +Case 5 (200x100x256): +Naive time: 0.013841 seconds +Blocked time: 0.00505818 seconds +Parallel time: 0.00410179 seconds +Blocked speedup: 2.73635x +Parallel speedup: 3.37437x +Case 5 (200x100x256): +Naive time: 0.0151431 seconds +Blocked time: 0.00556704 seconds +Parallel time: 0.00727099 seconds +Blocked speedup: 2.72014x +Parallel speedup: 2.08267x +Case 5 (200x100x256): +Naive time: 0.0136434 seconds +Blocked time: 0.00510782 seconds +Parallel time: 0.00402616 seconds +Blocked speedup: 2.67108x +Parallel speedup: 3.38869x +Case 5 (200x100x256): +Naive time: 0.0138959 seconds +Blocked time: 0.00512436 seconds +Parallel time: 0.00406105 seconds +Blocked speedup: 2.71174x +Parallel speedup: 3.42176x +Case 5 (200x100x256): +Naive time: 0.0137395 seconds +Blocked time: 0.00508282 seconds +Parallel time: 0.0037429 seconds +Blocked speedup: 2.70312x +Parallel speedup: 3.6708x +Case 5 (200x100x256): +Naive time: 0.0136377 seconds +Blocked time: 0.0051026 seconds +Parallel time: 0.00397826 seconds +Blocked speedup: 2.6727x +Parallel speedup: 3.42807x +Case 5 (200x100x256): +Naive time: 0.0137948 seconds +Blocked time: 0.00490593 seconds +Parallel time: 0.00423019 seconds +Blocked speedup: 2.81186x +Parallel speedup: 3.26104x +Case 5 (200x100x256): +Naive time: 0.0189816 seconds +Blocked time: 0.00497887 seconds +Parallel time: 0.00367286 seconds +Blocked speedup: 3.81244x +Parallel speedup: 5.16808x +Case 5 (200x100x256): +Naive time: 0.0137875 seconds +Blocked time: 0.00484597 seconds +Parallel time: 0.0042465 seconds +Blocked speedup: 2.84515x +Parallel speedup: 3.24679x +Case 5 (200x100x256): +Naive time: 0.0137387 seconds +Blocked time: 0.00489503 seconds +Parallel time: 0.00419846 seconds +Blocked speedup: 2.80667x +Parallel speedup: 3.27232x +Case 5 (200x100x256): +Naive time: 0.0140041 seconds +Blocked time: 0.00490994 seconds +Parallel time: 0.00375039 seconds +Blocked speedup: 2.8522x +Parallel speedup: 3.73405x +Case 5 (200x100x256): +Naive time: 0.0187592 seconds +Blocked time: 0.0049206 seconds +Parallel time: 0.00397927 seconds +Blocked speedup: 3.81238x +Parallel speedup: 4.71424x +Case 5 (200x100x256): +Naive time: 0.0136476 seconds +Blocked time: 0.00493897 seconds +Parallel time: 0.003746 seconds +Blocked speedup: 2.76324x +Parallel speedup: 3.64324x +Case 5 (200x100x256): +Naive time: 0.013704 seconds +Blocked time: 0.005086 seconds +Parallel time: 0.00379163 seconds +Blocked speedup: 2.69445x +Parallel speedup: 3.61427x +Case 5 (200x100x256): +Naive time: 0.0144121 seconds +Blocked time: 0.00491926 seconds +Parallel time: 0.00486072 seconds +Blocked speedup: 2.92972x +Parallel speedup: 2.96501x +Case 5 (200x100x256): +Naive time: 0.0136428 seconds +Blocked time: 0.00487247 seconds +Parallel time: 0.00391357 seconds +Blocked speedup: 2.79997x +Parallel speedup: 3.48602x +Case 5 (200x100x256): +Naive time: 0.013747 seconds +Blocked time: 0.00491945 seconds +Parallel time: 0.00836923 seconds +Blocked speedup: 2.79442x +Parallel speedup: 1.64257x +Case 5 (200x100x256): +Naive time: 0.013697 seconds +Blocked time: 0.00497483 seconds +Parallel time: 0.00714046 seconds +Blocked speedup: 2.75326x +Parallel speedup: 1.91823x +Case 5 (200x100x256): +Naive time: 0.0137142 seconds +Blocked time: 0.00495567 seconds +Parallel time: 0.00726657 seconds +Blocked speedup: 2.76738x +Parallel speedup: 1.8873x +Case 5 (200x100x256): +Naive time: 0.0136878 seconds +Blocked time: 0.00495307 seconds +Parallel time: 0.00414523 seconds +Blocked speedup: 2.76349x +Parallel speedup: 3.30205x +Case 5 (200x100x256): +Naive time: 0.0137811 seconds +Blocked time: 0.00491611 seconds +Parallel time: 0.00966298 seconds +Blocked speedup: 2.80325x +Parallel speedup: 1.42617x +Case 5 (200x100x256): +Naive time: 0.0137646 seconds +Blocked time: 0.0049456 seconds +Parallel time: 0.0108 seconds +Blocked speedup: 2.7832x +Parallel speedup: 1.2745x +Case 5 (200x100x256): +Naive time: 0.0138294 seconds +Blocked time: 0.00493469 seconds +Parallel time: 0.00392752 seconds +Blocked speedup: 2.80249x +Parallel speedup: 3.52116x +Case 5 (200x100x256): +Naive time: 0.0138289 seconds +Blocked time: 0.00494477 seconds +Parallel time: 0.00393648 seconds +Blocked speedup: 2.79667x +Parallel speedup: 3.51301x +Case 5 (200x100x256): +Naive time: 0.0137595 seconds +Blocked time: 0.00492227 seconds +Parallel time: 0.00378293 seconds +Blocked speedup: 2.79535x +Parallel speedup: 3.63726x +Case 5 (200x100x256): +Naive time: 0.0137397 seconds +Blocked time: 0.00495646 seconds +Parallel time: 0.00415039 seconds +Blocked speedup: 2.77209x +Parallel speedup: 3.31047x +Case 5 (200x100x256): +Naive time: 0.0137205 seconds +Blocked time: 0.00502405 seconds +Parallel time: 0.00386429 seconds +Blocked speedup: 2.73097x +Parallel speedup: 3.55059x +Case 5 (200x100x256): +Naive time: 0.0138516 seconds +Blocked time: 0.00546404 seconds +Parallel time: 0.00411977 seconds +Blocked speedup: 2.53504x +Parallel speedup: 3.36222x +Case 5 (200x100x256): +Naive time: 0.0137724 seconds +Blocked time: 0.0048899 seconds +Parallel time: 0.00415344 seconds +Blocked speedup: 2.81649x +Parallel speedup: 3.3159x +Case 5 (200x100x256): +Naive time: 0.0137975 seconds +Blocked time: 0.00504488 seconds +Parallel time: 0.00817251 seconds +Blocked speedup: 2.73496x +Parallel speedup: 1.68829x +Case 5 (200x100x256): +Naive time: 0.0136732 seconds +Blocked time: 0.00498834 seconds +Parallel time: 0.00386415 seconds +Blocked speedup: 2.74103x +Parallel speedup: 3.53847x +Case 5 (200x100x256): +Naive time: 0.0145219 seconds +Blocked time: 0.00507599 seconds +Parallel time: 0.00848677 seconds +Blocked speedup: 2.8609x +Parallel speedup: 1.71112x +Case 5 (200x100x256): +Naive time: 0.0140905 seconds +Blocked time: 0.00499296 seconds +Parallel time: 0.00373929 seconds +Blocked speedup: 2.82208x +Parallel speedup: 3.76825x +Case 5 (200x100x256): +Naive time: 0.0138095 seconds +Blocked time: 0.00490097 seconds +Parallel time: 0.00407806 seconds +Blocked speedup: 2.81771x +Parallel speedup: 3.38629x +Case 5 (200x100x256): +Naive time: 0.0137678 seconds +Blocked time: 0.00492499 seconds +Parallel time: 0.0075985 seconds +Blocked speedup: 2.7955x +Parallel speedup: 1.81191x +Case 5 (200x100x256): +Naive time: 0.0137515 seconds +Blocked time: 0.00490445 seconds +Parallel time: 0.00380987 seconds +Blocked speedup: 2.80387x +Parallel speedup: 3.60943x +Case 5 (200x100x256): +Naive time: 0.0144823 seconds +Blocked time: 0.00491052 seconds +Parallel time: 0.00430915 seconds +Blocked speedup: 2.94923x +Parallel speedup: 3.36081x +Case 5 (200x100x256): +Naive time: 0.0156949 seconds +Blocked time: 0.00497554 seconds +Parallel time: 0.00767275 seconds +Blocked speedup: 3.15442x +Parallel speedup: 2.04554x +Case 5 (200x100x256): +Naive time: 0.0137774 seconds +Blocked time: 0.00489915 seconds +Parallel time: 0.00877419 seconds +Blocked speedup: 2.8122x +Parallel speedup: 1.57022x +Case 5 (200x100x256): +Naive time: 0.0145479 seconds +Blocked time: 0.00492335 seconds +Parallel time: 0.00403532 seconds +Blocked speedup: 2.95488x +Parallel speedup: 3.60514x +Case 5 (200x100x256): +Naive time: 0.0138068 seconds +Blocked time: 0.00491546 seconds +Parallel time: 0.00410404 seconds +Blocked speedup: 2.80884x +Parallel speedup: 3.36419x +Case 5 (200x100x256): +Naive time: 0.0139532 seconds +Blocked time: 0.00498609 seconds +Parallel time: 0.0040777 seconds +Blocked speedup: 2.79843x +Parallel speedup: 3.42183x +Case 5 (200x100x256): +Naive time: 0.0137324 seconds +Blocked time: 0.0049214 seconds +Parallel time: 0.00379375 seconds +Blocked speedup: 2.79035x +Parallel speedup: 3.61975x +Case 5 (200x100x256): +Naive time: 0.0138286 seconds +Blocked time: 0.00487089 seconds +Parallel time: 0.00401539 seconds +Blocked speedup: 2.83904x +Parallel speedup: 3.4439x +Case 5 (200x100x256): +Naive time: 0.0138724 seconds +Blocked time: 0.004919 seconds +Parallel time: 0.00418014 seconds +Blocked speedup: 2.82018x +Parallel speedup: 3.31865x +Case 5 (200x100x256): +Naive time: 0.0137362 seconds +Blocked time: 0.00513237 seconds +Parallel time: 0.00762247 seconds +Blocked speedup: 2.67638x +Parallel speedup: 1.80206x +Case 5 (200x100x256): +Naive time: 0.0137977 seconds +Blocked time: 0.00491971 seconds +Parallel time: 0.0111822 seconds +Blocked speedup: 2.80457x +Parallel speedup: 1.23389x +Case 5 (200x100x256): +Naive time: 0.0138221 seconds +Blocked time: 0.0049207 seconds +Parallel time: 0.00728289 seconds +Blocked speedup: 2.80898x +Parallel speedup: 1.89789x +Case 5 (200x100x256): +Naive time: 0.0137167 seconds +Blocked time: 0.0049199 seconds +Parallel time: 0.00431727 seconds +Blocked speedup: 2.78801x +Parallel speedup: 3.17717x +Case 5 (200x100x256): +Naive time: 0.0137439 seconds +Blocked time: 0.00490577 seconds +Parallel time: 0.00407902 seconds +Blocked speedup: 2.80157x +Parallel speedup: 3.3694x +Case 5 (200x100x256): +Naive time: 0.0194599 seconds +Blocked time: 0.00488227 seconds +Parallel time: 0.00368736 seconds +Blocked speedup: 3.98583x +Parallel speedup: 5.27746x +Case 5 (200x100x256): +Naive time: 0.0137625 seconds +Blocked time: 0.00494397 seconds +Parallel time: 0.00383066 seconds +Blocked speedup: 2.7837x +Parallel speedup: 3.59273x +Case 5 (200x100x256): +Naive time: 0.0140041 seconds +Blocked time: 0.00487789 seconds +Parallel time: 0.0044748 seconds +Blocked speedup: 2.87093x +Parallel speedup: 3.12955x +Case 5 (200x100x256): +Naive time: 0.013763 seconds +Blocked time: 0.00491564 seconds +Parallel time: 0.0037171 seconds +Blocked speedup: 2.79983x +Parallel speedup: 3.7026x +Case 5 (200x100x256): +Naive time: 0.0137894 seconds +Blocked time: 0.00503984 seconds +Parallel time: 0.00400856 seconds +Blocked speedup: 2.73608x +Parallel speedup: 3.43998x +Case 5 (200x100x256): +Naive time: 0.0137192 seconds +Blocked time: 0.00492825 seconds +Parallel time: 0.00430076 seconds +Blocked speedup: 2.7838x +Parallel speedup: 3.18996x +Case 5 (200x100x256): +Naive time: 0.0138367 seconds +Blocked time: 0.00498079 seconds +Parallel time: 0.00422043 seconds +Blocked speedup: 2.77801x +Parallel speedup: 3.2785x +Case 5 (200x100x256): +Naive time: 0.0138487 seconds +Blocked time: 0.0049656 seconds +Parallel time: 0.0041257 seconds +Blocked speedup: 2.78892x +Parallel speedup: 3.35668x +Case 5 (200x100x256): +Naive time: 0.0138392 seconds +Blocked time: 0.00499209 seconds +Parallel time: 0.00397831 seconds +Blocked speedup: 2.77222x +Parallel speedup: 3.47866x +Case 5 (200x100x256): +Naive time: 0.0137905 seconds +Blocked time: 0.00492993 seconds +Parallel time: 0.00401207 seconds +Blocked speedup: 2.79729x +Parallel speedup: 3.43725x +Case 5 (200x100x256): +Naive time: 0.013816 seconds +Blocked time: 0.00488495 seconds +Parallel time: 0.003823 seconds +Blocked speedup: 2.82827x +Parallel speedup: 3.6139x +Case 5 (200x100x256): +Naive time: 0.0138128 seconds +Blocked time: 0.0048676 seconds +Parallel time: 0.00360418 seconds +Blocked speedup: 2.8377x +Parallel speedup: 3.83244x +Case 5 (200x100x256): +Naive time: 0.0137479 seconds +Blocked time: 0.00500716 seconds +Parallel time: 0.00418979 seconds +Blocked speedup: 2.74565x +Parallel speedup: 3.28129x +Case 5 (200x100x256): +Naive time: 0.0137295 seconds +Blocked time: 0.00492813 seconds +Parallel time: 0.00384449 seconds +Blocked speedup: 2.78594x +Parallel speedup: 3.5712x +Case 5 (200x100x256): +Naive time: 0.013872 seconds +Blocked time: 0.0050661 seconds +Parallel time: 0.00377925 seconds +Blocked speedup: 2.7382x +Parallel speedup: 3.67056x +Case 5 (200x100x256): +Naive time: 0.0137031 seconds +Blocked time: 0.00496091 seconds +Parallel time: 0.00412225 seconds +Blocked speedup: 2.76222x +Parallel speedup: 3.32419x +Case 5 (200x100x256): +Naive time: 0.013998 seconds +Blocked time: 0.00504733 seconds +Parallel time: 0.00373942 seconds +Blocked speedup: 2.77335x +Parallel speedup: 3.74336x +Case 5 (200x100x256): +Naive time: 0.0139458 seconds +Blocked time: 0.00497581 seconds +Parallel time: 0.00374339 seconds +Blocked speedup: 2.80271x +Parallel speedup: 3.72543x +Case 5 (200x100x256): +Naive time: 0.0144825 seconds +Blocked time: 0.00492806 seconds +Parallel time: 0.00675897 seconds +Blocked speedup: 2.93878x +Parallel speedup: 2.1427x +Case 5 (200x100x256): +Naive time: 0.0140053 seconds +Blocked time: 0.0048475 seconds +Parallel time: 0.00409623 seconds +Blocked speedup: 2.88918x +Parallel speedup: 3.41907x +Case 5 (200x100x256): +Naive time: 0.0139257 seconds +Blocked time: 0.00490855 seconds +Parallel time: 0.0037734 seconds +Blocked speedup: 2.83702x +Parallel speedup: 3.69049x +Case 5 (200x100x256): +Naive time: 0.0138833 seconds +Blocked time: 0.00502048 seconds +Parallel time: 0.00376517 seconds +Blocked speedup: 2.76534x +Parallel speedup: 3.6873x +Case 5 (200x100x256): +Naive time: 0.0137491 seconds +Blocked time: 0.0049623 seconds +Parallel time: 0.00400777 seconds +Blocked speedup: 2.7707x +Parallel speedup: 3.4306x +Case 5 (200x100x256): +Naive time: 0.013845 seconds +Blocked time: 0.0049566 seconds +Parallel time: 0.00418313 seconds +Blocked speedup: 2.79324x +Parallel speedup: 3.30972x +Case 5 (200x100x256): +Naive time: 0.0138161 seconds +Blocked time: 0.00494601 seconds +Parallel time: 0.0041888 seconds +Blocked speedup: 2.79339x +Parallel speedup: 3.29835x +Case 5 (200x100x256): +Naive time: 0.0138474 seconds +Blocked time: 0.00492877 seconds +Parallel time: 0.00307498 seconds +Blocked speedup: 2.80951x +Parallel speedup: 4.50326x +Case 5 (200x100x256): +Naive time: 0.0138701 seconds +Blocked time: 0.00500072 seconds +Parallel time: 0.00391571 seconds +Blocked speedup: 2.77361x +Parallel speedup: 3.54216x +Case 5 (200x100x256): +Naive time: 0.0149368 seconds +Blocked time: 0.00486423 seconds +Parallel time: 0.00371422 seconds +Blocked speedup: 3.07075x +Parallel speedup: 4.02152x +Case 5 (200x100x256): +Naive time: 0.0136946 seconds +Blocked time: 0.00493451 seconds +Parallel time: 0.00395864 seconds +Blocked speedup: 2.77528x +Parallel speedup: 3.45943x +Case 5 (200x100x256): +Naive time: 0.0138132 seconds +Blocked time: 0.00488946 seconds +Parallel time: 0.00400483 seconds +Blocked speedup: 2.8251x +Parallel speedup: 3.44913x +Case 5 (200x100x256): +Naive time: 0.0137636 seconds +Blocked time: 0.00507447 seconds +Parallel time: 0.00403637 seconds +Blocked speedup: 2.71233x +Parallel speedup: 3.4099x +Case 5 (200x100x256): +Naive time: 0.0145092 seconds +Blocked time: 0.00504553 seconds +Parallel time: 0.00340497 seconds +Blocked speedup: 2.87566x +Parallel speedup: 4.26119x +Case 5 (200x100x256): +Naive time: 0.0138304 seconds +Blocked time: 0.00490792 seconds +Parallel time: 0.00426228 seconds +Blocked speedup: 2.81798x +Parallel speedup: 3.24484x +Case 5 (200x100x256): +Naive time: 0.0137917 seconds +Blocked time: 0.0049644 seconds +Parallel time: 0.00400217 seconds +Blocked speedup: 2.77811x +Parallel speedup: 3.44605x +Case 5 (200x100x256): +Naive time: 0.0138386 seconds +Blocked time: 0.00494716 seconds +Parallel time: 0.0070312 seconds +Blocked speedup: 2.79728x +Parallel speedup: 1.96817x +Case 5 (200x100x256): +Naive time: 0.0138662 seconds +Blocked time: 0.00498871 seconds +Parallel time: 0.00573514 seconds +Blocked speedup: 2.77952x +Parallel speedup: 2.41777x +Case 5 (200x100x256): +Naive time: 0.0139028 seconds +Blocked time: 0.00501899 seconds +Parallel time: 0.00391183 seconds +Blocked speedup: 2.77004x +Parallel speedup: 3.55404x +Case 5 (200x100x256): +Naive time: 0.0139039 seconds +Blocked time: 0.0049234 seconds +Parallel time: 0.00523082 seconds +Blocked speedup: 2.82405x +Parallel speedup: 2.65808x diff --git a/results/benchmark6.txt b/results/benchmark6.txt new file mode 100644 index 0000000..117ad5a --- /dev/null +++ b/results/benchmark6.txt @@ -0,0 +1,600 @@ +Case 6 (256x256x256): +Naive time: 0.0452807 seconds +Blocked time: 0.0152689 seconds +Parallel time: 0.0100956 seconds +Blocked speedup: 2.96554x +Parallel speedup: 4.48521x +Case 6 (256x256x256): +Naive time: 0.0457101 seconds +Blocked time: 0.0155576 seconds +Parallel time: 0.0161184 seconds +Blocked speedup: 2.93813x +Parallel speedup: 2.8359x +Case 6 (256x256x256): +Naive time: 0.0454275 seconds +Blocked time: 0.0151832 seconds +Parallel time: 0.00951767 seconds +Blocked speedup: 2.99197x +Parallel speedup: 4.77297x +Case 6 (256x256x256): +Naive time: 0.0452754 seconds +Blocked time: 0.0152108 seconds +Parallel time: 0.0101775 seconds +Blocked speedup: 2.97652x +Parallel speedup: 4.44857x +Case 6 (256x256x256): +Naive time: 0.0456725 seconds +Blocked time: 0.0153111 seconds +Parallel time: 0.0117766 seconds +Blocked speedup: 2.98297x +Parallel speedup: 3.87825x +Case 6 (256x256x256): +Naive time: 0.0451976 seconds +Blocked time: 0.0152577 seconds +Parallel time: 0.00982767 seconds +Blocked speedup: 2.96228x +Parallel speedup: 4.59901x +Case 6 (256x256x256): +Naive time: 0.0454264 seconds +Blocked time: 0.0153102 seconds +Parallel time: 0.00999083 seconds +Blocked speedup: 2.96707x +Parallel speedup: 4.54681x +Case 6 (256x256x256): +Naive time: 0.0453641 seconds +Blocked time: 0.0154971 seconds +Parallel time: 0.0100002 seconds +Blocked speedup: 2.92727x +Parallel speedup: 4.53631x +Case 6 (256x256x256): +Naive time: 0.0482299 seconds +Blocked time: 0.015343 seconds +Parallel time: 0.00995462 seconds +Blocked speedup: 3.14345x +Parallel speedup: 4.84497x +Case 6 (256x256x256): +Naive time: 0.0455195 seconds +Blocked time: 0.0153445 seconds +Parallel time: 0.0188169 seconds +Blocked speedup: 2.96651x +Parallel speedup: 2.41907x +Case 6 (256x256x256): +Naive time: 0.0457791 seconds +Blocked time: 0.0156444 seconds +Parallel time: 0.0219032 seconds +Blocked speedup: 2.92623x +Parallel speedup: 2.09007x +Case 6 (256x256x256): +Naive time: 0.0455767 seconds +Blocked time: 0.0155133 seconds +Parallel time: 0.0101494 seconds +Blocked speedup: 2.93791x +Parallel speedup: 4.49058x +Case 6 (256x256x256): +Naive time: 0.0451704 seconds +Blocked time: 0.0154746 seconds +Parallel time: 0.0101635 seconds +Blocked speedup: 2.91901x +Parallel speedup: 4.44439x +Case 6 (256x256x256): +Naive time: 0.0452646 seconds +Blocked time: 0.0154634 seconds +Parallel time: 0.0103145 seconds +Blocked speedup: 2.92721x +Parallel speedup: 4.38844x +Case 6 (256x256x256): +Naive time: 0.0450794 seconds +Blocked time: 0.0155608 seconds +Parallel time: 0.00997712 seconds +Blocked speedup: 2.897x +Parallel speedup: 4.51828x +Case 6 (256x256x256): +Naive time: 0.0453421 seconds +Blocked time: 0.0153938 seconds +Parallel time: 0.0104112 seconds +Blocked speedup: 2.94548x +Parallel speedup: 4.35511x +Case 6 (256x256x256): +Naive time: 0.0450714 seconds +Blocked time: 0.015204 seconds +Parallel time: 0.0102931 seconds +Blocked speedup: 2.96445x +Parallel speedup: 4.3788x +Case 6 (256x256x256): +Naive time: 0.046125 seconds +Blocked time: 0.0157518 seconds +Parallel time: 0.0099011 seconds +Blocked speedup: 2.92823x +Parallel speedup: 4.65857x +Case 6 (256x256x256): +Naive time: 0.0452981 seconds +Blocked time: 0.0153202 seconds +Parallel time: 0.0102487 seconds +Blocked speedup: 2.95676x +Parallel speedup: 4.41988x +Case 6 (256x256x256): +Naive time: 0.0463519 seconds +Blocked time: 0.0152159 seconds +Parallel time: 0.0100029 seconds +Blocked speedup: 3.04628x +Parallel speedup: 4.63382x +Case 6 (256x256x256): +Naive time: 0.0449959 seconds +Blocked time: 0.0155262 seconds +Parallel time: 0.0100917 seconds +Blocked speedup: 2.89806x +Parallel speedup: 4.4587x +Case 6 (256x256x256): +Naive time: 0.0454609 seconds +Blocked time: 0.0153302 seconds +Parallel time: 0.00988107 seconds +Blocked speedup: 2.96545x +Parallel speedup: 4.60081x +Case 6 (256x256x256): +Naive time: 0.0461095 seconds +Blocked time: 0.015612 seconds +Parallel time: 0.0101771 seconds +Blocked speedup: 2.95348x +Parallel speedup: 4.53073x +Case 6 (256x256x256): +Naive time: 0.0452974 seconds +Blocked time: 0.0151806 seconds +Parallel time: 0.0100701 seconds +Blocked speedup: 2.9839x +Parallel speedup: 4.4982x +Case 6 (256x256x256): +Naive time: 0.0454944 seconds +Blocked time: 0.0156679 seconds +Parallel time: 0.00959204 seconds +Blocked speedup: 2.90368x +Parallel speedup: 4.74294x +Case 6 (256x256x256): +Naive time: 0.0468505 seconds +Blocked time: 0.0158827 seconds +Parallel time: 0.00995963 seconds +Blocked speedup: 2.94979x +Parallel speedup: 4.70404x +Case 6 (256x256x256): +Naive time: 0.0453835 seconds +Blocked time: 0.0155696 seconds +Parallel time: 0.00994648 seconds +Blocked speedup: 2.91488x +Parallel speedup: 4.56277x +Case 6 (256x256x256): +Naive time: 0.0453964 seconds +Blocked time: 0.0153043 seconds +Parallel time: 0.016516 seconds +Blocked speedup: 2.96624x +Parallel speedup: 2.74863x +Case 6 (256x256x256): +Naive time: 0.0455251 seconds +Blocked time: 0.0152732 seconds +Parallel time: 0.0096209 seconds +Blocked speedup: 2.98071x +Parallel speedup: 4.73189x +Case 6 (256x256x256): +Naive time: 0.045377 seconds +Blocked time: 0.0153287 seconds +Parallel time: 0.00984177 seconds +Blocked speedup: 2.96027x +Parallel speedup: 4.61066x +Case 6 (256x256x256): +Naive time: 0.0462015 seconds +Blocked time: 0.0153854 seconds +Parallel time: 0.0129801 seconds +Blocked speedup: 3.00295x +Parallel speedup: 3.55941x +Case 6 (256x256x256): +Naive time: 0.045475 seconds +Blocked time: 0.0153114 seconds +Parallel time: 0.00978234 seconds +Blocked speedup: 2.97001x +Parallel speedup: 4.64868x +Case 6 (256x256x256): +Naive time: 0.0453278 seconds +Blocked time: 0.015366 seconds +Parallel time: 0.0112817 seconds +Blocked speedup: 2.94988x +Parallel speedup: 4.0178x +Case 6 (256x256x256): +Naive time: 0.0452225 seconds +Blocked time: 0.0153636 seconds +Parallel time: 0.0100784 seconds +Blocked speedup: 2.94349x +Parallel speedup: 4.48708x +Case 6 (256x256x256): +Naive time: 0.045281 seconds +Blocked time: 0.0153078 seconds +Parallel time: 0.0125033 seconds +Blocked speedup: 2.95804x +Parallel speedup: 3.62152x +Case 6 (256x256x256): +Naive time: 0.0459397 seconds +Blocked time: 0.015343 seconds +Parallel time: 0.0169398 seconds +Blocked speedup: 2.99418x +Parallel speedup: 2.71193x +Case 6 (256x256x256): +Naive time: 0.0455515 seconds +Blocked time: 0.0153106 seconds +Parallel time: 0.0101252 seconds +Blocked speedup: 2.97515x +Parallel speedup: 4.49883x +Case 6 (256x256x256): +Naive time: 0.0452186 seconds +Blocked time: 0.0154017 seconds +Parallel time: 0.00995892 seconds +Blocked speedup: 2.93595x +Parallel speedup: 4.54051x +Case 6 (256x256x256): +Naive time: 0.0458003 seconds +Blocked time: 0.0154253 seconds +Parallel time: 0.0100473 seconds +Blocked speedup: 2.96917x +Parallel speedup: 4.55848x +Case 6 (256x256x256): +Naive time: 0.0460104 seconds +Blocked time: 0.0156301 seconds +Parallel time: 0.016658 seconds +Blocked speedup: 2.94371x +Parallel speedup: 2.76206x +Case 6 (256x256x256): +Naive time: 0.0453293 seconds +Blocked time: 0.0152988 seconds +Parallel time: 0.00989413 seconds +Blocked speedup: 2.96294x +Parallel speedup: 4.58143x +Case 6 (256x256x256): +Naive time: 0.0448585 seconds +Blocked time: 0.0153228 seconds +Parallel time: 0.0104111 seconds +Blocked speedup: 2.92757x +Parallel speedup: 4.30874x +Case 6 (256x256x256): +Naive time: 0.045675 seconds +Blocked time: 0.0156197 seconds +Parallel time: 0.00963212 seconds +Blocked speedup: 2.9242x +Parallel speedup: 4.74195x +Case 6 (256x256x256): +Naive time: 0.0453968 seconds +Blocked time: 0.0153201 seconds +Parallel time: 0.00985928 seconds +Blocked speedup: 2.96322x +Parallel speedup: 4.60448x +Case 6 (256x256x256): +Naive time: 0.0455068 seconds +Blocked time: 0.0155392 seconds +Parallel time: 0.00980876 seconds +Blocked speedup: 2.92852x +Parallel speedup: 4.6394x +Case 6 (256x256x256): +Naive time: 0.0453884 seconds +Blocked time: 0.0152997 seconds +Parallel time: 0.0102024 seconds +Blocked speedup: 2.96661x +Parallel speedup: 4.44878x +Case 6 (256x256x256): +Naive time: 0.0454425 seconds +Blocked time: 0.0155598 seconds +Parallel time: 0.00977037 seconds +Blocked speedup: 2.92051x +Parallel speedup: 4.65105x +Case 6 (256x256x256): +Naive time: 0.0454344 seconds +Blocked time: 0.0154042 seconds +Parallel time: 0.00982539 seconds +Blocked speedup: 2.94949x +Parallel speedup: 4.62418x +Case 6 (256x256x256): +Naive time: 0.0454296 seconds +Blocked time: 0.0154074 seconds +Parallel time: 0.00990537 seconds +Blocked speedup: 2.94855x +Parallel speedup: 4.58636x +Case 6 (256x256x256): +Naive time: 0.0454913 seconds +Blocked time: 0.015454 seconds +Parallel time: 0.00974496 seconds +Blocked speedup: 2.94366x +Parallel speedup: 4.66819x +Case 6 (256x256x256): +Naive time: 0.0456475 seconds +Blocked time: 0.0153183 seconds +Parallel time: 0.00996773 seconds +Blocked speedup: 2.97994x +Parallel speedup: 4.57952x +Case 6 (256x256x256): +Naive time: 0.0461789 seconds +Blocked time: 0.0163316 seconds +Parallel time: 0.0108994 seconds +Blocked speedup: 2.82758x +Parallel speedup: 4.23682x +Case 6 (256x256x256): +Naive time: 0.0456839 seconds +Blocked time: 0.0154831 seconds +Parallel time: 0.00999202 seconds +Blocked speedup: 2.95057x +Parallel speedup: 4.57204x +Case 6 (256x256x256): +Naive time: 0.0444807 seconds +Blocked time: 0.0156192 seconds +Parallel time: 0.0103789 seconds +Blocked speedup: 2.84782x +Parallel speedup: 4.28567x +Case 6 (256x256x256): +Naive time: 0.0452035 seconds +Blocked time: 0.0154341 seconds +Parallel time: 0.0105306 seconds +Blocked speedup: 2.9288x +Parallel speedup: 4.2926x +Case 6 (256x256x256): +Naive time: 0.0449055 seconds +Blocked time: 0.015346 seconds +Parallel time: 0.00994484 seconds +Blocked speedup: 2.92621x +Parallel speedup: 4.51546x +Case 6 (256x256x256): +Naive time: 0.0447268 seconds +Blocked time: 0.0154101 seconds +Parallel time: 0.0102186 seconds +Blocked speedup: 2.90243x +Parallel speedup: 4.37699x +Case 6 (256x256x256): +Naive time: 0.0461344 seconds +Blocked time: 0.0153448 seconds +Parallel time: 0.00985479 seconds +Blocked speedup: 3.00652x +Parallel speedup: 4.68141x +Case 6 (256x256x256): +Naive time: 0.0477806 seconds +Blocked time: 0.0153502 seconds +Parallel time: 0.0202362 seconds +Blocked speedup: 3.1127x +Parallel speedup: 2.36115x +Case 6 (256x256x256): +Naive time: 0.0458862 seconds +Blocked time: 0.0162147 seconds +Parallel time: 0.0101518 seconds +Blocked speedup: 2.82992x +Parallel speedup: 4.52003x +Case 6 (256x256x256): +Naive time: 0.0458499 seconds +Blocked time: 0.0155966 seconds +Parallel time: 0.0105833 seconds +Blocked speedup: 2.93973x +Parallel speedup: 4.33229x +Case 6 (256x256x256): +Naive time: 0.0456252 seconds +Blocked time: 0.0157118 seconds +Parallel time: 0.00992912 seconds +Blocked speedup: 2.90387x +Parallel speedup: 4.59509x +Case 6 (256x256x256): +Naive time: 0.0454457 seconds +Blocked time: 0.0154116 seconds +Parallel time: 0.00970612 seconds +Blocked speedup: 2.94879x +Parallel speedup: 4.68217x +Case 6 (256x256x256): +Naive time: 0.0453158 seconds +Blocked time: 0.0153712 seconds +Parallel time: 0.0102181 seconds +Blocked speedup: 2.9481x +Parallel speedup: 4.43486x +Case 6 (256x256x256): +Naive time: 0.0457523 seconds +Blocked time: 0.0153487 seconds +Parallel time: 0.0105412 seconds +Blocked speedup: 2.98086x +Parallel speedup: 4.34032x +Case 6 (256x256x256): +Naive time: 0.0458299 seconds +Blocked time: 0.0152797 seconds +Parallel time: 0.0103225 seconds +Blocked speedup: 2.99941x +Parallel speedup: 4.43983x +Case 6 (256x256x256): +Naive time: 0.0459811 seconds +Blocked time: 0.015241 seconds +Parallel time: 0.0100019 seconds +Blocked speedup: 3.01693x +Parallel speedup: 4.59725x +Case 6 (256x256x256): +Naive time: 0.0453136 seconds +Blocked time: 0.0153857 seconds +Parallel time: 0.00983217 seconds +Blocked speedup: 2.94517x +Parallel speedup: 4.60871x +Case 6 (256x256x256): +Naive time: 0.0451383 seconds +Blocked time: 0.0153992 seconds +Parallel time: 0.00981385 seconds +Blocked speedup: 2.93122x +Parallel speedup: 4.59944x +Case 6 (256x256x256): +Naive time: 0.0450827 seconds +Blocked time: 0.0156314 seconds +Parallel time: 0.00971662 seconds +Blocked speedup: 2.88412x +Parallel speedup: 4.63975x +Case 6 (256x256x256): +Naive time: 0.0451417 seconds +Blocked time: 0.0153592 seconds +Parallel time: 0.0100049 seconds +Blocked speedup: 2.93907x +Parallel speedup: 4.51196x +Case 6 (256x256x256): +Naive time: 0.045101 seconds +Blocked time: 0.0153916 seconds +Parallel time: 0.0102189 seconds +Blocked speedup: 2.93024x +Parallel speedup: 4.41351x +Case 6 (256x256x256): +Naive time: 0.045588 seconds +Blocked time: 0.0154928 seconds +Parallel time: 0.0171663 seconds +Blocked speedup: 2.94253x +Parallel speedup: 2.65567x +Case 6 (256x256x256): +Naive time: 0.0458853 seconds +Blocked time: 0.0157963 seconds +Parallel time: 0.011717 seconds +Blocked speedup: 2.90482x +Parallel speedup: 3.91614x +Case 6 (256x256x256): +Naive time: 0.0463516 seconds +Blocked time: 0.0154547 seconds +Parallel time: 0.0100733 seconds +Blocked speedup: 2.9992x +Parallel speedup: 4.60144x +Case 6 (256x256x256): +Naive time: 0.0453487 seconds +Blocked time: 0.0152766 seconds +Parallel time: 0.00988157 seconds +Blocked speedup: 2.96851x +Parallel speedup: 4.58922x +Case 6 (256x256x256): +Naive time: 0.0461811 seconds +Blocked time: 0.015425 seconds +Parallel time: 0.0099814 seconds +Blocked speedup: 2.99391x +Parallel speedup: 4.62672x +Case 6 (256x256x256): +Naive time: 0.0454056 seconds +Blocked time: 0.0152555 seconds +Parallel time: 0.00971813 seconds +Blocked speedup: 2.97634x +Parallel speedup: 4.67226x +Case 6 (256x256x256): +Naive time: 0.0458209 seconds +Blocked time: 0.0160377 seconds +Parallel time: 0.00957803 seconds +Blocked speedup: 2.85708x +Parallel speedup: 4.78396x +Case 6 (256x256x256): +Naive time: 0.0453948 seconds +Blocked time: 0.0153213 seconds +Parallel time: 0.00998656 seconds +Blocked speedup: 2.96285x +Parallel speedup: 4.54559x +Case 6 (256x256x256): +Naive time: 0.045316 seconds +Blocked time: 0.0155198 seconds +Parallel time: 0.00987513 seconds +Blocked speedup: 2.91989x +Parallel speedup: 4.5889x +Case 6 (256x256x256): +Naive time: 0.0456647 seconds +Blocked time: 0.0153532 seconds +Parallel time: 0.00997381 seconds +Blocked speedup: 2.97428x +Parallel speedup: 4.57846x +Case 6 (256x256x256): +Naive time: 0.0454162 seconds +Blocked time: 0.0153213 seconds +Parallel time: 0.00987589 seconds +Blocked speedup: 2.96426x +Parallel speedup: 4.5987x +Case 6 (256x256x256): +Naive time: 0.0451677 seconds +Blocked time: 0.0154099 seconds +Parallel time: 0.00965742 seconds +Blocked speedup: 2.93109x +Parallel speedup: 4.67699x +Case 6 (256x256x256): +Naive time: 0.0453854 seconds +Blocked time: 0.0153493 seconds +Parallel time: 0.00983445 seconds +Blocked speedup: 2.95683x +Parallel speedup: 4.61494x +Case 6 (256x256x256): +Naive time: 0.044726 seconds +Blocked time: 0.0154255 seconds +Parallel time: 0.00970372 seconds +Blocked speedup: 2.89948x +Parallel speedup: 4.60916x +Case 6 (256x256x256): +Naive time: 0.0461186 seconds +Blocked time: 0.0152648 seconds +Parallel time: 0.0174549 seconds +Blocked speedup: 3.02124x +Parallel speedup: 2.64216x +Case 6 (256x256x256): +Naive time: 0.0461375 seconds +Blocked time: 0.0154755 seconds +Parallel time: 0.0100039 seconds +Blocked speedup: 2.98133x +Parallel speedup: 4.61197x +Case 6 (256x256x256): +Naive time: 0.0452216 seconds +Blocked time: 0.0151979 seconds +Parallel time: 0.0105764 seconds +Blocked speedup: 2.97553x +Parallel speedup: 4.27573x +Case 6 (256x256x256): +Naive time: 0.0449207 seconds +Blocked time: 0.0154817 seconds +Parallel time: 0.0105505 seconds +Blocked speedup: 2.90153x +Parallel speedup: 4.25767x +Case 6 (256x256x256): +Naive time: 0.0451556 seconds +Blocked time: 0.0153208 seconds +Parallel time: 0.00973942 seconds +Blocked speedup: 2.94734x +Parallel speedup: 4.63638x +Case 6 (256x256x256): +Naive time: 0.0452826 seconds +Blocked time: 0.015369 seconds +Parallel time: 0.00985212 seconds +Blocked speedup: 2.94636x +Parallel speedup: 4.59623x +Case 6 (256x256x256): +Naive time: 0.0461638 seconds +Blocked time: 0.0153769 seconds +Parallel time: 0.0101423 seconds +Blocked speedup: 3.00215x +Parallel speedup: 4.55162x +Case 6 (256x256x256): +Naive time: 0.0450233 seconds +Blocked time: 0.0152994 seconds +Parallel time: 0.0100329 seconds +Blocked speedup: 2.94282x +Parallel speedup: 4.48758x +Case 6 (256x256x256): +Naive time: 0.0454826 seconds +Blocked time: 0.0155032 seconds +Parallel time: 0.0101704 seconds +Blocked speedup: 2.93376x +Parallel speedup: 4.47206x +Case 6 (256x256x256): +Naive time: 0.0458893 seconds +Blocked time: 0.0153437 seconds +Parallel time: 0.00975324 seconds +Blocked speedup: 2.99075x +Parallel speedup: 4.70503x +Case 6 (256x256x256): +Naive time: 0.0459361 seconds +Blocked time: 0.0153219 seconds +Parallel time: 0.0153828 seconds +Blocked speedup: 2.99807x +Parallel speedup: 2.98621x +Case 6 (256x256x256): +Naive time: 0.0461685 seconds +Blocked time: 0.0152783 seconds +Parallel time: 0.0100524 seconds +Blocked speedup: 3.02183x +Parallel speedup: 4.59276x +Case 6 (256x256x256): +Naive time: 0.0457994 seconds +Blocked time: 0.0153017 seconds +Parallel time: 0.00961546 seconds +Blocked speedup: 2.99309x +Parallel speedup: 4.7631x +Case 6 (256x256x256): +Naive time: 0.0459951 seconds +Blocked time: 0.015656 seconds +Parallel time: 0.0100969 seconds +Blocked speedup: 2.93785x +Parallel speedup: 4.55536x diff --git a/results/benchmark7.txt b/results/benchmark7.txt new file mode 100644 index 0000000..ebb916e --- /dev/null +++ b/results/benchmark7.txt @@ -0,0 +1,600 @@ +Case 7 (256x300x256): +Naive time: 0.0531918 seconds +Blocked time: 0.0184958 seconds +Parallel time: 0.0126409 seconds +Blocked speedup: 2.87589x +Parallel speedup: 4.2079x +Case 7 (256x300x256): +Naive time: 0.053771 seconds +Blocked time: 0.0184112 seconds +Parallel time: 0.011689 seconds +Blocked speedup: 2.92056x +Parallel speedup: 4.60013x +Case 7 (256x300x256): +Naive time: 0.0541285 seconds +Blocked time: 0.0186816 seconds +Parallel time: 0.0159331 seconds +Blocked speedup: 2.89743x +Parallel speedup: 3.39723x +Case 7 (256x300x256): +Naive time: 0.0528428 seconds +Blocked time: 0.0179913 seconds +Parallel time: 0.0113889 seconds +Blocked speedup: 2.93713x +Parallel speedup: 4.63986x +Case 7 (256x300x256): +Naive time: 0.0528247 seconds +Blocked time: 0.0183841 seconds +Parallel time: 0.0114989 seconds +Blocked speedup: 2.8734x +Parallel speedup: 4.59391x +Case 7 (256x300x256): +Naive time: 0.0534567 seconds +Blocked time: 0.0181572 seconds +Parallel time: 0.0115275 seconds +Blocked speedup: 2.9441x +Parallel speedup: 4.63733x +Case 7 (256x300x256): +Naive time: 0.0530374 seconds +Blocked time: 0.0184798 seconds +Parallel time: 0.0114992 seconds +Blocked speedup: 2.87002x +Parallel speedup: 4.61227x +Case 7 (256x300x256): +Naive time: 0.0537043 seconds +Blocked time: 0.0182815 seconds +Parallel time: 0.0116327 seconds +Blocked speedup: 2.93764x +Parallel speedup: 4.61669x +Case 7 (256x300x256): +Naive time: 0.0537575 seconds +Blocked time: 0.0184839 seconds +Parallel time: 0.0113125 seconds +Blocked speedup: 2.90834x +Parallel speedup: 4.75206x +Case 7 (256x300x256): +Naive time: 0.0531222 seconds +Blocked time: 0.0182059 seconds +Parallel time: 0.0113336 seconds +Blocked speedup: 2.91787x +Parallel speedup: 4.68716x +Case 7 (256x300x256): +Naive time: 0.0536285 seconds +Blocked time: 0.0181251 seconds +Parallel time: 0.0115569 seconds +Blocked speedup: 2.95879x +Parallel speedup: 4.6404x +Case 7 (256x300x256): +Naive time: 0.0538054 seconds +Blocked time: 0.0189797 seconds +Parallel time: 0.0206023 seconds +Blocked speedup: 2.83489x +Parallel speedup: 2.61162x +Case 7 (256x300x256): +Naive time: 0.0531848 seconds +Blocked time: 0.0182059 seconds +Parallel time: 0.0113872 seconds +Blocked speedup: 2.92129x +Parallel speedup: 4.67056x +Case 7 (256x300x256): +Naive time: 0.0536322 seconds +Blocked time: 0.0183776 seconds +Parallel time: 0.0109237 seconds +Blocked speedup: 2.91835x +Parallel speedup: 4.90972x +Case 7 (256x300x256): +Naive time: 0.0532998 seconds +Blocked time: 0.0189253 seconds +Parallel time: 0.0116227 seconds +Blocked speedup: 2.81633x +Parallel speedup: 4.58586x +Case 7 (256x300x256): +Naive time: 0.0531844 seconds +Blocked time: 0.0180474 seconds +Parallel time: 0.0118201 seconds +Blocked speedup: 2.94693x +Parallel speedup: 4.4995x +Case 7 (256x300x256): +Naive time: 0.0528096 seconds +Blocked time: 0.0182217 seconds +Parallel time: 0.0111161 seconds +Blocked speedup: 2.89817x +Parallel speedup: 4.75074x +Case 7 (256x300x256): +Naive time: 0.0527978 seconds +Blocked time: 0.0180347 seconds +Parallel time: 0.0122796 seconds +Blocked speedup: 2.92757x +Parallel speedup: 4.29963x +Case 7 (256x300x256): +Naive time: 0.0533187 seconds +Blocked time: 0.0190983 seconds +Parallel time: 0.0110339 seconds +Blocked speedup: 2.79181x +Parallel speedup: 4.83225x +Case 7 (256x300x256): +Naive time: 0.0540193 seconds +Blocked time: 0.0184342 seconds +Parallel time: 0.0228036 seconds +Blocked speedup: 2.93039x +Parallel speedup: 2.36889x +Case 7 (256x300x256): +Naive time: 0.0533537 seconds +Blocked time: 0.0186003 seconds +Parallel time: 0.0114687 seconds +Blocked speedup: 2.86843x +Parallel speedup: 4.65213x +Case 7 (256x300x256): +Naive time: 0.0531692 seconds +Blocked time: 0.0181897 seconds +Parallel time: 0.0117939 seconds +Blocked speedup: 2.92303x +Parallel speedup: 4.50821x +Case 7 (256x300x256): +Naive time: 0.0529893 seconds +Blocked time: 0.0186337 seconds +Parallel time: 0.0110297 seconds +Blocked speedup: 2.84374x +Parallel speedup: 4.80423x +Case 7 (256x300x256): +Naive time: 0.0530809 seconds +Blocked time: 0.0181938 seconds +Parallel time: 0.0115464 seconds +Blocked speedup: 2.91754x +Parallel speedup: 4.59717x +Case 7 (256x300x256): +Naive time: 0.053261 seconds +Blocked time: 0.017948 seconds +Parallel time: 0.0180674 seconds +Blocked speedup: 2.96752x +Parallel speedup: 2.9479x +Case 7 (256x300x256): +Naive time: 0.0536768 seconds +Blocked time: 0.0182445 seconds +Parallel time: 0.0113602 seconds +Blocked speedup: 2.94208x +Parallel speedup: 4.72498x +Case 7 (256x300x256): +Naive time: 0.0532257 seconds +Blocked time: 0.0181173 seconds +Parallel time: 0.0122447 seconds +Blocked speedup: 2.93783x +Parallel speedup: 4.34682x +Case 7 (256x300x256): +Naive time: 0.0525993 seconds +Blocked time: 0.0181414 seconds +Parallel time: 0.0109894 seconds +Blocked speedup: 2.89941x +Parallel speedup: 4.78635x +Case 7 (256x300x256): +Naive time: 0.0530849 seconds +Blocked time: 0.0182227 seconds +Parallel time: 0.0121695 seconds +Blocked speedup: 2.91312x +Parallel speedup: 4.36213x +Case 7 (256x300x256): +Naive time: 0.0530095 seconds +Blocked time: 0.0181086 seconds +Parallel time: 0.0110939 seconds +Blocked speedup: 2.92731x +Parallel speedup: 4.77825x +Case 7 (256x300x256): +Naive time: 0.0526339 seconds +Blocked time: 0.0180812 seconds +Parallel time: 0.0192056 seconds +Blocked speedup: 2.91097x +Parallel speedup: 2.74055x +Case 7 (256x300x256): +Naive time: 0.0539085 seconds +Blocked time: 0.0183416 seconds +Parallel time: 0.0109939 seconds +Blocked speedup: 2.93914x +Parallel speedup: 4.90349x +Case 7 (256x300x256): +Naive time: 0.0532673 seconds +Blocked time: 0.0180238 seconds +Parallel time: 0.0112945 seconds +Blocked speedup: 2.95538x +Parallel speedup: 4.7162x +Case 7 (256x300x256): +Naive time: 0.0528933 seconds +Blocked time: 0.0180629 seconds +Parallel time: 0.0117495 seconds +Blocked speedup: 2.92828x +Parallel speedup: 4.50173x +Case 7 (256x300x256): +Naive time: 0.0527477 seconds +Blocked time: 0.0180203 seconds +Parallel time: 0.0188939 seconds +Blocked speedup: 2.92712x +Parallel speedup: 2.79179x +Case 7 (256x300x256): +Naive time: 0.0531594 seconds +Blocked time: 0.0181367 seconds +Parallel time: 0.0143706 seconds +Blocked speedup: 2.93104x +Parallel speedup: 3.69919x +Case 7 (256x300x256): +Naive time: 0.0537889 seconds +Blocked time: 0.0194321 seconds +Parallel time: 0.02041 seconds +Blocked speedup: 2.76804x +Parallel speedup: 2.63541x +Case 7 (256x300x256): +Naive time: 0.056966 seconds +Blocked time: 0.018922 seconds +Parallel time: 0.0135422 seconds +Blocked speedup: 3.01057x +Parallel speedup: 4.20656x +Case 7 (256x300x256): +Naive time: 0.0535451 seconds +Blocked time: 0.0184047 seconds +Parallel time: 0.0111388 seconds +Blocked speedup: 2.90931x +Parallel speedup: 4.80706x +Case 7 (256x300x256): +Naive time: 0.0694305 seconds +Blocked time: 0.0188796 seconds +Parallel time: 0.0245491 seconds +Blocked speedup: 3.67754x +Parallel speedup: 2.82823x +Case 7 (256x300x256): +Naive time: 0.0533623 seconds +Blocked time: 0.0187706 seconds +Parallel time: 0.012047 seconds +Blocked speedup: 2.84286x +Parallel speedup: 4.4295x +Case 7 (256x300x256): +Naive time: 0.0542122 seconds +Blocked time: 0.0183245 seconds +Parallel time: 0.0116396 seconds +Blocked speedup: 2.95845x +Parallel speedup: 4.65755x +Case 7 (256x300x256): +Naive time: 0.0560297 seconds +Blocked time: 0.0197751 seconds +Parallel time: 0.0132659 seconds +Blocked speedup: 2.83335x +Parallel speedup: 4.2236x +Case 7 (256x300x256): +Naive time: 0.054536 seconds +Blocked time: 0.0183204 seconds +Parallel time: 0.0108841 seconds +Blocked speedup: 2.9768x +Parallel speedup: 5.0106x +Case 7 (256x300x256): +Naive time: 0.0554836 seconds +Blocked time: 0.0200266 seconds +Parallel time: 0.0151349 seconds +Blocked speedup: 2.7705x +Parallel speedup: 3.66595x +Case 7 (256x300x256): +Naive time: 0.0536382 seconds +Blocked time: 0.0184644 seconds +Parallel time: 0.0113092 seconds +Blocked speedup: 2.90496x +Parallel speedup: 4.74288x +Case 7 (256x300x256): +Naive time: 0.0533395 seconds +Blocked time: 0.0182773 seconds +Parallel time: 0.0109353 seconds +Blocked speedup: 2.91835x +Parallel speedup: 4.87772x +Case 7 (256x300x256): +Naive time: 0.0532561 seconds +Blocked time: 0.0184244 seconds +Parallel time: 0.0111087 seconds +Blocked speedup: 2.89053x +Parallel speedup: 4.79409x +Case 7 (256x300x256): +Naive time: 0.0523266 seconds +Blocked time: 0.0179744 seconds +Parallel time: 0.0119574 seconds +Blocked speedup: 2.91118x +Parallel speedup: 4.3761x +Case 7 (256x300x256): +Naive time: 0.0530394 seconds +Blocked time: 0.0180911 seconds +Parallel time: 0.0115401 seconds +Blocked speedup: 2.93179x +Parallel speedup: 4.59609x +Case 7 (256x300x256): +Naive time: 0.0530457 seconds +Blocked time: 0.0181047 seconds +Parallel time: 0.0113862 seconds +Blocked speedup: 2.92995x +Parallel speedup: 4.65875x +Case 7 (256x300x256): +Naive time: 0.0523409 seconds +Blocked time: 0.0181262 seconds +Parallel time: 0.012415 seconds +Blocked speedup: 2.88758x +Parallel speedup: 4.21595x +Case 7 (256x300x256): +Naive time: 0.0529497 seconds +Blocked time: 0.0191797 seconds +Parallel time: 0.0192685 seconds +Blocked speedup: 2.76071x +Parallel speedup: 2.74799x +Case 7 (256x300x256): +Naive time: 0.0533209 seconds +Blocked time: 0.0182973 seconds +Parallel time: 0.0114825 seconds +Blocked speedup: 2.91414x +Parallel speedup: 4.64369x +Case 7 (256x300x256): +Naive time: 0.0534895 seconds +Blocked time: 0.0183646 seconds +Parallel time: 0.0128017 seconds +Blocked speedup: 2.91264x +Parallel speedup: 4.1783x +Case 7 (256x300x256): +Naive time: 0.0531378 seconds +Blocked time: 0.0186915 seconds +Parallel time: 0.0121075 seconds +Blocked speedup: 2.84289x +Parallel speedup: 4.38883x +Case 7 (256x300x256): +Naive time: 0.0527773 seconds +Blocked time: 0.0184785 seconds +Parallel time: 0.011473 seconds +Blocked speedup: 2.85615x +Parallel speedup: 4.60014x +Case 7 (256x300x256): +Naive time: 0.0539169 seconds +Blocked time: 0.0191497 seconds +Parallel time: 0.0117145 seconds +Blocked speedup: 2.81554x +Parallel speedup: 4.60259x +Case 7 (256x300x256): +Naive time: 0.0529826 seconds +Blocked time: 0.0183164 seconds +Parallel time: 0.0113532 seconds +Blocked speedup: 2.89263x +Parallel speedup: 4.66675x +Case 7 (256x300x256): +Naive time: 0.0536875 seconds +Blocked time: 0.020199 seconds +Parallel time: 0.0192709 seconds +Blocked speedup: 2.65792x +Parallel speedup: 2.78593x +Case 7 (256x300x256): +Naive time: 0.0535452 seconds +Blocked time: 0.0181297 seconds +Parallel time: 0.0114084 seconds +Blocked speedup: 2.95346x +Parallel speedup: 4.6935x +Case 7 (256x300x256): +Naive time: 0.0532024 seconds +Blocked time: 0.0185265 seconds +Parallel time: 0.0143662 seconds +Blocked speedup: 2.8717x +Parallel speedup: 3.70329x +Case 7 (256x300x256): +Naive time: 0.0533254 seconds +Blocked time: 0.0178584 seconds +Parallel time: 0.0193796 seconds +Blocked speedup: 2.986x +Parallel speedup: 2.75163x +Case 7 (256x300x256): +Naive time: 0.0564549 seconds +Blocked time: 0.0183252 seconds +Parallel time: 0.0117656 seconds +Blocked speedup: 3.08073x +Parallel speedup: 4.7983x +Case 7 (256x300x256): +Naive time: 0.0536732 seconds +Blocked time: 0.0235558 seconds +Parallel time: 0.0272381 seconds +Blocked speedup: 2.27855x +Parallel speedup: 1.97052x +Case 7 (256x300x256): +Naive time: 0.0555447 seconds +Blocked time: 0.0189127 seconds +Parallel time: 0.0222016 seconds +Blocked speedup: 2.9369x +Parallel speedup: 2.50183x +Case 7 (256x300x256): +Naive time: 0.0531683 seconds +Blocked time: 0.0185708 seconds +Parallel time: 0.0202026 seconds +Blocked speedup: 2.86301x +Parallel speedup: 2.63175x +Case 7 (256x300x256): +Naive time: 0.05364 seconds +Blocked time: 0.0184233 seconds +Parallel time: 0.0114016 seconds +Blocked speedup: 2.91154x +Parallel speedup: 4.70459x +Case 7 (256x300x256): +Naive time: 0.0524498 seconds +Blocked time: 0.018384 seconds +Parallel time: 0.011239 seconds +Blocked speedup: 2.85301x +Parallel speedup: 4.66675x +Case 7 (256x300x256): +Naive time: 0.0528376 seconds +Blocked time: 0.0181088 seconds +Parallel time: 0.0116682 seconds +Blocked speedup: 2.91778x +Parallel speedup: 4.52833x +Case 7 (256x300x256): +Naive time: 0.0537422 seconds +Blocked time: 0.0181213 seconds +Parallel time: 0.0115403 seconds +Blocked speedup: 2.9657x +Parallel speedup: 4.65693x +Case 7 (256x300x256): +Naive time: 0.0544979 seconds +Blocked time: 0.0184362 seconds +Parallel time: 0.0199619 seconds +Blocked speedup: 2.95602x +Parallel speedup: 2.7301x +Case 7 (256x300x256): +Naive time: 0.0529261 seconds +Blocked time: 0.0206974 seconds +Parallel time: 0.0124199 seconds +Blocked speedup: 2.55714x +Parallel speedup: 4.2614x +Case 7 (256x300x256): +Naive time: 0.0532451 seconds +Blocked time: 0.018475 seconds +Parallel time: 0.011994 seconds +Blocked speedup: 2.882x +Parallel speedup: 4.43932x +Case 7 (256x300x256): +Naive time: 0.0559655 seconds +Blocked time: 0.0186506 seconds +Parallel time: 0.0143325 seconds +Blocked speedup: 3.00074x +Parallel speedup: 3.9048x +Case 7 (256x300x256): +Naive time: 0.0531711 seconds +Blocked time: 0.0187986 seconds +Parallel time: 0.0107028 seconds +Blocked speedup: 2.82846x +Parallel speedup: 4.96795x +Case 7 (256x300x256): +Naive time: 0.0542288 seconds +Blocked time: 0.0194958 seconds +Parallel time: 0.0202113 seconds +Blocked speedup: 2.78157x +Parallel speedup: 2.68309x +Case 7 (256x300x256): +Naive time: 0.0549096 seconds +Blocked time: 0.0195524 seconds +Parallel time: 0.0174604 seconds +Blocked speedup: 2.80833x +Parallel speedup: 3.14481x +Case 7 (256x300x256): +Naive time: 0.0524339 seconds +Blocked time: 0.0189821 seconds +Parallel time: 0.0172885 seconds +Blocked speedup: 2.76228x +Parallel speedup: 3.03288x +Case 7 (256x300x256): +Naive time: 0.0545585 seconds +Blocked time: 0.0200089 seconds +Parallel time: 0.0149744 seconds +Blocked speedup: 2.72672x +Parallel speedup: 3.64345x +Case 7 (256x300x256): +Naive time: 0.0537673 seconds +Blocked time: 0.0185997 seconds +Parallel time: 0.0128904 seconds +Blocked speedup: 2.89075x +Parallel speedup: 4.17112x +Case 7 (256x300x256): +Naive time: 0.0524091 seconds +Blocked time: 0.0181467 seconds +Parallel time: 0.0114743 seconds +Blocked speedup: 2.88808x +Parallel speedup: 4.56754x +Case 7 (256x300x256): +Naive time: 0.0526396 seconds +Blocked time: 0.0181662 seconds +Parallel time: 0.010855 seconds +Blocked speedup: 2.89766x +Parallel speedup: 4.84934x +Case 7 (256x300x256): +Naive time: 0.0524124 seconds +Blocked time: 0.0182909 seconds +Parallel time: 0.0110826 seconds +Blocked speedup: 2.86548x +Parallel speedup: 4.72924x +Case 7 (256x300x256): +Naive time: 0.0530021 seconds +Blocked time: 0.0183126 seconds +Parallel time: 0.0115913 seconds +Blocked speedup: 2.89429x +Parallel speedup: 4.57257x +Case 7 (256x300x256): +Naive time: 0.0523287 seconds +Blocked time: 0.0180252 seconds +Parallel time: 0.0173366 seconds +Blocked speedup: 2.90308x +Parallel speedup: 3.01839x +Case 7 (256x300x256): +Naive time: 0.0536349 seconds +Blocked time: 0.0184272 seconds +Parallel time: 0.0195485 seconds +Blocked speedup: 2.91064x +Parallel speedup: 2.74368x +Case 7 (256x300x256): +Naive time: 0.0535984 seconds +Blocked time: 0.0182611 seconds +Parallel time: 0.013739 seconds +Blocked speedup: 2.93512x +Parallel speedup: 3.90119x +Case 7 (256x300x256): +Naive time: 0.0537863 seconds +Blocked time: 0.0185456 seconds +Parallel time: 0.014177 seconds +Blocked speedup: 2.90022x +Parallel speedup: 3.79391x +Case 7 (256x300x256): +Naive time: 0.0539372 seconds +Blocked time: 0.0184849 seconds +Parallel time: 0.011329 seconds +Blocked speedup: 2.9179x +Parallel speedup: 4.76098x +Case 7 (256x300x256): +Naive time: 0.0530088 seconds +Blocked time: 0.0181572 seconds +Parallel time: 0.0126587 seconds +Blocked speedup: 2.91943x +Parallel speedup: 4.18754x +Case 7 (256x300x256): +Naive time: 0.0530843 seconds +Blocked time: 0.0181258 seconds +Parallel time: 0.0113267 seconds +Blocked speedup: 2.92867x +Parallel speedup: 4.68667x +Case 7 (256x300x256): +Naive time: 0.0526127 seconds +Blocked time: 0.0180781 seconds +Parallel time: 0.0111282 seconds +Blocked speedup: 2.9103x +Parallel speedup: 4.72787x +Case 7 (256x300x256): +Naive time: 0.0533993 seconds +Blocked time: 0.0181911 seconds +Parallel time: 0.011554 seconds +Blocked speedup: 2.93546x +Parallel speedup: 4.62171x +Case 7 (256x300x256): +Naive time: 0.0526876 seconds +Blocked time: 0.0185929 seconds +Parallel time: 0.0113316 seconds +Blocked speedup: 2.83374x +Parallel speedup: 4.6496x +Case 7 (256x300x256): +Naive time: 0.0527098 seconds +Blocked time: 0.0190162 seconds +Parallel time: 0.0118842 seconds +Blocked speedup: 2.77184x +Parallel speedup: 4.43528x +Case 7 (256x300x256): +Naive time: 0.0554998 seconds +Blocked time: 0.0182383 seconds +Parallel time: 0.0116254 seconds +Blocked speedup: 3.04303x +Parallel speedup: 4.77401x +Case 7 (256x300x256): +Naive time: 0.0534728 seconds +Blocked time: 0.0180969 seconds +Parallel time: 0.011326 seconds +Blocked speedup: 2.9548x +Parallel speedup: 4.72124x +Case 7 (256x300x256): +Naive time: 0.0527386 seconds +Blocked time: 0.018196 seconds +Parallel time: 0.0111838 seconds +Blocked speedup: 2.89837x +Parallel speedup: 4.71563x +Case 7 (256x300x256): +Naive time: 0.0526101 seconds +Blocked time: 0.0181757 seconds +Parallel time: 0.0116644 seconds +Blocked speedup: 2.89452x +Parallel speedup: 4.5103x diff --git a/results/benchmark8.txt b/results/benchmark8.txt new file mode 100644 index 0000000..6b790fc --- /dev/null +++ b/results/benchmark8.txt @@ -0,0 +1,600 @@ +Case 8 (64x128x64): +Naive time: 0.00137638 seconds +Blocked time: 0.000587012 seconds +Parallel time: 0.0115767 seconds +Blocked speedup: 2.34473x +Parallel speedup: 0.118892x +Case 8 (64x128x64): +Naive time: 0.00139993 seconds +Blocked time: 0.000524083 seconds +Parallel time: 0.00266865 seconds +Blocked speedup: 2.67119x +Parallel speedup: 0.524583x +Case 8 (64x128x64): +Naive time: 0.00140871 seconds +Blocked time: 0.000534583 seconds +Parallel time: 0.000677351 seconds +Blocked speedup: 2.63516x +Parallel speedup: 2.07974x +Case 8 (64x128x64): +Naive time: 0.00140452 seconds +Blocked time: 0.000535204 seconds +Parallel time: 0.0110311 seconds +Blocked speedup: 2.62426x +Parallel speedup: 0.127323x +Case 8 (64x128x64): +Naive time: 0.00141301 seconds +Blocked time: 0.000488727 seconds +Parallel time: 0.00305373 seconds +Blocked speedup: 2.89121x +Parallel speedup: 0.462716x +Case 8 (64x128x64): +Naive time: 0.00142641 seconds +Blocked time: 0.000545343 seconds +Parallel time: 0.00357761 seconds +Blocked speedup: 2.61561x +Parallel speedup: 0.398703x +Case 8 (64x128x64): +Naive time: 0.00140704 seconds +Blocked time: 0.000487455 seconds +Parallel time: 0.000944402 seconds +Blocked speedup: 2.8865x +Parallel speedup: 1.48987x +Case 8 (64x128x64): +Naive time: 0.00145351 seconds +Blocked time: 0.000517371 seconds +Parallel time: 0.00750484 seconds +Blocked speedup: 2.80941x +Parallel speedup: 0.193676x +Case 8 (64x128x64): +Naive time: 0.00140871 seconds +Blocked time: 0.000538681 seconds +Parallel time: 0.00273245 seconds +Blocked speedup: 2.61512x +Parallel speedup: 0.51555x +Case 8 (64x128x64): +Naive time: 0.00141087 seconds +Blocked time: 0.000481253 seconds +Parallel time: 0.000845546 seconds +Blocked speedup: 2.93165x +Parallel speedup: 1.66859x +Case 8 (64x128x64): +Naive time: 0.00141895 seconds +Blocked time: 0.000521178 seconds +Parallel time: 0.00298601 seconds +Blocked speedup: 2.72259x +Parallel speedup: 0.4752x +Case 8 (64x128x64): +Naive time: 0.00141336 seconds +Blocked time: 0.000492564 seconds +Parallel time: 0.0028355 seconds +Blocked speedup: 2.8694x +Parallel speedup: 0.498452x +Case 8 (64x128x64): +Naive time: 0.00142707 seconds +Blocked time: 0.000503786 seconds +Parallel time: 0.00383533 seconds +Blocked speedup: 2.83269x +Parallel speedup: 0.372085x +Case 8 (64x128x64): +Naive time: 0.00141107 seconds +Blocked time: 0.000489298 seconds +Parallel time: 0.000838223 seconds +Blocked speedup: 2.88386x +Parallel speedup: 1.6834x +Case 8 (64x128x64): +Naive time: 0.00141092 seconds +Blocked time: 0.000483767 seconds +Parallel time: 0.00433276 seconds +Blocked speedup: 2.91652x +Parallel speedup: 0.325639x +Case 8 (64x128x64): +Naive time: 0.00146646 seconds +Blocked time: 0.000495499 seconds +Parallel time: 0.00589203 seconds +Blocked speedup: 2.95956x +Parallel speedup: 0.248889x +Case 8 (64x128x64): +Naive time: 0.00141031 seconds +Blocked time: 0.000487996 seconds +Parallel time: 0.000863109 seconds +Blocked speedup: 2.89x +Parallel speedup: 1.63398x +Case 8 (64x128x64): +Naive time: 0.00142494 seconds +Blocked time: 0.000494548 seconds +Parallel time: 0.00268586 seconds +Blocked speedup: 2.88131x +Parallel speedup: 0.530535x +Case 8 (64x128x64): +Naive time: 0.00148381 seconds +Blocked time: 0.000492825 seconds +Parallel time: 0.00266055 seconds +Blocked speedup: 3.01084x +Parallel speedup: 0.557709x +Case 8 (64x128x64): +Naive time: 0.00141024 seconds +Blocked time: 0.000508764 seconds +Parallel time: 0.00385824 seconds +Blocked speedup: 2.77189x +Parallel speedup: 0.365513x +Case 8 (64x128x64): +Naive time: 0.00141098 seconds +Blocked time: 0.000479399 seconds +Parallel time: 0.000816662 seconds +Blocked speedup: 2.94322x +Parallel speedup: 1.72774x +Case 8 (64x128x64): +Naive time: 0.00144567 seconds +Blocked time: 0.000553117 seconds +Parallel time: 0.00847711 seconds +Blocked speedup: 2.61368x +Parallel speedup: 0.170538x +Case 8 (64x128x64): +Naive time: 0.00140764 seconds +Blocked time: 0.000513444 seconds +Parallel time: 0.00465424 seconds +Blocked speedup: 2.74157x +Parallel speedup: 0.302443x +Case 8 (64x128x64): +Naive time: 0.00141085 seconds +Blocked time: 0.000486482 seconds +Parallel time: 0.000849784 seconds +Blocked speedup: 2.9001x +Parallel speedup: 1.66024x +Case 8 (64x128x64): +Naive time: 0.00145959 seconds +Blocked time: 0.000489599 seconds +Parallel time: 0.000748435 seconds +Blocked speedup: 2.98119x +Parallel speedup: 1.95019x +Case 8 (64x128x64): +Naive time: 0.00145174 seconds +Blocked time: 0.000845256 seconds +Parallel time: 0.00394378 seconds +Blocked speedup: 1.71752x +Parallel speedup: 0.36811x +Case 8 (64x128x64): +Naive time: 0.00144198 seconds +Blocked time: 0.000518944 seconds +Parallel time: 0.00344897 seconds +Blocked speedup: 2.77867x +Parallel speedup: 0.418088x +Case 8 (64x128x64): +Naive time: 0.00142831 seconds +Blocked time: 0.000483827 seconds +Parallel time: 0.000827453 seconds +Blocked speedup: 2.95211x +Parallel speedup: 1.72615x +Case 8 (64x128x64): +Naive time: 0.00143193 seconds +Blocked time: 0.000497954 seconds +Parallel time: 0.00084748 seconds +Blocked speedup: 2.87562x +Parallel speedup: 1.68963x +Case 8 (64x128x64): +Naive time: 0.00141472 seconds +Blocked time: 0.000493266 seconds +Parallel time: 0.0018599 seconds +Blocked speedup: 2.86808x +Parallel speedup: 0.760645x +Case 8 (64x128x64): +Naive time: 0.00141027 seconds +Blocked time: 0.000496902 seconds +Parallel time: 0.00588257 seconds +Blocked speedup: 2.83812x +Parallel speedup: 0.239736x +Case 8 (64x128x64): +Naive time: 0.00142421 seconds +Blocked time: 0.000490611 seconds +Parallel time: 0.00920297 seconds +Blocked speedup: 2.90294x +Parallel speedup: 0.154756x +Case 8 (64x128x64): +Naive time: 0.0014132 seconds +Blocked time: 0.000486423 seconds +Parallel time: 0.000836149 seconds +Blocked speedup: 2.90529x +Parallel speedup: 1.69013x +Case 8 (64x128x64): +Naive time: 0.0014252 seconds +Blocked time: 0.000498485 seconds +Parallel time: 0.00634329 seconds +Blocked speedup: 2.85907x +Parallel speedup: 0.224679x +Case 8 (64x128x64): +Naive time: 0.00141172 seconds +Blocked time: 0.000492795 seconds +Parallel time: 0.00233255 seconds +Blocked speedup: 2.86472x +Parallel speedup: 0.605226x +Case 8 (64x128x64): +Naive time: 0.00141161 seconds +Blocked time: 0.000481654 seconds +Parallel time: 0.00084754 seconds +Blocked speedup: 2.93075x +Parallel speedup: 1.66554x +Case 8 (64x128x64): +Naive time: 0.00144567 seconds +Blocked time: 0.00049001 seconds +Parallel time: 0.0104143 seconds +Blocked speedup: 2.95029x +Parallel speedup: 0.138816x +Case 8 (64x128x64): +Naive time: 0.00141142 seconds +Blocked time: 0.000503325 seconds +Parallel time: 0.00383612 seconds +Blocked speedup: 2.80419x +Parallel speedup: 0.367929x +Case 8 (64x128x64): +Naive time: 0.00177035 seconds +Blocked time: 0.000531447 seconds +Parallel time: 0.0061977 seconds +Blocked speedup: 3.33119x +Parallel speedup: 0.285647x +Case 8 (64x128x64): +Naive time: 0.00140688 seconds +Blocked time: 0.000493215 seconds +Parallel time: 0.000851127 seconds +Blocked speedup: 2.85247x +Parallel speedup: 1.65296x +Case 8 (64x128x64): +Naive time: 0.001436 seconds +Blocked time: 0.000497994 seconds +Parallel time: 0.000873299 seconds +Blocked speedup: 2.88356x +Parallel speedup: 1.64433x +Case 8 (64x128x64): +Naive time: 0.00141581 seconds +Blocked time: 0.00048522 seconds +Parallel time: 0.000853722 seconds +Blocked speedup: 2.91786x +Parallel speedup: 1.65839x +Case 8 (64x128x64): +Naive time: 0.00157888 seconds +Blocked time: 0.000492324 seconds +Parallel time: 0.000520857 seconds +Blocked speedup: 3.207x +Parallel speedup: 3.03132x +Case 8 (64x128x64): +Naive time: 0.00166055 seconds +Blocked time: 0.000487645 seconds +Parallel time: 0.000811452 seconds +Blocked speedup: 3.40524x +Parallel speedup: 2.04639x +Case 8 (64x128x64): +Naive time: 0.0014935 seconds +Blocked time: 0.000493216 seconds +Parallel time: 0.00396942 seconds +Blocked speedup: 3.02809x +Parallel speedup: 0.376252x +Case 8 (64x128x64): +Naive time: 0.00140645 seconds +Blocked time: 0.000487975 seconds +Parallel time: 0.00322503 seconds +Blocked speedup: 2.88222x +Parallel speedup: 0.436104x +Case 8 (64x128x64): +Naive time: 0.00142913 seconds +Blocked time: 0.000504606 seconds +Parallel time: 0.00309983 seconds +Blocked speedup: 2.83217x +Parallel speedup: 0.461036x +Case 8 (64x128x64): +Naive time: 0.00141652 seconds +Blocked time: 0.000491141 seconds +Parallel time: 0.00310775 seconds +Blocked speedup: 2.88414x +Parallel speedup: 0.455801x +Case 8 (64x128x64): +Naive time: 0.00141185 seconds +Blocked time: 0.000491943 seconds +Parallel time: 0.00190914 seconds +Blocked speedup: 2.86995x +Parallel speedup: 0.739521x +Case 8 (64x128x64): +Naive time: 0.00142468 seconds +Blocked time: 0.000527259 seconds +Parallel time: 0.00283984 seconds +Blocked speedup: 2.70206x +Parallel speedup: 0.501678x +Case 8 (64x128x64): +Naive time: 0.00140626 seconds +Blocked time: 0.000493546 seconds +Parallel time: 0.000519956 seconds +Blocked speedup: 2.8493x +Parallel speedup: 2.70457x +Case 8 (64x128x64): +Naive time: 0.00145559 seconds +Blocked time: 0.000538069 seconds +Parallel time: 0.00259105 seconds +Blocked speedup: 2.70521x +Parallel speedup: 0.561776x +Case 8 (64x128x64): +Naive time: 0.00140746 seconds +Blocked time: 0.000496692 seconds +Parallel time: 0.00287732 seconds +Blocked speedup: 2.83367x +Parallel speedup: 0.489157x +Case 8 (64x128x64): +Naive time: 0.00142039 seconds +Blocked time: 0.000493256 seconds +Parallel time: 0.00333618 seconds +Blocked speedup: 2.87961x +Parallel speedup: 0.425752x +Case 8 (64x128x64): +Naive time: 0.00152354 seconds +Blocked time: 0.000484028 seconds +Parallel time: 0.000833814 seconds +Blocked speedup: 3.14763x +Parallel speedup: 1.82719x +Case 8 (64x128x64): +Naive time: 0.00144646 seconds +Blocked time: 0.000489057 seconds +Parallel time: 0.00333323 seconds +Blocked speedup: 2.95766x +Parallel speedup: 0.433952x +Case 8 (64x128x64): +Naive time: 0.00141331 seconds +Blocked time: 0.00054346 seconds +Parallel time: 0.00273407 seconds +Blocked speedup: 2.60058x +Parallel speedup: 0.516926x +Case 8 (64x128x64): +Naive time: 0.00141837 seconds +Blocked time: 0.000513544 seconds +Parallel time: 0.00299075 seconds +Blocked speedup: 2.76193x +Parallel speedup: 0.474253x +Case 8 (64x128x64): +Naive time: 0.00140926 seconds +Blocked time: 0.000487485 seconds +Parallel time: 0.000838583 seconds +Blocked speedup: 2.89089x +Parallel speedup: 1.68053x +Case 8 (64x128x64): +Naive time: 0.00144396 seconds +Blocked time: 0.000496983 seconds +Parallel time: 0.00295315 seconds +Blocked speedup: 2.90545x +Parallel speedup: 0.488955x +Case 8 (64x128x64): +Naive time: 0.00141985 seconds +Blocked time: 0.000502142 seconds +Parallel time: 0.00296071 seconds +Blocked speedup: 2.82759x +Parallel speedup: 0.479565x +Case 8 (64x128x64): +Naive time: 0.001407 seconds +Blocked time: 0.000655981 seconds +Parallel time: 0.00258369 seconds +Blocked speedup: 2.14488x +Parallel speedup: 0.54457x +Case 8 (64x128x64): +Naive time: 0.00141121 seconds +Blocked time: 0.000492674 seconds +Parallel time: 0.00410671 seconds +Blocked speedup: 2.86438x +Parallel speedup: 0.343635x +Case 8 (64x128x64): +Naive time: 0.00178818 seconds +Blocked time: 0.000488857 seconds +Parallel time: 0.00252988 seconds +Blocked speedup: 3.65789x +Parallel speedup: 0.706827x +Case 8 (64x128x64): +Naive time: 0.00140708 seconds +Blocked time: 0.000488837 seconds +Parallel time: 0.000499698 seconds +Blocked speedup: 2.87842x +Parallel speedup: 2.81586x +Case 8 (64x128x64): +Naive time: 0.00143635 seconds +Blocked time: 0.000620334 seconds +Parallel time: 0.00306309 seconds +Blocked speedup: 2.31545x +Parallel speedup: 0.468924x +Case 8 (64x128x64): +Naive time: 0.00140373 seconds +Blocked time: 0.000526358 seconds +Parallel time: 0.000507211 seconds +Blocked speedup: 2.66688x +Parallel speedup: 2.76755x +Case 8 (64x128x64): +Naive time: 0.00140891 seconds +Blocked time: 0.000491261 seconds +Parallel time: 0.0075513 seconds +Blocked speedup: 2.86795x +Parallel speedup: 0.186579x +Case 8 (64x128x64): +Naive time: 0.00141661 seconds +Blocked time: 0.000490831 seconds +Parallel time: 0.00320736 seconds +Blocked speedup: 2.88614x +Parallel speedup: 0.441674x +Case 8 (64x128x64): +Naive time: 0.00141616 seconds +Blocked time: 0.000491702 seconds +Parallel time: 0.00049031 seconds +Blocked speedup: 2.88011x +Parallel speedup: 2.88829x +Case 8 (64x128x64): +Naive time: 0.00140581 seconds +Blocked time: 0.0005061 seconds +Parallel time: 0.00431193 seconds +Blocked speedup: 2.77773x +Parallel speedup: 0.326028x +Case 8 (64x128x64): +Naive time: 0.00152279 seconds +Blocked time: 0.000484028 seconds +Parallel time: 0.000842631 seconds +Blocked speedup: 3.14607x +Parallel speedup: 1.80718x +Case 8 (64x128x64): +Naive time: 0.00141412 seconds +Blocked time: 0.000488707 seconds +Parallel time: 0.00331882 seconds +Blocked speedup: 2.8936x +Parallel speedup: 0.426093x +Case 8 (64x128x64): +Naive time: 0.00140708 seconds +Blocked time: 0.000506189 seconds +Parallel time: 0.00723026 seconds +Blocked speedup: 2.77975x +Parallel speedup: 0.19461x +Case 8 (64x128x64): +Naive time: 0.00140868 seconds +Blocked time: 0.000581421 seconds +Parallel time: 0.000849924 seconds +Blocked speedup: 2.42283x +Parallel speedup: 1.65742x +Case 8 (64x128x64): +Naive time: 0.00140682 seconds +Blocked time: 0.000502052 seconds +Parallel time: 0.0025156 seconds +Blocked speedup: 2.80214x +Parallel speedup: 0.559238x +Case 8 (64x128x64): +Naive time: 0.00141772 seconds +Blocked time: 0.000480382 seconds +Parallel time: 0.000512041 seconds +Blocked speedup: 2.95123x +Parallel speedup: 2.76876x +Case 8 (64x128x64): +Naive time: 0.0014382 seconds +Blocked time: 0.000500299 seconds +Parallel time: 0.00303046 seconds +Blocked speedup: 2.87468x +Parallel speedup: 0.474581x +Case 8 (64x128x64): +Naive time: 0.00172011 seconds +Blocked time: 0.000494689 seconds +Parallel time: 0.00287581 seconds +Blocked speedup: 3.47715x +Parallel speedup: 0.59813x +Case 8 (64x128x64): +Naive time: 0.00142044 seconds +Blocked time: 0.000521509 seconds +Parallel time: 0.00289992 seconds +Blocked speedup: 2.72372x +Parallel speedup: 0.489822x +Case 8 (64x128x64): +Naive time: 0.00140636 seconds +Blocked time: 0.000488316 seconds +Parallel time: 0.0028826 seconds +Blocked speedup: 2.88002x +Parallel speedup: 0.487879x +Case 8 (64x128x64): +Naive time: 0.00141754 seconds +Blocked time: 0.000489809 seconds +Parallel time: 0.0030873 seconds +Blocked speedup: 2.89407x +Parallel speedup: 0.459152x +Case 8 (64x128x64): +Naive time: 0.00142327 seconds +Blocked time: 0.000498074 seconds +Parallel time: 0.00347564 seconds +Blocked speedup: 2.85755x +Parallel speedup: 0.409498x +Case 8 (64x128x64): +Naive time: 0.00141696 seconds +Blocked time: 0.000498025 seconds +Parallel time: 0.00590253 seconds +Blocked speedup: 2.84516x +Parallel speedup: 0.24006x +Case 8 (64x128x64): +Naive time: 0.00142784 seconds +Blocked time: 0.000498265 seconds +Parallel time: 0.00669348 seconds +Blocked speedup: 2.86562x +Parallel speedup: 0.213318x +Case 8 (64x128x64): +Naive time: 0.0014204 seconds +Blocked time: 0.000481013 seconds +Parallel time: 0.000839906 seconds +Blocked speedup: 2.95293x +Parallel speedup: 1.69114x +Case 8 (64x128x64): +Naive time: 0.00143871 seconds +Blocked time: 0.000491271 seconds +Parallel time: 0.00326441 seconds +Blocked speedup: 2.92855x +Parallel speedup: 0.440727x +Case 8 (64x128x64): +Naive time: 0.00141169 seconds +Blocked time: 0.000491853 seconds +Parallel time: 0.00838865 seconds +Blocked speedup: 2.87014x +Parallel speedup: 0.168286x +Case 8 (64x128x64): +Naive time: 0.00150384 seconds +Blocked time: 0.000487123 seconds +Parallel time: 0.00258412 seconds +Blocked speedup: 3.08719x +Parallel speedup: 0.581955x +Case 8 (64x128x64): +Naive time: 0.00142805 seconds +Blocked time: 0.000505228 seconds +Parallel time: 0.00797808 seconds +Blocked speedup: 2.82654x +Parallel speedup: 0.178997x +Case 8 (64x128x64): +Naive time: 0.00142315 seconds +Blocked time: 0.000482616 seconds +Parallel time: 0.000842791 seconds +Blocked speedup: 2.94882x +Parallel speedup: 1.68862x +Case 8 (64x128x64): +Naive time: 0.00141795 seconds +Blocked time: 0.000491112 seconds +Parallel time: 0.00317047 seconds +Blocked speedup: 2.88723x +Parallel speedup: 0.447237x +Case 8 (64x128x64): +Naive time: 0.00141515 seconds +Blocked time: 0.000543821 seconds +Parallel time: 0.00320163 seconds +Blocked speedup: 2.60223x +Parallel speedup: 0.442008x +Case 8 (64x128x64): +Naive time: 0.00140685 seconds +Blocked time: 0.000491121 seconds +Parallel time: 0.000808296 seconds +Blocked speedup: 2.86457x +Parallel speedup: 1.74051x +Case 8 (64x128x64): +Naive time: 0.00145343 seconds +Blocked time: 0.000494137 seconds +Parallel time: 0.000870784 seconds +Blocked speedup: 2.94134x +Parallel speedup: 1.6691x +Case 8 (64x128x64): +Naive time: 0.00145153 seconds +Blocked time: 0.000556905 seconds +Parallel time: 0.00292074 seconds +Blocked speedup: 2.60643x +Parallel speedup: 0.496975x +Case 8 (64x128x64): +Naive time: 0.00140676 seconds +Blocked time: 0.000487745 seconds +Parallel time: 0.0028824 seconds +Blocked speedup: 2.88421x +Parallel speedup: 0.488052x +Case 8 (64x128x64): +Naive time: 0.00141049 seconds +Blocked time: 0.000494377 seconds +Parallel time: 0.00223388 seconds +Blocked speedup: 2.85306x +Parallel speedup: 0.631406x +Case 8 (64x128x64): +Naive time: 0.0014208 seconds +Blocked time: 0.000626676 seconds +Parallel time: 0.00239316 seconds +Blocked speedup: 2.26719x +Parallel speedup: 0.59369x +Case 8 (64x128x64): +Naive time: 0.00141287 seconds +Blocked time: 0.00050092 seconds +Parallel time: 0.000834796 seconds +Blocked speedup: 2.82055x +Parallel speedup: 1.69247x diff --git a/results/benchmark9.txt b/results/benchmark9.txt new file mode 100644 index 0000000..903bae1 --- /dev/null +++ b/results/benchmark9.txt @@ -0,0 +1,600 @@ +Case 9 (256x256x257): +Naive time: 0.0458363 seconds +Blocked time: 0.0156956 seconds +Parallel time: 0.00997131 seconds +Blocked speedup: 2.92032x +Parallel speedup: 4.59681x +Case 9 (256x256x257): +Naive time: 0.0452691 seconds +Blocked time: 0.0154866 seconds +Parallel time: 0.00966331 seconds +Blocked speedup: 2.92312x +Parallel speedup: 4.68464x +Case 9 (256x256x257): +Naive time: 0.0449855 seconds +Blocked time: 0.0158177 seconds +Parallel time: 0.0099134 seconds +Blocked speedup: 2.84399x +Parallel speedup: 4.53784x +Case 9 (256x256x257): +Naive time: 0.0450339 seconds +Blocked time: 0.0157722 seconds +Parallel time: 0.0169654 seconds +Blocked speedup: 2.85527x +Parallel speedup: 2.65445x +Case 9 (256x256x257): +Naive time: 0.0456335 seconds +Blocked time: 0.0158952 seconds +Parallel time: 0.00959538 seconds +Blocked speedup: 2.87091x +Parallel speedup: 4.75578x +Case 9 (256x256x257): +Naive time: 0.045433 seconds +Blocked time: 0.0160423 seconds +Parallel time: 0.00972065 seconds +Blocked speedup: 2.83208x +Parallel speedup: 4.67387x +Case 9 (256x256x257): +Naive time: 0.0456344 seconds +Blocked time: 0.0157404 seconds +Parallel time: 0.00989929 seconds +Blocked speedup: 2.89918x +Parallel speedup: 4.60986x +Case 9 (256x256x257): +Naive time: 0.0459051 seconds +Blocked time: 0.0157189 seconds +Parallel time: 0.0114866 seconds +Blocked speedup: 2.92038x +Parallel speedup: 3.99641x +Case 9 (256x256x257): +Naive time: 0.0451341 seconds +Blocked time: 0.015755 seconds +Parallel time: 0.0125654 seconds +Blocked speedup: 2.86475x +Parallel speedup: 3.59192x +Case 9 (256x256x257): +Naive time: 0.0454338 seconds +Blocked time: 0.0159955 seconds +Parallel time: 0.00905122 seconds +Blocked speedup: 2.84041x +Parallel speedup: 5.01964x +Case 9 (256x256x257): +Naive time: 0.0456228 seconds +Blocked time: 0.015856 seconds +Parallel time: 0.0183269 seconds +Blocked speedup: 2.87732x +Parallel speedup: 2.48939x +Case 9 (256x256x257): +Naive time: 0.0455372 seconds +Blocked time: 0.0158361 seconds +Parallel time: 0.0100541 seconds +Blocked speedup: 2.87552x +Parallel speedup: 4.52923x +Case 9 (256x256x257): +Naive time: 0.0459496 seconds +Blocked time: 0.0156864 seconds +Parallel time: 0.00955539 seconds +Blocked speedup: 2.92926x +Parallel speedup: 4.80877x +Case 9 (256x256x257): +Naive time: 0.0452856 seconds +Blocked time: 0.0155983 seconds +Parallel time: 0.0110828 seconds +Blocked speedup: 2.90324x +Parallel speedup: 4.08609x +Case 9 (256x256x257): +Naive time: 0.0454791 seconds +Blocked time: 0.0158121 seconds +Parallel time: 0.0164942 seconds +Blocked speedup: 2.87622x +Parallel speedup: 2.75729x +Case 9 (256x256x257): +Naive time: 0.0458192 seconds +Blocked time: 0.015536 seconds +Parallel time: 0.00984169 seconds +Blocked speedup: 2.94923x +Parallel speedup: 4.65562x +Case 9 (256x256x257): +Naive time: 0.044792 seconds +Blocked time: 0.0158106 seconds +Parallel time: 0.00979835 seconds +Blocked speedup: 2.83303x +Parallel speedup: 4.57138x +Case 9 (256x256x257): +Naive time: 0.0454449 seconds +Blocked time: 0.0156567 seconds +Parallel time: 0.0100306 seconds +Blocked speedup: 2.90259x +Parallel speedup: 4.53065x +Case 9 (256x256x257): +Naive time: 0.0447508 seconds +Blocked time: 0.0155715 seconds +Parallel time: 0.0097049 seconds +Blocked speedup: 2.8739x +Parallel speedup: 4.61116x +Case 9 (256x256x257): +Naive time: 0.0447298 seconds +Blocked time: 0.0156236 seconds +Parallel time: 0.0100422 seconds +Blocked speedup: 2.86296x +Parallel speedup: 4.4542x +Case 9 (256x256x257): +Naive time: 0.0448764 seconds +Blocked time: 0.0158559 seconds +Parallel time: 0.00986414 seconds +Blocked speedup: 2.83027x +Parallel speedup: 4.54945x +Case 9 (256x256x257): +Naive time: 0.0457666 seconds +Blocked time: 0.0158265 seconds +Parallel time: 0.00999937 seconds +Blocked speedup: 2.89178x +Parallel speedup: 4.57695x +Case 9 (256x256x257): +Naive time: 0.0453138 seconds +Blocked time: 0.0155756 seconds +Parallel time: 0.0151195 seconds +Blocked speedup: 2.90928x +Parallel speedup: 2.99704x +Case 9 (256x256x257): +Naive time: 0.0456052 seconds +Blocked time: 0.0156793 seconds +Parallel time: 0.00991958 seconds +Blocked speedup: 2.90862x +Parallel speedup: 4.59749x +Case 9 (256x256x257): +Naive time: 0.0457349 seconds +Blocked time: 0.0157651 seconds +Parallel time: 0.00949613 seconds +Blocked speedup: 2.90102x +Parallel speedup: 4.81617x +Case 9 (256x256x257): +Naive time: 0.0454875 seconds +Blocked time: 0.0156938 seconds +Parallel time: 0.00952874 seconds +Blocked speedup: 2.89844x +Parallel speedup: 4.77372x +Case 9 (256x256x257): +Naive time: 0.0450199 seconds +Blocked time: 0.0156372 seconds +Parallel time: 0.0101262 seconds +Blocked speedup: 2.87903x +Parallel speedup: 4.4459x +Case 9 (256x256x257): +Naive time: 0.0449061 seconds +Blocked time: 0.0155124 seconds +Parallel time: 0.00960486 seconds +Blocked speedup: 2.89484x +Parallel speedup: 4.67535x +Case 9 (256x256x257): +Naive time: 0.0455697 seconds +Blocked time: 0.0158853 seconds +Parallel time: 0.00976411 seconds +Blocked speedup: 2.86866x +Parallel speedup: 4.66706x +Case 9 (256x256x257): +Naive time: 0.0454374 seconds +Blocked time: 0.0154289 seconds +Parallel time: 0.00988387 seconds +Blocked speedup: 2.94495x +Parallel speedup: 4.59712x +Case 9 (256x256x257): +Naive time: 0.0449716 seconds +Blocked time: 0.0156081 seconds +Parallel time: 0.00993053 seconds +Blocked speedup: 2.8813x +Parallel speedup: 4.52862x +Case 9 (256x256x257): +Naive time: 0.0452002 seconds +Blocked time: 0.0155331 seconds +Parallel time: 0.0101849 seconds +Blocked speedup: 2.90994x +Parallel speedup: 4.43798x +Case 9 (256x256x257): +Naive time: 0.045012 seconds +Blocked time: 0.015789 seconds +Parallel time: 0.0097909 seconds +Blocked speedup: 2.85085x +Parallel speedup: 4.59733x +Case 9 (256x256x257): +Naive time: 0.04514 seconds +Blocked time: 0.0156814 seconds +Parallel time: 0.0141484 seconds +Blocked speedup: 2.87858x +Parallel speedup: 3.19047x +Case 9 (256x256x257): +Naive time: 0.0455188 seconds +Blocked time: 0.0158033 seconds +Parallel time: 0.00980702 seconds +Blocked speedup: 2.88033x +Parallel speedup: 4.64145x +Case 9 (256x256x257): +Naive time: 0.0446889 seconds +Blocked time: 0.0158532 seconds +Parallel time: 0.0102104 seconds +Blocked speedup: 2.81891x +Parallel speedup: 4.37679x +Case 9 (256x256x257): +Naive time: 0.045889 seconds +Blocked time: 0.0162247 seconds +Parallel time: 0.0174762 seconds +Blocked speedup: 2.82833x +Parallel speedup: 2.6258x +Case 9 (256x256x257): +Naive time: 0.0449065 seconds +Blocked time: 0.0155723 seconds +Parallel time: 0.0100989 seconds +Blocked speedup: 2.88375x +Parallel speedup: 4.44665x +Case 9 (256x256x257): +Naive time: 0.0456427 seconds +Blocked time: 0.0155387 seconds +Parallel time: 0.0103228 seconds +Blocked speedup: 2.93736x +Parallel speedup: 4.42152x +Case 9 (256x256x257): +Naive time: 0.0447598 seconds +Blocked time: 0.0156182 seconds +Parallel time: 0.00895316 seconds +Blocked speedup: 2.86587x +Parallel speedup: 4.99933x +Case 9 (256x256x257): +Naive time: 0.0449452 seconds +Blocked time: 0.01547 seconds +Parallel time: 0.0107586 seconds +Blocked speedup: 2.90531x +Parallel speedup: 4.17759x +Case 9 (256x256x257): +Naive time: 0.0449798 seconds +Blocked time: 0.0157805 seconds +Parallel time: 0.00978623 seconds +Blocked speedup: 2.85035x +Parallel speedup: 4.59623x +Case 9 (256x256x257): +Naive time: 0.0453498 seconds +Blocked time: 0.0155345 seconds +Parallel time: 0.00982009 seconds +Blocked speedup: 2.9193x +Parallel speedup: 4.61806x +Case 9 (256x256x257): +Naive time: 0.0447243 seconds +Blocked time: 0.0155075 seconds +Parallel time: 0.00972437 seconds +Blocked speedup: 2.88404x +Parallel speedup: 4.5992x +Case 9 (256x256x257): +Naive time: 0.0454752 seconds +Blocked time: 0.0157514 seconds +Parallel time: 0.0100175 seconds +Blocked speedup: 2.88706x +Parallel speedup: 4.53957x +Case 9 (256x256x257): +Naive time: 0.0451681 seconds +Blocked time: 0.0155438 seconds +Parallel time: 0.0169812 seconds +Blocked speedup: 2.90585x +Parallel speedup: 2.6599x +Case 9 (256x256x257): +Naive time: 0.0453981 seconds +Blocked time: 0.0155574 seconds +Parallel time: 0.00997469 seconds +Blocked speedup: 2.91811x +Parallel speedup: 4.55133x +Case 9 (256x256x257): +Naive time: 0.0449871 seconds +Blocked time: 0.0153905 seconds +Parallel time: 0.012318 seconds +Blocked speedup: 2.92305x +Parallel speedup: 3.65214x +Case 9 (256x256x257): +Naive time: 0.0452925 seconds +Blocked time: 0.016027 seconds +Parallel time: 0.0171921 seconds +Blocked speedup: 2.82601x +Parallel speedup: 2.63449x +Case 9 (256x256x257): +Naive time: 0.0451727 seconds +Blocked time: 0.0156166 seconds +Parallel time: 0.00987122 seconds +Blocked speedup: 2.8926x +Parallel speedup: 4.5762x +Case 9 (256x256x257): +Naive time: 0.0447709 seconds +Blocked time: 0.015487 seconds +Parallel time: 0.00984545 seconds +Blocked speedup: 2.89087x +Parallel speedup: 4.54736x +Case 9 (256x256x257): +Naive time: 0.0453117 seconds +Blocked time: 0.0157616 seconds +Parallel time: 0.00906524 seconds +Blocked speedup: 2.87482x +Parallel speedup: 4.9984x +Case 9 (256x256x257): +Naive time: 0.0456947 seconds +Blocked time: 0.0158839 seconds +Parallel time: 0.0116593 seconds +Blocked speedup: 2.8768x +Parallel speedup: 3.91915x +Case 9 (256x256x257): +Naive time: 0.0455831 seconds +Blocked time: 0.0160981 seconds +Parallel time: 0.00967608 seconds +Blocked speedup: 2.83157x +Parallel speedup: 4.7109x +Case 9 (256x256x257): +Naive time: 0.045085 seconds +Blocked time: 0.015668 seconds +Parallel time: 0.00998108 seconds +Blocked speedup: 2.87752x +Parallel speedup: 4.51705x +Case 9 (256x256x257): +Naive time: 0.0450644 seconds +Blocked time: 0.0158224 seconds +Parallel time: 0.0107814 seconds +Blocked speedup: 2.84814x +Parallel speedup: 4.17982x +Case 9 (256x256x257): +Naive time: 0.0456737 seconds +Blocked time: 0.0155454 seconds +Parallel time: 0.0103399 seconds +Blocked speedup: 2.93808x +Parallel speedup: 4.41723x +Case 9 (256x256x257): +Naive time: 0.0451516 seconds +Blocked time: 0.0155683 seconds +Parallel time: 0.0100115 seconds +Blocked speedup: 2.90022x +Parallel speedup: 4.50996x +Case 9 (256x256x257): +Naive time: 0.0457319 seconds +Blocked time: 0.0159523 seconds +Parallel time: 0.00983904 seconds +Blocked speedup: 2.86679x +Parallel speedup: 4.648x +Case 9 (256x256x257): +Naive time: 0.0451859 seconds +Blocked time: 0.0156693 seconds +Parallel time: 0.00967706 seconds +Blocked speedup: 2.88371x +Parallel speedup: 4.66938x +Case 9 (256x256x257): +Naive time: 0.0447029 seconds +Blocked time: 0.0155605 seconds +Parallel time: 0.009624 seconds +Blocked speedup: 2.87284x +Parallel speedup: 4.64494x +Case 9 (256x256x257): +Naive time: 0.0447364 seconds +Blocked time: 0.0162374 seconds +Parallel time: 0.0095948 seconds +Blocked speedup: 2.75514x +Parallel speedup: 4.66257x +Case 9 (256x256x257): +Naive time: 0.044913 seconds +Blocked time: 0.0157358 seconds +Parallel time: 0.00890298 seconds +Blocked speedup: 2.8542x +Parallel speedup: 5.04471x +Case 9 (256x256x257): +Naive time: 0.0450081 seconds +Blocked time: 0.0156911 seconds +Parallel time: 0.0190034 seconds +Blocked speedup: 2.86838x +Parallel speedup: 2.36843x +Case 9 (256x256x257): +Naive time: 0.0467208 seconds +Blocked time: 0.0161946 seconds +Parallel time: 0.00976039 seconds +Blocked speedup: 2.88496x +Parallel speedup: 4.78678x +Case 9 (256x256x257): +Naive time: 0.0455111 seconds +Blocked time: 0.0156734 seconds +Parallel time: 0.00991084 seconds +Blocked speedup: 2.90372x +Parallel speedup: 4.59206x +Case 9 (256x256x257): +Naive time: 0.044853 seconds +Blocked time: 0.015733 seconds +Parallel time: 0.00991039 seconds +Blocked speedup: 2.85089x +Parallel speedup: 4.52586x +Case 9 (256x256x257): +Naive time: 0.0450302 seconds +Blocked time: 0.0156712 seconds +Parallel time: 0.01053 seconds +Blocked speedup: 2.87344x +Parallel speedup: 4.27638x +Case 9 (256x256x257): +Naive time: 0.0449579 seconds +Blocked time: 0.0157764 seconds +Parallel time: 0.010577 seconds +Blocked speedup: 2.84969x +Parallel speedup: 4.25052x +Case 9 (256x256x257): +Naive time: 0.0449595 seconds +Blocked time: 0.0157129 seconds +Parallel time: 0.0101034 seconds +Blocked speedup: 2.86131x +Parallel speedup: 4.44992x +Case 9 (256x256x257): +Naive time: 0.0456928 seconds +Blocked time: 0.0158497 seconds +Parallel time: 0.00972843 seconds +Blocked speedup: 2.88287x +Parallel speedup: 4.69683x +Case 9 (256x256x257): +Naive time: 0.0447145 seconds +Blocked time: 0.015752 seconds +Parallel time: 0.0101454 seconds +Blocked speedup: 2.83865x +Parallel speedup: 4.40738x +Case 9 (256x256x257): +Naive time: 0.052914 seconds +Blocked time: 0.0274547 seconds +Parallel time: 0.0100565 seconds +Blocked speedup: 1.92732x +Parallel speedup: 5.26169x +Case 9 (256x256x257): +Naive time: 0.0459994 seconds +Blocked time: 0.0164421 seconds +Parallel time: 0.00982636 seconds +Blocked speedup: 2.79767x +Parallel speedup: 4.68123x +Case 9 (256x256x257): +Naive time: 0.0461508 seconds +Blocked time: 0.0163674 seconds +Parallel time: 0.0179622 seconds +Blocked speedup: 2.81967x +Parallel speedup: 2.56933x +Case 9 (256x256x257): +Naive time: 0.0484875 seconds +Blocked time: 0.0160423 seconds +Parallel time: 0.00997365 seconds +Blocked speedup: 3.02248x +Parallel speedup: 4.86156x +Case 9 (256x256x257): +Naive time: 0.0484594 seconds +Blocked time: 0.0167374 seconds +Parallel time: 0.0154817 seconds +Blocked speedup: 2.89528x +Parallel speedup: 3.13011x +Case 9 (256x256x257): +Naive time: 0.0508698 seconds +Blocked time: 0.0158154 seconds +Parallel time: 0.010242 seconds +Blocked speedup: 3.21648x +Parallel speedup: 4.96678x +Case 9 (256x256x257): +Naive time: 0.0462769 seconds +Blocked time: 0.0155248 seconds +Parallel time: 0.0101148 seconds +Blocked speedup: 2.98084x +Parallel speedup: 4.57517x +Case 9 (256x256x257): +Naive time: 0.0478485 seconds +Blocked time: 0.0161021 seconds +Parallel time: 0.0103969 seconds +Blocked speedup: 2.97158x +Parallel speedup: 4.6022x +Case 9 (256x256x257): +Naive time: 0.045476 seconds +Blocked time: 0.0160659 seconds +Parallel time: 0.011371 seconds +Blocked speedup: 2.83059x +Parallel speedup: 3.99928x +Case 9 (256x256x257): +Naive time: 0.0455981 seconds +Blocked time: 0.0155369 seconds +Parallel time: 0.00945442 seconds +Blocked speedup: 2.93482x +Parallel speedup: 4.82294x +Case 9 (256x256x257): +Naive time: 0.0455443 seconds +Blocked time: 0.0156662 seconds +Parallel time: 0.0092576 seconds +Blocked speedup: 2.90717x +Parallel speedup: 4.91967x +Case 9 (256x256x257): +Naive time: 0.0460333 seconds +Blocked time: 0.0159344 seconds +Parallel time: 0.0100237 seconds +Blocked speedup: 2.88892x +Parallel speedup: 4.59243x +Case 9 (256x256x257): +Naive time: 0.0461325 seconds +Blocked time: 0.0159006 seconds +Parallel time: 0.0105871 seconds +Blocked speedup: 2.9013x +Parallel speedup: 4.35744x +Case 9 (256x256x257): +Naive time: 0.0455614 seconds +Blocked time: 0.0157337 seconds +Parallel time: 0.0100294 seconds +Blocked speedup: 2.89579x +Parallel speedup: 4.54278x +Case 9 (256x256x257): +Naive time: 0.0488668 seconds +Blocked time: 0.0165166 seconds +Parallel time: 0.00923295 seconds +Blocked speedup: 2.95865x +Parallel speedup: 5.29265x +Case 9 (256x256x257): +Naive time: 0.0457618 seconds +Blocked time: 0.0160988 seconds +Parallel time: 0.00985999 seconds +Blocked speedup: 2.84256x +Parallel speedup: 4.64116x +Case 9 (256x256x257): +Naive time: 0.0459279 seconds +Blocked time: 0.0159577 seconds +Parallel time: 0.00922182 seconds +Blocked speedup: 2.8781x +Parallel speedup: 4.98035x +Case 9 (256x256x257): +Naive time: 0.0454468 seconds +Blocked time: 0.0158052 seconds +Parallel time: 0.00969373 seconds +Blocked speedup: 2.87542x +Parallel speedup: 4.68826x +Case 9 (256x256x257): +Naive time: 0.0476042 seconds +Blocked time: 0.0160627 seconds +Parallel time: 0.0109184 seconds +Blocked speedup: 2.96364x +Parallel speedup: 4.35999x +Case 9 (256x256x257): +Naive time: 0.0462428 seconds +Blocked time: 0.0158922 seconds +Parallel time: 0.00968015 seconds +Blocked speedup: 2.90978x +Parallel speedup: 4.77707x +Case 9 (256x256x257): +Naive time: 0.0462012 seconds +Blocked time: 0.0157565 seconds +Parallel time: 0.0104213 seconds +Blocked speedup: 2.9322x +Parallel speedup: 4.43332x +Case 9 (256x256x257): +Naive time: 0.0453675 seconds +Blocked time: 0.0158135 seconds +Parallel time: 0.0102043 seconds +Blocked speedup: 2.86891x +Parallel speedup: 4.44591x +Case 9 (256x256x257): +Naive time: 0.0455999 seconds +Blocked time: 0.0155676 seconds +Parallel time: 0.010077 seconds +Blocked speedup: 2.92916x +Parallel speedup: 4.52514x +Case 9 (256x256x257): +Naive time: 0.0450715 seconds +Blocked time: 0.0154219 seconds +Parallel time: 0.0100226 seconds +Blocked speedup: 2.92257x +Parallel speedup: 4.49699x +Case 9 (256x256x257): +Naive time: 0.0450698 seconds +Blocked time: 0.0155349 seconds +Parallel time: 0.00967175 seconds +Blocked speedup: 2.90119x +Parallel speedup: 4.65994x +Case 9 (256x256x257): +Naive time: 0.0450132 seconds +Blocked time: 0.0154666 seconds +Parallel time: 0.0106215 seconds +Blocked speedup: 2.91035x +Parallel speedup: 4.23793x +Case 9 (256x256x257): +Naive time: 0.0452847 seconds +Blocked time: 0.0157047 seconds +Parallel time: 0.00969443 seconds +Blocked speedup: 2.88351x +Parallel speedup: 4.67121x +Case 9 (256x256x257): +Naive time: 0.0453644 seconds +Blocked time: 0.0154472 seconds +Parallel time: 0.00991193 seconds +Blocked speedup: 2.93675x +Parallel speedup: 4.57675x diff --git a/results/benchmark_summary.csv b/results/benchmark_summary.csv new file mode 100644 index 0000000..3d880a9 --- /dev/null +++ b/results/benchmark_summary.csv @@ -0,0 +1,11 @@ +Test Case,Dimensions (m x n x p),Naive Time (s),Blocked Time (s),Parallel Time (s),Blocked Speedup,Parallel Speedup +0,64x64x64,0.000716916,0.000267748,0.00418022,2.69936,0.316059 +1,128x64x128,0.00276601,0.000983891,0.00202219,2.81429,1.67944 +2,100x128x56,0.00192184,0.000693677,0.00407294,2.79296,0.697054 +3,128x64x128,0.00278392,0.00099082,0.00232068,2.81336,1.61629 +4,32x128x32,0.000362589,0.000133682,0.00373411,2.74501,0.302438 +5,200x100x256,0.0140396,0.00496904,0.0047582,2.82651,3.2177 +6,256x256x256,0.0455719,0.0154318,0.0108826,2.95349,4.32004 +7,256x300x256,0.0536304,0.0185423,0.0134807,2.89548,4.17668 +8,64x128x64,0.00143829,0.000509443,0.0031398,2.84041,0.887305 +9,256x256x257,0.0456388,0.0158858,0.0107711,2.88015,4.36675 From 5a2eded80cdefea0a4da05695d46de4d2d3c1f64 Mon Sep 17 00:00:00 2001 From: Tristan AMIOTTE-SUCHET Date: Sun, 31 May 2026 23:46:55 +0300 Subject: [PATCH 7/7] writeup --- writeup.md | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 writeup.md diff --git a/writeup.md b/writeup.md new file mode 100644 index 0000000..3981774 --- /dev/null +++ b/writeup.md @@ -0,0 +1,90 @@ +# Parallel Programming + +**Åbo Akademi University, Information Technology Department** + +**Instructor: Alireza Olama** + +**Student: Tristan Amiotte-Suchet** + +**Student ID: 2501127** + +## Homework Assignment 4: Optimizing Matrix Multiplication in C++ + +**Due Date**: 31/05/2026 + +**Points**: 100 + +--- + +## Challenge of the Assignment + +In this assignment, we are tasked with optimizing the performance of a naive matrix multiplication implementation in C++ using two techniques: cache optimization via blocked matrix multiplication and parallelization using OpenMP. + +Before starting the optimizations, it is crucial to ensure that the naive matrix multiplication implementation is correct. Also, because all the benchmarks and tests rely on the correctness of the matrix multiplication, validating the results against a reference implementation is essential. It is also important to start by implementing the validation functions and all the other onces that are use for read and write the matrix files. + +Let's have a small overview of this pre-optimization phase and the next steps when starting implementing the different optimizations. + +### Pre-Optimization Phase + +I decided to divide the work and implement the read/write work in a separate library called `matfile` to keep the code organized and modular. And also cause it allows me to, later, write a side program to generate random matrices with specific dimensions for testing without having to duplicate the read/write code. So the my implementation of the work in the `main_ans.cpp` file has the same structure as the one provided in `main.cpp`. + +### Cache Optimization (Blocked Matrix Multiplication) + +This part is the one that create me the most difficulties. At the beginning, I implemented it using the pseudocode provided in the assignment instructions. However, I quickly faced a performance issue. Since even with all different possible cache size, the speedup was not improved on the provided matrices. Using my side executable to generate bigger matrices like 1000x1000, I was finally able to notice a small speedup between 1.4x and 1.6x in general. However, even with bigger matrices the gain was not significant. Maybe the fact that the first implementation does not increase the the speedup is due to my hardware limitations, but I unfortunately have no idea about the exact cause. The code used at this moment is the following one: + +```cpp +void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) +{ + // A is m x n, B is n x p, C is m x p + // Use block_size to divide matrices into submatrices + for (uint32_t ii = 0; ii < m; ii += BLOCK_SIZE) { + for (uint32_t kk = 0; kk < n; kk += BLOCK_SIZE) { + for (uint32_t jj = 0; jj < p; jj += BLOCK_SIZE) { + + uint32_t i_end = std::min(ii + BLOCK_SIZE, m); + uint32_t k_end = std::min(kk + BLOCK_SIZE, n); + uint32_t j_end = std::min(jj + BLOCK_SIZE, p); + + for (uint32_t i = ii; i < i_end; ++i) { + for (uint32_t k = kk; k < k_end; ++k) { + + float aik = A[i * n + k]; + + for (uint32_t j = jj; j < j_end; ++j) { + C[i * p + j] += + aik * B[k * p + j]; + } + } + } + } + } + } +} +``` + +After that, I decided to use a different approach. The concept is still the same and I continue the divide the matrices into blocks. But also start using a register blocking technique. The idea is to load the values of the output matrix \( C \) into registers, and then perform the calculations for a small block of columns (e.g., 8 columns at a time) while keeping the intermediate results in registers. This way, we can reduce the number of memory accesses and take advantage of the CPU's ability to perform multiple operations on the data stored in registers. With this new approach, I was able to achieve a much better speedup, around 2.8x in general on the provided matrices. The code used for this new approach is the one available in the `main_ans.cpp` with the last commited version of the `blocked_matmul` function. + +### Parallel Matrix Multiplication using OpenMP + +This one was more straightforward to implement. I just had to add the OpenMP pragmas to the naive matrix multiplication implementation as explained in the assignment instructions. using exactly this following line `#pragma omp parallel for` before the first loop of the naive matrix multiplication implementation. + +I also tried to vary the number of threads used for the parallel implementation by setting the `OMP_NUM_THREADS` environment variable. I noticed that until 4 threads, the speedup is perfectly linear, but after that, it stay a bit over 4x. Probably because of the overhead of creating threads and the fact that my CPU has 4 physical cores, so using more than 4 threads does not provide any additional performance benefit. + +## Results and benchmarks + +After implementing both optimizations, I wrote a simple shell script to run several times my program on each provided matrices. The script is available in the `benchmark.sh` file. The results is the following table: + +| Test Case | Dimensions (m × n × p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup | +|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------| +| 0 | 64x64x64 | 0.000716916 | 0.000267748 | 0.00418022 | 2.69936 | 0.316059 | +| 1 | 128x64x128 | 0.00276601 | 0.000983891 | 0.00202219 | 2.81429 | 1.67944 | +| 2 | 100x128x56 | 0.00192184 | 0.000693677 | 0.00407294 | 2.79296 | 0.697054 | +| 3 | 128x64x128 | 0.00278392 | 0.00099082 | 0.00232068 | 2.81336 | 1.61629 | +| 4 | 32x128x32 | 0.000362589 | 0.000133682 | 0.00373411 | 2.74501 | 0.302438 | +| 5 | 200x100x256 | 0.0140396 | 0.00496904 | 0.0047582 | 2.82651 | 3.2177 | +| 6 | 256x256x256 | 0.0455719 | 0.0154318 | 0.0108826 | 2.95349 | 4.32004 | +| 7 | 256x300x256 | 0.0536304 | 0.0185423 | 0.0134807 | 2.89548 | 4.17668 | +| 8 | 64x128x64 | 0.00143829 | 0.000509443 | 0.0031398 | 2.84041 | 0.887305 | +| 9 | 256x256x257 | 0.0456388 | 0.0158858 | 0.0107711 | 2.88015 | 4.36675 | + +We can notice on this table that the blocked matrix multiplication implementation provides a significant speedup over the naive implementation, with an average speedup of around 2.8x across all test cases. The parallel matrix multiplication implementation also provides a significant speedup, with an average speedup of around 3.2x across all test cases. However, the speedup from parallelization is not consistent across all test cases, and in some cases, it is even slower than the naive implementation. This could be due to various factors such as the overhead of creating threads, the size of the matrices, and the number of available CPU cores. The blocked version, on the other hand, consistently outperforms the naive version, and the speedup is usually the same across all test cases, which indicates that the cache optimization technique is effective regardless of the matrix size.