diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3a1ad18 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +build/ +data/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b04fd0..156eaeb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,10 +2,12 @@ cmake_minimum_required(VERSION 3.10) project(MatrixMultiplication) -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 23) set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + find_package(OpenMP REQUIRED) if(OpenMP_CXX_FOUND) @@ -17,9 +19,10 @@ if(APPLE) endif() -add_executable(matmul main_ans.cpp) +add_executable(matmul main.cpp) +target_compile_options(matmul PRIVATE -g -O3) if(OpenMP_CXX_FOUND) target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX) -endif() \ No newline at end of file +endif() diff --git a/README.md b/README.md index 51c7f2a..b2432bd 100644 --- a/README.md +++ b/README.md @@ -235,3 +235,11 @@ git push origin student-name - Use small test cases to debug your blocked and parallel implementations. Good luck, and enjoy optimizing your matrix multiplication! + +## Report + +The table of results is in `timings.csv` and also `report.pdf`, +please read report.pdf, it has a table and graph of the results. + +`report.typ` has the plaintext report before compilation. It depends on +my personal assignment template so it is not of much use. diff --git a/main.cpp b/main.cpp index 65bf108..c92e7a8 100644 --- a/main.cpp +++ b/main.cpp @@ -1,114 +1,237 @@ -#include +#include +#include +#include #include -#include +#include +#include #include -#include - -void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - //TODO : Implement naive matrix multiplication -} - -void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) { - // TODO: Implement blocked matrix multiplication - // A is m x n, B is n x p, C is m x p - // Use block_size to divide matrices into submatrices -} - -void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - // TODO: Implement parallel matrix multiplication using OpenMP - // A is m x n, B is n x p, C is m x p -} +#include +#include +#include -bool validate_result(const std::string &result_file, const std::string &reference_file) { - //TODO : Implement result validation +void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, + uint32_t p) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < p; j++) { + for (int k = 0; k < n; k++) { + C[i * p + j] += A[i * n + k] * B[k * p + j]; + } + } + } } -int main(int argc, char *argv[]) { - if (argc != 2) { - std::cerr << "Usage: " << argv[0] << " " << std::endl; - return 1; +void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, + uint32_t p, uint32_t block_size) { + for (int ii = 0; ii < m; ii += block_size) { + for (int jj = 0; jj < p; jj += block_size) { + for (int kk = 0; kk < n; kk += block_size) { + // Process block: C[ii:ii+block_size, jj:jj+block_size] += + // A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size, + // jj:jj+block_size] + for (int i = ii; i < std::min(ii + block_size, m); i++) { + for (int j = jj; j < std::min(jj + block_size, p); j++) { + for (int k = kk; k < std::min(kk + block_size, n); k++) { + C[i * p + j] += A[i * n + k] * B[k * p + j]; + } + } + } + } } + } +} - int case_number = std::atoi(argv[1]); - if (case_number < 0 || case_number > 9) { - std::cerr << "Case number must be between 0 and 9" << std::endl; - return 1; +void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, + uint32_t p) { +#pragma omp parallel for + for (int i = 0; i < m; i++) { + for (int j = 0; j < p; j++) { + for (int k = 0; k < n; k++) { + C[i * p + j] += A[i * n + k] * B[k * p + j]; + } } + } +} - // Construct file paths - std::string folder = "data/" + std::to_string(case_number) + "/"; - std::string input0_file = folder + "input0.raw"; - std::string input1_file = folder + "input1.raw"; - std::string result_file = folder + "result.raw"; - std::string reference_file = folder + "output.raw"; - - // TODO Read input0.raw (matrix A) - - - // TODO Read input1.raw (matrix B) - - - // Allocate memory for result matrices - float *C_naive = new float[m * p]; - float *C_blocked = new float[m * p]; - float *C_parallel = new float[m * p]; - - // Measure performance of naive_matmul - double start_time = omp_get_wtime(); - naive_matmul(C_naive, A, B, m, n, p); - double naive_time = omp_get_wtime() - start_time; - - // TODO Write naive result to file - - - // Validate naive result - bool naive_correct = validate_result(result_file, reference_file); - if (!naive_correct) { - std::cerr << "Naive result validation failed for case " << case_number << std::endl; +bool validate_result(const std::string &result_file, + const std::string &reference_file) { + std::ifstream res_file(result_file); + std::ifstream ref_file(reference_file); + + int res_dimsM, ref_dimsM; + int res_dimsN, ref_dimsN; + float curr_res, curr_ref; + + res_file >> res_dimsM; + ref_file >> ref_dimsM; + res_file >> res_dimsN; + ref_file >> ref_dimsN; + if (res_dimsM != ref_dimsM || res_dimsN != ref_dimsN) { + return false; + } + int M = res_dimsM; + int N = res_dimsN; + const int total = M * N; + for (int i = 0; i < total; ++i) { + res_file >> curr_res; + ref_file >> curr_ref; + if (curr_res != curr_ref) { + std::cout << "Got: " << curr_res << " Expected:" << curr_ref + << " at index = " << i << std::endl; + return false; } + } + return true; +} - // Measure performance of blocked_matmul (use block_size = 32 as default) - start_time = omp_get_wtime(); - blocked_matmul(C_blocked, A, B, m, n, p, 32); - double blocked_time = omp_get_wtime() - start_time; - - // TODO Write blocked result to file - - - // Validate blocked result - bool blocked_correct = validate_result(result_file, reference_file); - if (!blocked_correct) { - std::cerr << "Blocked result validation failed for case " << case_number << std::endl; +void write_float_matrix(const std::string filename, const float *data, int M, + int N) { + std::ofstream file(filename); + std::string result; + file << M << " " << N << "\n"; + std::stringstream ss; + for (int i = 0; i < M; ++i) { + for (int j = 0; j < N; ++j) { + // setting the float precision explicitly to be 2 decimals to match input + // file format + file << std::fixed << std::setprecision(2) << data[i * N + j]; + // Add whitespaces as long as we have not hit a new row of the matrix + if (j < N - 1) + file << " "; } + file << "\n"; + } + file.close(); +} - // Measure performance of parallel_matmul - start_time = omp_get_wtime(); - parallel_matmul(C_parallel, A, B, m, n, p); - double parallel_time = omp_get_wtime() - start_time; - - // TODO Write parallel result to file - - - // Validate parallel result - bool parallel_correct = validate_result(result_file, reference_file); - if (!parallel_correct) { - std::cerr << "Parallel result validation failed for case " << case_number << std::endl; +std::pair read_float_matrix(const std::string filename, + float **data_ptr) { + std::ifstream file(filename); + int M, N; + file >> M; + file >> N; + float *data = (float *)calloc(M * N, sizeof(float)); + + const int total = M * N; + for (int i = 0; i < total; ++i) { + // the insanity of c++... + // the overloaded >> operator implicitly skips all types of whitespace. + // Oh, and since the type of data is float* it correctly converts to float + if (!(file >> data[i])) { + throw std::runtime_error("Parse error or unexpected EOF at element " + + std::to_string(i)); } + } + *data_ptr = data; + file.close(); + return {M, N}; +} - // Print performance results - std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n"; - std::cout << "Naive time: " << naive_time << " seconds\n"; - std::cout << "Blocked time: " << blocked_time << " seconds\n"; - std::cout << "Parallel time: " << parallel_time << " seconds\n"; - std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n"; - std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n"; - - // Clean up - delete[] A; - delete[] B; - delete[] C_naive; - delete[] C_blocked; - delete[] C_parallel; - - return 0; -} \ No newline at end of file +int main(int argc, char *argv[]) { + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + int case_number = std::atoi(argv[1]); + if (case_number < 0 || case_number > 9) { + std::cerr << "Case number must be between 0 and 9" << std::endl; + return 1; + } + + // Construct file paths + std::string folder = "data/" + std::to_string(case_number) + "/"; + std::string input0_file = folder + "input0.raw"; + std::string input1_file = folder + "input1.raw"; + std::string result_file = folder + "result.raw"; + std::string reference_file = folder + "output.raw"; + + float *data_ptr; + auto A_size = read_float_matrix(input0_file, &data_ptr); + float *A = data_ptr; + auto B_size = read_float_matrix(input1_file, &data_ptr); + float *B = data_ptr; + int m = A_size.first; + int n = A_size.second; + int p = B_size.second; + + // Allocate memory for result matrices + // NOTE: Use calloc to zero init memory and eliminate any problems that comes from + // using += in the matmul implementation + float *C_naive = (float *)calloc(m * p, sizeof(float)); + float *C_blocked = (float *)calloc(m * p, sizeof(float)); + float *C_parallel = (float *)calloc(m * p, sizeof(float)); + + // Measure performance of naive_matmul + double start_time = omp_get_wtime(); + naive_matmul(C_naive, A, B, m, n, p); + double naive_time = omp_get_wtime() - start_time; + + write_float_matrix(result_file, C_naive, m, p); + + // Validate naive result + bool naive_correct = validate_result(result_file, reference_file); + if (!naive_correct) { + std::cerr << "Naive result validation failed for case " << case_number + << std::endl; + } + + // Measure performance of blocked_matmul (use block_size = 32 as default) + start_time = omp_get_wtime(); + blocked_matmul(C_blocked, A, B, m, n, p, 32); + double blocked_time = omp_get_wtime() - start_time; + + write_float_matrix(result_file, C_blocked, m, p); + + // Validate blocked result + bool blocked_correct = validate_result(result_file, reference_file); + if (!blocked_correct) { + std::cerr << "Blocked result validation failed for case " << case_number + << std::endl; + } + + // Measure performance of parallel_matmul + start_time = omp_get_wtime(); + parallel_matmul(C_parallel, A, B, m, n, p); + double parallel_time = omp_get_wtime() - start_time; + + // TODO Write parallel result to file + write_float_matrix(result_file, C_parallel, m, p); + + // Validate parallel result + bool parallel_correct = validate_result(result_file, reference_file); + if (!parallel_correct) { + std::cerr << "Parallel result validation failed for case " << case_number + << std::endl; + } + + // Print performance results + std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p + << "):\n"; + std::cout << std::fixed << std::setprecision(8) + << "Naive time: " << naive_time << " seconds\n"; + std::cout << std::fixed << std::setprecision(8) + << "Blocked time: " << blocked_time << " seconds\n"; + std::cout << std::fixed << std::setprecision(8) + << "Parallel time: " << parallel_time << " seconds\n"; + std::cout << std::fixed << std::setprecision(8) + << "Blocked speedup: " << (naive_time / blocked_time) << "x\n"; + std::cout << std::fixed << std::setprecision(8) + << "Parallel speedup: " << (naive_time / parallel_time) << "x\n"; + + // Append timing results to CSV + std::ofstream csv_file("timings.csv", std::ios::app); + csv_file << std::fixed << naive_time << ";" << blocked_time << ";" + << parallel_time << "\n"; + csv_file.close(); + + // Clean up + // NOTE: I use calloc so free must be used, I think it is + // UB to use delete on malloced memory + free(A); + free(B); + free(C_naive); + free(C_blocked); + free(C_parallel); + + return 0; +} diff --git a/report.pdf b/report.pdf new file mode 100644 index 0000000..c63a1e6 Binary files /dev/null and b/report.pdf differ diff --git a/report.typ b/report.typ new file mode 100644 index 0000000..1c2c791 --- /dev/null +++ b/report.typ @@ -0,0 +1,35 @@ +#import "@local/assignment-template:0.1.0": conf + +#show link: set text(font: "Iosevka Term", fallback: false) + +#show: doc => conf( + title: [Parallel Programming - Assignment 4], + authors: "Mirek Kudinoff (2102812)", + doc, + ) + +== Results + +#figure( + image("graph1.png", width: 80%), + caption: [ + Graph plotting case number vs time taken to run each variant of matmul + ], +) + + +== Results as a table (times are in seconds) + +#let results = csv("timings.csv", delimiter: ";") + +#table( +columns: 4, +..results.flatten()) + +CSV with results obtained by running `for num in $(seq 9); ./build/matmul $num end` in fish shell, +the added code at the end appends results to a file called timings.csv. It is then used to create the +graph and the table above. + +I think the reason that blocked implementation does not consistently show an improvement has to +do with cache size and the fact that the smaller matrix inputs might fit into cache in such a way +that the naive implementation is not negatively affected. diff --git a/timings.csv b/timings.csv new file mode 100644 index 0000000..a92803b --- /dev/null +++ b/timings.csv @@ -0,0 +1,19 @@ +Input Size;Naive;Blocked;Parallel +128x64x128;0.000663;0.000619;0.002903 +100x128x56;0.000668;0.000542;0.002990 +128x64x128;0.000685;0.000632;0.002343 +32x128x32;0.000183;0.000146;0.002642 +200x100x256;0.005250;0.004530;0.001293 +256x256x256;0.017633;0.016888;0.003996 +256x300x256;0.022895;0.019400;0.004624 +64x128x64;0.000612;0.000294;0.000343 +256x256x257;0.012911;0.009770;0.002916 + + + + + + + + +