AA-parallel-computing · letriton25 · May 30, 2026 · May 30, 2026 · May 31, 2026 · May 31, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+# data folder
+build/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,10 +16,19 @@ if(APPLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_Alignof=alignof")
 endif()
 
+add_library(matfile matfile.cpp)
 
 add_executable(matmul main_ans.cpp)
+add_executable(randmat randmat.cpp)
 
 
+target_link_libraries(matmul PRIVATE matfile)
 if(OpenMP_CXX_FOUND)
     target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX)
-endif()
+endif()
+
+target_link_libraries(randmat PRIVATE matfile)
+
+file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/data ${CMAKE_CURRENT_SOURCE_DIR}/benchmark.sh
+     DESTINATION ${CMAKE_CURRENT_BINARY_DIR}
+)
diff --git a/benchmark.sh b/benchmark.sh
@@ -0,0 +1,35 @@
+#! /bin/sh
+
+echo 'Iterating each data folder and running benchmark 100 times on each case...'
+
+# run benchmark and save results to results/benchmark{case_number}.txt
+
+mkdir -p results
+echo '+====================================================================================================+'
+for data in `seq 0 9`; do
+    echo "| Running benchmark on $data                                                                             |"
+    echo -n '|'
+    : > results/benchmark$data.txt
+    for i in `seq 100`; do
+        ./matmul $data >> results/benchmark$data.txt
+        echo -n '-'
+    done
+    echo '|'
+done
+echo '+====================================================================================================+'
+
+# extract statistics from results and save to results/benchmark_summary.csv
+echo 'Extracting statistics from benchmark results...'
+
+RESULT_FILE=results/benchmark_summary.csv
+: > $RESULT_FILE
+echo 'Test Case,Dimensions (m × n × p),Naive Time (s),Blocked Time (s),Parallel Time (s),Blocked Speedup,Parallel Speedup' > $RESULT_FILE
+for data in `seq 0 9`; do
+    echo -n "$data," >> $RESULT_FILE
+    grep -oEm1 '[0-9]+x[0-9]+x[0-9]+' results/benchmark$data.txt | tr '\n' ',' >> $RESULT_FILE
+    grep 'Naive time' results/benchmark$data.txt | cut -d' ' -f3 | awk '{sum+=$1; count++} END {print sum/count}' | tr '\n' ',' >> $RESULT_FILE
+    grep 'Blocked time' results/benchmark$data.txt | cut -d' ' -f3 | awk '{sum+=$1; count++} END {print sum/count}' | tr '\n' ',' >> $RESULT_FILE
+    grep 'Parallel time' results/benchmark$data.txt | cut -d' ' -f3 | awk '{sum+=$1; count++} END {print sum/count}' | tr '\n' ',' >> $RESULT_FILE
+    grep 'Blocked speedup' results/benchmark$data.txt | cut -d' ' -f3 | tr -d 'x' | awk '{sum+=$1; count++} END {print sum/count}' | tr '\n' ',' >> $RESULT_FILE
+    grep 'Parallel speedup' results/benchmark$data.txt | cut -d' ' -f3 | tr -d 'x' | awk '{sum+=$1; count++} END {print sum/count}' >> $RESULT_FILE
+done
diff --git a/load-env.sh b/load-env.sh
@@ -0,0 +1,2 @@
+OMP_NUM_THREADS=8
+export OMP_NUM_THREADS
diff --git a/main_ans.cpp b/main_ans.cpp
@@ -0,0 +1,271 @@
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <omp.h>
+#include <cmath>
+#include <cstdint>
+#include <tuple>
+#include <cassert>
+
+#include "matfile.h"
+
+#define BLOCK_SIZE 32
+#define FLOAT_TOLERANCE 1e-2
+
+void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p);
+void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size);
+void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p);
+bool validate_result(const std::string &result_file, const std::string &reference_file);
+
+int main(int argc, char* argv[])
+{
+    if (argc != 2) {
+        std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
+        return 1;
+    }
+
+    int case_number = std::atoi(argv[1]);
+    if (case_number < 0 || case_number > 9) {
+        std::cerr << "Case number must be between 0 and 9" << std::endl;
+        return 1;
+    }
+
+    // Construct file paths
+    std::string folder = "data/" + std::to_string(case_number) + "/";
+    std::string input0_file = folder + "input0.raw";
+    std::string input1_file = folder + "input1.raw";
+    std::string result_file = folder + "result.raw";
+    std::string reference_file = folder + "output.raw";
+
+    // open input files
+    std::ifstream input0(input0_file, std::ios::binary);
+    std::ifstream input1(input1_file, std::ios::binary);
+
+    // read dimensions of input matrices
+    uint32_t m, n, n2, p;
+    matfile_read_dimensions(input0, m, n);
+    matfile_read_dimensions(input1, n2, p);
+    // validate dimensions
+    if (n != n2) {
+        std::cerr << "Inner dimensions of A and B must match" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // allocate memory for input matrices
+    float* A = new float[m * n];
+    float* B = new float[n * p];
+
+    // read input matrices from files
+    matfile_read_matrix(input0, A);
+    matfile_read_matrix(input1, B);
+
+    // close input files
+    input0.close();
+    input1.close();
+
+    // Allocate memory for result matrices
+    float* C_naive = new float[m * p];
+    float* C_blocked = new float[m * p];
+    float* C_parallel = new float[m * p];
+
+    // Initialize result matrices to zero
+    std::fill(C_naive, C_naive + m*p, 0.0f);
+    std::fill(C_blocked, C_blocked + m*p, 0.0f);
+    std::fill(C_parallel, C_parallel + m*p, 0.0f);
+
+    // Measure performance of naive_matmul
+    double start_time = omp_get_wtime();
+    naive_matmul(C_naive, A, B, m, n, p);
+    double naive_time = omp_get_wtime() - start_time;
+
+    // write naive result to file
+    matfile_write_matrix(result_file, C_naive, m, p);
+
+    // Validate naive result
+    bool naive_correct = validate_result(result_file, reference_file);
+    if (!naive_correct) {
+        std::cerr << "Naive result validation failed for case " << case_number << std::endl;
+    }
+
+    // Measure performance of blocked_matmul
+    start_time = omp_get_wtime();
+    blocked_matmul(C_blocked, A, B, m, n, p, BLOCK_SIZE);
+    double blocked_time = omp_get_wtime() - start_time;
+
+    // write blocked result to file
+    matfile_write_matrix(result_file, C_blocked, m, p);
+
+    // Validate blocked result
+    bool blocked_correct = validate_result(result_file, reference_file);
+    if (!blocked_correct) {
+        std::cerr << "Blocked result validation failed for case " << case_number << std::endl;
+    }
+
+    // Measure performance of parallel_matmul
+    start_time = omp_get_wtime();
+    parallel_matmul(C_parallel, A, B, m, n, p);
+    double parallel_time = omp_get_wtime() - start_time;
+
+    // write parallel result to file
+    matfile_write_matrix(result_file, C_parallel, m, p);
+
+    // Validate parallel result
+    bool parallel_correct = validate_result(result_file, reference_file);
+    if (!parallel_correct) {
+        std::cerr << "Parallel result validation failed for case " << case_number << std::endl;
+    }
+
+    // Print performance results
+    std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
+    std::cout << "Naive time: " << naive_time << " seconds\n";
+    std::cout << "Blocked time: " << blocked_time << " seconds\n";
+    std::cout << "Parallel time: " << parallel_time << " seconds\n";
+    std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
+    std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n";
+
+    // close input files
+    input0.close();
+    input1.close();
+
+    // Clean up
+    delete[] A;
+    delete[] B;
+    delete[] C_naive;
+    delete[] C_blocked;
+    delete[] C_parallel;
+
+    return EXIT_SUCCESS;
+}
+
+void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p)
+{
+    // A is m x n, B is n x p, C is m x p
+    for (uint32_t i = 0; i < m; ++i) {
+        for (uint32_t j = 0; j < p; ++j) {
+            float sum = 0.0f;
+            for (uint32_t k = 0; k < n; ++k) {
+                sum += A[i*n + k] * B[k*p + j];
+            }
+            C[i*p + j] = sum;
+        }
+    }
+}
+
+void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size)
+{
+    // A is m x n, B is n x p, C is m x p
+    // Use block_size to divide matrices into submatrices
+    for (uint32_t ii = 0; ii < m; ii += block_size) {
+        uint32_t i_end = std::min(ii + block_size, m);
+
+        for (uint32_t kk = 0; kk < n; kk += block_size) {
+            uint32_t k_end = std::min(kk + block_size, n);
+
+            for (uint32_t jj = 0; jj < p; jj += block_size) {
+                uint32_t j_end = std::min(jj + block_size, p);
+
+                for (uint32_t i = ii; i < i_end; ++i) {
+                    for (uint32_t j = jj; j + 7 < j_end; j += 8) { // unroll 8 columns at a time
+                        // load C[i][j..j+7] into registers to ensure optimized memory access
+                        float c0 = C[i*p + j + 0];
+                        float c1 = C[i*p + j + 1];
+                        float c2 = C[i*p + j + 2];
+                        float c3 = C[i*p + j + 3];
+                        float c4 = C[i*p + j + 4];
+                        float c5 = C[i*p + j + 5];
+                        float c6 = C[i*p + j + 6];
+                        float c7 = C[i*p + j + 7];
+
+                        for (uint32_t k = kk; k < k_end; ++k) {
+                            const float aik = A[i * n + k];
+                            const float* b = B + k*p + j;
+                            // process calculation for the 8 columns in registers
+                            c0 += aik * b[0];
+                            c1 += aik * b[1];
+                            c2 += aik * b[2];
+                            c3 += aik * b[3];
+                            c4 += aik * b[4];
+                            c5 += aik * b[5];
+                            c6 += aik * b[6];
+                            c7 += aik * b[7];
+                        }
+
+                        // store results back to C
+                        C[i*p + j + 0] = c0;
+                        C[i*p + j + 1] = c1;
+                        C[i*p + j + 2] = c2;
+                        C[i*p + j + 3] = c3;
+                        C[i*p + j + 4] = c4;
+                        C[i*p + j + 5] = c5;
+                        C[i*p + j + 6] = c6;
+                        C[i*p + j + 7] = c7;
+                    }
+
+                    // cleanup for remaining columns that were not processed in the unrolled loop
+                    for (uint32_t j = j_end - ((j_end - jj) % 8) ; j < j_end ; ++j) {
+                        float sum = C[i * p + j];
+                        for (uint32_t k = kk; k < k_end; ++k) {
+                            sum += A[i * n + k] * B[k * p + j];
+                        }
+                        C[i * p + j] = sum;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p)
+{
+    // A is m x n, B is n x p, C is m x p
+    #pragma omp parallel for
+    for (uint32_t i = 0; i < m; ++i) {
+        for (uint32_t j = 0; j < p; ++j) {
+            float sum = 0.0f;
+            for (uint32_t k = 0; k < n; ++k) {
+                sum += A[i*n + k] * B[k*p + j];
+            }
+            C[i*p + j] = sum;
+        }
+    }
+}
+
+bool validate_result(const std::string &result_filename, const std::string &reference_filename)
+{
+    // Open result and reference files
+    std::ifstream result(result_filename, std::ios::binary);
+    std::ifstream reference(reference_filename, std::ios::binary);
+    // Read dimensions of result and reference matrices
+    uint32_t m_result, n_result, m_ref, n_ref;
+    matfile_read_dimensions(result, m_result, n_result);
+    matfile_read_dimensions(reference, m_ref, n_ref);
+    // Validate dimensions
+    if (m_result != m_ref || n_result != n_ref) {
+        std::cerr << "Dimension mismatch: result is " << m_result << "x" << n_result
+                  << ", reference is " << m_ref << "x" << n_ref << std::endl;
+        return false;
+    }
+    // Read matrices into memory
+    float* C_result = new float[m_result * n_result];
+    float* C_reference = new float[m_ref * n_ref];
+    matfile_read_matrix(result, C_result);
+    matfile_read_matrix(reference, C_reference);
+    // Validate values with tolerance
+    bool valid = true;
+    for (uint32_t i = 0; i < m_result * n_result; ++i) {
+        if (std::fabs(C_result[i] - C_reference[i]) > FLOAT_TOLERANCE) {
+            std::cerr << "Value mismatch at index " << i << ": result is " << C_result[i]
+                      << ", reference is " << C_reference[i] << std::endl;
+            valid = false;
+            break;
+        }
+    }
+    // close files
+    result.close();
+    reference.close();
+    // Clean up
+    delete[] C_result;
+    delete[] C_reference;
+    // exit
+    return valid;
+}
diff --git a/matfile.cpp b/matfile.cpp
@@ -0,0 +1,40 @@
+#include "matfile.h"
+
+#include <fstream>
+
+void matfile_read_dimensions(std::ifstream& file, uint32_t& m, uint32_t& n)
+{
+    // read the first line of the file to extract the dimensions of the matrix
+    std::string str_m, str_n;
+    file >> str_m >> str_n;
+    m = std::stoul(str_m);
+    n = std::stoul(str_n);
+}
+
+void matfile_read_matrix(std::ifstream& file, float* matrix)
+{
+    // read the file line by line
+    std::string line;
+    std::size_t index = 0;
+    std::string value;
+    while (file >> value) {
+        matrix[index++] = static_cast<float>(std::stof(value));
+    }
+}
+
+void matfile_write_matrix(std::string const& filename, const float* matrix, uint32_t m, uint32_t n)
+{
+    // open the file for writing
+    std::ofstream file(filename, std::ios::binary);
+    // write the matrix dimensions as the first line of the file
+    file << m << ' ' << n << std::endl;
+    // write the matrix to the file line by line
+    for (uint32_t i = 0; i < m; ++i) {
+        for (uint32_t j = 0; j < n; ++j) {
+            file << matrix[i*n + j] << " ";
+        }
+        file << std::endl;
+    }
+    // close the file
+    file.close();
+}
diff --git a/matfile.h b/matfile.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <iostream>
+#include <cstdint>
+
+void matfile_read_dimensions(std::ifstream& file, uint32_t& m, uint32_t& n);
+void matfile_read_matrix(std::ifstream& file, float* matrix);
+void matfile_write_matrix(std::string const& filename, const float* matrix, uint32_t m, uint32_t n);