Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# data folder
build/
11 changes: 10 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,19 @@ if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_Alignof=alignof")
endif()

add_library(matfile matfile.cpp)

add_executable(matmul main_ans.cpp)
add_executable(randmat randmat.cpp)


target_link_libraries(matmul PRIVATE matfile)
if(OpenMP_CXX_FOUND)
target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX)
endif()
endif()

target_link_libraries(randmat PRIVATE matfile)

file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/data ${CMAKE_CURRENT_SOURCE_DIR}/benchmark.sh
DESTINATION ${CMAKE_CURRENT_BINARY_DIR}
)
35 changes: 35 additions & 0 deletions benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#! /bin/sh

echo 'Iterating each data folder and running benchmark 100 times on each case...'

# run benchmark and save results to results/benchmark{case_number}.txt

mkdir -p results
echo '+====================================================================================================+'
for data in `seq 0 9`; do
echo "| Running benchmark on $data |"
echo -n '|'
: > results/benchmark$data.txt
for i in `seq 100`; do
./matmul $data >> results/benchmark$data.txt
echo -n '-'
done
echo '|'
done
echo '+====================================================================================================+'

# extract statistics from results and save to results/benchmark_summary.csv
echo 'Extracting statistics from benchmark results...'

RESULT_FILE=results/benchmark_summary.csv
: > $RESULT_FILE
echo 'Test Case,Dimensions (m × n × p),Naive Time (s),Blocked Time (s),Parallel Time (s),Blocked Speedup,Parallel Speedup' > $RESULT_FILE
for data in `seq 0 9`; do
echo -n "$data," >> $RESULT_FILE
grep -oEm1 '[0-9]+x[0-9]+x[0-9]+' results/benchmark$data.txt | tr '\n' ',' >> $RESULT_FILE
grep 'Naive time' results/benchmark$data.txt | cut -d' ' -f3 | awk '{sum+=$1; count++} END {print sum/count}' | tr '\n' ',' >> $RESULT_FILE
grep 'Blocked time' results/benchmark$data.txt | cut -d' ' -f3 | awk '{sum+=$1; count++} END {print sum/count}' | tr '\n' ',' >> $RESULT_FILE
grep 'Parallel time' results/benchmark$data.txt | cut -d' ' -f3 | awk '{sum+=$1; count++} END {print sum/count}' | tr '\n' ',' >> $RESULT_FILE
grep 'Blocked speedup' results/benchmark$data.txt | cut -d' ' -f3 | tr -d 'x' | awk '{sum+=$1; count++} END {print sum/count}' | tr '\n' ',' >> $RESULT_FILE
grep 'Parallel speedup' results/benchmark$data.txt | cut -d' ' -f3 | tr -d 'x' | awk '{sum+=$1; count++} END {print sum/count}' >> $RESULT_FILE
done
2 changes: 2 additions & 0 deletions load-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
OMP_NUM_THREADS=8
export OMP_NUM_THREADS
271 changes: 271 additions & 0 deletions main_ans.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
#include <iostream>
#include <fstream>
#include <string>
#include <omp.h>
#include <cmath>
#include <cstdint>
#include <tuple>
#include <cassert>

#include "matfile.h"

#define BLOCK_SIZE 32
#define FLOAT_TOLERANCE 1e-2

void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p);
void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size);
void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p);
bool validate_result(const std::string &result_file, const std::string &reference_file);

int main(int argc, char* argv[])
{
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
return 1;
}

int case_number = std::atoi(argv[1]);
if (case_number < 0 || case_number > 9) {
std::cerr << "Case number must be between 0 and 9" << std::endl;
return 1;
}

// Construct file paths
std::string folder = "data/" + std::to_string(case_number) + "/";
std::string input0_file = folder + "input0.raw";
std::string input1_file = folder + "input1.raw";
std::string result_file = folder + "result.raw";
std::string reference_file = folder + "output.raw";

// open input files
std::ifstream input0(input0_file, std::ios::binary);
std::ifstream input1(input1_file, std::ios::binary);

// read dimensions of input matrices
uint32_t m, n, n2, p;
matfile_read_dimensions(input0, m, n);
matfile_read_dimensions(input1, n2, p);
// validate dimensions
if (n != n2) {
std::cerr << "Inner dimensions of A and B must match" << std::endl;
return EXIT_FAILURE;
}

// allocate memory for input matrices
float* A = new float[m * n];
float* B = new float[n * p];

// read input matrices from files
matfile_read_matrix(input0, A);
matfile_read_matrix(input1, B);

// close input files
input0.close();
input1.close();

// Allocate memory for result matrices
float* C_naive = new float[m * p];
float* C_blocked = new float[m * p];
float* C_parallel = new float[m * p];

// Initialize result matrices to zero
std::fill(C_naive, C_naive + m*p, 0.0f);
std::fill(C_blocked, C_blocked + m*p, 0.0f);
std::fill(C_parallel, C_parallel + m*p, 0.0f);

// Measure performance of naive_matmul
double start_time = omp_get_wtime();
naive_matmul(C_naive, A, B, m, n, p);
double naive_time = omp_get_wtime() - start_time;

// write naive result to file
matfile_write_matrix(result_file, C_naive, m, p);

// Validate naive result
bool naive_correct = validate_result(result_file, reference_file);
if (!naive_correct) {
std::cerr << "Naive result validation failed for case " << case_number << std::endl;
}

// Measure performance of blocked_matmul
start_time = omp_get_wtime();
blocked_matmul(C_blocked, A, B, m, n, p, BLOCK_SIZE);
double blocked_time = omp_get_wtime() - start_time;

// write blocked result to file
matfile_write_matrix(result_file, C_blocked, m, p);

// Validate blocked result
bool blocked_correct = validate_result(result_file, reference_file);
if (!blocked_correct) {
std::cerr << "Blocked result validation failed for case " << case_number << std::endl;
}

// Measure performance of parallel_matmul
start_time = omp_get_wtime();
parallel_matmul(C_parallel, A, B, m, n, p);
double parallel_time = omp_get_wtime() - start_time;

// write parallel result to file
matfile_write_matrix(result_file, C_parallel, m, p);

// Validate parallel result
bool parallel_correct = validate_result(result_file, reference_file);
if (!parallel_correct) {
std::cerr << "Parallel result validation failed for case " << case_number << std::endl;
}

// Print performance results
std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
std::cout << "Naive time: " << naive_time << " seconds\n";
std::cout << "Blocked time: " << blocked_time << " seconds\n";
std::cout << "Parallel time: " << parallel_time << " seconds\n";
std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n";

// close input files
input0.close();
input1.close();

// Clean up
delete[] A;
delete[] B;
delete[] C_naive;
delete[] C_blocked;
delete[] C_parallel;

return EXIT_SUCCESS;
}

void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p)
{
// A is m x n, B is n x p, C is m x p
for (uint32_t i = 0; i < m; ++i) {
for (uint32_t j = 0; j < p; ++j) {
float sum = 0.0f;
for (uint32_t k = 0; k < n; ++k) {
sum += A[i*n + k] * B[k*p + j];
}
C[i*p + j] = sum;
}
}
}

void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size)
{
// A is m x n, B is n x p, C is m x p
// Use block_size to divide matrices into submatrices
for (uint32_t ii = 0; ii < m; ii += block_size) {
uint32_t i_end = std::min(ii + block_size, m);

for (uint32_t kk = 0; kk < n; kk += block_size) {
uint32_t k_end = std::min(kk + block_size, n);

for (uint32_t jj = 0; jj < p; jj += block_size) {
uint32_t j_end = std::min(jj + block_size, p);

for (uint32_t i = ii; i < i_end; ++i) {
for (uint32_t j = jj; j + 7 < j_end; j += 8) { // unroll 8 columns at a time
// load C[i][j..j+7] into registers to ensure optimized memory access
float c0 = C[i*p + j + 0];
float c1 = C[i*p + j + 1];
float c2 = C[i*p + j + 2];
float c3 = C[i*p + j + 3];
float c4 = C[i*p + j + 4];
float c5 = C[i*p + j + 5];
float c6 = C[i*p + j + 6];
float c7 = C[i*p + j + 7];

for (uint32_t k = kk; k < k_end; ++k) {
const float aik = A[i * n + k];
const float* b = B + k*p + j;
// process calculation for the 8 columns in registers
c0 += aik * b[0];
c1 += aik * b[1];
c2 += aik * b[2];
c3 += aik * b[3];
c4 += aik * b[4];
c5 += aik * b[5];
c6 += aik * b[6];
c7 += aik * b[7];
}

// store results back to C
C[i*p + j + 0] = c0;
C[i*p + j + 1] = c1;
C[i*p + j + 2] = c2;
C[i*p + j + 3] = c3;
C[i*p + j + 4] = c4;
C[i*p + j + 5] = c5;
C[i*p + j + 6] = c6;
C[i*p + j + 7] = c7;
}

// cleanup for remaining columns that were not processed in the unrolled loop
for (uint32_t j = j_end - ((j_end - jj) % 8) ; j < j_end ; ++j) {
float sum = C[i * p + j];
for (uint32_t k = kk; k < k_end; ++k) {
sum += A[i * n + k] * B[k * p + j];
}
C[i * p + j] = sum;
}
}
}
}
}
}

void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p)
{
// A is m x n, B is n x p, C is m x p
#pragma omp parallel for
for (uint32_t i = 0; i < m; ++i) {
for (uint32_t j = 0; j < p; ++j) {
float sum = 0.0f;
for (uint32_t k = 0; k < n; ++k) {
sum += A[i*n + k] * B[k*p + j];
}
C[i*p + j] = sum;
}
}
}

bool validate_result(const std::string &result_filename, const std::string &reference_filename)
{
// Open result and reference files
std::ifstream result(result_filename, std::ios::binary);
std::ifstream reference(reference_filename, std::ios::binary);
// Read dimensions of result and reference matrices
uint32_t m_result, n_result, m_ref, n_ref;
matfile_read_dimensions(result, m_result, n_result);
matfile_read_dimensions(reference, m_ref, n_ref);
// Validate dimensions
if (m_result != m_ref || n_result != n_ref) {
std::cerr << "Dimension mismatch: result is " << m_result << "x" << n_result
<< ", reference is " << m_ref << "x" << n_ref << std::endl;
return false;
}
// Read matrices into memory
float* C_result = new float[m_result * n_result];
float* C_reference = new float[m_ref * n_ref];
matfile_read_matrix(result, C_result);
matfile_read_matrix(reference, C_reference);
// Validate values with tolerance
bool valid = true;
for (uint32_t i = 0; i < m_result * n_result; ++i) {
if (std::fabs(C_result[i] - C_reference[i]) > FLOAT_TOLERANCE) {
std::cerr << "Value mismatch at index " << i << ": result is " << C_result[i]
<< ", reference is " << C_reference[i] << std::endl;
valid = false;
break;
}
}
// close files
result.close();
reference.close();
// Clean up
delete[] C_result;
delete[] C_reference;
// exit
return valid;
}
40 changes: 40 additions & 0 deletions matfile.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#include "matfile.h"

#include <fstream>

void matfile_read_dimensions(std::ifstream& file, uint32_t& m, uint32_t& n)
{
// read the first line of the file to extract the dimensions of the matrix
std::string str_m, str_n;
file >> str_m >> str_n;
m = std::stoul(str_m);
n = std::stoul(str_n);
}

void matfile_read_matrix(std::ifstream& file, float* matrix)
{
// read the file line by line
std::string line;
std::size_t index = 0;
std::string value;
while (file >> value) {
matrix[index++] = static_cast<float>(std::stof(value));
}
}

void matfile_write_matrix(std::string const& filename, const float* matrix, uint32_t m, uint32_t n)
{
// open the file for writing
std::ofstream file(filename, std::ios::binary);
// write the matrix dimensions as the first line of the file
file << m << ' ' << n << std::endl;
// write the matrix to the file line by line
for (uint32_t i = 0; i < m; ++i) {
for (uint32_t j = 0; j < n; ++j) {
file << matrix[i*n + j] << " ";
}
file << std::endl;
}
// close the file
file.close();
}
8 changes: 8 additions & 0 deletions matfile.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#pragma once

#include <iostream>
#include <cstdint>

void matfile_read_dimensions(std::ifstream& file, uint32_t& m, uint32_t& n);
void matfile_read_matrix(std::ifstream& file, float* matrix);
void matfile_write_matrix(std::string const& filename, const float* matrix, uint32_t m, uint32_t n);
Loading