Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
build/
data/
9 changes: 6 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ cmake_minimum_required(VERSION 3.10)

project(MatrixMultiplication)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD 23)

set(CMAKE_CXX_STANDARD_REQUIRED ON)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

find_package(OpenMP REQUIRED)

if(OpenMP_CXX_FOUND)
Expand All @@ -17,9 +19,10 @@ if(APPLE)
endif()


add_executable(matmul main_ans.cpp)
add_executable(matmul main.cpp)
target_compile_options(matmul PRIVATE -g -O3)


if(OpenMP_CXX_FOUND)
target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX)
endif()
endif()
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,3 +235,11 @@ git push origin student-name
- Use small test cases to debug your blocked and parallel implementations.

Good luck, and enjoy optimizing your matrix multiplication!

## Report

The table of results is in `timings.csv` and also `report.pdf`,
please read report.pdf, it has a table and graph of the results.

`report.typ` has the plaintext report before compilation. It depends on
my personal assignment template so it is not of much use.
321 changes: 222 additions & 99 deletions main.cpp
Original file line number Diff line number Diff line change
@@ -1,114 +1,237 @@
#include <iostream>
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <string>
#include <iomanip>
#include <iostream>
#include <omp.h>
#include <cmath>

void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
//TODO : Implement naive matrix multiplication
}

void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) {
// TODO: Implement blocked matrix multiplication
// A is m x n, B is n x p, C is m x p
// Use block_size to divide matrices into submatrices
}

void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
// TODO: Implement parallel matrix multiplication using OpenMP
// A is m x n, B is n x p, C is m x p
}
#include <pthread.h>
#include <stdexcept>
#include <string>

bool validate_result(const std::string &result_file, const std::string &reference_file) {
//TODO : Implement result validation
void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n,
uint32_t p) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < p; j++) {
for (int k = 0; k < n; k++) {
C[i * p + j] += A[i * n + k] * B[k * p + j];
}
}
}
}

int main(int argc, char *argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
return 1;
void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n,
uint32_t p, uint32_t block_size) {
for (int ii = 0; ii < m; ii += block_size) {
for (int jj = 0; jj < p; jj += block_size) {
for (int kk = 0; kk < n; kk += block_size) {
// Process block: C[ii:ii+block_size, jj:jj+block_size] +=
// A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size,
// jj:jj+block_size]
for (int i = ii; i < std::min(ii + block_size, m); i++) {
for (int j = jj; j < std::min(jj + block_size, p); j++) {
for (int k = kk; k < std::min(kk + block_size, n); k++) {
C[i * p + j] += A[i * n + k] * B[k * p + j];
}
}
}
}
}
}
}

int case_number = std::atoi(argv[1]);
if (case_number < 0 || case_number > 9) {
std::cerr << "Case number must be between 0 and 9" << std::endl;
return 1;
void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n,
uint32_t p) {
#pragma omp parallel for
for (int i = 0; i < m; i++) {
for (int j = 0; j < p; j++) {
for (int k = 0; k < n; k++) {
C[i * p + j] += A[i * n + k] * B[k * p + j];
}
}
}
}

// Construct file paths
std::string folder = "data/" + std::to_string(case_number) + "/";
std::string input0_file = folder + "input0.raw";
std::string input1_file = folder + "input1.raw";
std::string result_file = folder + "result.raw";
std::string reference_file = folder + "output.raw";

// TODO Read input0.raw (matrix A)


// TODO Read input1.raw (matrix B)


// Allocate memory for result matrices
float *C_naive = new float[m * p];
float *C_blocked = new float[m * p];
float *C_parallel = new float[m * p];

// Measure performance of naive_matmul
double start_time = omp_get_wtime();
naive_matmul(C_naive, A, B, m, n, p);
double naive_time = omp_get_wtime() - start_time;

// TODO Write naive result to file


// Validate naive result
bool naive_correct = validate_result(result_file, reference_file);
if (!naive_correct) {
std::cerr << "Naive result validation failed for case " << case_number << std::endl;
bool validate_result(const std::string &result_file,
const std::string &reference_file) {
std::ifstream res_file(result_file);
std::ifstream ref_file(reference_file);

int res_dimsM, ref_dimsM;
int res_dimsN, ref_dimsN;
float curr_res, curr_ref;

res_file >> res_dimsM;
ref_file >> ref_dimsM;
res_file >> res_dimsN;
ref_file >> ref_dimsN;
if (res_dimsM != ref_dimsM || res_dimsN != ref_dimsN) {
return false;
}
int M = res_dimsM;
int N = res_dimsN;
const int total = M * N;
for (int i = 0; i < total; ++i) {
res_file >> curr_res;
ref_file >> curr_ref;
if (curr_res != curr_ref) {
std::cout << "Got: " << curr_res << " Expected:" << curr_ref
<< " at index = " << i << std::endl;
return false;
}
}
return true;
}

// Measure performance of blocked_matmul (use block_size = 32 as default)
start_time = omp_get_wtime();
blocked_matmul(C_blocked, A, B, m, n, p, 32);
double blocked_time = omp_get_wtime() - start_time;

// TODO Write blocked result to file


// Validate blocked result
bool blocked_correct = validate_result(result_file, reference_file);
if (!blocked_correct) {
std::cerr << "Blocked result validation failed for case " << case_number << std::endl;
void write_float_matrix(const std::string filename, const float *data, int M,
int N) {
std::ofstream file(filename);
std::string result;
file << M << " " << N << "\n";
std::stringstream ss;
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
// setting the float precision explicitly to be 2 decimals to match input
// file format
file << std::fixed << std::setprecision(2) << data[i * N + j];
// Add whitespaces as long as we have not hit a new row of the matrix
if (j < N - 1)
file << " ";
}
file << "\n";
}
file.close();
}

// Measure performance of parallel_matmul
start_time = omp_get_wtime();
parallel_matmul(C_parallel, A, B, m, n, p);
double parallel_time = omp_get_wtime() - start_time;

// TODO Write parallel result to file


// Validate parallel result
bool parallel_correct = validate_result(result_file, reference_file);
if (!parallel_correct) {
std::cerr << "Parallel result validation failed for case " << case_number << std::endl;
std::pair<int, int> read_float_matrix(const std::string filename,
float **data_ptr) {
std::ifstream file(filename);
int M, N;
file >> M;
file >> N;
float *data = (float *)calloc(M * N, sizeof(float));

const int total = M * N;
for (int i = 0; i < total; ++i) {
// the insanity of c++...
// the overloaded >> operator implicitly skips all types of whitespace.
// Oh, and since the type of data is float* it correctly converts to float
if (!(file >> data[i])) {
throw std::runtime_error("Parse error or unexpected EOF at element " +
std::to_string(i));
}
}
*data_ptr = data;
file.close();
return {M, N};
}

// Print performance results
std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
std::cout << "Naive time: " << naive_time << " seconds\n";
std::cout << "Blocked time: " << blocked_time << " seconds\n";
std::cout << "Parallel time: " << parallel_time << " seconds\n";
std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n";

// Clean up
delete[] A;
delete[] B;
delete[] C_naive;
delete[] C_blocked;
delete[] C_parallel;

return 0;
}
int main(int argc, char *argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
return 1;
}

int case_number = std::atoi(argv[1]);
if (case_number < 0 || case_number > 9) {
std::cerr << "Case number must be between 0 and 9" << std::endl;
return 1;
}

// Construct file paths
std::string folder = "data/" + std::to_string(case_number) + "/";
std::string input0_file = folder + "input0.raw";
std::string input1_file = folder + "input1.raw";
std::string result_file = folder + "result.raw";
std::string reference_file = folder + "output.raw";

float *data_ptr;
auto A_size = read_float_matrix(input0_file, &data_ptr);
float *A = data_ptr;
auto B_size = read_float_matrix(input1_file, &data_ptr);
float *B = data_ptr;
int m = A_size.first;
int n = A_size.second;
int p = B_size.second;

// Allocate memory for result matrices
// NOTE: Use calloc to zero init memory and eliminate any problems that comes from
// using += in the matmul implementation
float *C_naive = (float *)calloc(m * p, sizeof(float));
float *C_blocked = (float *)calloc(m * p, sizeof(float));
float *C_parallel = (float *)calloc(m * p, sizeof(float));

// Measure performance of naive_matmul
double start_time = omp_get_wtime();
naive_matmul(C_naive, A, B, m, n, p);
double naive_time = omp_get_wtime() - start_time;

write_float_matrix(result_file, C_naive, m, p);

// Validate naive result
bool naive_correct = validate_result(result_file, reference_file);
if (!naive_correct) {
std::cerr << "Naive result validation failed for case " << case_number
<< std::endl;
}

// Measure performance of blocked_matmul (use block_size = 32 as default)
start_time = omp_get_wtime();
blocked_matmul(C_blocked, A, B, m, n, p, 32);
double blocked_time = omp_get_wtime() - start_time;

write_float_matrix(result_file, C_blocked, m, p);

// Validate blocked result
bool blocked_correct = validate_result(result_file, reference_file);
if (!blocked_correct) {
std::cerr << "Blocked result validation failed for case " << case_number
<< std::endl;
}

// Measure performance of parallel_matmul
start_time = omp_get_wtime();
parallel_matmul(C_parallel, A, B, m, n, p);
double parallel_time = omp_get_wtime() - start_time;

// TODO Write parallel result to file
write_float_matrix(result_file, C_parallel, m, p);

// Validate parallel result
bool parallel_correct = validate_result(result_file, reference_file);
if (!parallel_correct) {
std::cerr << "Parallel result validation failed for case " << case_number
<< std::endl;
}

// Print performance results
std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p
<< "):\n";
std::cout << std::fixed << std::setprecision(8)
<< "Naive time: " << naive_time << " seconds\n";
std::cout << std::fixed << std::setprecision(8)
<< "Blocked time: " << blocked_time << " seconds\n";
std::cout << std::fixed << std::setprecision(8)
<< "Parallel time: " << parallel_time << " seconds\n";
std::cout << std::fixed << std::setprecision(8)
<< "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
std::cout << std::fixed << std::setprecision(8)
<< "Parallel speedup: " << (naive_time / parallel_time) << "x\n";

// Append timing results to CSV
std::ofstream csv_file("timings.csv", std::ios::app);
csv_file << std::fixed << naive_time << ";" << blocked_time << ";"
<< parallel_time << "\n";
csv_file.close();

// Clean up
// NOTE: I use calloc so free must be used, I think it is
// UB to use delete on malloced memory
free(A);
free(B);
free(C_naive);
free(C_blocked);
free(C_parallel);

return 0;
}
Binary file added report.pdf
Binary file not shown.
Loading