diff --git a/README.md b/README.md index 51c7f2a..f634165 100644 --- a/README.md +++ b/README.md @@ -109,22 +109,37 @@ threads (e.g., 2, 4, 8) by setting the environment variable `OMP_NUM_THREADS`. For each test case (0 through 9 in the `data` folder): - Measure the **wall clock time** for: - - Naive matrix multiplication (`naive_matmul`). - - Cache-optimized matrix multiplication (`blocked_matmul`). - - Parallel matrix multiplication (`parallel_matmul`). + - Naive matrix multiplication (`naive_matmul`). + - Cache-optimized matrix multiplication (`blocked_matmul`). + - Parallel matrix multiplication (`parallel_matmul`). - Use `omp_get_wtime()` for timing, as it provides high-resolution wall clock time. - Report the times in a table in your submission README.md, including: - - Test case number. - - Matrix dimensions (m × n × p). - - Wall clock time for each implementation (in seconds). - - Speedup of blocked and parallel implementations over the naive implementation. + - Test case number. + - Matrix dimensions (m × n × p). + - Wall clock time for each implementation (in seconds). + - Speedup of blocked and parallel implementations over the naive implementation. Example table format: | Test Case | Dimensions (m × n × p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup | -|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------| +| --------- | ---------------------- | -------------- | ---------------- | ----------------- | --------------- | ---------------- | | 0 | 512 × 512 × 512 | 2.345 | 0.987 | 0.543 | 2.38× | 4.32× | +## Results - Group H + +| Test Case | Dimensions (m x n x p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup | +| --------- | ---------------------- | -------------- | ---------------- | ----------------- | --------------- | ---------------- | +| 0 | 64 x 64 x 64 | 0.000397436 | 0.000262500 | 0.000198718 | 1.51x | 2.00x | +| 1 | 128 x 64 x 128 | 0.001657894 | 0.000984128 | 0.000712641 | 1.68x | 2.33x | +| 2 | 100 x 128 x 56 | 0.001347827 | 0.000594339 | 0.000440559 | 2.27x | 3.06x | +| 3 | 128 x 64 x 128 | 0.001729733 | 0.001550001 | 0.000840000 | 1.12x | 2.06x | +| 4 | 32 x 128 x 32 | 0.000229629 | 0.000165354 | 0.000174157 | 1.39x | 1.32x | +| 5 | 200 x 100 x 256 | 0.007874995 | 0.004499997 | 0.002863635 | 1.75x | 2.75x | +| 6 | 256 x 256 x 256 | 0.052999973 | 0.013999999 | 0.011199999 | 3.79x | 4.73x | +| 7 | 256 x 300 x 256 | 0.034000039 | 0.019500017 | 0.013499975 | 1.74x | 2.52x | +| 8 | 64 x 128 x 64 | 0.000819675 | 0.000439656 | 0.000398438 | 1.86x | 2.06x | +| 9 | 256 x 256 x 257 | 0.026499987 | 0.019500017 | 0.007750005 | 1.36x | 3.42x | + --- #### Matrix Storage and Memory Management @@ -138,9 +153,9 @@ Example table format: #### Input/Output and Validation - Use the same input/output format as Assignment 1: - - Input files: `data//input0.raw` (matrix \( A \)) and `input1.raw` (matrix \( B \)). - - Output file: `data//result.raw` (matrix \( C \)). - - Reference file: `data//output.raw` for validation. + - Input files: `data//input0.raw` (matrix \( A \)) and `input1.raw` (matrix \( B \)). + - Output file: `data//result.raw` (matrix \( C \)). + - Reference file: `data//output.raw` for validation. - The executable accepts a case number (0–9) as a command-line argument. - Validate correctness by comparing `result.raw` with `output.raw` for each implementation. @@ -150,22 +165,22 @@ Example table format: - Use the provided `CMakeLists.txt` to build the project. - **Additional Requirements**: - - Ensure OpenMP is enabled in your compiler (e.g., `-fopenmp` for GCC). - - The provided CMake file includes OpenMP support. + - Ensure OpenMP is enabled in your compiler (e.g., `-fopenmp` for GCC). + - The provided CMake file includes OpenMP support. - **Windows Users**: - - Use CLion or Visual Studio with CMake. - - Alternatively, use MinGW with `cmake -G "MinGW Makefiles"` and `make`. + - Use CLion or Visual Studio with CMake. + - Alternatively, use MinGW with `cmake -G "MinGW Makefiles"` and `make`. - **Linux/Mac Users**: - - Make sure the GCC compiler is installed (`brew install gcc` on Mac). - - Configure CMake to use the correct compiler: - ```bash - cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ . - ``` - - Run `cmake .` to generate a Makefile, then `make`. + - Make sure the GCC compiler is installed (`brew install gcc` on Mac). + - Configure CMake to use the correct compiler: + ```bash + cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ . + ``` + - Run `cmake .` to generate a Makefile, then `make`. - **Testing OpenMP**: - - Set the number of threads using the environment variable `OMP_NUM_THREADS` (e.g., `export OMP_NUM_THREADS=4` on - Linux/Mac, or `set OMP_NUM_THREADS=4` on Windows). - - Test with different thread counts to find the best performance. + - Set the number of threads using the environment variable `OMP_NUM_THREADS` (e.g., `export OMP_NUM_THREADS=4` on + Linux/Mac, or `set OMP_NUM_THREADS=4` on Windows). + - Test with different thread counts to find the best performance. --- @@ -209,7 +224,7 @@ git push origin student-name ### Grading (100 Points Total) | Subtask | Points | -|---------------------------------------------|--------| +| ------------------------------------------- | ------ | | Correct implementation of `blocked_matmul` | 30 | | Correct implementation of `parallel_matmul` | 30 | | Accurate performance measurements | 20 | @@ -222,16 +237,16 @@ git push origin student-name ### Tips for Success - **Cache Optimization**: - - Experiment with different block sizes. Start with powers of 2 (e.g., 16, 32, 64). - - Use a block size that balances cache usage without excessive overhead. + - Experiment with different block sizes. Start with powers of 2 (e.g., 16, 32, 64). + - Use a block size that balances cache usage without excessive overhead. - **OpenMP**: - - Test with different thread counts to find the optimal number for your system. - - Be cautious of false sharing (when threads access nearby memory locations, causing cache coherence issues). + - Test with different thread counts to find the optimal number for your system. + - Be cautious of false sharing (when threads access nearby memory locations, causing cache coherence issues). - **Performance Measurement**: - - Run multiple iterations for each test case and report the average time to reduce variability. - - Ensure no other heavy processes are running during measurements. + - Run multiple iterations for each test case and report the average time to reduce variability. + - Ensure no other heavy processes are running during measurements. - **Debugging**: - - Validate each implementation against `output.raw` to ensure correctness before optimizing. - - Use small test cases to debug your blocked and parallel implementations. + - Validate each implementation against `output.raw` to ensure correctness before optimizing. + - Use small test cases to debug your blocked and parallel implementations. Good luck, and enjoy optimizing your matrix multiplication! diff --git a/main.cpp b/main.cpp index 65bf108..78cc3f7 100644 --- a/main.cpp +++ b/main.cpp @@ -3,34 +3,190 @@ #include #include #include +#include +#include +#include +#include -void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - //TODO : Implement naive matrix multiplication +const double MIN_TIMING_SECONDS = 0.05; + +bool read_matrix(const std::string &file_name, float *&matrix, uint32_t &rows, uint32_t &cols) +{ + std::ifstream file(file_name); + if (!file) + { + std::cerr << "Failed to open " << file_name << std::endl; + return false; + } + + if (!(file >> rows >> cols)) + { + std::cerr << "Failed to read matrix dimensions from " << file_name << std::endl; + return false; + } + + matrix = new float[rows * cols]; + for (uint32_t i = 0; i < rows * cols; i++) + { + if (!(file >> matrix[i])) + { + std::cerr << "Failed to read matrix value " << i << " from " << file_name << std::endl; + delete[] matrix; + matrix = nullptr; + return false; + } + } + + return true; } -void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) { - // TODO: Implement blocked matrix multiplication - // A is m x n, B is n x p, C is m x p - // Use block_size to divide matrices into submatrices +bool write_matrix(const std::string &file_name, const float *matrix, uint32_t rows, uint32_t cols) +{ + std::ofstream file(file_name); + if (!file) + { + std::cerr << "Failed to open " << file_name << " for writing" << std::endl; + return false; + } + + file << rows << " " << cols << "\n"; + file << std::fixed << std::setprecision(2); + for (uint32_t i = 0; i < rows; i++) + { + for (uint32_t j = 0; j < cols; j++) + { + if (j > 0) + { + file << " "; + } + file << matrix[i * cols + j]; + } + file << "\n"; + } + + return true; } -void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - // TODO: Implement parallel matrix multiplication using OpenMP - // A is m x n, B is n x p, C is m x p +void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) +{ + std::fill(C, C + m * p, 0.0f); + + for (uint32_t i = 0; i < m; i++) + { + for (uint32_t j = 0; j < p; j++) + { + for (uint32_t k = 0; k < n; k++) + { + C[i * p + j] += A[i * n + k] * B[k * p + j]; + } + } + } } -bool validate_result(const std::string &result_file, const std::string &reference_file) { - //TODO : Implement result validation +void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) +{ + std::fill(C, C + m * p, 0.0f); + + for (uint32_t ii = 0; ii < m; ii += block_size) + { + for (uint32_t kk = 0; kk < n; kk += block_size) + { + for (uint32_t jj = 0; jj < p; jj += block_size) + { + const uint32_t i_end = std::min(ii + block_size, m); + const uint32_t k_end = std::min(kk + block_size, n); + const uint32_t j_end = std::min(jj + block_size, p); + + for (uint32_t i = ii; i < i_end; i++) + { + for (uint32_t k = kk; k < k_end; k++) + { + const float a = A[i * n + k]; + for (uint32_t j = jj; j < j_end; j++) + { + C[i * p + j] += a * B[k * p + j]; + } + } + } + } + } + } } -int main(int argc, char *argv[]) { - if (argc != 2) { +void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) +{ + std::fill(C, C + m * p, 0.0f); + +#pragma omp parallel for schedule(static) + for (int i = 0; i < static_cast(m); i++) + { + for (uint32_t j = 0; j < p; j++) + { + for (uint32_t k = 0; k < n; k++) + { + C[i * p + j] += A[i * n + k] * B[k * p + j]; + } + } + } +} + +bool validate_result(const std::string &result_file, const std::string &reference_file) +{ + float *result = nullptr; + float *reference = nullptr; + uint32_t result_rows = 0; + uint32_t result_cols = 0; + uint32_t reference_rows = 0; + uint32_t reference_cols = 0; + + if (!read_matrix(result_file, result, result_rows, result_cols) || + !read_matrix(reference_file, reference, reference_rows, reference_cols)) + { + delete[] result; + delete[] reference; + return false; + } + + if (result_rows != reference_rows || result_cols != reference_cols) + { + std::cerr << "Dimension mismatch: result is " << result_rows << "x" << result_cols + << ", reference is " << reference_rows << "x" << reference_cols << std::endl; + delete[] result; + delete[] reference; + return false; + } + + const float epsilon = 5e-2f; + const uint32_t count = result_rows * result_cols; + for (uint32_t i = 0; i < count; i++) + { + const float diff = std::fabs(result[i] - reference[i]); + if (diff > epsilon) + { + std::cerr << "Value mismatch at index " << i << ": result=" << result[i] + << ", reference=" << reference[i] << ", diff=" << diff << std::endl; + delete[] result; + delete[] reference; + return false; + } + } + + delete[] result; + delete[] reference; + return true; +} + +int main(int argc, char *argv[]) +{ + if (argc != 2) + { std::cerr << "Usage: " << argv[0] << " " << std::endl; return 1; } int case_number = std::atoi(argv[1]); - if (case_number < 0 || case_number > 9) { + if (case_number < 0 || case_number > 9) + { std::cerr << "Case number must be between 0 and 9" << std::endl; return 1; } @@ -42,60 +198,124 @@ int main(int argc, char *argv[]) { std::string result_file = folder + "result.raw"; std::string reference_file = folder + "output.raw"; - // TODO Read input0.raw (matrix A) + float *A = nullptr; + float *B = nullptr; + uint32_t m = 0; + uint32_t n = 0; + uint32_t b_rows = 0; + uint32_t p = 0; + if (!read_matrix(input0_file, A, m, n)) + { + return 1; + } - // TODO Read input1.raw (matrix B) + if (!read_matrix(input1_file, B, b_rows, p)) + { + delete[] A; + return 1; + } + if (n != b_rows) + { + std::cerr << "Incompatible matrix dimensions: A is " << m << "x" << n + << ", B is " << b_rows << "x" << p << std::endl; + delete[] A; + delete[] B; + return 1; + } // Allocate memory for result matrices float *C_naive = new float[m * p]; float *C_blocked = new float[m * p]; float *C_parallel = new float[m * p]; - // Measure performance of naive_matmul double start_time = omp_get_wtime(); - naive_matmul(C_naive, A, B, m, n, p); - double naive_time = omp_get_wtime() - start_time; - - // TODO Write naive result to file + double elapsed_time = 0.0; + int timing_repetitions = 0; + do + { + naive_matmul(C_naive, A, B, m, n, p); + timing_repetitions++; + elapsed_time = omp_get_wtime() - start_time; + } while (elapsed_time < MIN_TIMING_SECONDS); + double naive_time = elapsed_time / timing_repetitions; + if (!write_matrix(result_file, C_naive, m, p)) + { + delete[] A; + delete[] B; + delete[] C_naive; + delete[] C_blocked; + delete[] C_parallel; + return 1; + } // Validate naive result bool naive_correct = validate_result(result_file, reference_file); - if (!naive_correct) { + if (!naive_correct) + { std::cerr << "Naive result validation failed for case " << case_number << std::endl; } - // Measure performance of blocked_matmul (use block_size = 32 as default) start_time = omp_get_wtime(); - blocked_matmul(C_blocked, A, B, m, n, p, 32); - double blocked_time = omp_get_wtime() - start_time; - - // TODO Write blocked result to file + elapsed_time = 0.0; + timing_repetitions = 0; + do + { + blocked_matmul(C_blocked, A, B, m, n, p, 32); + timing_repetitions++; + elapsed_time = omp_get_wtime() - start_time; + } while (elapsed_time < MIN_TIMING_SECONDS); + double blocked_time = elapsed_time / timing_repetitions; + if (!write_matrix(result_file, C_blocked, m, p)) + { + delete[] A; + delete[] B; + delete[] C_naive; + delete[] C_blocked; + delete[] C_parallel; + return 1; + } // Validate blocked result bool blocked_correct = validate_result(result_file, reference_file); - if (!blocked_correct) { + if (!blocked_correct) + { std::cerr << "Blocked result validation failed for case " << case_number << std::endl; } - // Measure performance of parallel_matmul start_time = omp_get_wtime(); - parallel_matmul(C_parallel, A, B, m, n, p); - double parallel_time = omp_get_wtime() - start_time; - - // TODO Write parallel result to file + elapsed_time = 0.0; + timing_repetitions = 0; + do + { + parallel_matmul(C_parallel, A, B, m, n, p); + timing_repetitions++; + elapsed_time = omp_get_wtime() - start_time; + } while (elapsed_time < MIN_TIMING_SECONDS); + double parallel_time = elapsed_time / timing_repetitions; + if (!write_matrix(result_file, C_parallel, m, p)) + { + delete[] A; + delete[] B; + delete[] C_naive; + delete[] C_blocked; + delete[] C_parallel; + return 1; + } // Validate parallel result bool parallel_correct = validate_result(result_file, reference_file); - if (!parallel_correct) { + if (!parallel_correct) + { std::cerr << "Parallel result validation failed for case " << case_number << std::endl; } // Print performance results + std::cout << std::fixed << std::setprecision(9); std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n"; std::cout << "Naive time: " << naive_time << " seconds\n"; std::cout << "Blocked time: " << blocked_time << " seconds\n"; @@ -111,4 +331,4 @@ int main(int argc, char *argv[]) { delete[] C_parallel; return 0; -} \ No newline at end of file +}