From 5d4bebd1a24e9c37cc66f7b30dc0a5e8e8b5738d Mon Sep 17 00:00:00 2001 From: Khyati Kiyawat Date: Mon, 17 Mar 2025 19:05:37 -0400 Subject: [PATCH 1/3] added rmsnorm v1 --- PIMbench/rmsnorm/Makefile | 16 ++ PIMbench/rmsnorm/PIM/Makefile | 24 ++ PIMbench/rmsnorm/PIM/rmsnorm.cpp | 250 +++++++++++++++++++++ PIMbench/rmsnorm/PIM/run_rmsnorm.sh | 1 + PIMbench/rmsnorm/README.md | 96 ++++++++ PIMbench/rmsnorm/baselines/CPU/Makefile | 24 ++ PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp | 112 +++++++++ 7 files changed, 523 insertions(+) create mode 100644 PIMbench/rmsnorm/Makefile create mode 100644 PIMbench/rmsnorm/PIM/Makefile create mode 100644 PIMbench/rmsnorm/PIM/rmsnorm.cpp create mode 100755 PIMbench/rmsnorm/PIM/run_rmsnorm.sh create mode 100644 PIMbench/rmsnorm/README.md create mode 100644 PIMbench/rmsnorm/baselines/CPU/Makefile create mode 100644 PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp diff --git a/PIMbench/rmsnorm/Makefile b/PIMbench/rmsnorm/Makefile new file mode 100644 index 00000000..b3e4ce58 --- /dev/null +++ b/PIMbench/rmsnorm/Makefile @@ -0,0 +1,16 @@ +# Makefile: C++ version of matrix vector multiplication +# Copyright (c) 2024 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +SUBDIRS := PIM + +.PHONY: debug perf dramsim3_integ clean $(SUBDIRS) +.DEFAULT_GOAL := perf + +USE_OPENMP ?= 0 + +debug perf dramsim3_integ clean: $(SUBDIRS) + +$(SUBDIRS): + $(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP) diff --git a/PIMbench/rmsnorm/PIM/Makefile b/PIMbench/rmsnorm/PIM/Makefile new file mode 100644 index 00000000..53225bf0 --- /dev/null +++ b/PIMbench/rmsnorm/PIM/Makefile @@ -0,0 +1,24 @@ +# Makefile: C++ version of matrix vector multiplication +# Copyright (c) 2024 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +PROJ_ROOT = ../../.. +include ${PROJ_ROOT}/Makefile.common + +# make USE_OPENMP=1 +USE_OPENMP ?= 0 +ifeq ($(USE_OPENMP),1) + CXXFLAGS += -fopenmp +endif + +EXEC := rmsnorm.out +SRC := rmsnorm.cpp + +debug perf dramsim3_integ: $(EXEC) + +$(EXEC): $(SRC) $(DEPS) + $(CXX) $< $(CXXFLAGS) -o $@ + +clean: + rm -rf $(EXEC) *.dSYM diff --git a/PIMbench/rmsnorm/PIM/rmsnorm.cpp b/PIMbench/rmsnorm/PIM/rmsnorm.cpp new file mode 100644 index 00000000..07a09547 --- /dev/null +++ b/PIMbench/rmsnorm/PIM/rmsnorm.cpp @@ -0,0 +1,250 @@ +// Test: C++ version of matrix vector multiplication +// Copyright (c) 2024 University of Virginia +// This file is licensed under the MIT License. +// See the LICENSE file in the root of this repository for more details. + +#include +#include +#include +#include +#include +#if defined(_OPENMP) +#include +#endif + +#include "util.h" +#include "libpimeval.h" +#include +#include + +std::chrono::duration hostElapsedTime = std::chrono::duration::zero(); + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t vectorLength; + char *configFile; + char *inputFile; + bool shouldVerify; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./rmsnorm.out [options]" + "\n" + "\n -l vectorLength (default=128 elements)" + "\n -c dramsim config file" + "\n -i input file containing two vectors (default=generates vector with random numbers)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +struct Params getInputParams(int argc, char **argv) +{ + struct Params p; + p.vectorLength = 128; + p.configFile = nullptr; + p.inputFile = nullptr; + p.shouldVerify = false; + + int opt; + while ((opt = getopt(argc, argv, "h:l:c:i:v:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + break; + case 'l': + p.vectorLength = strtoull(optarg, NULL, 0); + break; + case 'c': + p.configFile = optarg; + break; + case 'i': + p.inputFile = optarg; + break; + case 'v': + p.shouldVerify = (*optarg == 't') ? true : false; + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + return p; +} + +// Newton-Raphson iterative integer square root +uint32_t newton_sqrt(uint32_t x) { + if (x == 0) return 0; // Handle zero case + + uint32_t guess = x; // Initial guess + uint32_t prev_guess = 0; + + while (guess != prev_guess) { // Continue until convergence + prev_guess = guess; + guess = (guess + x / guess) / 2; // Newton-Raphson iteration + } + + //std::cout << "newton sqrt: " << guess << std::endl; + return guess; +} + +void rmsnorm(uint64_t vectorLength, std::vector &srcVector, std::vector &dst) +{ + PimObjId srcObj1 = pimAlloc(PIM_ALLOC_AUTO, vectorLength, PIM_INT32); + if (srcObj1 == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimObjId dstObj = pimAllocAssociated(srcObj1, PIM_INT32); + if (dstObj == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimStatus status; + + status = pimCopyHostToDevice((void *)srcVector.data(), srcObj1); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + // Square the element of the vector + status = pimMul(srcObj1, srcObj1, dstObj); //TODO: How to take care of overflow? + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + + // Sum of the squared elements - reduction + uint32_t sum = 0; + status = pimRedSum(dstObj, static_cast(&sum), 0, vectorLength); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + + auto start_cpu = std::chrono::high_resolution_clock::now(); + // divide to get mean + uint32_t mean = sum/vectorLength; + + // Compute RMS using Newton-Raphson square root + uint32_t rms = newton_sqrt(mean + 1); // +1 to prevent division by zero + //uint32_t rms = 0; + auto stop_cpu = std::chrono::high_resolution_clock::now(); + hostElapsedTime += (stop_cpu - start_cpu); + + // Scale srcVector + status = pimDivScalar(srcObj1, dstObj, rms+1); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + dst.resize(vectorLength); + status = pimCopyDeviceToHost(dstObj, (void *)dst.data()); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + } + pimFree(srcObj1); + pimFree(dstObj); +} + +int main(int argc, char *argv[]) +{ + struct Params params = getInputParams(argc, argv); + std::cout << "Running RMSNORM for vector of size: " << params.vectorLength << std::endl; + + std::vector srcVector (params.vectorLength, 1), resultVector; + + if (params.shouldVerify) { + if (params.inputFile == nullptr) + { + getVector(params.vectorLength, srcVector); + } + else + { + std::cout << "Reading from input file is not implemented yet." << std::endl; + return 1; + } + } + + if (!createDevice(params.configFile)) + { + return 1; + } + + // TODO: Check if vector can fit in one iteration. Otherwise need to run in multiple iteration. + rmsnorm(params.vectorLength, srcVector, resultVector); + + if (params.shouldVerify) + { + bool shouldBreak = false; // shared flag variable + + // verify result + + std::vector result (params.vectorLength, 0); + + //rms norm + uint32_t sum_sq = 0; + + // Compute sum of squares + for (size_t i = 0; i < params.vectorLength; i++) { + sum_sq += (uint32_t)(srcVector[i] * srcVector[i]); // Prevent overflow + } + + // Compute mean squared value + uint32_t mean_sq = sum_sq / params.vectorLength; // Integer division + + // Compute RMS using Newton-Raphson square root + //uint32_t rms = newton_sqrt(mean_sq + 1); // +1 to prevent division by zero + uint32_t rms = sqrt(mean_sq+1); + //std::cout << "sqrt(): " << rms << std::endl; + + // Normalize each element: Y[i] = X[i] / RMS + for (size_t i = 0; i < params.vectorLength; i++) { + result[i] = srcVector[i] / (rms + 1); // Prevent division by zero + } + + for (size_t i = 0; i < params.vectorLength; i++) + { + if (result[i] != resultVector[i]) + { + #pragma omp critical + { + if (!shouldBreak) + { // check the flag again in a critical section + std::cout << "Wrong answer: " << resultVector[i] << " (expected " << result[i] << ")" << std::endl; + shouldBreak = true; // set the flag to true + } + } + } + } + + + if (!shouldBreak) { + std::cout << "\n\nCorrect Answer!!\n\n"; + } + } + + pimShowStats(); + std::cout << "Host elapsed time: " << std::fixed << std::setprecision(3) << hostElapsedTime.count() << " ms." << endl; + + return 0; +} diff --git a/PIMbench/rmsnorm/PIM/run_rmsnorm.sh b/PIMbench/rmsnorm/PIM/run_rmsnorm.sh new file mode 100755 index 00000000..3d3d8b46 --- /dev/null +++ b/PIMbench/rmsnorm/PIM/run_rmsnorm.sh @@ -0,0 +1 @@ +./rmsnorm.out -v t -c ../../../configs/hbm/PIMeval_Bank_Rank1.cfg -l 12000 diff --git a/PIMbench/rmsnorm/README.md b/PIMbench/rmsnorm/README.md new file mode 100644 index 00000000..6698b958 --- /dev/null +++ b/PIMbench/rmsnorm/README.md @@ -0,0 +1,96 @@ +#TO BE UPDATED FOR RMSNORM +# General Matrix Vector Multiplication (GEMV) + +The RMSNorm is a normalization function mostly used in AI models + + +For a detailed description of RMSNorm, you can refer to the [torch.nn.RMSNorm](https://pytorch.org/docs/stable/generated/torch.nn.RMSNorm.html) or the [paper](https://dl.acm.org/doi/pdf/10.5555/3454287.3455397) + +## Directory Structure + +``` +rmsnorm/ +├── PIM/ +│ ├── Makefile +│ ├── rmsnorm.cpp +├── baselines/ +│ ├── CPU/ +│ │ ├── Makefile +│ │ ├── rmsnorm.cpp +│ ├── GPU/ **TODO** +│ │ ├── Makefile +│ │ ├── rmsnorm.cu +├── README.md +├── Makefile +``` + +## Implementation Description + +This repository contains three different implementations of the RMSNORM benchmark: + +1. CPU +2. GPU **TODO** +3. PIM + +### Baseline Implementation + +CPU and GPU have been used as baselines. + +#### CPU + +The CPU variant of GEMV has been implemented using standard C++ library and OpenBLAS for parallel execution. + +#### GPU + +The GPU variant (**TODO** Try torch rmsnorm) + +### PIM Implementation + +The PIM variant is implemented using C++ and three different PIM architectures can be tested with this. + +## Compilation Instructions for Specific Variants + +### CPU Variant + +To compile for the CPU variant, use: + +```bash +cd baselines/CPU +make +``` + +### GPU Variant + +To compile for the GPU variant, use: + +```bash +cd baselines/GPU +make +``` + +*Note that the GPU Makefile currently uses SM_80, which is compatible with the A100. To run it on a different GPU, please manually change this in the makefile. + +### PIM Variant + +To compile for the PIM variant, use: + +```bash +cd PIM +make -j USE_OPENMP=1 +``` + +## Execution Instructions + +### Running the Executable + +After compiling, run the each executable with the following command that will run it for default parameters: + +```bash +./rmsnorm.out +``` + +To see help text on all usages and how to modify any of the input parameters, use following command: + +```bash +./rmsnorm.out -h +``` diff --git a/PIMbench/rmsnorm/baselines/CPU/Makefile b/PIMbench/rmsnorm/baselines/CPU/Makefile new file mode 100644 index 00000000..ac1a058c --- /dev/null +++ b/PIMbench/rmsnorm/baselines/CPU/Makefile @@ -0,0 +1,24 @@ +# Compiler +CXX := g++ + +# Compiler flags +CXXFLAGS := -Wall -Wextra -Werror -std=c++17 -O3 -fopenmp +LDFLAGS = -lopenblas + +# Executable name +EXEC := rmsnorm.out + +# Source files +SRC_FILES := $(wildcard *.cpp) + + +.PHONY: all clean + +all: $(EXEC) + +$(EXEC): $(SRC_FILES) | + $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDFLAGS) + +clean: + rm -rf $(EXEC) + diff --git a/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp b/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp new file mode 100644 index 00000000..9fff1878 --- /dev/null +++ b/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp @@ -0,0 +1,112 @@ +/** + * @file rmsnorm.cpp + * @brief RMSNORM. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../../util/utilBaselines.h" + +using namespace std; + +// Global Vectors +vector A; +vector B; + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t vectorLength; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./gemv.out [options]" + "\n" + "\n -l vector size (default=128 elements)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +/** + * @brief Parses command line input parameters + * @param argc Number of command line arguments + * @param argv Array of command line arguments + * @return Parsed parameters + */ +struct Params parseParams(int argc, char **argv) +{ + struct Params p; + p.vectorLength = 128; + + int opt; + while ((opt = getopt(argc, argv, ":l:h:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + case 'l': + p.vectorLength = stoull(optarg); + break; + default: + cerr << "\nUnrecognized option: " << opt << "\n"; + usage(); + exit(1); + } + } + return p; +} + +void rmsnorm(uint64_t vectorLength, std::vector &srcVector, std::vector &dst) +{ +uint32_t sum_sq = 0; +for (size_t i = 0; i < vectorLength; i++) +{ + sum_sq += (uint32_t)(srcVector[i] * srcVector[i]); // Prevent overflow +} +uint32_t mean_sq = sum_sq / vectorLength; +uint32_t rms = sqrt(mean_sq+1); +for (size_t i = 0; i < vectorLength; i++) +{ + dst[i] = srcVector[i] / (rms + 1); // Prevent division by zero +} +} + +/** + * @brief Main function. + */ +int main(int argc, char **argv) +{ + // Parse input parameters + Params params = parseParams(argc, argv); + uint64_t vectorLength = params.vectorLength; + + // Initialize vectors + getVector(vectorLength, A); + B.resize(vectorLength); + std::cout << "Done initialization." << std::endl; + + auto start = chrono::high_resolution_clock::now(); + + for (int32_t i = 0; i < WARMUP; i++) + { + rmsnorm(vectorLength, A, B); + } + + auto end = chrono::high_resolution_clock::now(); + + chrono::duration elapsedTime = (end - start) / WARMUP; + cout << "Duration: " << fixed << setprecision(3) << elapsedTime.count() << " ms." << endl; + + return 0; +} From e41cc3d5dbadd987bfb3481f5e317b150b1671af Mon Sep 17 00:00:00 2001 From: Khyati Kiyawat Date: Mon, 17 Mar 2025 19:59:10 -0400 Subject: [PATCH 2/3] Assignment files, not to make public Revert "added rmsnorm v1" This reverts commit 5d4bebd1a24e9c37cc66f7b30dc0a5e8e8b5738d. --- PIMbench/rmsnorm/Makefile | 16 -- PIMbench/rmsnorm/PIM/Makefile | 24 -- PIMbench/rmsnorm/PIM/rmsnorm.cpp | 250 --------------------- PIMbench/rmsnorm/PIM/run_rmsnorm.sh | 1 - PIMbench/rmsnorm/README.md | 96 -------- PIMbench/rmsnorm/baselines/CPU/Makefile | 24 -- PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp | 112 --------- 7 files changed, 523 deletions(-) delete mode 100644 PIMbench/rmsnorm/Makefile delete mode 100644 PIMbench/rmsnorm/PIM/Makefile delete mode 100644 PIMbench/rmsnorm/PIM/rmsnorm.cpp delete mode 100755 PIMbench/rmsnorm/PIM/run_rmsnorm.sh delete mode 100644 PIMbench/rmsnorm/README.md delete mode 100644 PIMbench/rmsnorm/baselines/CPU/Makefile delete mode 100644 PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp diff --git a/PIMbench/rmsnorm/Makefile b/PIMbench/rmsnorm/Makefile deleted file mode 100644 index b3e4ce58..00000000 --- a/PIMbench/rmsnorm/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -# Makefile: C++ version of matrix vector multiplication -# Copyright (c) 2024 University of Virginia -# This file is licensed under the MIT License. -# See the LICENSE file in the root of this repository for more details. - -SUBDIRS := PIM - -.PHONY: debug perf dramsim3_integ clean $(SUBDIRS) -.DEFAULT_GOAL := perf - -USE_OPENMP ?= 0 - -debug perf dramsim3_integ clean: $(SUBDIRS) - -$(SUBDIRS): - $(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP) diff --git a/PIMbench/rmsnorm/PIM/Makefile b/PIMbench/rmsnorm/PIM/Makefile deleted file mode 100644 index 53225bf0..00000000 --- a/PIMbench/rmsnorm/PIM/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -# Makefile: C++ version of matrix vector multiplication -# Copyright (c) 2024 University of Virginia -# This file is licensed under the MIT License. -# See the LICENSE file in the root of this repository for more details. - -PROJ_ROOT = ../../.. -include ${PROJ_ROOT}/Makefile.common - -# make USE_OPENMP=1 -USE_OPENMP ?= 0 -ifeq ($(USE_OPENMP),1) - CXXFLAGS += -fopenmp -endif - -EXEC := rmsnorm.out -SRC := rmsnorm.cpp - -debug perf dramsim3_integ: $(EXEC) - -$(EXEC): $(SRC) $(DEPS) - $(CXX) $< $(CXXFLAGS) -o $@ - -clean: - rm -rf $(EXEC) *.dSYM diff --git a/PIMbench/rmsnorm/PIM/rmsnorm.cpp b/PIMbench/rmsnorm/PIM/rmsnorm.cpp deleted file mode 100644 index 07a09547..00000000 --- a/PIMbench/rmsnorm/PIM/rmsnorm.cpp +++ /dev/null @@ -1,250 +0,0 @@ -// Test: C++ version of matrix vector multiplication -// Copyright (c) 2024 University of Virginia -// This file is licensed under the MIT License. -// See the LICENSE file in the root of this repository for more details. - -#include -#include -#include -#include -#include -#if defined(_OPENMP) -#include -#endif - -#include "util.h" -#include "libpimeval.h" -#include -#include - -std::chrono::duration hostElapsedTime = std::chrono::duration::zero(); - -// Params --------------------------------------------------------------------- -typedef struct Params -{ - uint64_t vectorLength; - char *configFile; - char *inputFile; - bool shouldVerify; -} Params; - -void usage() -{ - fprintf(stderr, - "\nUsage: ./rmsnorm.out [options]" - "\n" - "\n -l vectorLength (default=128 elements)" - "\n -c dramsim config file" - "\n -i input file containing two vectors (default=generates vector with random numbers)" - "\n -v t = verifies PIM output with host output. (default=false)" - "\n"); -} - -struct Params getInputParams(int argc, char **argv) -{ - struct Params p; - p.vectorLength = 128; - p.configFile = nullptr; - p.inputFile = nullptr; - p.shouldVerify = false; - - int opt; - while ((opt = getopt(argc, argv, "h:l:c:i:v:")) >= 0) - { - switch (opt) - { - case 'h': - usage(); - exit(0); - break; - case 'l': - p.vectorLength = strtoull(optarg, NULL, 0); - break; - case 'c': - p.configFile = optarg; - break; - case 'i': - p.inputFile = optarg; - break; - case 'v': - p.shouldVerify = (*optarg == 't') ? true : false; - break; - default: - fprintf(stderr, "\nUnrecognized option!\n"); - usage(); - exit(0); - } - } - return p; -} - -// Newton-Raphson iterative integer square root -uint32_t newton_sqrt(uint32_t x) { - if (x == 0) return 0; // Handle zero case - - uint32_t guess = x; // Initial guess - uint32_t prev_guess = 0; - - while (guess != prev_guess) { // Continue until convergence - prev_guess = guess; - guess = (guess + x / guess) / 2; // Newton-Raphson iteration - } - - //std::cout << "newton sqrt: " << guess << std::endl; - return guess; -} - -void rmsnorm(uint64_t vectorLength, std::vector &srcVector, std::vector &dst) -{ - PimObjId srcObj1 = pimAlloc(PIM_ALLOC_AUTO, vectorLength, PIM_INT32); - if (srcObj1 == -1) - { - std::cout << "Abort" << std::endl; - return; - } - - PimObjId dstObj = pimAllocAssociated(srcObj1, PIM_INT32); - if (dstObj == -1) - { - std::cout << "Abort" << std::endl; - return; - } - - PimStatus status; - - status = pimCopyHostToDevice((void *)srcVector.data(), srcObj1); - if (status != PIM_OK) - { - std::cout << "Abort" << std::endl; - return; - } - - // Square the element of the vector - status = pimMul(srcObj1, srcObj1, dstObj); //TODO: How to take care of overflow? - if (status != PIM_OK) - { - std::cout << "Abort" << std::endl; - return; - } - - - // Sum of the squared elements - reduction - uint32_t sum = 0; - status = pimRedSum(dstObj, static_cast(&sum), 0, vectorLength); - if (status != PIM_OK) - { - std::cout << "Abort" << std::endl; - return; - } - - - auto start_cpu = std::chrono::high_resolution_clock::now(); - // divide to get mean - uint32_t mean = sum/vectorLength; - - // Compute RMS using Newton-Raphson square root - uint32_t rms = newton_sqrt(mean + 1); // +1 to prevent division by zero - //uint32_t rms = 0; - auto stop_cpu = std::chrono::high_resolution_clock::now(); - hostElapsedTime += (stop_cpu - start_cpu); - - // Scale srcVector - status = pimDivScalar(srcObj1, dstObj, rms+1); - if (status != PIM_OK) - { - std::cout << "Abort" << std::endl; - return; - } - - dst.resize(vectorLength); - status = pimCopyDeviceToHost(dstObj, (void *)dst.data()); - if (status != PIM_OK) - { - std::cout << "Abort" << std::endl; - } - pimFree(srcObj1); - pimFree(dstObj); -} - -int main(int argc, char *argv[]) -{ - struct Params params = getInputParams(argc, argv); - std::cout << "Running RMSNORM for vector of size: " << params.vectorLength << std::endl; - - std::vector srcVector (params.vectorLength, 1), resultVector; - - if (params.shouldVerify) { - if (params.inputFile == nullptr) - { - getVector(params.vectorLength, srcVector); - } - else - { - std::cout << "Reading from input file is not implemented yet." << std::endl; - return 1; - } - } - - if (!createDevice(params.configFile)) - { - return 1; - } - - // TODO: Check if vector can fit in one iteration. Otherwise need to run in multiple iteration. - rmsnorm(params.vectorLength, srcVector, resultVector); - - if (params.shouldVerify) - { - bool shouldBreak = false; // shared flag variable - - // verify result - - std::vector result (params.vectorLength, 0); - - //rms norm - uint32_t sum_sq = 0; - - // Compute sum of squares - for (size_t i = 0; i < params.vectorLength; i++) { - sum_sq += (uint32_t)(srcVector[i] * srcVector[i]); // Prevent overflow - } - - // Compute mean squared value - uint32_t mean_sq = sum_sq / params.vectorLength; // Integer division - - // Compute RMS using Newton-Raphson square root - //uint32_t rms = newton_sqrt(mean_sq + 1); // +1 to prevent division by zero - uint32_t rms = sqrt(mean_sq+1); - //std::cout << "sqrt(): " << rms << std::endl; - - // Normalize each element: Y[i] = X[i] / RMS - for (size_t i = 0; i < params.vectorLength; i++) { - result[i] = srcVector[i] / (rms + 1); // Prevent division by zero - } - - for (size_t i = 0; i < params.vectorLength; i++) - { - if (result[i] != resultVector[i]) - { - #pragma omp critical - { - if (!shouldBreak) - { // check the flag again in a critical section - std::cout << "Wrong answer: " << resultVector[i] << " (expected " << result[i] << ")" << std::endl; - shouldBreak = true; // set the flag to true - } - } - } - } - - - if (!shouldBreak) { - std::cout << "\n\nCorrect Answer!!\n\n"; - } - } - - pimShowStats(); - std::cout << "Host elapsed time: " << std::fixed << std::setprecision(3) << hostElapsedTime.count() << " ms." << endl; - - return 0; -} diff --git a/PIMbench/rmsnorm/PIM/run_rmsnorm.sh b/PIMbench/rmsnorm/PIM/run_rmsnorm.sh deleted file mode 100755 index 3d3d8b46..00000000 --- a/PIMbench/rmsnorm/PIM/run_rmsnorm.sh +++ /dev/null @@ -1 +0,0 @@ -./rmsnorm.out -v t -c ../../../configs/hbm/PIMeval_Bank_Rank1.cfg -l 12000 diff --git a/PIMbench/rmsnorm/README.md b/PIMbench/rmsnorm/README.md deleted file mode 100644 index 6698b958..00000000 --- a/PIMbench/rmsnorm/README.md +++ /dev/null @@ -1,96 +0,0 @@ -#TO BE UPDATED FOR RMSNORM -# General Matrix Vector Multiplication (GEMV) - -The RMSNorm is a normalization function mostly used in AI models - - -For a detailed description of RMSNorm, you can refer to the [torch.nn.RMSNorm](https://pytorch.org/docs/stable/generated/torch.nn.RMSNorm.html) or the [paper](https://dl.acm.org/doi/pdf/10.5555/3454287.3455397) - -## Directory Structure - -``` -rmsnorm/ -├── PIM/ -│ ├── Makefile -│ ├── rmsnorm.cpp -├── baselines/ -│ ├── CPU/ -│ │ ├── Makefile -│ │ ├── rmsnorm.cpp -│ ├── GPU/ **TODO** -│ │ ├── Makefile -│ │ ├── rmsnorm.cu -├── README.md -├── Makefile -``` - -## Implementation Description - -This repository contains three different implementations of the RMSNORM benchmark: - -1. CPU -2. GPU **TODO** -3. PIM - -### Baseline Implementation - -CPU and GPU have been used as baselines. - -#### CPU - -The CPU variant of GEMV has been implemented using standard C++ library and OpenBLAS for parallel execution. - -#### GPU - -The GPU variant (**TODO** Try torch rmsnorm) - -### PIM Implementation - -The PIM variant is implemented using C++ and three different PIM architectures can be tested with this. - -## Compilation Instructions for Specific Variants - -### CPU Variant - -To compile for the CPU variant, use: - -```bash -cd baselines/CPU -make -``` - -### GPU Variant - -To compile for the GPU variant, use: - -```bash -cd baselines/GPU -make -``` - -*Note that the GPU Makefile currently uses SM_80, which is compatible with the A100. To run it on a different GPU, please manually change this in the makefile. - -### PIM Variant - -To compile for the PIM variant, use: - -```bash -cd PIM -make -j USE_OPENMP=1 -``` - -## Execution Instructions - -### Running the Executable - -After compiling, run the each executable with the following command that will run it for default parameters: - -```bash -./rmsnorm.out -``` - -To see help text on all usages and how to modify any of the input parameters, use following command: - -```bash -./rmsnorm.out -h -``` diff --git a/PIMbench/rmsnorm/baselines/CPU/Makefile b/PIMbench/rmsnorm/baselines/CPU/Makefile deleted file mode 100644 index ac1a058c..00000000 --- a/PIMbench/rmsnorm/baselines/CPU/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -# Compiler -CXX := g++ - -# Compiler flags -CXXFLAGS := -Wall -Wextra -Werror -std=c++17 -O3 -fopenmp -LDFLAGS = -lopenblas - -# Executable name -EXEC := rmsnorm.out - -# Source files -SRC_FILES := $(wildcard *.cpp) - - -.PHONY: all clean - -all: $(EXEC) - -$(EXEC): $(SRC_FILES) | - $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDFLAGS) - -clean: - rm -rf $(EXEC) - diff --git a/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp b/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp deleted file mode 100644 index 9fff1878..00000000 --- a/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp +++ /dev/null @@ -1,112 +0,0 @@ -/** - * @file rmsnorm.cpp - * @brief RMSNORM. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../../../../util/utilBaselines.h" - -using namespace std; - -// Global Vectors -vector A; -vector B; - -// Params --------------------------------------------------------------------- -typedef struct Params -{ - uint64_t vectorLength; -} Params; - -void usage() -{ - fprintf(stderr, - "\nUsage: ./gemv.out [options]" - "\n" - "\n -l vector size (default=128 elements)" - "\n -v t = verifies PIM output with host output. (default=false)" - "\n"); -} - -/** - * @brief Parses command line input parameters - * @param argc Number of command line arguments - * @param argv Array of command line arguments - * @return Parsed parameters - */ -struct Params parseParams(int argc, char **argv) -{ - struct Params p; - p.vectorLength = 128; - - int opt; - while ((opt = getopt(argc, argv, ":l:h:")) >= 0) - { - switch (opt) - { - case 'h': - usage(); - exit(0); - case 'l': - p.vectorLength = stoull(optarg); - break; - default: - cerr << "\nUnrecognized option: " << opt << "\n"; - usage(); - exit(1); - } - } - return p; -} - -void rmsnorm(uint64_t vectorLength, std::vector &srcVector, std::vector &dst) -{ -uint32_t sum_sq = 0; -for (size_t i = 0; i < vectorLength; i++) -{ - sum_sq += (uint32_t)(srcVector[i] * srcVector[i]); // Prevent overflow -} -uint32_t mean_sq = sum_sq / vectorLength; -uint32_t rms = sqrt(mean_sq+1); -for (size_t i = 0; i < vectorLength; i++) -{ - dst[i] = srcVector[i] / (rms + 1); // Prevent division by zero -} -} - -/** - * @brief Main function. - */ -int main(int argc, char **argv) -{ - // Parse input parameters - Params params = parseParams(argc, argv); - uint64_t vectorLength = params.vectorLength; - - // Initialize vectors - getVector(vectorLength, A); - B.resize(vectorLength); - std::cout << "Done initialization." << std::endl; - - auto start = chrono::high_resolution_clock::now(); - - for (int32_t i = 0; i < WARMUP; i++) - { - rmsnorm(vectorLength, A, B); - } - - auto end = chrono::high_resolution_clock::now(); - - chrono::duration elapsedTime = (end - start) / WARMUP; - cout << "Duration: " << fixed << setprecision(3) << elapsedTime.count() << " ms." << endl; - - return 0; -} From 434c540871d12fee133d6c05c5b1e3e1fcfbc164 Mon Sep 17 00:00:00 2001 From: Khyati Kiyawat Date: Mon, 23 Jun 2025 17:31:18 -0400 Subject: [PATCH 3/3] added lnorm and rmsnorm, GPU baseline is not implemented --- PIMbench/lnorm/Makefile | 16 ++ PIMbench/lnorm/PIM/Makefile | 24 ++ PIMbench/lnorm/PIM/lnorm.cpp | 291 +++++++++++++++++++++ PIMbench/lnorm/PIM/run_lnorm.sh | 1 + PIMbench/lnorm/README.md | 95 +++++++ PIMbench/lnorm/baselines/CPU/Makefile | 24 ++ PIMbench/lnorm/baselines/CPU/lnorm.cpp | 135 ++++++++++ PIMbench/rmsnorm/Makefile | 16 ++ PIMbench/rmsnorm/PIM/Makefile | 24 ++ PIMbench/rmsnorm/PIM/rmsnorm.cpp | 250 ++++++++++++++++++ PIMbench/rmsnorm/PIM/run_rmsnorm.sh | 1 + PIMbench/rmsnorm/README.md | 95 +++++++ PIMbench/rmsnorm/baselines/CPU/Makefile | 24 ++ PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp | 112 ++++++++ 14 files changed, 1108 insertions(+) create mode 100644 PIMbench/lnorm/Makefile create mode 100644 PIMbench/lnorm/PIM/Makefile create mode 100644 PIMbench/lnorm/PIM/lnorm.cpp create mode 100755 PIMbench/lnorm/PIM/run_lnorm.sh create mode 100644 PIMbench/lnorm/README.md create mode 100644 PIMbench/lnorm/baselines/CPU/Makefile create mode 100644 PIMbench/lnorm/baselines/CPU/lnorm.cpp create mode 100644 PIMbench/rmsnorm/Makefile create mode 100644 PIMbench/rmsnorm/PIM/Makefile create mode 100644 PIMbench/rmsnorm/PIM/rmsnorm.cpp create mode 100755 PIMbench/rmsnorm/PIM/run_rmsnorm.sh create mode 100644 PIMbench/rmsnorm/README.md create mode 100644 PIMbench/rmsnorm/baselines/CPU/Makefile create mode 100644 PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp diff --git a/PIMbench/lnorm/Makefile b/PIMbench/lnorm/Makefile new file mode 100644 index 00000000..b3e4ce58 --- /dev/null +++ b/PIMbench/lnorm/Makefile @@ -0,0 +1,16 @@ +# Makefile: C++ version of matrix vector multiplication +# Copyright (c) 2024 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +SUBDIRS := PIM + +.PHONY: debug perf dramsim3_integ clean $(SUBDIRS) +.DEFAULT_GOAL := perf + +USE_OPENMP ?= 0 + +debug perf dramsim3_integ clean: $(SUBDIRS) + +$(SUBDIRS): + $(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP) diff --git a/PIMbench/lnorm/PIM/Makefile b/PIMbench/lnorm/PIM/Makefile new file mode 100644 index 00000000..e637f51d --- /dev/null +++ b/PIMbench/lnorm/PIM/Makefile @@ -0,0 +1,24 @@ +# Makefile: C++ version of matrix vector multiplication +# Copyright (c) 2024 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +PROJ_ROOT = ../../.. +include ${PROJ_ROOT}/Makefile.common + +# make USE_OPENMP=1 +USE_OPENMP ?= 0 +ifeq ($(USE_OPENMP),1) + CXXFLAGS += -fopenmp +endif + +EXEC := lnorm.out +SRC := lnorm.cpp + +debug perf dramsim3_integ: $(EXEC) + +$(EXEC): $(SRC) $(DEPS) + $(CXX) $< $(CXXFLAGS) -o $@ + +clean: + rm -rf $(EXEC) *.dSYM diff --git a/PIMbench/lnorm/PIM/lnorm.cpp b/PIMbench/lnorm/PIM/lnorm.cpp new file mode 100644 index 00000000..75b7375f --- /dev/null +++ b/PIMbench/lnorm/PIM/lnorm.cpp @@ -0,0 +1,291 @@ +// Test: C++ version of matrix vector multiplication +// Copyright (c) 2024 University of Virginia +// This file is licensed under the MIT License. +// See the LICENSE file in the root of this repository for more details. + +#include +#include +#include +#include +#include +#if defined(_OPENMP) +#include +#endif + +#include "util.h" +#include "libpimeval.h" +#include +#include + +std::chrono::duration hostElapsedTime = std::chrono::duration::zero(); +//auto start_cpu, stop_cpu; + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t vectorLength; + char *configFile; + char *inputFile; + bool shouldVerify; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./lnorm.out [options]" + "\n" + "\n -l vectorLength (default=128 elements)" + "\n -c dramsim config file" + "\n -i input file containing two vectors (default=generates vector with random numbers)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +struct Params getInputParams(int argc, char **argv) +{ + struct Params p; + p.vectorLength = 128; + p.configFile = nullptr; + p.inputFile = nullptr; + p.shouldVerify = false; + + int opt; + while ((opt = getopt(argc, argv, "h:l:c:i:v:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + break; + case 'l': + p.vectorLength = strtoull(optarg, NULL, 0); + break; + case 'c': + p.configFile = optarg; + break; + case 'i': + p.inputFile = optarg; + break; + case 'v': + p.shouldVerify = (*optarg == 't') ? true : false; + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + return p; +} + +// Newton-Raphson iterative integer square root +uint32_t newton_sqrt(uint32_t x) { + if (x == 0) return 0; // Handle zero case + + uint32_t guess = x; // Initial guess + uint32_t prev_guess = 0; + + while (guess != prev_guess) { // Continue until convergence + prev_guess = guess; + guess = (guess + x / guess) / 2; // Newton-Raphson iteration + } + + //std::cout << "newton sqrt: " << guess << std::endl; + return guess; +} + +void lnorm(uint64_t vectorLength, std::vector &srcVector, std::vector &dst) +{ + PimObjId srcObj1 = pimAlloc(PIM_ALLOC_AUTO, vectorLength, PIM_INT32); + if (srcObj1 == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimObjId tempObj1 = pimAllocAssociated(srcObj1, PIM_INT32); + if (tempObj1 == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimObjId dstObj = pimAllocAssociated(srcObj1, PIM_INT32); + if (dstObj == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimStatus status; + + status = pimCopyHostToDevice((void *)srcVector.data(), srcObj1); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + //mean + int32_t sum = 0; + status = pimRedSum(srcObj1, static_cast(&sum), 0, vectorLength); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + auto start_cpu = std::chrono::high_resolution_clock::now(); + int32_t mean = sum/vectorLength; + std::cout << "mean " << mean << " sum " << sum <(&sum2), 0, vectorLength); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + start_cpu = std::chrono::high_resolution_clock::now(); + + int32_t variance = sum2/vectorLength; + int32_t sqrt_var = newton_sqrt(variance + 1); + std::cout << "sqrt_var " << sqrt_var << " var " << variance < srcVector (params.vectorLength, 1), resultVector; + + if (params.shouldVerify) { + if (params.inputFile == nullptr) + { + getVector(params.vectorLength, srcVector); + } + else + { + std::cout << "Reading from input file is not implemented yet." << std::endl; + return 1; + } + } + + + if (!createDevice(params.configFile)) + { + return 1; + } + + // TODO: Check if vector can fit in one iteration. Otherwise need to run in multiple iteration. + lnorm(params.vectorLength, srcVector, resultVector); + + if (params.shouldVerify) + { + bool shouldBreak = false; // shared flag variable + + // verify result + + std::vector result (params.vectorLength, 0); + std::vector src_minus_mean (params.vectorLength, 0); + std::vector sq_src_minus_mean (params.vectorLength, 0); + + int32_t sum = 0; + + for (size_t i = 0; i < params.vectorLength; i++) { + sum += srcVector[i]; + } + + int32_t mean = sum / params.vectorLength; + + for (size_t i = 0; i < params.vectorLength; i++) { + src_minus_mean[i] = srcVector[i] - mean; + } + + for (size_t i = 0; i < params.vectorLength; i++) { + sq_src_minus_mean[i] = (int32_t)(src_minus_mean[i]*src_minus_mean[i]); + } + + int32_t sum2 = 0; + for (size_t i = 0; i < params.vectorLength; i++) { + sum2 += sq_src_minus_mean[i]; + } + + int32_t var = sum2/params.vectorLength; + + int32_t sqrt_var = newton_sqrt(var+1); + if(sqrt_var==0){ + sqrt_var = 1; + } + + // layer norm + for (size_t i = 0; i < params.vectorLength; i++) { + result[i] = src_minus_mean[i] / (sqrt_var); // Prevent division by zero + } + + for (size_t i = 0; i < params.vectorLength; i++) + { + if (result[i] != resultVector[i]) + { + #pragma omp critical + { + if (!shouldBreak) + { // check the flag again in a critical section + std::cout << "Wrong answer: " << resultVector[i] << " (expected " << result[i] << ")" << std::endl; + shouldBreak = true; // set the flag to true + } + } + } + } + + + if (!shouldBreak) { + std::cout << "\n\nCorrect Answer!!\n\n"; + } + } + + pimShowStats(); + std::cout << "Host elapsed time: " << std::fixed << std::setprecision(3) << hostElapsedTime.count() << " ms." << endl; + + return 0; +} diff --git a/PIMbench/lnorm/PIM/run_lnorm.sh b/PIMbench/lnorm/PIM/run_lnorm.sh new file mode 100755 index 00000000..e7aaba83 --- /dev/null +++ b/PIMbench/lnorm/PIM/run_lnorm.sh @@ -0,0 +1 @@ +./lnorm.out -v t -c ../../../configs/hbm/PIMeval_Bank_Rank1.cfg -l 128 diff --git a/PIMbench/lnorm/README.md b/PIMbench/lnorm/README.md new file mode 100644 index 00000000..6c9f4303 --- /dev/null +++ b/PIMbench/lnorm/README.md @@ -0,0 +1,95 @@ +# Layer Normalization (LNORM) + +The LayerNorm is a normalization function mostly used in AI models + + +For a detailed description of RMSNorm, you can refer to the [torch.nn.LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) or the [paper](https://arxiv.org/pdf/1607.06450) + +## Directory Structure + +``` +rmsnorm/ +├── PIM/ +│ ├── Makefile +│ ├── lnorm.cpp +├── baselines/ +│ ├── CPU/ +│ │ ├── Makefile +│ │ ├── lnorm.cpp +│ ├── GPU/ **TODO** +│ │ ├── Makefile +│ │ ├── lnorm.cu +├── README.md +├── Makefile +``` + +## Implementation Description + +This repository contains three different implementations of the RMSNORM benchmark: + +1. CPU +2. GPU **TODO** +3. PIM + +### Baseline Implementation + +CPU and GPU have been used as baselines. + +#### CPU + +The CPU variant ... + +#### GPU + +The GPU variant (**TODO** Try torch rmsnorm) + +### PIM Implementation + +The PIM variant is implemented using C++ and three different PIM architectures can be tested with this. + +## Compilation Instructions for Specific Variants + +### CPU Variant + +To compile for the CPU variant, use: + +```bash +cd baselines/CPU +make +``` + +### GPU Variant + +To compile for the GPU variant, use: + +```bash +cd baselines/GPU +make +``` + +*Note that the GPU Makefile currently uses SM_80, which is compatible with the A100. To run it on a different GPU, please manually change this in the makefile. + +### PIM Variant + +To compile for the PIM variant, use: + +```bash +cd PIM +make -j USE_OPENMP=1 +``` + +## Execution Instructions + +### Running the Executable + +After compiling, run the each executable with the following command that will run it for default parameters: + +```bash +./lnorm.out +``` + +To see help text on all usages and how to modify any of the input parameters, use following command: + +```bash +./lnorm.out -h +``` diff --git a/PIMbench/lnorm/baselines/CPU/Makefile b/PIMbench/lnorm/baselines/CPU/Makefile new file mode 100644 index 00000000..5a45f922 --- /dev/null +++ b/PIMbench/lnorm/baselines/CPU/Makefile @@ -0,0 +1,24 @@ +# Compiler +CXX := g++ + +# Compiler flags +CXXFLAGS := -Wall -Wextra -Werror -std=c++17 -O3 -fopenmp +LDFLAGS = -lopenblas + +# Executable name +EXEC := lnorm.out + +# Source files +SRC_FILES := $(wildcard *.cpp) + + +.PHONY: all clean + +all: $(EXEC) + +$(EXEC): $(SRC_FILES) | + $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDFLAGS) + +clean: + rm -rf $(EXEC) + diff --git a/PIMbench/lnorm/baselines/CPU/lnorm.cpp b/PIMbench/lnorm/baselines/CPU/lnorm.cpp new file mode 100644 index 00000000..5e1124b7 --- /dev/null +++ b/PIMbench/lnorm/baselines/CPU/lnorm.cpp @@ -0,0 +1,135 @@ +/** + * @file lnorm.cpp + * @brief LNORM. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../../util/utilBaselines.h" + +using namespace std; + +// Global Vectors +vector A; +vector B; + + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t vectorLength; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./lnorm.out [options]" + "\n" + "\n -l vector size (default=128 elements)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +/** + * @brief Parses command line input parameters + * @param argc Number of command line arguments + * @param argv Array of command line arguments + * @return Parsed parameters + */ +struct Params parseParams(int argc, char **argv) +{ + struct Params p; + p.vectorLength = 128; + + int opt; + while ((opt = getopt(argc, argv, ":l:h:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + case 'l': + p.vectorLength = stoull(optarg); + break; + default: + cerr << "\nUnrecognized option: " << opt << "\n"; + usage(); + exit(1); + } + } + return p; +} + +void lnorm(uint64_t vectorLength, std::vector &srcVector, std::vector &dst) +{ + + std::vector src_minus_mean (vectorLength, 0); + std::vector sq_src_minus_mean (vectorLength, 0); + + int32_t sum = 0; + + for (size_t i = 0; i < vectorLength; i++) { + sum += srcVector[i]; + } + + int32_t mean = sum / vectorLength; + + for (size_t i = 0; i < vectorLength; i++) { + src_minus_mean[i] = srcVector[i] - mean; + } + + for (size_t i = 0; i < vectorLength; i++) { + sq_src_minus_mean[i] = (int32_t)(src_minus_mean[i]*src_minus_mean[i]); + } + + int32_t sum2 = 0; + for (size_t i = 0; i < vectorLength; i++) { + sum2 += sq_src_minus_mean[i]; + } + + int32_t var = sum2/vectorLength; + + int32_t sqrt_var = sqrt(var+1); + + // layer norm + for (size_t i = 0; i < vectorLength; i++) { + dst[i] = src_minus_mean[i] / (sqrt_var + 1); // Prevent division by zero + } +} + +/** + * @brief Main function. + */ +int main(int argc, char **argv) +{ + // Parse input parameters + Params params = parseParams(argc, argv); + uint64_t vectorLength = params.vectorLength; + + // Initialize vectors + getVector(vectorLength, A); + B.resize(vectorLength); + std::cout << "Done initialization." << std::endl; + + auto start = chrono::high_resolution_clock::now(); + + for (int32_t i = 0; i < WARMUP; i++) + { + lnorm(vectorLength, A, B); + } + + auto end = chrono::high_resolution_clock::now(); + + chrono::duration elapsedTime = (end - start) / WARMUP; + cout << "Duration: " << fixed << setprecision(3) << elapsedTime.count() << " ms." << endl; + + return 0; +} diff --git a/PIMbench/rmsnorm/Makefile b/PIMbench/rmsnorm/Makefile new file mode 100644 index 00000000..b3e4ce58 --- /dev/null +++ b/PIMbench/rmsnorm/Makefile @@ -0,0 +1,16 @@ +# Makefile: C++ version of matrix vector multiplication +# Copyright (c) 2024 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +SUBDIRS := PIM + +.PHONY: debug perf dramsim3_integ clean $(SUBDIRS) +.DEFAULT_GOAL := perf + +USE_OPENMP ?= 0 + +debug perf dramsim3_integ clean: $(SUBDIRS) + +$(SUBDIRS): + $(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP) diff --git a/PIMbench/rmsnorm/PIM/Makefile b/PIMbench/rmsnorm/PIM/Makefile new file mode 100644 index 00000000..53225bf0 --- /dev/null +++ b/PIMbench/rmsnorm/PIM/Makefile @@ -0,0 +1,24 @@ +# Makefile: C++ version of matrix vector multiplication +# Copyright (c) 2024 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +PROJ_ROOT = ../../.. +include ${PROJ_ROOT}/Makefile.common + +# make USE_OPENMP=1 +USE_OPENMP ?= 0 +ifeq ($(USE_OPENMP),1) + CXXFLAGS += -fopenmp +endif + +EXEC := rmsnorm.out +SRC := rmsnorm.cpp + +debug perf dramsim3_integ: $(EXEC) + +$(EXEC): $(SRC) $(DEPS) + $(CXX) $< $(CXXFLAGS) -o $@ + +clean: + rm -rf $(EXEC) *.dSYM diff --git a/PIMbench/rmsnorm/PIM/rmsnorm.cpp b/PIMbench/rmsnorm/PIM/rmsnorm.cpp new file mode 100644 index 00000000..07a09547 --- /dev/null +++ b/PIMbench/rmsnorm/PIM/rmsnorm.cpp @@ -0,0 +1,250 @@ +// Test: C++ version of matrix vector multiplication +// Copyright (c) 2024 University of Virginia +// This file is licensed under the MIT License. +// See the LICENSE file in the root of this repository for more details. + +#include +#include +#include +#include +#include +#if defined(_OPENMP) +#include +#endif + +#include "util.h" +#include "libpimeval.h" +#include +#include + +std::chrono::duration hostElapsedTime = std::chrono::duration::zero(); + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t vectorLength; + char *configFile; + char *inputFile; + bool shouldVerify; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./rmsnorm.out [options]" + "\n" + "\n -l vectorLength (default=128 elements)" + "\n -c dramsim config file" + "\n -i input file containing two vectors (default=generates vector with random numbers)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +struct Params getInputParams(int argc, char **argv) +{ + struct Params p; + p.vectorLength = 128; + p.configFile = nullptr; + p.inputFile = nullptr; + p.shouldVerify = false; + + int opt; + while ((opt = getopt(argc, argv, "h:l:c:i:v:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + break; + case 'l': + p.vectorLength = strtoull(optarg, NULL, 0); + break; + case 'c': + p.configFile = optarg; + break; + case 'i': + p.inputFile = optarg; + break; + case 'v': + p.shouldVerify = (*optarg == 't') ? true : false; + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + return p; +} + +// Newton-Raphson iterative integer square root +uint32_t newton_sqrt(uint32_t x) { + if (x == 0) return 0; // Handle zero case + + uint32_t guess = x; // Initial guess + uint32_t prev_guess = 0; + + while (guess != prev_guess) { // Continue until convergence + prev_guess = guess; + guess = (guess + x / guess) / 2; // Newton-Raphson iteration + } + + //std::cout << "newton sqrt: " << guess << std::endl; + return guess; +} + +void rmsnorm(uint64_t vectorLength, std::vector &srcVector, std::vector &dst) +{ + PimObjId srcObj1 = pimAlloc(PIM_ALLOC_AUTO, vectorLength, PIM_INT32); + if (srcObj1 == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimObjId dstObj = pimAllocAssociated(srcObj1, PIM_INT32); + if (dstObj == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimStatus status; + + status = pimCopyHostToDevice((void *)srcVector.data(), srcObj1); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + // Square the element of the vector + status = pimMul(srcObj1, srcObj1, dstObj); //TODO: How to take care of overflow? + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + + // Sum of the squared elements - reduction + uint32_t sum = 0; + status = pimRedSum(dstObj, static_cast(&sum), 0, vectorLength); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + + auto start_cpu = std::chrono::high_resolution_clock::now(); + // divide to get mean + uint32_t mean = sum/vectorLength; + + // Compute RMS using Newton-Raphson square root + uint32_t rms = newton_sqrt(mean + 1); // +1 to prevent division by zero + //uint32_t rms = 0; + auto stop_cpu = std::chrono::high_resolution_clock::now(); + hostElapsedTime += (stop_cpu - start_cpu); + + // Scale srcVector + status = pimDivScalar(srcObj1, dstObj, rms+1); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + dst.resize(vectorLength); + status = pimCopyDeviceToHost(dstObj, (void *)dst.data()); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + } + pimFree(srcObj1); + pimFree(dstObj); +} + +int main(int argc, char *argv[]) +{ + struct Params params = getInputParams(argc, argv); + std::cout << "Running RMSNORM for vector of size: " << params.vectorLength << std::endl; + + std::vector srcVector (params.vectorLength, 1), resultVector; + + if (params.shouldVerify) { + if (params.inputFile == nullptr) + { + getVector(params.vectorLength, srcVector); + } + else + { + std::cout << "Reading from input file is not implemented yet." << std::endl; + return 1; + } + } + + if (!createDevice(params.configFile)) + { + return 1; + } + + // TODO: Check if vector can fit in one iteration. Otherwise need to run in multiple iteration. + rmsnorm(params.vectorLength, srcVector, resultVector); + + if (params.shouldVerify) + { + bool shouldBreak = false; // shared flag variable + + // verify result + + std::vector result (params.vectorLength, 0); + + //rms norm + uint32_t sum_sq = 0; + + // Compute sum of squares + for (size_t i = 0; i < params.vectorLength; i++) { + sum_sq += (uint32_t)(srcVector[i] * srcVector[i]); // Prevent overflow + } + + // Compute mean squared value + uint32_t mean_sq = sum_sq / params.vectorLength; // Integer division + + // Compute RMS using Newton-Raphson square root + //uint32_t rms = newton_sqrt(mean_sq + 1); // +1 to prevent division by zero + uint32_t rms = sqrt(mean_sq+1); + //std::cout << "sqrt(): " << rms << std::endl; + + // Normalize each element: Y[i] = X[i] / RMS + for (size_t i = 0; i < params.vectorLength; i++) { + result[i] = srcVector[i] / (rms + 1); // Prevent division by zero + } + + for (size_t i = 0; i < params.vectorLength; i++) + { + if (result[i] != resultVector[i]) + { + #pragma omp critical + { + if (!shouldBreak) + { // check the flag again in a critical section + std::cout << "Wrong answer: " << resultVector[i] << " (expected " << result[i] << ")" << std::endl; + shouldBreak = true; // set the flag to true + } + } + } + } + + + if (!shouldBreak) { + std::cout << "\n\nCorrect Answer!!\n\n"; + } + } + + pimShowStats(); + std::cout << "Host elapsed time: " << std::fixed << std::setprecision(3) << hostElapsedTime.count() << " ms." << endl; + + return 0; +} diff --git a/PIMbench/rmsnorm/PIM/run_rmsnorm.sh b/PIMbench/rmsnorm/PIM/run_rmsnorm.sh new file mode 100755 index 00000000..3d3d8b46 --- /dev/null +++ b/PIMbench/rmsnorm/PIM/run_rmsnorm.sh @@ -0,0 +1 @@ +./rmsnorm.out -v t -c ../../../configs/hbm/PIMeval_Bank_Rank1.cfg -l 12000 diff --git a/PIMbench/rmsnorm/README.md b/PIMbench/rmsnorm/README.md new file mode 100644 index 00000000..63c8cc12 --- /dev/null +++ b/PIMbench/rmsnorm/README.md @@ -0,0 +1,95 @@ +# Root Mean Square Normalization (RMSNorm) + +The RMSNorm is a normalization function mostly used in AI models + + +For a detailed description of RMSNorm, you can refer to the [torch.nn.RMSNorm](https://pytorch.org/docs/stable/generated/torch.nn.RMSNorm.html) or the [paper](https://dl.acm.org/doi/pdf/10.5555/3454287.3455397) + +## Directory Structure + +``` +rmsnorm/ +├── PIM/ +│ ├── Makefile +│ ├── rmsnorm.cpp +├── baselines/ +│ ├── CPU/ +│ │ ├── Makefile +│ │ ├── rmsnorm.cpp +│ ├── GPU/ **TODO** +│ │ ├── Makefile +│ │ ├── rmsnorm.cu +├── README.md +├── Makefile +``` + +## Implementation Description + +This repository contains three different implementations of the RMSNORM benchmark: + +1. CPU +2. GPU **TODO** +3. PIM + +### Baseline Implementation + +CPU and GPU have been used as baselines. + +#### CPU + +The CPU variant ... + +#### GPU + +The GPU variant (**TODO** Try torch rmsnorm) + +### PIM Implementation + +The PIM variant is implemented using C++ and three different PIM architectures can be tested with this. + +## Compilation Instructions for Specific Variants + +### CPU Variant + +To compile for the CPU variant, use: + +```bash +cd baselines/CPU +make +``` + +### GPU Variant + +To compile for the GPU variant, use: + +```bash +cd baselines/GPU +make +``` + +*Note that the GPU Makefile currently uses SM_80, which is compatible with the A100. To run it on a different GPU, please manually change this in the makefile. + +### PIM Variant + +To compile for the PIM variant, use: + +```bash +cd PIM +make -j USE_OPENMP=1 +``` + +## Execution Instructions + +### Running the Executable + +After compiling, run the each executable with the following command that will run it for default parameters: + +```bash +./rmsnorm.out +``` + +To see help text on all usages and how to modify any of the input parameters, use following command: + +```bash +./rmsnorm.out -h +``` diff --git a/PIMbench/rmsnorm/baselines/CPU/Makefile b/PIMbench/rmsnorm/baselines/CPU/Makefile new file mode 100644 index 00000000..ac1a058c --- /dev/null +++ b/PIMbench/rmsnorm/baselines/CPU/Makefile @@ -0,0 +1,24 @@ +# Compiler +CXX := g++ + +# Compiler flags +CXXFLAGS := -Wall -Wextra -Werror -std=c++17 -O3 -fopenmp +LDFLAGS = -lopenblas + +# Executable name +EXEC := rmsnorm.out + +# Source files +SRC_FILES := $(wildcard *.cpp) + + +.PHONY: all clean + +all: $(EXEC) + +$(EXEC): $(SRC_FILES) | + $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDFLAGS) + +clean: + rm -rf $(EXEC) + diff --git a/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp b/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp new file mode 100644 index 00000000..8ddc4e0b --- /dev/null +++ b/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp @@ -0,0 +1,112 @@ +/** + * @file rmsnorm.cpp + * @brief RMSNORM. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../../util/utilBaselines.h" + +using namespace std; + +// Global Vectors +vector A; +vector B; + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t vectorLength; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./rmsnorm.out [options]" + "\n" + "\n -l vector size (default=128 elements)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +/** + * @brief Parses command line input parameters + * @param argc Number of command line arguments + * @param argv Array of command line arguments + * @return Parsed parameters + */ +struct Params parseParams(int argc, char **argv) +{ + struct Params p; + p.vectorLength = 128; + + int opt; + while ((opt = getopt(argc, argv, ":l:h:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + case 'l': + p.vectorLength = stoull(optarg); + break; + default: + cerr << "\nUnrecognized option: " << opt << "\n"; + usage(); + exit(1); + } + } + return p; +} + +void rmsnorm(uint64_t vectorLength, std::vector &srcVector, std::vector &dst) +{ +uint32_t sum_sq = 0; +for (size_t i = 0; i < vectorLength; i++) +{ + sum_sq += (uint32_t)(srcVector[i] * srcVector[i]); // Prevent overflow +} +uint32_t mean_sq = sum_sq / vectorLength; +uint32_t rms = sqrt(mean_sq+1); +for (size_t i = 0; i < vectorLength; i++) +{ + dst[i] = srcVector[i] / (rms + 1); // Prevent division by zero +} +} + +/** + * @brief Main function. + */ +int main(int argc, char **argv) +{ + // Parse input parameters + Params params = parseParams(argc, argv); + uint64_t vectorLength = params.vectorLength; + + // Initialize vectors + getVector(vectorLength, A); + B.resize(vectorLength); + std::cout << "Done initialization." << std::endl; + + auto start = chrono::high_resolution_clock::now(); + + for (int32_t i = 0; i < WARMUP; i++) + { + rmsnorm(vectorLength, A, B); + } + + auto end = chrono::high_resolution_clock::now(); + + chrono::duration elapsedTime = (end - start) / WARMUP; + cout << "Duration: " << fixed << setprecision(3) << elapsedTime.count() << " ms." << endl; + + return 0; +}