From 9cc455a1937f089b64d93408d6e458019b4bf1da Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Fri, 28 Mar 2025 14:10:21 -0400 Subject: [PATCH 01/31] add PIM stencil --- misc-bench/stencil/PIM/Makefile | 24 ++ misc-bench/stencil/PIM/stencil.cpp | 351 +++++++++++++++++++++++++++++ 2 files changed, 375 insertions(+) create mode 100644 misc-bench/stencil/PIM/Makefile create mode 100644 misc-bench/stencil/PIM/stencil.cpp diff --git a/misc-bench/stencil/PIM/Makefile b/misc-bench/stencil/PIM/Makefile new file mode 100644 index 00000000..54481573 --- /dev/null +++ b/misc-bench/stencil/PIM/Makefile @@ -0,0 +1,24 @@ +# Makefile: C++ version of stencil +# Copyright (c) 2025 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +PROJ_ROOT = ../../.. +include ${PROJ_ROOT}/Makefile.common + +# make USE_OPENMP=1 +USE_OPENMP ?= 0 +ifeq ($(USE_OPENMP),1) + CXXFLAGS += -fopenmp +endif + +EXEC := stencil.out +SRC := stencil.cpp + +debug perf dramsim3_integ: $(EXEC) + +$(EXEC): $(SRC) $(DEPS) + $(CXX) $< $(CXXFLAGS) -o $@ + +clean: + rm -rf $(EXEC) *.dSYM \ No newline at end of file diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp new file mode 100644 index 00000000..a09899ec --- /dev/null +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -0,0 +1,351 @@ +// Test: C++ version of the stencil +// Copyright (c) 2025 University of Virginia +// This file is licensed under the MIT License. +// See the LICENSE file in the root of this repository for more details. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(_OPENMP) +#include +#endif + +#include "util.h" +#include "libpimeval.h" + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t gridWidth; + uint64_t gridHeight; + uint64_t stencilWidth; + uint64_t stencilHeight; + uint64_t numLeft; + uint64_t numAbove; + const char *configFile; + const char *inputFile; + bool shouldVerify; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./stencil.out [options]" + "\n" + "\n -x grid width (default=2048 elements)" + "\n -y grid height (default=2048 elements)" + "\n -w horizontal stencil size (default=3)" + "\n -d vertical stencil size (default=3)" + "\n -l number of elements to the left of the output element for the stencil pattern, must be less than the horizontal stencil size (default=1)" + "\n -a number of elements above the output element for the stencil pattern, must be less than the vertical stencil size (default=1)" + "\n -c dramsim config file" + "\n -i input file containing a 2d array (default=random)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +struct Params getInputParams(int argc, char **argv) +{ + struct Params p; + p.gridWidth = 2048; + p.gridHeight = 2048; + p.stencilWidth = 3; + p.stencilHeight = 3; + p.numLeft = 1; + p.numAbove = 1; + p.configFile = nullptr; + p.inputFile = nullptr; + p.shouldVerify = false; + + int opt; + while ((opt = getopt(argc, argv, "h:x:y:w:d:l:a:c:i:v:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + break; + case 'x': + p.gridWidth = strtoull(optarg, NULL, 0); + break; + case 'y': + p.gridHeight = strtoull(optarg, NULL, 0); + break; + case 'w': + p.stencilWidth = strtoull(optarg, NULL, 0); + break; + case 'd': + p.stencilHeight = strtoull(optarg, NULL, 0); + break; + case 'l': + p.numLeft = strtoull(optarg, NULL, 0); + break; + case 'a': + p.numAbove = strtoull(optarg, NULL, 0); + break; + case 'c': + p.configFile = optarg; + break; + case 'i': + p.inputFile = optarg; + break; + case 'v': + p.shouldVerify = (*optarg == 't'); + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + return p; +} + +//! @brief Shifts the elements of the input row so that necessary elements are vertically aligned +//! @param[in] src The vector to shift +//! @param[in] stencilWidth The horizontal width of the stencil +//! @param[in] numLeft The number of elements to the left of the output element in the stencil pattern +//! @param[in] toAssociate A PIM Object to associate the added data with +//! @return The shifted row as a list of PIM objects +std::vector createShiftedStencilRows(const std::vector &src, const uint64_t stencilWidth, + const uint64_t numLeft, const PimObjId toAssociate) { + PimStatus status; + + std::vector result(stencilWidth); + + for(uint64_t i=0; i0; --i) { + status = pimCopyObjectToObject(result[i], result[i-1]); + assert (status == PIM_OK); + + status = pimShiftElementsRight(result[i-1]); + assert (status == PIM_OK); + } + + for(uint64_t i=numLeft+1; i> &srcHost, std::vector> &dstHost, + const std::vector> &stencilPattern, const uint64_t numLeft, const uint64_t numAbove) { + PimStatus status; + + assert(!srcHost.empty()); + assert(!srcHost[0].empty()); + assert(srcHost.size() == dstHost.size()); + assert(srcHost[0].size() == dstHost[0].size()); + assert(!stencilPattern.empty()); + assert(!stencilPattern[0].empty()); + assert(stencilPattern.size() > numAbove); + assert(stencilPattern[0].size() > numLeft); + + const uint64_t gridHeight = srcHost.size(); + const uint64_t gridWidth = srcHost[0].size(); + const uint64_t stencilHeight = stencilPattern.size(); + const uint64_t stencilWidth = stencilPattern[0].size(); + const uint64_t numBelow = stencilHeight - numAbove - 1; + + // PIM API only supports passing scalar values through uint64_t + std::vector> stencilPatternConverted(stencilHeight); + for(uint64_t y=0; y(tmp); + } + } + + PimObjId resultPim = pimAlloc(PIM_ALLOC_AUTO, gridWidth, PIM_FP32); + assert(resultPim != -1); + + PimObjId tempPim = pimAllocAssociated(resultPim, PIM_FP32); + assert(tempPim != -1); + + std::list> shiftedRows; + + for(uint64_t i=0; i &shiftedRow : shiftedRows) { + for(uint64_t stencilX = 0; stencilX < stencilWidth; ++stencilX) { + if(stencilY == 0 && stencilX == 0) { + status = pimMulScalar(shiftedRow[stencilX], resultPim, stencilPatternConverted[stencilY][stencilX]); + assert (status == PIM_OK); + } else { + status = pimMulScalar(shiftedRow[stencilX], tempPim, stencilPatternConverted[stencilY][stencilX]); + assert (status == PIM_OK); + + status = pimAdd(resultPim, tempPim, resultPim); + assert (status == PIM_OK); + } + } + ++stencilY; + } + + status = pimCopyDeviceToHost(resultPim, (void *) dstHost[row].data()); + assert (status == PIM_OK); + + for(PimObjId objToFree : shiftedRows.front()) { + pimFree(objToFree); + } + shiftedRows.pop_front(); + } + + while(!shiftedRows.empty()) { + for(PimObjId objToFree : shiftedRows.front()) { + pimFree(objToFree); + } + shiftedRows.pop_front(); + } +} + +int main(int argc, char* argv[]) +{ + struct Params params = getInputParams(argc, argv); + + std::cout << "Running PIM stencil for grid: " << params.gridHeight << "x" << params.gridWidth << std::endl; + std::cout << "Stencil Size: " << params.stencilHeight << "x" << params.stencilWidth << std::endl; + std::cout << "Num Above: " << params.numAbove << ", Num Left: " << params.numLeft << std::endl; + + std::vector> x, y; + std::vector> stencilPattern; + if (params.inputFile == nullptr) + { + // Fill in random grid + x.resize(params.gridHeight); + for(size_t i=0; i dist(0.0f, 10000.0f); + + #pragma omp for + for(size_t i=0; i dist(0.0f, 1.0f); + + #pragma omp for + for(size_t i=0; i Date: Mon, 31 Mar 2025 18:24:32 -0400 Subject: [PATCH 02/31] wip --- misc-bench/stencil/PIM/stencil.cpp | 226 +++++++++++++++++++---------- 1 file changed, 152 insertions(+), 74 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index a09899ec..f7c580e7 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -112,43 +112,109 @@ struct Params getInputParams(int argc, char **argv) return p; } -//! @brief Shifts the elements of the input row so that necessary elements are vertically aligned -//! @param[in] src The vector to shift +// //! @brief Shifts the elements of the input row so that necessary elements are vertically aligned +// //! @param[in] src The vector to shift +// //! @param[in] stencilWidth The horizontal width of the stencil +// //! @param[in] numLeft The number of elements to the left of the output element in the stencil pattern +// //! @param[in] toAssociate A PIM Object to associate the added data with +// //! @return The shifted row as a list of PIM objects +// std::vector createShiftedStencilRows(const std::vector &src, const uint64_t stencilWidth, +// const uint64_t numLeft, const PimObjId toAssociate) { +// PimStatus status; + +// std::vector result(stencilWidth); + +// for(uint64_t i=0; i0; --i) { +// status = pimCopyObjectToObject(result[i], result[i-1]); +// assert (status == PIM_OK); + +// status = pimShiftElementsRight(result[i-1]); +// assert (status == PIM_OK); +// } + +// for(uint64_t i=numLeft+1; i createShiftedStencilRows(const std::vector &src, const uint64_t stencilWidth, - const uint64_t numLeft, const PimObjId toAssociate) { +//! @return The sumed PIM row +PimObjId sumStencilRow(const std::vector &src, const uint64_t stencilWidth, const uint64_t numLeft, const PimObjId toAssociate) { PimStatus status; - std::vector result(stencilWidth); + PimObjId mid = pimAllocAssociated(toAssociate, PIM_FP32); + assert(mid != -1); - for(uint64_t i=0; i0; --i) { - status = pimCopyObjectToObject(result[i], result[i-1]); + for(uint64_t shiftIter=1; shiftIter createShiftedStencilRows(const std::vector &src, co //! @param[in] numLeft The number of elements to the left of the output element in the stencil pattern //! @param[in] numAbove The number of elements above the output element in the stencil pattern void stencil(const std::vector> &srcHost, std::vector> &dstHost, - const std::vector> &stencilPattern, const uint64_t numLeft, const uint64_t numAbove) { + const uint64_t stencilWidth, const uint64_t stencilHeight, const uint64_t numLeft, const uint64_t numAbove) { PimStatus status; assert(!srcHost.empty()); assert(!srcHost[0].empty()); assert(srcHost.size() == dstHost.size()); assert(srcHost[0].size() == dstHost[0].size()); - assert(!stencilPattern.empty()); - assert(!stencilPattern[0].empty()); - assert(stencilPattern.size() > numAbove); - assert(stencilPattern[0].size() > numLeft); + assert(numLeft < stencilWidth); + assert(numAbove < stencilHeight); const uint64_t gridHeight = srcHost.size(); const uint64_t gridWidth = srcHost[0].size(); - const uint64_t stencilHeight = stencilPattern.size(); - const uint64_t stencilWidth = stencilPattern[0].size(); const uint64_t numBelow = stencilHeight - numAbove - 1; - - // PIM API only supports passing scalar values through uint64_t - std::vector> stencilPatternConverted(stencilHeight); - for(uint64_t y=0; y(tmp); - } - } + + const uint64_t stencilAreaInt = stencilHeight * stencilWidth; + const float stencilAreaFloat = 1.0f / static_cast(stencilAreaInt); + uint32_t tmp; + std::memcpy(&tmp, &stencilAreaFloat, sizeof(float)); + const uint64_t stencilAreaToMultiply = static_cast(tmp); PimObjId resultPim = pimAlloc(PIM_ALLOC_AUTO, gridWidth, PIM_FP32); assert(resultPim != -1); - PimObjId tempPim = pimAllocAssociated(resultPim, PIM_FP32); - assert(tempPim != -1); + // Handle special case + if(stencilHeight == 1) { + for(size_t i=0; i> shiftedRows; + status = pimCopyDeviceToHost(resultPim, dstHost[i].data()); + assert (status == PIM_OK); - for(uint64_t i=0; i &shiftedRow : shiftedRows) { - for(uint64_t stencilX = 0; stencilX < stencilWidth; ++stencilX) { - if(stencilY == 0 && stencilX == 0) { - status = pimMulScalar(shiftedRow[stencilX], resultPim, stencilPatternConverted[stencilY][stencilX]); - assert (status == PIM_OK); - } else { - status = pimMulScalar(shiftedRow[stencilX], tempPim, stencilPatternConverted[stencilY][stencilX]); - assert (status == PIM_OK); + // PimObjId tempPim = pimAllocAssociated(resultPim, PIM_FP32); + // assert(tempPim != -1); - status = pimAdd(resultPim, tempPim, resultPim); - assert (status == PIM_OK); - } - } - ++stencilY; - } + // std::list> shiftedRows; - status = pimCopyDeviceToHost(resultPim, (void *) dstHost[row].data()); - assert (status == PIM_OK); - - for(PimObjId objToFree : shiftedRows.front()) { - pimFree(objToFree); - } - shiftedRows.pop_front(); - } + // for(uint64_t i=0; i &shiftedRow : shiftedRows) { + // for(uint64_t stencilX = 0; stencilX < stencilWidth; ++stencilX) { + // if(stencilY == 0 && stencilX == 0) { + // status = pimMulScalar(shiftedRow[stencilX], resultPim, stencilPatternConverted[stencilY][stencilX]); + // assert (status == PIM_OK); + // } else { + // status = pimMulScalar(shiftedRow[stencilX], tempPim, stencilPatternConverted[stencilY][stencilX]); + // assert (status == PIM_OK); + + // status = pimAdd(resultPim, tempPim, resultPim); + // assert (status == PIM_OK); + // } + // } + // ++stencilY; + // } + + // status = pimCopyDeviceToHost(resultPim, (void *) dstHost[row].data()); + // assert (status == PIM_OK); + + // for(PimObjId objToFree : shiftedRows.front()) { + // pimFree(objToFree); + // } + // shiftedRows.pop_front(); + // } + + // while(!shiftedRows.empty()) { + // for(PimObjId objToFree : shiftedRows.front()) { + // pimFree(objToFree); + // } + // shiftedRows.pop_front(); + // } } int main(int argc, char* argv[]) From 31785bdc13042aa1cb67bdbac768af5a5933a351 Mon Sep 17 00:00:00 2001 From: Arleee1 Date: Tue, 1 Apr 2025 12:10:54 -0400 Subject: [PATCH 03/31] PIM stencil mostly working, TODO numLeft not working --- misc-bench/stencil/PIM/stencil.cpp | 113 ++++++++++++----------------- 1 file changed, 45 insertions(+), 68 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index f7c580e7..d142af50 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -268,54 +268,46 @@ void stencil(const std::vector> &srcHost, std::vector rowsInSum; + rowsInSum.push_back(sumStencilRow(srcHost[0], stencilWidth, numLeft, resultPim)); + rowsInSum.push_back(sumStencilRow(srcHost[1], stencilWidth, numLeft, resultPim)); + status = pimAdd(rowsInSum.front(), rowsInSum.back(), runningSum); + assert (status == PIM_OK); - // PimObjId tempPim = pimAllocAssociated(resultPim, PIM_FP32); - // assert(tempPim != -1); - - // std::list> shiftedRows; - - // for(uint64_t i=0; i &shiftedRow : shiftedRows) { - // for(uint64_t stencilX = 0; stencilX < stencilWidth; ++stencilX) { - // if(stencilY == 0 && stencilX == 0) { - // status = pimMulScalar(shiftedRow[stencilX], resultPim, stencilPatternConverted[stencilY][stencilX]); - // assert (status == PIM_OK); - // } else { - // status = pimMulScalar(shiftedRow[stencilX], tempPim, stencilPatternConverted[stencilY][stencilX]); - // assert (status == PIM_OK); + for(uint64_t row=numAbove; row> x, y; - std::vector> stencilPattern; + if (params.inputFile == nullptr) { // Fill in random grid @@ -349,26 +341,6 @@ int main(int argc, char* argv[]) } } } - - // Fill in random stencil pattern - stencilPattern.resize(params.stencilHeight); - for(size_t i=0; i dist(0.0f, 1.0f); - - #pragma omp for - for(size_t i=0; i(params.stencilWidth * params.stencilHeight)); + + constexpr float acceptableDelta = 0.1f; + if (std::abs(resCPU - y[gridY][gridX]) > acceptableDelta) { #pragma omp critical { - std::cout << "Wrong answer: " << y[gridY][gridX] << " (expected " << resCPU << ")" << std::endl; + std::cout << "Wrong answer: " << y[gridY][gridX] << " (expected " << resCPU << ") at position (" << gridX << ", " << gridY << ")" << std::endl; ok = false; } } From 1b08ee5660bf5cd6232f8c6267a61846965ccc47 Mon Sep 17 00:00:00 2001 From: Arleee1 Date: Tue, 1 Apr 2025 12:14:25 -0400 Subject: [PATCH 04/31] PIM stencil (seemingly) working --- misc-bench/stencil/PIM/stencil.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index d142af50..d8f6721e 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -168,7 +168,7 @@ PimObjId sumStencilRow(const std::vector &src, const uint64_t stencilWidt const uint64_t numRight = stencilWidth - numLeft - 1; - if(numLeft == 0 || numRight == 0) { + if(numLeft == 0 && numRight == 0) { return mid; } From 1b138ff06d24d0d5eb32d03b5ec0433ae85e8aad Mon Sep 17 00:00:00 2001 From: Arleee1 Date: Tue, 1 Apr 2025 12:18:18 -0400 Subject: [PATCH 05/31] add stencil makefile --- misc-bench/stencil/Makefile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 misc-bench/stencil/Makefile diff --git a/misc-bench/stencil/Makefile b/misc-bench/stencil/Makefile new file mode 100644 index 00000000..c4cdb2e3 --- /dev/null +++ b/misc-bench/stencil/Makefile @@ -0,0 +1,16 @@ +# Makefile: C++ version of stencil +# Copyright (c) 2025 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +SUBDIRS := PIM + +.PHONY: debug perf dramsim3_integ clean $(SUBDIRS) +.DEFAULT_GOAL := perf + +USE_OPENMP ?= 0 + +debug perf dramsim3_integ clean: $(SUBDIRS) + +$(SUBDIRS): + $(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP) \ No newline at end of file From 6fb1dd895c0f6beca5271111fd5669c45233f283 Mon Sep 17 00:00:00 2001 From: Arleee1 Date: Tue, 1 Apr 2025 12:35:24 -0400 Subject: [PATCH 06/31] add stencil readme --- misc-bench/stencil/README.MD | 96 ++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 misc-bench/stencil/README.MD diff --git a/misc-bench/stencil/README.MD b/misc-bench/stencil/README.MD new file mode 100644 index 00000000..dfd4cc62 --- /dev/null +++ b/misc-bench/stencil/README.MD @@ -0,0 +1,96 @@ +# Stencil + +Stencil computing takes a 2D array and sets each element to the average of its neighbors and itself. By default, the average is taken of each 3x3 block of elements, however this can be adjusted using the runtime parameters. For this benchmark, elements with neighbors outside of the input range are not computed. Additionally, the location of the output element within the stencil grid can be changed via the paramemters, meaning each output element could be the average of the 3x3 block to its bottom right etc., rather than being just in the center. For example, averaging a 3x3 grid with a 3x3 stencil pattern, with num left and num above both set to 1: + +- Input: [[2.0, 2.0, 2.0], [1.0, 1.0, 1.0], [2.0, 2.0, 2.0]] +- Output: [[#, #, #], [#, 1.66, #], [#, #, #]] +- \# represents an undefined element + +## Directory Structure + +``` +stencil/ +├── PIM/ +│ ├── Makefile +│ ├── stencil.cpp +├── baselines/ +│ ├── CPU/ +│ │ ├── Makefile +│ │ ├── stencil.cpp +│ ├── GPU/ +│ │ ├── Makefile +│ │ ├── stencil.cu +├── README.md +├── Makefile +``` + +## Implementation Description + +This repository contains three different implementations of the stencil benchmark: + +1. CPU +2. GPU +3. PIM + +### Baseline Implementation + +CPU and GPU have been used as baselines. + +#### CPU + +The CPU variant of stencil has been implemented using the standard C++ library, as well as with parallelization from OpenMP. + +#### GPU + +The GPU variant leverages CUDA C++ to parallelize the stencil benchmark on an NVIDIA GPU. + +### PIM Implementation + +The PIM variant is implemented using C++ with some speedup from OpenMP. Three different PIM architectures can be tested with this. + +## Compilation Instructions for Specific Variants + +### CPU Variant + +To compile for the CPU variant, use: + +```bash +cd baselines/CPU +make +``` + +### GPU Variant + +To compile for the GPU variant, use: + +```bash +cd baselines/GPU +make +``` + +*Note that the GPU Makefile currently uses SM_80, which is compatible with the A100. To run it on a different GPU, please manually change this in the makefile. + +### PIM Variant + +To compile for the PIM variant, use: + +```bash +cd PIM +make +``` + +## Execution Instructions + +### Running the Executable + +After compiling, run the each executable with the following command that will run it for default parameters: + +```bash +./stencil.out +``` + +To see help text on all usages and how to modify any of the input parameters, use the following command: + +```bash +./stencil.out -h +``` From cb767a71465bb81e49ce7e33a0866550ef6d308d Mon Sep 17 00:00:00 2001 From: Arleee1 Date: Tue, 1 Apr 2025 12:44:40 -0400 Subject: [PATCH 07/31] fix readme typo --- misc-bench/stencil/README.MD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc-bench/stencil/README.MD b/misc-bench/stencil/README.MD index dfd4cc62..f4dea72c 100644 --- a/misc-bench/stencil/README.MD +++ b/misc-bench/stencil/README.MD @@ -83,7 +83,7 @@ make ### Running the Executable -After compiling, run the each executable with the following command that will run it for default parameters: +After compiling, run each executable with the following command that will run it for default parameters: ```bash ./stencil.out From 0af845bfb1ab3f95346b116cb8b01fe70eb05892 Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Sun, 27 Apr 2025 22:32:31 -0400 Subject: [PATCH 08/31] wip stencil chunking --- misc-bench/stencil/PIM/stencil.cpp | 262 ++++++++++++++++++----------- 1 file changed, 168 insertions(+), 94 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index d8f6721e..dd338bcc 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -26,6 +26,7 @@ // Params --------------------------------------------------------------------- typedef struct Params { + uint64_t iterations; uint64_t gridWidth; uint64_t gridHeight; uint64_t stencilWidth; @@ -42,6 +43,7 @@ void usage() fprintf(stderr, "\nUsage: ./stencil.out [options]" "\n" + "\n -n iterations (default=10 iterations)" "\n -x grid width (default=2048 elements)" "\n -y grid height (default=2048 elements)" "\n -w horizontal stencil size (default=3)" @@ -57,6 +59,7 @@ void usage() struct Params getInputParams(int argc, char **argv) { struct Params p; + p.iterations = 10; p.gridWidth = 2048; p.gridHeight = 2048; p.stencilWidth = 3; @@ -68,7 +71,7 @@ struct Params getInputParams(int argc, char **argv) p.shouldVerify = false; int opt; - while ((opt = getopt(argc, argv, "h:x:y:w:d:l:a:c:i:v:")) >= 0) + while ((opt = getopt(argc, argv, "h:n:x:y:w:d:l:a:c:i:v:")) >= 0) { switch (opt) { @@ -76,6 +79,9 @@ struct Params getInputParams(int argc, char **argv) usage(); exit(0); break; + case 'n': + p.iterations = strtoull(optarg, NULL, 0); + break; case 'x': p.gridWidth = strtoull(optarg, NULL, 0); break; @@ -157,64 +163,80 @@ struct Params getInputParams(int argc, char **argv) //! @param[in] numLeft The number of elements to the left of the output element in the stencil pattern //! @param[in] toAssociate A PIM Object to associate the added data with //! @return The sumed PIM row -PimObjId sumStencilRow(const std::vector &src, const uint64_t stencilWidth, const uint64_t numLeft, const PimObjId toAssociate) { +void sumStencilRow(PimObjId mid, PimObjId pimRowSum, PimObjId shiftBackup, const uint64_t radius) { PimStatus status; - PimObjId mid = pimAllocAssociated(toAssociate, PIM_FP32); - assert(mid != -1); - - status = pimCopyHostToDevice((void *)src.data(), mid); - assert (status == PIM_OK); - - const uint64_t numRight = stencilWidth - numLeft - 1; - - if(numLeft == 0 && numRight == 0) { - return mid; + if(radius == 0) { + return; } - - PimObjId pimRowSum = pimAllocAssociated(toAssociate, PIM_FP32); // Result, is the sum of the neighbors in the row - assert(pimRowSum != -1); - - PimObjId shiftBackup = pimAllocAssociated(toAssociate, PIM_FP32); // Used after mid is shifted to the left, is shifted to the right - assert(shiftBackup != -1); status = pimCopyObjectToObject(mid, shiftBackup); assert (status == PIM_OK); - uint64_t leftShiftIterStart = 0; - PimObjId toShiftLeft = mid; - if(numLeft != 0) { + + status = pimShiftElementsRight(shiftBackup); + assert (status == PIM_OK); + + status = pimAdd(mid, shiftBackup, pimRowSum); + assert (status == PIM_OK); + + for(uint64_t shiftIter=1; shiftIter& workingPimMemory, std::vector& rowsInSumCircularQueue, PimObjId tmpPim, PimObjId runningSum, const uint64_t stencilAreaToMultiplyPim, const uint64_t radius) { + PimStatus status; + uint64_t circularQueueTop = 0; + uint64_t circularQueueBot = 0; + + sumStencilRow(workingPimMemory[0], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); + ++circularQueueTop; + sumStencilRow(workingPimMemory[1], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); + ++circularQueueTop; + status = pimAdd(rowsInSumCircularQueue[0], rowsInSumCircularQueue[1], runningSum); + assert (status == PIM_OK); - status = pimAdd(pimRowSum, toShiftLeft, pimRowSum); + for(uint64_t i=2; i<2*radius; ++i) { + sumStencilRow(workingPimMemory[i], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); + status = pimAdd(runningSum, rowsInSumCircularQueue[i], runningSum); assert (status == PIM_OK); + ++circularQueueTop; } - pimFree(mid); - pimFree(shiftBackup); + uint64_t nextRowToAdd = 2*radius; - return pimRowSum; + for(uint64_t row=radius; row &src, const uint64_t stencilWidt //! @param[in] stencilPattern The stencil pattern to apply //! @param[in] numLeft The number of elements to the left of the output element in the stencil pattern //! @param[in] numAbove The number of elements above the output element in the stencil pattern -void stencil(const std::vector> &srcHost, std::vector> &dstHost, - const uint64_t stencilWidth, const uint64_t stencilHeight, const uint64_t numLeft, const uint64_t numAbove) { +void stencil(const std::vector> &srcHost, std::vector> &dstHost, const uint64_t numRows, + const uint64_t iterations, const uint64_t radius) { PimStatus status; assert(!srcHost.empty()); assert(!srcHost[0].empty()); assert(srcHost.size() == dstHost.size()); assert(srcHost[0].size() == dstHost[0].size()); - assert(numLeft < stencilWidth); - assert(numAbove < stencilHeight); const uint64_t gridHeight = srcHost.size(); const uint64_t gridWidth = srcHost[0].size(); - const uint64_t numBelow = stencilHeight - numAbove - 1; - const uint64_t stencilAreaInt = stencilHeight * stencilWidth; + const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1); const float stencilAreaFloat = 1.0f / static_cast(stencilAreaInt); uint32_t tmp; std::memcpy(&tmp, &stencilAreaFloat, sizeof(float)); - const uint64_t stencilAreaToMultiply = static_cast(tmp); + const uint64_t stencilAreaToMultiplyPim = static_cast(tmp); + constexpr uint64_t numIterationsPerPim = 5; + + PimObjId tmpPim = pimAlloc(PIM_ALLOC_AUTO, gridWidth, PIM_FP32); + assert(tmpPim != -1); + PimObjId runningSum = pimAllocAssociated(tmpPim, PIM_FP32); + assert(runningSum != -1); - PimObjId resultPim = pimAlloc(PIM_ALLOC_AUTO, gridWidth, PIM_FP32); - assert(resultPim != -1); + std::vector rowsInSumCircularQueue(2*radius+1); + for(uint64_t i=0; i workingPimMemory(20); // TODO: Set to a better number, num associable - num used other + for(uint64_t i=0; i= srcHost.size()) { + break; + } + const uint64_t totalRowsThisIter = min(srcHost.size(), firstRowSrc + workingPimMemory.size()) - firstRowSrc; + uint64_t workingPimMemoryIdx = 0; + for(uint64_t srcHostRow = firstRowSrc; srcHostRow < firstRowSrc + totalRowsThisIter; ++srcHostRow) { + status = pimCopyHostToDevice((void*) srcHost[srcHostRow].data(), workingPimMemory[workingPimMemoryIdx]); assert (status == PIM_OK); + ++workingPimMemoryIdx; + } + // computeStencilChunkIteration x currIters + for(uint64_t iterNum = 0; iterNum < currIterations; ++iterNum) { + computeStencilChunkIteration(workingPimMemory, rowsInSumCircularQueue, tmpPim, runningSum, stencilAreaToMultiplyPim, radius); + } + // copy range wpm [invalidResultsTop, (used wpm size)-invalidResultsTop) into dstHost [firstRowUsable, firstRowUsable+rowsThisIter) + // firstRowUsable += rowsOnIter + const uint64_t usableRowsThisIter = totalRowsThisIter - 2*invalidResultsTop; + firstRowSrc += usableRowsThisIter; + } - status = pimCopyDeviceToHost(resultPim, dstHost[i].data()); - assert (status == PIM_OK); + - pimFree(summedRow); - } + // PimObjId resultPim = pimAlloc(PIM_ALLOC_AUTO, gridWidth, PIM_FP32); + // assert(resultPim != -1); - pimFree(resultPim); - return; - } + // // Handle special case + // if(stencilHeight == 1) { + // for(size_t i=0; i rowsInSum; - rowsInSum.push_back(sumStencilRow(srcHost[0], stencilWidth, numLeft, resultPim)); - rowsInSum.push_back(sumStencilRow(srcHost[1], stencilWidth, numLeft, resultPim)); - status = pimAdd(rowsInSum.front(), rowsInSum.back(), runningSum); - assert (status == PIM_OK); + // status = pimCopyDeviceToHost(resultPim, dstHost[i].data()); + // assert (status == PIM_OK); - for(uint64_t i=2; i rowsInSum; + // rowsInSum.push_back(sumStencilRow(srcHost[0], stencilWidth, numLeft, resultPim)); + // rowsInSum.push_back(sumStencilRow(srcHost[1], stencilWidth, numLeft, resultPim)); + // status = pimAdd(rowsInSum.front(), rowsInSum.back(), runningSum); + // assert (status == PIM_OK); - status = pimCopyDeviceToHost(resultPim, (void *) dstHost[row].data()); - assert (status == PIM_OK); - - if(row+1> cpuY; + cpuY.resize(y.size(), std::vector(y[0].size(), 0)); + bool ok = true; // Only compute when stencil is fully in range From 2a0d540b7a3a26a35f15cd78a5b0b8d71ba8878b Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Mon, 28 Apr 2025 14:25:50 -0400 Subject: [PATCH 09/31] vertical chunking starting to work --- misc-bench/stencil/PIM/stencil.cpp | 61 ++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index dd338bcc..3a0c527c 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -179,6 +179,15 @@ void sumStencilRow(PimObjId mid, PimObjId pimRowSum, PimObjId shiftBackup, const status = pimAdd(mid, shiftBackup, pimRowSum); assert (status == PIM_OK); + // status = pimShiftElementsLeft(shiftBackup); + // assert (status == PIM_OK); + + // status = pimShiftElementsLeft(shiftBackup); + // assert (status == PIM_OK); + + // status = pimAdd(pimRowSum, shiftBackup, pimRowSum); + // assert (status == PIM_OK); + for(uint64_t shiftIter=1; shiftIter vec(len); + PimStatus status = pimCopyDeviceToHost(obj, (void*) vec.data()); + assert (status == PIM_OK); + + for(float f : vec) { + std::cout << f << ", "; + } + std::cout << std::endl; +} + void computeStencilChunkIteration(std::vector& workingPimMemory, std::vector& rowsInSumCircularQueue, PimObjId tmpPim, PimObjId runningSum, const uint64_t stencilAreaToMultiplyPim, const uint64_t radius) { PimStatus status; uint64_t circularQueueTop = 0; @@ -210,7 +230,7 @@ void computeStencilChunkIteration(std::vector& workingPimMemory, std:: ++circularQueueTop; status = pimAdd(rowsInSumCircularQueue[0], rowsInSumCircularQueue[1], runningSum); assert (status == PIM_OK); - + // std::cout << "radius " << radius; for(uint64_t i=2; i<2*radius; ++i) { sumStencilRow(workingPimMemory[i], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); status = pimAdd(runningSum, rowsInSumCircularQueue[i], runningSum); @@ -224,6 +244,10 @@ void computeStencilChunkIteration(std::vector& workingPimMemory, std:: // rowsInSum.push_back(sumStencilRow(srcHost[nextRowToAdd], stencilWidth, numLeft, resultPim)); sumStencilRow(workingPimMemory[nextRowToAdd], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); status = pimAdd(runningSum, rowsInSumCircularQueue[circularQueueTop], runningSum); + // std::cout << "row: " << row << ", added from queue: "; + // print_pim(rowsInSumCircularQueue[circularQueueTop], 10); + // std::cout << "running sum: "; + // print_pim(runningSum, 10); assert (status == PIM_OK); circularQueueTop = (1+circularQueueTop) % rowsInSumCircularQueue.size(); ++nextRowToAdd; @@ -253,6 +277,7 @@ void stencil(const std::vector> &srcHost, std::vector> &srcHost, std::vector> &srcHost, std::vector= srcHost.size()) { + if(firstRowUsableSrc + invalidResultsTop >= srcHost.size()) { break; } const uint64_t totalRowsThisIter = min(srcHost.size(), firstRowSrc + workingPimMemory.size()) - firstRowSrc; + const uint64_t usableRowsThisIter = totalRowsThisIter - 2*invalidResultsTop; uint64_t workingPimMemoryIdx = 0; for(uint64_t srcHostRow = firstRowSrc; srcHostRow < firstRowSrc + totalRowsThisIter; ++srcHostRow) { status = pimCopyHostToDevice((void*) srcHost[srcHostRow].data(), workingPimMemory[workingPimMemoryIdx]); @@ -305,8 +346,13 @@ void stencil(const std::vector> &srcHost, std::vector> &srcHost, std::vector> &dstHost, const uint64_t numRows, + // const uint64_t iterations, const uint64_t radius) + stencil(x, y, 2 * deviceProp.numRowPerSubarray, params.iterations, 1); if (params.shouldVerify) { - std::vector> cpuY; - cpuY.resize(y.size(), std::vector(y[0].size(), 0)); + // std::vector> cpuY; + // cpuY.resize(y.size(), std::vector(y[0].size(), 0)); bool ok = true; From 4ec58a11be3d2fa20f63ef0ac9c9bf6422b48707 Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Mon, 28 Apr 2025 14:47:50 -0400 Subject: [PATCH 10/31] working for multiple iterations --- misc-bench/stencil/PIM/stencil.cpp | 54 ++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index 3a0c527c..22c4e515 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -321,7 +321,7 @@ void stencil(const std::vector> &srcHost, std::vector> &srcHost, std::vector>& src, std::vector>& dst, const uint64_t iterations, const uint64_t radius) { + // Only compute when stencil is fully in range + const uint64_t startY = radius; + const uint64_t endY = src.size() - radius; + const uint64_t startX = radius; + const uint64_t endX = src[0].size() - radius; + const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1); + const float stencilAreaInverseFloat = 1.0f / static_cast(stencilAreaInt); + + for(uint64_t iter=0; iter> cpuY; - // cpuY.resize(y.size(), std::vector(y[0].size(), 0)); + std::vector> cpuY; + cpuY.resize(y.size(), std::vector(y[0].size(), 0)); + stencilCpu(x, cpuY, params.iterations, 1); bool ok = true; // Only compute when stencil is fully in range - const uint64_t startY = params.numAbove; - const uint64_t endY = params.gridHeight - (params.stencilHeight - params.numAbove - 1); - const uint64_t startX = params.numLeft; - const uint64_t endX = params.gridWidth - (params.stencilWidth - params.numLeft - 1); + const uint64_t startY = 1 + params.iterations - 1; + const uint64_t endY = params.gridHeight - (1 + params.iterations - 1); + const uint64_t startX = 1 + params.iterations - 1; + const uint64_t endX = params.gridWidth - (1 + params.iterations - 1); std::cout << std::fixed << std::setprecision(10); #pragma omp parallel for collapse(2) for(uint64_t gridY=startY; gridY(params.stencilWidth * params.stencilHeight)); - constexpr float acceptableDelta = 0.1f; - if (std::abs(resCPU - y[gridY][gridX]) > acceptableDelta) + if (std::abs(cpuY[gridY][gridX] - y[gridY][gridX]) > acceptableDelta) { #pragma omp critical { - std::cout << "Wrong answer: " << y[gridY][gridX] << " (expected " << resCPU << ") at position (" << gridX << ", " << gridY << ")" << std::endl; + std::cout << "Wrong answer: " << y[gridY][gridX] << " (expected " << cpuY[gridY][gridX] << ") at position (" << gridX << ", " << gridY << ")" << std::endl; ok = false; } } From d5d22dacdd855fa9b0b0897148edf9b4b1afd36e Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Wed, 30 Apr 2025 19:40:50 -0400 Subject: [PATCH 11/31] correct for radius and iterations both >1 --- misc-bench/stencil/PIM/stencil.cpp | 95 ++++++++++++------------------ 1 file changed, 38 insertions(+), 57 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index 22c4e515..0a462e4f 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -29,10 +29,7 @@ typedef struct Params uint64_t iterations; uint64_t gridWidth; uint64_t gridHeight; - uint64_t stencilWidth; - uint64_t stencilHeight; - uint64_t numLeft; - uint64_t numAbove; + uint64_t radius; const char *configFile; const char *inputFile; bool shouldVerify; @@ -46,10 +43,7 @@ void usage() "\n -n iterations (default=10 iterations)" "\n -x grid width (default=2048 elements)" "\n -y grid height (default=2048 elements)" - "\n -w horizontal stencil size (default=3)" - "\n -d vertical stencil size (default=3)" - "\n -l number of elements to the left of the output element for the stencil pattern, must be less than the horizontal stencil size (default=1)" - "\n -a number of elements above the output element for the stencil pattern, must be less than the vertical stencil size (default=1)" + "\n -r stencil radius (default=1)" "\n -c dramsim config file" "\n -i input file containing a 2d array (default=random)" "\n -v t = verifies PIM output with host output. (default=false)" @@ -59,19 +53,16 @@ void usage() struct Params getInputParams(int argc, char **argv) { struct Params p; - p.iterations = 10; - p.gridWidth = 2048; - p.gridHeight = 2048; - p.stencilWidth = 3; - p.stencilHeight = 3; - p.numLeft = 1; - p.numAbove = 1; + p.iterations = 2; + p.gridWidth = 10; + p.gridHeight = 20; + p.radius = 2; p.configFile = nullptr; p.inputFile = nullptr; - p.shouldVerify = false; + p.shouldVerify = true; int opt; - while ((opt = getopt(argc, argv, "h:n:x:y:w:d:l:a:c:i:v:")) >= 0) + while ((opt = getopt(argc, argv, "h:n:x:y:r:c:i:v:")) >= 0) { switch (opt) { @@ -88,17 +79,8 @@ struct Params getInputParams(int argc, char **argv) case 'y': p.gridHeight = strtoull(optarg, NULL, 0); break; - case 'w': - p.stencilWidth = strtoull(optarg, NULL, 0); - break; - case 'd': - p.stencilHeight = strtoull(optarg, NULL, 0); - break; - case 'l': - p.numLeft = strtoull(optarg, NULL, 0); - break; - case 'a': - p.numAbove = strtoull(optarg, NULL, 0); + case 'r': + p.radius= strtoull(optarg, NULL, 0); break; case 'c': p.configFile = optarg; @@ -208,6 +190,13 @@ void sumStencilRow(PimObjId mid, PimObjId pimRowSum, PimObjId shiftBackup, const } } +std::vector get_pim(PimObjId obj, uint64_t len) { + std::vector vec(len); + PimStatus status = pimCopyDeviceToHost(obj, (void*) vec.data()); + assert (status == PIM_OK); + return vec; +} + void print_pim(PimObjId obj, uint64_t len) { std::vector vec(len); PimStatus status = pimCopyDeviceToHost(obj, (void*) vec.data()); @@ -233,7 +222,7 @@ void computeStencilChunkIteration(std::vector& workingPimMemory, std:: // std::cout << "radius " << radius; for(uint64_t i=2; i<2*radius; ++i) { sumStencilRow(workingPimMemory[i], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); - status = pimAdd(runningSum, rowsInSumCircularQueue[i], runningSum); + status = pimAdd(runningSum, rowsInSumCircularQueue[circularQueueTop], runningSum); assert (status == PIM_OK); ++circularQueueTop; } @@ -321,8 +310,8 @@ void stencil(const std::vector> &srcHost, std::vector> &srcHost, std::vector>& src, std::vector>& dst, const uint64_t iterations, const uint64_t radius) { - // Only compute when stencil is fully in range - const uint64_t startY = radius; - const uint64_t endY = src.size() - radius; - const uint64_t startX = radius; - const uint64_t endX = src[0].size() - radius; const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1); const float stencilAreaInverseFloat = 1.0f / static_cast(stencilAreaInt); - for(uint64_t iter=0; iter> x, y; if (params.inputFile == nullptr) { // Fill in random grid - x.resize(params.gridHeight); - for(size_t i=0; i(params.gridWidth)); #pragma omp parallel { @@ -494,31 +479,27 @@ int main(int argc, char* argv[]) return 1; } - y.resize(x.size()); - for(size_t i=0; i(x[0].size())); PimDeviceProperties deviceProp; PimStatus status = pimGetDeviceProperties(&deviceProp); assert(status == PIM_OK); // void stencil(const std::vector> &srcHost, std::vector> &dstHost, const uint64_t numRows, // const uint64_t iterations, const uint64_t radius) - stencil(x, y, 2 * deviceProp.numRowPerSubarray, params.iterations, 1); + stencil(x, y, 2 * deviceProp.numRowPerSubarray, params.iterations, params.radius); if (params.shouldVerify) { - std::vector> cpuY; - cpuY.resize(y.size(), std::vector(y[0].size(), 0)); - stencilCpu(x, cpuY, params.iterations, 1); + std::vector> cpuY(y.size(), std::vector(y[0].size())); + stencilCpu(x, cpuY, params.iterations, params.radius); bool ok = true; // Only compute when stencil is fully in range - const uint64_t startY = 1 + params.iterations - 1; - const uint64_t endY = params.gridHeight - (1 + params.iterations - 1); - const uint64_t startX = 1 + params.iterations - 1; - const uint64_t endX = params.gridWidth - (1 + params.iterations - 1); + const uint64_t startY = params.radius * params.iterations; + const uint64_t endY = params.gridHeight - startY; + const uint64_t startX = params.radius * params.iterations; + const uint64_t endX = params.gridWidth - startX; std::cout << std::fixed << std::setprecision(10); From da1f01cd3854bd6d0108ee84b8501d4479736873 Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Wed, 30 Apr 2025 19:49:12 -0400 Subject: [PATCH 12/31] cleanup --- misc-bench/stencil/PIM/stencil.cpp | 203 ++++------------------------- 1 file changed, 23 insertions(+), 180 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index 0a462e4f..c086ec0e 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -53,10 +53,10 @@ void usage() struct Params getInputParams(int argc, char **argv) { struct Params p; - p.iterations = 2; - p.gridWidth = 10; - p.gridHeight = 20; - p.radius = 2; + p.iterations = 10; + p.gridWidth = 2048; + p.gridHeight = 2048; + p.radius = 1; p.configFile = nullptr; p.inputFile = nullptr; p.shouldVerify = true; @@ -100,51 +100,11 @@ struct Params getInputParams(int argc, char **argv) return p; } -// //! @brief Shifts the elements of the input row so that necessary elements are vertically aligned -// //! @param[in] src The vector to shift -// //! @param[in] stencilWidth The horizontal width of the stencil -// //! @param[in] numLeft The number of elements to the left of the output element in the stencil pattern -// //! @param[in] toAssociate A PIM Object to associate the added data with -// //! @return The shifted row as a list of PIM objects -// std::vector createShiftedStencilRows(const std::vector &src, const uint64_t stencilWidth, -// const uint64_t numLeft, const PimObjId toAssociate) { -// PimStatus status; - -// std::vector result(stencilWidth); - -// for(uint64_t i=0; i0; --i) { -// status = pimCopyObjectToObject(result[i], result[i-1]); -// assert (status == PIM_OK); - -// status = pimShiftElementsRight(result[i-1]); -// assert (status == PIM_OK); -// } - -// for(uint64_t i=numLeft+1; i get_pim(PimObjId obj, uint64_t len) { - std::vector vec(len); - PimStatus status = pimCopyDeviceToHost(obj, (void*) vec.data()); - assert (status == PIM_OK); - return vec; -} - -void print_pim(PimObjId obj, uint64_t len) { - std::vector vec(len); - PimStatus status = pimCopyDeviceToHost(obj, (void*) vec.data()); - assert (status == PIM_OK); - - for(float f : vec) { - std::cout << f << ", "; - } - std::cout << std::endl; -} - void computeStencilChunkIteration(std::vector& workingPimMemory, std::vector& rowsInSumCircularQueue, PimObjId tmpPim, PimObjId runningSum, const uint64_t stencilAreaToMultiplyPim, const uint64_t radius) { PimStatus status; uint64_t circularQueueTop = 0; @@ -219,7 +152,7 @@ void computeStencilChunkIteration(std::vector& workingPimMemory, std:: ++circularQueueTop; status = pimAdd(rowsInSumCircularQueue[0], rowsInSumCircularQueue[1], runningSum); assert (status == PIM_OK); - // std::cout << "radius " << radius; + for(uint64_t i=2; i<2*radius; ++i) { sumStencilRow(workingPimMemory[i], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); status = pimAdd(runningSum, rowsInSumCircularQueue[circularQueueTop], runningSum); @@ -230,14 +163,11 @@ void computeStencilChunkIteration(std::vector& workingPimMemory, std:: uint64_t nextRowToAdd = 2*radius; for(uint64_t row=radius; row& workingPimMemory, std:: //! @brief Computes a stencil pattern over a 2d array //! @param[in] srcHost The input stencil grid -//! @param[in] dstHost The resultant stencil grid -//! @param[in] stencilPattern The stencil pattern to apply -//! @param[in] numLeft The number of elements to the left of the output element in the stencil pattern -//! @param[in] numAbove The number of elements above the output element in the stencil pattern -void stencil(const std::vector> &srcHost, std::vector> &dstHost, const uint64_t numRows, - const uint64_t iterations, const uint64_t radius) { +//! @param[out] dstHost The resultant stencil grid +//! @param[in] numRows Number of PIM rows that objects can be associated within +//! @param[in] iterations Number of iterations to run the stencil pattern for +//! @param[in] radius The radius of the stencil pattern +void stencil(const std::vector> &srcHost, std::vector> &dstHost, + const uint64_t numRows, const uint64_t iterations, const uint64_t radius) { PimStatus status; assert(!srcHost.empty()); assert(!srcHost[0].empty()); assert(srcHost.size() == dstHost.size()); assert(srcHost[0].size() == dstHost[0].size()); - // assert(srcHost.size() <= 20); - const uint64_t gridHeight = srcHost.size(); const uint64_t gridWidth = srcHost[0].size(); const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1); @@ -295,29 +223,12 @@ void stencil(const std::vector> &srcHost, std::vector= srcHost.size()) { break; @@ -330,87 +241,20 @@ void stencil(const std::vector> &srcHost, std::vector rowsInSum; - // rowsInSum.push_back(sumStencilRow(srcHost[0], stencilWidth, numLeft, resultPim)); - // rowsInSum.push_back(sumStencilRow(srcHost[1], stencilWidth, numLeft, resultPim)); - // status = pimAdd(rowsInSum.front(), rowsInSum.back(), runningSum); - // assert (status == PIM_OK); - - // for(uint64_t i=2; i>& src, std::vector>& dst, const uint64_t iterations, const uint64_t radius) { @@ -484,8 +328,7 @@ int main(int argc, char* argv[]) PimDeviceProperties deviceProp; PimStatus status = pimGetDeviceProperties(&deviceProp); assert(status == PIM_OK); - // void stencil(const std::vector> &srcHost, std::vector> &dstHost, const uint64_t numRows, - // const uint64_t iterations, const uint64_t radius) + stencil(x, y, 2 * deviceProp.numRowPerSubarray, params.iterations, params.radius); if (params.shouldVerify) From f6f89e69d46a3e37a9ae0aa3388847777ae5eb69 Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Wed, 30 Apr 2025 20:36:23 -0400 Subject: [PATCH 13/31] works for many iterations --- misc-bench/stencil/PIM/stencil.cpp | 70 +++++++++++++++++------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index c086ec0e..01d26fe9 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -59,7 +59,7 @@ struct Params getInputParams(int argc, char **argv) p.radius = 1; p.configFile = nullptr; p.inputFile = nullptr; - p.shouldVerify = true; + p.shouldVerify = false; int opt; while ((opt = getopt(argc, argv, "h:n:x:y:r:c:i:v:")) >= 0) @@ -197,6 +197,9 @@ void stencil(const std::vector> &srcHost, std::vector> tmpGrid; + tmpGrid.resize(srcHost.size(), std::vector(srcHost[0].size())); + const uint64_t gridWidth = srcHost[0].size(); const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1); @@ -204,7 +207,7 @@ void stencil(const std::vector> &srcHost, std::vector(tmp); - constexpr uint64_t numIterationsPerPim = 5; + constexpr uint64_t maxIterationsPerPim = 2; PimObjId tmpPim = pimAlloc(PIM_ALLOC_AUTO, gridWidth, PIM_FP32); assert(tmpPim != -1); @@ -223,38 +226,47 @@ void stencil(const std::vector> &srcHost, std::vector= srcHost.size()) { + break; + } + const uint64_t totalRowsThisIter = min(srcHost.size(), firstRowSrc + workingPimMemory.size()) - firstRowSrc; + const uint64_t usableRowsThisIter = totalRowsThisIter - 2*invalidResultsTop; + uint64_t workingPimMemoryIdx = 0; + for(uint64_t srcHostRow = firstRowSrc; srcHostRow < firstRowSrc + totalRowsThisIter; ++srcHostRow) { + if(iter == 0) { + status = pimCopyHostToDevice((void*) srcHost[srcHostRow].data(), workingPimMemory[workingPimMemoryIdx]); + } else { + status = pimCopyHostToDevice((void*) tmpGrid[srcHostRow].data(), workingPimMemory[workingPimMemoryIdx]); + } + assert (status == PIM_OK); + ++workingPimMemoryIdx; + } - uint64_t firstRowSrc = 0; - for(;;) { - const uint64_t firstRowUsableSrc = firstRowSrc + invalidResultsTop; - if(firstRowUsableSrc + invalidResultsTop >= srcHost.size()) { - break; - } - const uint64_t totalRowsThisIter = min(srcHost.size(), firstRowSrc + workingPimMemory.size()) - firstRowSrc; - const uint64_t usableRowsThisIter = totalRowsThisIter - 2*invalidResultsTop; - uint64_t workingPimMemoryIdx = 0; - for(uint64_t srcHostRow = firstRowSrc; srcHostRow < firstRowSrc + totalRowsThisIter; ++srcHostRow) { - status = pimCopyHostToDevice((void*) srcHost[srcHostRow].data(), workingPimMemory[workingPimMemoryIdx]); - assert (status == PIM_OK); - ++workingPimMemoryIdx; - } + for(uint64_t iterNum = 0; iterNum < currIterations; ++iterNum) { + computeStencilChunkIteration(workingPimMemory, rowsInSumCircularQueue, tmpPim, runningSum, stencilAreaToMultiplyPim, radius); + } - for(uint64_t iterNum = 0; iterNum < currIterations; ++iterNum) { - computeStencilChunkIteration(workingPimMemory, rowsInSumCircularQueue, tmpPim, runningSum, stencilAreaToMultiplyPim, radius); - } + workingPimMemoryIdx = invalidResultsTop; + for(uint64_t srcHostRow = firstRowUsableSrc; srcHostRow < firstRowUsableSrc + usableRowsThisIter; ++srcHostRow) { + status = pimCopyDeviceToHost(workingPimMemory[workingPimMemoryIdx], (void*) dstHost[srcHostRow].data()); + assert (status == PIM_OK); + ++workingPimMemoryIdx; + } - workingPimMemoryIdx = invalidResultsTop; - for(uint64_t srcHostRow = firstRowUsableSrc; srcHostRow < firstRowUsableSrc + usableRowsThisIter; ++srcHostRow) { - status = pimCopyDeviceToHost(workingPimMemory[workingPimMemoryIdx], (void*) dstHost[srcHostRow].data()); - assert (status == PIM_OK); - ++workingPimMemoryIdx; + firstRowSrc += usableRowsThisIter; } - - firstRowSrc += usableRowsThisIter; + std::swap(tmpGrid, dstHost); } + std::swap(tmpGrid, dstHost); } void stencilCpu(std::vector>& src, std::vector>& dst, const uint64_t iterations, const uint64_t radius) { From 9ea640c8fd7fa709516717edf6630a83ebed4ea4 Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Wed, 30 Apr 2025 21:00:30 -0400 Subject: [PATCH 14/31] fix workingPimMemory size --- misc-bench/stencil/PIM/stencil.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index 01d26fe9..3bb4667b 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -185,11 +185,11 @@ void computeStencilChunkIteration(std::vector& workingPimMemory, std:: //! @brief Computes a stencil pattern over a 2d array //! @param[in] srcHost The input stencil grid //! @param[out] dstHost The resultant stencil grid -//! @param[in] numRows Number of PIM rows that objects can be associated within +//! @param[in] numAssociable Number of float 32 PIM objects that can be associated with each other //! @param[in] iterations Number of iterations to run the stencil pattern for //! @param[in] radius The radius of the stencil pattern void stencil(const std::vector> &srcHost, std::vector> &dstHost, - const uint64_t numRows, const uint64_t iterations, const uint64_t radius) { + const uint64_t numAssociable, const uint64_t iterations, const uint64_t radius) { PimStatus status; assert(!srcHost.empty()); @@ -220,7 +220,7 @@ void stencil(const std::vector> &srcHost, std::vector workingPimMemory(20); // TODO: Set to a better number, num associable - num used other + std::vector workingPimMemory(numAssociable - (rowsInSumCircularQueue.size() + 2)); for(uint64_t i=0; i Date: Wed, 30 Apr 2025 21:02:59 -0400 Subject: [PATCH 15/31] cleanup comments, add TODO --- misc-bench/stencil/PIM/stencil.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index 3bb4667b..2b9ddaf7 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -207,7 +207,7 @@ void stencil(const std::vector> &srcHost, std::vector(tmp); - constexpr uint64_t maxIterationsPerPim = 2; + constexpr uint64_t maxIterationsPerPim = 2; // TODO: what should this number be? PimObjId tmpPim = pimAlloc(PIM_ALLOC_AUTO, gridWidth, PIM_FP32); assert(tmpPim != -1); @@ -228,7 +228,6 @@ void stencil(const std::vector> &srcHost, std::vector Date: Sat, 3 May 2025 23:22:30 -0400 Subject: [PATCH 16/31] chunked stencil working --- misc-bench/stencil/PIM/stencil.cpp | 96 ++++++++++++++++++++++++++---- 1 file changed, 84 insertions(+), 12 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index 2b9ddaf7..af98f430 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -23,6 +23,8 @@ #include "util.h" #include "libpimeval.h" +constexpr bool isHorizontallyChunked = true; + // Params --------------------------------------------------------------------- typedef struct Params { @@ -182,15 +184,58 @@ void computeStencilChunkIteration(std::vector& workingPimMemory, std:: } } +void copyChunkedVectorPim(std::vector &vec, PimObjId pimObj, const uint64_t pimObjLen, const uint64_t numInvalid, const uint64_t numElementsHorizontal, const bool isToPim) { + PimStatus status; + if constexpr (!isHorizontallyChunked) { + if(isToPim) { + status = pimCopyHostToDevice((void*) vec.data(), pimObj); + } else { + status = pimCopyDeviceToHost(pimObj, (void*) vec.data()); + } + assert (status == PIM_OK); + } else { + const uint64_t totalValid = vec.size() - 2*numInvalid; + const uint64_t maxUsable = numElementsHorizontal - 2*numInvalid; + const uint64_t numChunks = (totalValid + maxUsable - 1) / maxUsable; + if(isToPim) { + uint64_t hostStartIdx = 0; + uint64_t pimStartIdx = 0; + for(uint64_t i=0; i> &srcHost, std::vector> &dstHost, - const uint64_t numAssociable, const uint64_t iterations, const uint64_t radius) { - PimStatus status; +void stencil(const std::vector> &srcHost, std::vector> &dstHost, const uint64_t numAssociable, + const uint64_t numElementsHorizontal, const uint64_t iterations, const uint64_t radius) { assert(!srcHost.empty()); assert(!srcHost[0].empty()); @@ -209,7 +254,17 @@ void stencil(const std::vector> &srcHost, std::vector(tmp); constexpr uint64_t maxIterationsPerPim = 2; // TODO: what should this number be? - PimObjId tmpPim = pimAlloc(PIM_ALLOC_AUTO, gridWidth, PIM_FP32); + uint64_t pimAllocWidth; + if constexpr (isHorizontallyChunked) { + const uint64_t maxInvalidHorizontal = radius * std::min(maxIterationsPerPim, iterations); + const uint64_t maxUsableHorizontal = numElementsHorizontal - 2*maxInvalidHorizontal; + const uint64_t maxChunksHorizontal = (gridWidth + maxUsableHorizontal - 1) / maxUsableHorizontal; + pimAllocWidth = numElementsHorizontal * maxChunksHorizontal; + } else { + pimAllocWidth = gridWidth; + } + + PimObjId tmpPim = pimAlloc(PIM_ALLOC_AUTO, pimAllocWidth, PIM_FP32); assert(tmpPim != -1); PimObjId runningSum = pimAllocAssociated(tmpPim, PIM_FP32); assert(runningSum != -1); @@ -237,16 +292,15 @@ void stencil(const std::vector> &srcHost, std::vector= srcHost.size()) { break; } - const uint64_t totalRowsThisIter = min(srcHost.size(), firstRowSrc + workingPimMemory.size()) - firstRowSrc; + const uint64_t totalRowsThisIter = std::min(srcHost.size(), firstRowSrc + workingPimMemory.size()) - firstRowSrc; const uint64_t usableRowsThisIter = totalRowsThisIter - 2*invalidResultsTop; uint64_t workingPimMemoryIdx = 0; for(uint64_t srcHostRow = firstRowSrc; srcHostRow < firstRowSrc + totalRowsThisIter; ++srcHostRow) { if(iter == 0) { - status = pimCopyHostToDevice((void*) srcHost[srcHostRow].data(), workingPimMemory[workingPimMemoryIdx]); + copyChunkedVectorPim(const_cast&>(srcHost[srcHostRow]), workingPimMemory[workingPimMemoryIdx], pimAllocWidth, invalidResultsTop, numElementsHorizontal, true); } else { - status = pimCopyHostToDevice((void*) tmpGrid[srcHostRow].data(), workingPimMemory[workingPimMemoryIdx]); + copyChunkedVectorPim(tmpGrid[srcHostRow], workingPimMemory[workingPimMemoryIdx], pimAllocWidth, invalidResultsTop, numElementsHorizontal, true); } - assert (status == PIM_OK); ++workingPimMemoryIdx; } @@ -256,8 +310,7 @@ void stencil(const std::vector> &srcHost, std::vector Date: Mon, 5 May 2025 14:06:51 -0400 Subject: [PATCH 17/31] add api support for chunked shift --- libpimeval/src/libpimeval.cpp | 8 ++-- libpimeval/src/libpimeval.h | 4 +- libpimeval/src/pimCmd.cpp | 52 ++++++++++++----------- libpimeval/src/pimCmd.h | 5 ++- libpimeval/src/pimPerfEnergyAquabolt.cpp | 2 +- libpimeval/src/pimPerfEnergyAquabolt.h | 2 +- libpimeval/src/pimPerfEnergyBankLevel.cpp | 12 +++--- libpimeval/src/pimPerfEnergyBankLevel.h | 2 +- libpimeval/src/pimPerfEnergyBase.cpp | 2 +- libpimeval/src/pimPerfEnergyBase.h | 2 +- libpimeval/src/pimPerfEnergyBitSerial.cpp | 20 ++++++--- libpimeval/src/pimPerfEnergyBitSerial.h | 2 +- libpimeval/src/pimPerfEnergyFulcrum.cpp | 12 +++--- libpimeval/src/pimPerfEnergyFulcrum.h | 2 +- libpimeval/src/pimSim.cpp | 12 +++--- libpimeval/src/pimSim.h | 4 +- misc-bench/stencil/PIM/stencil.cpp | 6 +-- 17 files changed, 81 insertions(+), 68 deletions(-) diff --git a/libpimeval/src/libpimeval.cpp b/libpimeval/src/libpimeval.cpp index a400e22c..4c4201f5 100644 --- a/libpimeval/src/libpimeval.cpp +++ b/libpimeval/src/libpimeval.cpp @@ -501,17 +501,17 @@ pimRotateElementsLeft(PimObjId src) //! @brief Shift elements of an obj by one step to the right and fill zero PimStatus -pimShiftElementsRight(PimObjId src) +pimShiftElementsRight(PimObjId src, bool useCrossRegionCommunication) { - bool ok = pimSim::get()->pimShiftElementsRight(src); + bool ok = pimSim::get()->pimShiftElementsRight(src, useCrossRegionCommunication); return ok ? PIM_OK : PIM_ERROR; } //! @brief Shift elements of an obj by one step to the left and fill zero PimStatus -pimShiftElementsLeft(PimObjId src) +pimShiftElementsLeft(PimObjId src, bool useCrossRegionCommunication) { - bool ok = pimSim::get()->pimShiftElementsLeft(src); + bool ok = pimSim::get()->pimShiftElementsLeft(src, useCrossRegionCommunication); return ok ? PIM_OK : PIM_ERROR; } diff --git a/libpimeval/src/libpimeval.h b/libpimeval/src/libpimeval.h index 5275bb6c..def2f530 100644 --- a/libpimeval/src/libpimeval.h +++ b/libpimeval/src/libpimeval.h @@ -191,8 +191,8 @@ PimStatus pimBroadcastUInt(PimObjId dest, uint64_t value); PimStatus pimBroadcastFP(PimObjId dest, float value); PimStatus pimRotateElementsRight(PimObjId src); PimStatus pimRotateElementsLeft(PimObjId src); -PimStatus pimShiftElementsRight(PimObjId src); -PimStatus pimShiftElementsLeft(PimObjId src); +PimStatus pimShiftElementsRight(PimObjId src, bool useCrossRegionCommunication = true); +PimStatus pimShiftElementsLeft(PimObjId src, bool useCrossRegionCommunication = true); PimStatus pimShiftBitsRight(PimObjId src, PimObjId dest, unsigned shiftAmount); PimStatus pimShiftBitsLeft(PimObjId src, PimObjId dest, unsigned shiftAmount); diff --git a/libpimeval/src/pimCmd.cpp b/libpimeval/src/pimCmd.cpp index 6f6c49af..cf101c05 100644 --- a/libpimeval/src/pimCmd.cpp +++ b/libpimeval/src/pimCmd.cpp @@ -1205,33 +1205,35 @@ pimCmdRotate::execute() computeAllRegions(numRegions); // handle region boundaries - if (m_cmdType == PimCmdEnum::ROTATE_ELEM_R || m_cmdType == PimCmdEnum::SHIFT_ELEM_R) { - for (unsigned i = 0; i < numRegions; ++i) { - const pimRegion &srcRegion = objSrc.getRegions()[i]; - uint64_t elemIdxBegin = srcRegion.getElemIdxBegin(); - uint64_t val = 0; - if (i == 0 && m_cmdType == PimCmdEnum::ROTATE_ELEM_R) { - val = m_regionBoundary[numRegions - 1]; - } else if (i > 0) { - val = m_regionBoundary[i - 1]; + if(m_useCrossRegionCommunication) { + if (m_cmdType == PimCmdEnum::ROTATE_ELEM_R || m_cmdType == PimCmdEnum::SHIFT_ELEM_R) { + for (unsigned i = 0; i < numRegions; ++i) { + const pimRegion &srcRegion = objSrc.getRegions()[i]; + uint64_t elemIdxBegin = srcRegion.getElemIdxBegin(); + uint64_t val = 0; + if (i == 0 && m_cmdType == PimCmdEnum::ROTATE_ELEM_R) { + val = m_regionBoundary[numRegions - 1]; + } else if (i > 0) { + val = m_regionBoundary[i - 1]; + } + objSrc.setElement(elemIdxBegin, val); } - objSrc.setElement(elemIdxBegin, val); - } - } else if (m_cmdType == PimCmdEnum::ROTATE_ELEM_L || m_cmdType == PimCmdEnum::SHIFT_ELEM_L) { - for (unsigned i = 0; i < numRegions; ++i) { - const pimRegion &srcRegion = objSrc.getRegions()[i]; - unsigned numElementsInRegion = srcRegion.getNumElemInRegion(); - uint64_t elemIdxBegin = srcRegion.getElemIdxBegin(); - uint64_t val = 0; - if (i == numRegions - 1 && m_cmdType == PimCmdEnum::ROTATE_ELEM_L) { - val = m_regionBoundary[0]; - } else if (i < numRegions - 1) { - val = m_regionBoundary[i + 1]; + } else if (m_cmdType == PimCmdEnum::ROTATE_ELEM_L || m_cmdType == PimCmdEnum::SHIFT_ELEM_L) { + for (unsigned i = 0; i < numRegions; ++i) { + const pimRegion &srcRegion = objSrc.getRegions()[i]; + unsigned numElementsInRegion = srcRegion.getNumElemInRegion(); + uint64_t elemIdxBegin = srcRegion.getElemIdxBegin(); + uint64_t val = 0; + if (i == numRegions - 1 && m_cmdType == PimCmdEnum::ROTATE_ELEM_L) { + val = m_regionBoundary[0]; + } else if (i < numRegions - 1) { + val = m_regionBoundary[i + 1]; + } + objSrc.setElement(elemIdxBegin + numElementsInRegion - 1, val); } - objSrc.setElement(elemIdxBegin + numElementsInRegion - 1, val); + } else { + assert(0); } - } else { - assert(0); } if (pimSim::get()->getDeviceType() != PIM_FUNCTIONAL) { @@ -1306,7 +1308,7 @@ pimCmdRotate::updateStats() const PimDataType dataType = objSrc.getDataType(); bool isVLayout = objSrc.isVLayout(); - pimeval::perfEnergy mPerfEnergy = pimSim::get()->getPerfEnergyModel()->getPerfEnergyForRotate(m_cmdType, objSrc); + pimeval::perfEnergy mPerfEnergy = pimSim::get()->getPerfEnergyModel()->getPerfEnergyForRotate(m_cmdType, objSrc, m_useCrossRegionCommunication); pimSim::get()->getStatsMgr()->recordCmd(getName(dataType, isVLayout), mPerfEnergy); return true; } diff --git a/libpimeval/src/pimCmd.h b/libpimeval/src/pimCmd.h index 0b05fd1f..afcde024 100644 --- a/libpimeval/src/pimCmd.h +++ b/libpimeval/src/pimCmd.h @@ -506,8 +506,8 @@ class pimCmdBroadcast : public pimCmd class pimCmdRotate : public pimCmd { public: - pimCmdRotate(PimCmdEnum cmdType, PimObjId src) - : pimCmd(cmdType), m_src(src) + pimCmdRotate(PimCmdEnum cmdType, PimObjId src, bool useCrossRegionCommunication) + : pimCmd(cmdType), m_src(src), m_useCrossRegionCommunication(useCrossRegionCommunication) { assert(cmdType == PimCmdEnum::ROTATE_ELEM_R || cmdType == PimCmdEnum::ROTATE_ELEM_L || cmdType == PimCmdEnum::SHIFT_ELEM_R || cmdType == PimCmdEnum::SHIFT_ELEM_L); @@ -520,6 +520,7 @@ class pimCmdRotate : public pimCmd protected: PimObjId m_src; std::vector m_regionBoundary; + bool m_useCrossRegionCommunication; }; //! @class pimCmdReadRowToSa diff --git a/libpimeval/src/pimPerfEnergyAquabolt.cpp b/libpimeval/src/pimPerfEnergyAquabolt.cpp index c1f4ff4f..c780e126 100644 --- a/libpimeval/src/pimPerfEnergyAquabolt.cpp +++ b/libpimeval/src/pimPerfEnergyAquabolt.cpp @@ -237,7 +237,7 @@ pimPerfEnergyAquabolt::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimOb //! @brief Perf energy model of aquabolt PIM for rotate pimeval::perfEnergy -pimPerfEnergyAquabolt::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const +pimPerfEnergyAquabolt::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const { double msRuntime = 0.0; double mjEnergy = 0.0; diff --git a/libpimeval/src/pimPerfEnergyAquabolt.h b/libpimeval/src/pimPerfEnergyAquabolt.h index d3f10b39..7645853d 100644 --- a/libpimeval/src/pimPerfEnergyAquabolt.h +++ b/libpimeval/src/pimPerfEnergyAquabolt.h @@ -26,7 +26,7 @@ class pimPerfEnergyAquabolt : public pimPerfEnergyBase virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override; virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override; virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override; - virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override; + virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override; protected: unsigned m_aquaboltFPUBitWidth = 16; diff --git a/libpimeval/src/pimPerfEnergyBankLevel.cpp b/libpimeval/src/pimPerfEnergyBankLevel.cpp index 3da368eb..20733026 100644 --- a/libpimeval/src/pimPerfEnergyBankLevel.cpp +++ b/libpimeval/src/pimPerfEnergyBankLevel.cpp @@ -297,7 +297,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimO // TODO: This needs to be revisited //! @brief Perf energy model of bank-level PIM for rotate pimeval::perfEnergy -pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const +pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const { double msRuntime = 0.0; double mjEnergy = 0.0; @@ -308,8 +308,6 @@ pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI unsigned bitsPerElement = obj.getBitsPerElement(PimBitWidth::ACTUAL); unsigned numRegions = obj.getRegions().size(); uint64_t totalOp = 0; - // boundary handling - assume two times copying between device and host for boundary elements - pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); // rotate within subarray: // For every bit: Read row to SA; move SA to R1; Shift R1 by N steps; Move R1 to SA; Write SA to row @@ -319,8 +317,12 @@ pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI msRuntime = (m_tR + (bitsPerElement + 2) * m_tL + m_tW); // for one pass msRuntime *= numPass; mjEnergy = (m_eAP + (bitsPerElement + 2) * m_eL) * numPass; - msRuntime += 2 * perfEnergyBT.m_msRuntime; - mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + if(useCrossRegionCommunication) { + // boundary handling - assume two times copying between device and host for boundary elements + pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); + msRuntime += 2 * perfEnergyBT.m_msRuntime; + mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + } std::cout << "PIM-Warning: Perf energy model is not precise for PIM command " << pimCmd::getName(cmdType, "") << std::endl; return pimeval::perfEnergy(msRuntime, mjEnergy, msRead, msWrite, msCompute, totalOp); diff --git a/libpimeval/src/pimPerfEnergyBankLevel.h b/libpimeval/src/pimPerfEnergyBankLevel.h index b0482f44..034b643b 100644 --- a/libpimeval/src/pimPerfEnergyBankLevel.h +++ b/libpimeval/src/pimPerfEnergyBankLevel.h @@ -26,7 +26,7 @@ class pimPerfEnergyBankLevel : public pimPerfEnergyBase virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override; virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override; virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override; - virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override; + virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override; protected: double m_blimpCoreOriginalLatency = 0.000005; // ms; 200 MHz. Reference: BLIMP paper diff --git a/libpimeval/src/pimPerfEnergyBase.cpp b/libpimeval/src/pimPerfEnergyBase.cpp index 529a6572..d881871b 100644 --- a/libpimeval/src/pimPerfEnergyBase.cpp +++ b/libpimeval/src/pimPerfEnergyBase.cpp @@ -156,7 +156,7 @@ pimPerfEnergyBase::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInf //! @brief Perf energy model of base class for rotate (placeholder) pimeval::perfEnergy -pimPerfEnergyBase::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const +pimPerfEnergyBase::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const { double msRuntime = 1e10; double mjEnergy = 999999999.9; diff --git a/libpimeval/src/pimPerfEnergyBase.h b/libpimeval/src/pimPerfEnergyBase.h index 652cb49e..3aa1b47f 100644 --- a/libpimeval/src/pimPerfEnergyBase.h +++ b/libpimeval/src/pimPerfEnergyBase.h @@ -70,7 +70,7 @@ class pimPerfEnergyBase virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const; virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const; virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const; - virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const; + virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const; protected: PimDeviceEnum m_simTarget; diff --git a/libpimeval/src/pimPerfEnergyBitSerial.cpp b/libpimeval/src/pimPerfEnergyBitSerial.cpp index a52561f7..4b405d5c 100644 --- a/libpimeval/src/pimPerfEnergyBitSerial.cpp +++ b/libpimeval/src/pimPerfEnergyBitSerial.cpp @@ -441,7 +441,7 @@ pimPerfEnergyBitSerial::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimO //! @brief Perf energy model of bit-serial PIM for rotate pimeval::perfEnergy -pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const +pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const { double msRuntime = 0.0; double mjEnergy = 0.0; @@ -453,8 +453,6 @@ pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI unsigned bitsPerElement = obj.getBitsPerElement(PimBitWidth::ACTUAL); unsigned numRegions = obj.getRegions().size(); unsigned numCore = obj.getNumCoreAvailable(); - // boundary handling - assume two times copying between device and host for boundary elements - pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); switch (m_simTarget) { case PIM_DEVICE_BITSIMD_V: @@ -467,8 +465,12 @@ pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI totalOp += 3 * bitsPerElement * numPass * numCore; msRuntime = msRead + msWrite + msCompute; mjEnergy = (m_eAP + 3 * m_eL) * bitsPerElement * numPass; // for one pass - msRuntime += 2 * perfEnergyBT.m_msRuntime; - mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + if(useCrossRegionCommunication) { + // boundary handling - assume two times copying between device and host for boundary elements + pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); + msRuntime += 2 * perfEnergyBT.m_msRuntime; + mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + } break; case PIM_DEVICE_SIMDRAM: // todo @@ -483,8 +485,12 @@ pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI msRuntime = (m_tR + (bitsPerElement + 2) * m_tL + m_tW); // for one pass msRuntime *= numPass; mjEnergy = (m_eAP + (bitsPerElement + 2) * m_eL) * numPass; - msRuntime += 2 * perfEnergyBT.m_msRuntime; - mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + if(useCrossRegionCommunication) { + // boundary handling - assume two times copying between device and host for boundary elements + pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); + msRuntime += 2 * perfEnergyBT.m_msRuntime; + mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + } break; default: assert(0); diff --git a/libpimeval/src/pimPerfEnergyBitSerial.h b/libpimeval/src/pimPerfEnergyBitSerial.h index def51983..c26e4db9 100644 --- a/libpimeval/src/pimPerfEnergyBitSerial.h +++ b/libpimeval/src/pimPerfEnergyBitSerial.h @@ -26,7 +26,7 @@ class pimPerfEnergyBitSerial : public pimPerfEnergyBase virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override; virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override; virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override; - virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override; + virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override; protected: pimeval::perfEnergy getPerfEnergyBitSerial(PimDeviceEnum deviceType, PimCmdEnum cmdType, unsigned numPass, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const; diff --git a/libpimeval/src/pimPerfEnergyFulcrum.cpp b/libpimeval/src/pimPerfEnergyFulcrum.cpp index 55034757..81c9e890 100644 --- a/libpimeval/src/pimPerfEnergyFulcrum.cpp +++ b/libpimeval/src/pimPerfEnergyFulcrum.cpp @@ -301,7 +301,7 @@ pimPerfEnergyFulcrum::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObj //! @brief Perf energy model of Fulcrum for rotate pimeval::perfEnergy -pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const +pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const { double msRuntime = 0.0; double mjEnergy = 0.0; @@ -312,8 +312,6 @@ pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInf unsigned bitsPerElement = obj.getBitsPerElement(PimBitWidth::ACTUAL); unsigned numRegions = obj.getRegions().size(); uint64_t totalOp = 0; - // boundary handling - assume two times copying between device and host for boundary elements - pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); // rotate within subarray: // For every bit: Read row to SA; move SA to R1; Shift R1 by N steps; Move R1 to SA; Write SA to row @@ -324,8 +322,12 @@ pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInf msWrite = m_tW * numPass; msRuntime = msRead + msWrite + msCompute; mjEnergy = (m_eAP + (bitsPerElement + 2) * m_eL) * numPass; - msRuntime += 2 * perfEnergyBT.m_msRuntime; - mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + if(useCrossRegionCommunication) { + // boundary handling - assume two times copying between device and host for boundary elements + pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); + msRuntime += 2 * perfEnergyBT.m_msRuntime; + mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + } std::cout << "PIM-Warning: Perf energy model is not precise for PIM command " << pimCmd::getName(cmdType, "") << std::endl; return pimeval::perfEnergy(msRuntime, mjEnergy, msRead, msWrite, msCompute, totalOp); diff --git a/libpimeval/src/pimPerfEnergyFulcrum.h b/libpimeval/src/pimPerfEnergyFulcrum.h index aa28f3c9..e7ae0951 100644 --- a/libpimeval/src/pimPerfEnergyFulcrum.h +++ b/libpimeval/src/pimPerfEnergyFulcrum.h @@ -26,7 +26,7 @@ class pimPerfEnergyFulcrum : public pimPerfEnergyBase virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override; virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override; virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override; - virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override; + virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override; protected: double m_fulcrumMulLatency = 0.00000609; // 6.09ns diff --git a/libpimeval/src/pimSim.cpp b/libpimeval/src/pimSim.cpp index 63174a3f..ae10a79f 100644 --- a/libpimeval/src/pimSim.cpp +++ b/libpimeval/src/pimSim.cpp @@ -825,7 +825,7 @@ pimSim::pimRotateElementsRight(PimObjId src) { pimPerfMon perfMon("pimRotateElementsRight"); if (!isValidDevice()) { return false; } - std::unique_ptr cmd = std::make_unique(PimCmdEnum::ROTATE_ELEM_R, src); + std::unique_ptr cmd = std::make_unique(PimCmdEnum::ROTATE_ELEM_R, src, true); return m_device->executeCmd(std::move(cmd)); } @@ -834,25 +834,25 @@ pimSim::pimRotateElementsLeft(PimObjId src) { pimPerfMon perfMon("pimRotateElementsLeft"); if (!isValidDevice()) { return false; } - std::unique_ptr cmd = std::make_unique(PimCmdEnum::ROTATE_ELEM_L, src); + std::unique_ptr cmd = std::make_unique(PimCmdEnum::ROTATE_ELEM_L, src, true); return m_device->executeCmd(std::move(cmd)); } bool -pimSim::pimShiftElementsRight(PimObjId src) +pimSim::pimShiftElementsRight(PimObjId src, bool useCrossRegionCommunication) { pimPerfMon perfMon("pimShiftElementsRight"); if (!isValidDevice()) { return false; } - std::unique_ptr cmd = std::make_unique(PimCmdEnum::SHIFT_ELEM_R, src); + std::unique_ptr cmd = std::make_unique(PimCmdEnum::SHIFT_ELEM_R, src, useCrossRegionCommunication); return m_device->executeCmd(std::move(cmd)); } bool -pimSim::pimShiftElementsLeft(PimObjId src) +pimSim::pimShiftElementsLeft(PimObjId src, bool useCrossRegionCommunication) { pimPerfMon perfMon("pimShiftElementsLeft"); if (!isValidDevice()) { return false; } - std::unique_ptr cmd = std::make_unique(PimCmdEnum::SHIFT_ELEM_L, src); + std::unique_ptr cmd = std::make_unique(PimCmdEnum::SHIFT_ELEM_L, src, useCrossRegionCommunication); return m_device->executeCmd(std::move(cmd)); } diff --git a/libpimeval/src/pimSim.h b/libpimeval/src/pimSim.h index d2aedc06..4c1bff34 100644 --- a/libpimeval/src/pimSim.h +++ b/libpimeval/src/pimSim.h @@ -120,8 +120,8 @@ class pimSim template bool pimBroadcast(PimObjId dest, T value); bool pimRotateElementsRight(PimObjId src); bool pimRotateElementsLeft(PimObjId src); - bool pimShiftElementsRight(PimObjId src); - bool pimShiftElementsLeft(PimObjId src); + bool pimShiftElementsRight(PimObjId src, bool useCrossRegionCommunication); + bool pimShiftElementsLeft(PimObjId src, bool useCrossRegionCommunication); bool pimShiftBitsRight(PimObjId src, PimObjId dest, unsigned shiftAmount); bool pimShiftBitsLeft(PimObjId src, PimObjId dest, unsigned shiftAmount); diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index af98f430..db2f3e91 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -117,14 +117,14 @@ void sumStencilRow(PimObjId mid, PimObjId pimRowSum, PimObjId shiftBackup, const status = pimCopyObjectToObject(mid, shiftBackup); assert (status == PIM_OK); - status = pimShiftElementsRight(shiftBackup); + status = pimShiftElementsRight(shiftBackup, !isHorizontallyChunked); assert (status == PIM_OK); status = pimAdd(mid, shiftBackup, pimRowSum); assert (status == PIM_OK); for(uint64_t shiftIter=1; shiftIter Date: Mon, 5 May 2025 14:23:31 -0400 Subject: [PATCH 18/31] typo --- misc-bench/stencil/PIM/stencil.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index db2f3e91..c61ee68a 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -410,7 +410,7 @@ int main(int argc, char* argv[]) numElementsHorizontal = deviceProp.numSubarrayPerBank * deviceProp.numColPerSubarray / bitsPerElement; break; default: - std::cerr << "Stencil unimplmented for simulation target: " << deviceProp.simTarget << std::endl; + std::cerr << "Stencil unimplemented for simulation target: " << deviceProp.simTarget << std::endl; std::exit(1); } } else { From e7ba0e3d257358f503592a486adf61f50bbcd304 Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Tue, 6 May 2025 14:47:36 -0400 Subject: [PATCH 19/31] minor stencil updates --- misc-bench/stencil/PIM/stencil.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index c61ee68a..1368891d 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -353,7 +353,12 @@ int main(int argc, char* argv[]) struct Params params = getInputParams(argc, argv); std::cout << "Running PIM stencil for grid: " << params.gridHeight << "x" << params.gridWidth << std::endl; - std::cout << "Stencil Radius: " << params.radius << std::endl; + std::cout << "Stencil Radius: " << params.radius << ", Number of Iterations: " << params.iterations << std::endl; + if constexpr(isHorizontallyChunked) { + std::cout << "Stencil does not use cross region communication" << std::endl; + } else { + std::cout << "Stencil uses cross region communication" << std::endl; + } std::vector> x, y; @@ -437,7 +442,7 @@ int main(int argc, char* argv[]) #pragma omp parallel for collapse(2) for(uint64_t gridY=startY; gridY acceptableDelta) { #pragma omp critical From e4fb1eac9789e9a974405de20032174641805275 Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Tue, 6 May 2025 14:56:07 -0400 Subject: [PATCH 20/31] fix readme --- misc-bench/stencil/README.MD | 49 +++--------------------------------- 1 file changed, 4 insertions(+), 45 deletions(-) diff --git a/misc-bench/stencil/README.MD b/misc-bench/stencil/README.MD index f4dea72c..55a2fd18 100644 --- a/misc-bench/stencil/README.MD +++ b/misc-bench/stencil/README.MD @@ -1,8 +1,8 @@ # Stencil -Stencil computing takes a 2D array and sets each element to the average of its neighbors and itself. By default, the average is taken of each 3x3 block of elements, however this can be adjusted using the runtime parameters. For this benchmark, elements with neighbors outside of the input range are not computed. Additionally, the location of the output element within the stencil grid can be changed via the paramemters, meaning each output element could be the average of the 3x3 block to its bottom right etc., rather than being just in the center. For example, averaging a 3x3 grid with a 3x3 stencil pattern, with num left and num above both set to 1: +Iterative stencil loops consist of repeatedly applying a stencil pattern to a grid. This benchmark iterates a stencil average pattern on a 2D array, with both the number of iterations and the size of the stencil pattern parameterized. An example is below: -- Input: [[2.0, 2.0, 2.0], [1.0, 1.0, 1.0], [2.0, 2.0, 2.0]] +- Input: [[2.0, 2.0, 2.0], [1.0, 1.0, 1.0], [2.0, 2.0, 2.0]], radius=1, iterations=1 - Output: [[#, #, #], [#, 1.66, #], [#, #, #]] - \# represents an undefined element @@ -13,36 +13,15 @@ stencil/ ├── PIM/ │ ├── Makefile │ ├── stencil.cpp -├── baselines/ -│ ├── CPU/ -│ │ ├── Makefile -│ │ ├── stencil.cpp -│ ├── GPU/ -│ │ ├── Makefile -│ │ ├── stencil.cu ├── README.md ├── Makefile ``` ## Implementation Description -This repository contains three different implementations of the stencil benchmark: +This repository contains one implementation of the stencil benchmark: -1. CPU -2. GPU -3. PIM - -### Baseline Implementation - -CPU and GPU have been used as baselines. - -#### CPU - -The CPU variant of stencil has been implemented using the standard C++ library, as well as with parallelization from OpenMP. - -#### GPU - -The GPU variant leverages CUDA C++ to parallelize the stencil benchmark on an NVIDIA GPU. +1. PIM ### PIM Implementation @@ -50,26 +29,6 @@ The PIM variant is implemented using C++ with some speedup from OpenMP. Three di ## Compilation Instructions for Specific Variants -### CPU Variant - -To compile for the CPU variant, use: - -```bash -cd baselines/CPU -make -``` - -### GPU Variant - -To compile for the GPU variant, use: - -```bash -cd baselines/GPU -make -``` - -*Note that the GPU Makefile currently uses SM_80, which is compatible with the A100. To run it on a different GPU, please manually change this in the makefile. - ### PIM Variant To compile for the PIM variant, use: From de8173fe5b48c3380349e53d7825ca57ffadaa0d Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Wed, 7 May 2025 13:39:35 -0400 Subject: [PATCH 21/31] reset delta to 0.1 from 0.01, and set bank level region as subarray --- misc-bench/stencil/PIM/stencil.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index 1368891d..a73e6e87 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -411,9 +411,12 @@ int main(int argc, char* argv[]) case PIM_DEVICE_FULCRUM: numElementsHorizontal = deviceProp.numColPerSubarray / bitsPerElement; break; - case PIM_DEVICE_BANK_LEVEL: - numElementsHorizontal = deviceProp.numSubarrayPerBank * deviceProp.numColPerSubarray / bitsPerElement; - break; + case PIM_DEVICE_BANK_LEVEL: { + // numElementsHorizontal = deviceProp.numSubarrayPerBank * deviceProp.numColPerSubarray / bitsPerElement; + // TODO: Are bank level regions subarrays or banks? + numElementsHorizontal = deviceProp.numColPerSubarray / bitsPerElement; + break; + } default: std::cerr << "Stencil unimplemented for simulation target: " << deviceProp.simTarget << std::endl; std::exit(1); @@ -442,7 +445,7 @@ int main(int argc, char* argv[]) #pragma omp parallel for collapse(2) for(uint64_t gridY=startY; gridY acceptableDelta) { #pragma omp critical From 73c3c972b19a6195942cc8f96981325919a1b111 Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Sun, 14 Sep 2025 20:01:27 -0400 Subject: [PATCH 22/31] cleanup stencil comments --- misc-bench/stencil/PIM/stencil.cpp | 82 +++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 13 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index a73e6e87..177feb8c 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -102,7 +102,12 @@ struct Params getInputParams(int argc, char **argv) return p; } -//! @brief Sums the neighbors of each element in a stencil row to compute the horizontal stencil sum +//! @brief Sums the neighbors of each element in a stencil row to compute the horizontal stencil sum +//! +//! Sums radius number of elemements to the left and right of center element, including center element +//! Puts each result pimRowSum[i] where i is the center index +//! Formula: pimRowSum[i] = Σ (j ∈ [i-radius, i+radius]) mid[j] +//! Works by shifting mid to the left and right and adding shifted versions //! @param[in] mid PIM row to be summed //! @param[out] pimRowSum The resultant PIM object to place the sum into //! @param[in,out] shiftBackup Temporary PIM object used for calculations @@ -143,10 +148,25 @@ void sumStencilRow(PimObjId mid, PimObjId pimRowSum, PimObjId shiftBackup, const } } +//! @brief Computes one iteration of one chunk of the stencil +//! +//! Uses circular queue to compute window sums +//! Adds the next row to the front of the queue and to the sum +//! Takes the sum (divided by the stencil area) as the result from the row +//! Subtracts the back of the queue from the sum +//! Pops from the queue back of the queue +//! Repeats until done +//! @param[in] workingPimMemory PIM rows in the stencil chunk +//! @param[in] rowsInSumCircularQueue Queue used for keeping track of running sum of rows vertically +//! @param[in,out] tmpPim Temporary PIM object used for calculations +//! @param[in,out] runningSum Temporary PIM object used for keeping track of the current running (vertical) sum +//! @param[in] stencilAreaToMultiplyPim This algorithm computes stencil average, thus each element in the result must be divided by the stencil area. This is done by multiplying by the inverse. +//! @param[in] radius The stencil radius void computeStencilChunkIteration(std::vector& workingPimMemory, std::vector& rowsInSumCircularQueue, PimObjId tmpPim, PimObjId runningSum, const uint64_t stencilAreaToMultiplyPim, const uint64_t radius) { PimStatus status; - uint64_t circularQueueTop = 0; + uint64_t circularQueueBot = 0; + uint64_t circularQueueTop = 0; sumStencilRow(workingPimMemory[0], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); ++circularQueueTop; @@ -155,6 +175,13 @@ void computeStencilChunkIteration(std::vector& workingPimMemory, std:: status = pimAdd(rowsInSumCircularQueue[0], rowsInSumCircularQueue[1], runningSum); assert (status == PIM_OK); + // At this point: + // circularQueueBot = 0 + // circularQueueTop = 2 + // rowsInSumCircularQueue[0] = workingPimMemory[0] horizontally summed + // rowsInSumCircularQueue[1] = workingPimMemory[1] horizontally summed + // runningSum = sum of first two rows horizontally summed + for(uint64_t i=2; i<2*radius; ++i) { sumStencilRow(workingPimMemory[i], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); status = pimAdd(runningSum, rowsInSumCircularQueue[circularQueueTop], runningSum); @@ -162,7 +189,20 @@ void computeStencilChunkIteration(std::vector& workingPimMemory, std:: ++circularQueueTop; } - uint64_t nextRowToAdd = 2*radius; + // At this point: + // circularQueueBot = 0 + // circularQueueTop = 2*radius + // rowsInSumCircularQueue[0...2*radius] are occupied with workingPimMemory[0...2*radius] horizontally summed + // runningSum = sum of rows [0...2*radius] horizontally summed + + uint64_t nextRowToAdd = 2*radius; // The index of the next row to add to the queue and to the running sum + + // Loops over the rest of the rows in the current chunk, vertically + // Each iteration, finds horizontal sum of the next row (nextRowToAdd) + // Places this horizontal sum at the front of the queue (at position circularQueueTop) + // Adds the horizontal sum to the runningSum + // Places runningSum/stencilArea into the workingPimMemory as the final result for the row + // If neccessary, subtracts the row from the back of the queue from the runningSum for(uint64_t row=radius; row& workingPimMemory, std:: } } -void copyChunkedVectorPim(std::vector &vec, PimObjId pimObj, const uint64_t pimObjLen, const uint64_t numInvalid, const uint64_t numElementsHorizontal, const bool isToPim) { + +//! @brief Copies data to/from PIM, accounting for chunking in necessary +//! @param[in] vec Host vector to copy to PIM +//! @param[in] pimObj PIM Object to copy vector into +//! @param[in] numInvalid Number of elements on each side (left/right) of row that will not be included in final result +//! @param[in] numElementsHorizontal Number of PIM fp32 elements that can be placed in a row without shifting issues. If stencil is horizontally chunked, this means that elements cannot shift accross these boundaries, e.g. if numElementsHorizontal=100, then pimShiftElementsLeft cannot shift element 100 into position 99. +//! @param[in] isToPim Direction to copy. If true, then copy from host to PIM, if false then PIM to host +void copyChunkedVectorPim(std::vector &vec, PimObjId pimObj, const uint64_t numInvalid, const uint64_t numElementsHorizontal, const bool isToPim) { PimStatus status; if constexpr (!isHorizontallyChunked) { if(isToPim) { @@ -194,8 +241,11 @@ void copyChunkedVectorPim(std::vector &vec, PimObjId pimObj, const uint64 } assert (status == PIM_OK); } else { + //! @brief Total number of usable elements in final result const uint64_t totalValid = vec.size() - 2*numInvalid; + //! @brief Maximum number of usable elements in a horizontal chunk, will be the number usable for all except for (possibly) the last chunk const uint64_t maxUsable = numElementsHorizontal - 2*numInvalid; + //! @brief Total number of horizontal chunks const uint64_t numChunks = (totalValid + maxUsable - 1) / maxUsable; if(isToPim) { uint64_t hostStartIdx = 0; @@ -252,10 +302,16 @@ void stencil(const std::vector> &srcHost, std::vector(tmp); - constexpr uint64_t maxIterationsPerPim = 2; // TODO: what should this number be? + + // Model assumes that only a finite number of stencil iterations can be computed on the PIM device before transferring back to the host + // In chunked stencil implementations (with cross region computations) this limit is both vertical and horizontal + // In non-chunked stencil implementations, this limit is purely vertical + // TODO: Figure out what to make this number + constexpr uint64_t maxIterationsPerPim = 5; // TODO: what should this number be? uint64_t pimAllocWidth; if constexpr (isHorizontallyChunked) { + // Represents the number of elements on the left/right that aren't part of the final result for a horizontally chunked implementation. Without data movement, each iteration causes number of elements on each side to no longer be valid. const uint64_t maxInvalidHorizontal = radius * std::min(maxIterationsPerPim, iterations); const uint64_t maxUsableHorizontal = numElementsHorizontal - 2*maxInvalidHorizontal; const uint64_t maxChunksHorizontal = (gridWidth + maxUsableHorizontal - 1) / maxUsableHorizontal; @@ -284,22 +340,22 @@ void stencil(const std::vector> &srcHost, std::vector= srcHost.size()) { + const uint64_t firstRowUsableSrc = firstRowSrc + invalidResultsEachSide; + if(firstRowUsableSrc + invalidResultsEachSide >= srcHost.size()) { break; } const uint64_t totalRowsThisIter = std::min(srcHost.size(), firstRowSrc + workingPimMemory.size()) - firstRowSrc; - const uint64_t usableRowsThisIter = totalRowsThisIter - 2*invalidResultsTop; + const uint64_t usableRowsThisIter = totalRowsThisIter - 2*invalidResultsEachSide; uint64_t workingPimMemoryIdx = 0; for(uint64_t srcHostRow = firstRowSrc; srcHostRow < firstRowSrc + totalRowsThisIter; ++srcHostRow) { if(iter == 0) { - copyChunkedVectorPim(const_cast&>(srcHost[srcHostRow]), workingPimMemory[workingPimMemoryIdx], pimAllocWidth, invalidResultsTop, numElementsHorizontal, true); + copyChunkedVectorPim(const_cast&>(srcHost[srcHostRow]), workingPimMemory[workingPimMemoryIdx], invalidResultsEachSide, numElementsHorizontal, true); } else { - copyChunkedVectorPim(tmpGrid[srcHostRow], workingPimMemory[workingPimMemoryIdx], pimAllocWidth, invalidResultsTop, numElementsHorizontal, true); + copyChunkedVectorPim(tmpGrid[srcHostRow], workingPimMemory[workingPimMemoryIdx], invalidResultsEachSide, numElementsHorizontal, true); } ++workingPimMemoryIdx; } @@ -308,9 +364,9 @@ void stencil(const std::vector> &srcHost, std::vector Date: Sun, 14 Sep 2025 20:26:43 -0400 Subject: [PATCH 23/31] add cross region communication option for aim --- libpimeval/src/pimPerfEnergyAim.cpp | 2 +- libpimeval/src/pimPerfEnergyAim.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libpimeval/src/pimPerfEnergyAim.cpp b/libpimeval/src/pimPerfEnergyAim.cpp index 1dc73fc4..00190e94 100644 --- a/libpimeval/src/pimPerfEnergyAim.cpp +++ b/libpimeval/src/pimPerfEnergyAim.cpp @@ -132,7 +132,7 @@ pimPerfEnergyAim::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo //! @brief Perf energy model of aim for rotate pimeval::perfEnergy -pimPerfEnergyAim::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const +pimPerfEnergyAim::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const { double msRuntime = 0.0; double mjEnergy = 0.0; diff --git a/libpimeval/src/pimPerfEnergyAim.h b/libpimeval/src/pimPerfEnergyAim.h index bb00cd25..6b35750a 100644 --- a/libpimeval/src/pimPerfEnergyAim.h +++ b/libpimeval/src/pimPerfEnergyAim.h @@ -26,7 +26,7 @@ class pimPerfEnergyAim : public pimPerfEnergyBase virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override; virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override; virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override; - virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override; + virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override; virtual pimeval::perfEnergy getPerfEnergyForMac(PimCmdEnum cmdType, const pimObjInfo& obj) const override; protected: From 954e8f7e5b367b336044a7189f1bf01a185a7420 Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Tue, 30 Sep 2025 20:38:28 -0400 Subject: [PATCH 24/31] stencil: fix issue with switch case --- misc-bench/stencil/PIM/stencil.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp index 177feb8c..5ec25dc2 100644 --- a/misc-bench/stencil/PIM/stencil.cpp +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -465,14 +465,9 @@ int main(int argc, char* argv[]) if(deviceProp.isHLayoutDevice) { switch(deviceProp.simTarget) { case PIM_DEVICE_FULCRUM: + case PIM_DEVICE_BANK_LEVEL: numElementsHorizontal = deviceProp.numColPerSubarray / bitsPerElement; break; - case PIM_DEVICE_BANK_LEVEL: { - // numElementsHorizontal = deviceProp.numSubarrayPerBank * deviceProp.numColPerSubarray / bitsPerElement; - // TODO: Are bank level regions subarrays or banks? - numElementsHorizontal = deviceProp.numColPerSubarray / bitsPerElement; - break; - } default: std::cerr << "Stencil unimplemented for simulation target: " << deviceProp.simTarget << std::endl; std::exit(1); From 731fb7c5b2453b2ef071cbb68d3ee4b7f0e34219 Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Mon, 20 Oct 2025 01:47:52 -0400 Subject: [PATCH 25/31] add optimizer for stencil layout --- misc-bench/stencil/extra/optimizer.py | 90 +++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 misc-bench/stencil/extra/optimizer.py diff --git a/misc-bench/stencil/extra/optimizer.py b/misc-bench/stencil/extra/optimizer.py new file mode 100644 index 00000000..5230dabc --- /dev/null +++ b/misc-bench/stencil/extra/optimizer.py @@ -0,0 +1,90 @@ +# Finds the optimal layout for stencil assuming intra memory-layer transfer cost is consistent +# ie., all subarray to subarray transfers are equivalent in cost (within a bank) +# Note after running for some test values: layout does change based on transfer cost parameters + +subarray_block_width = 100 +subarray_block_height = 100 +subarrays_per_bank = 16 +banks_per_rank = 16 +ranks = 16 +transfer_cost_subarray_to_subarray = 1 +transfer_cost_bank_to_bank = 20 +transfer_cost_rank_to_rank = 100 + + +def get_stats(num_blocks, grid_width, block_width, block_height): + + if num_blocks % grid_width != 0: + raise ValueError("num blocks must be divisible by grid width") + + grid_height = num_blocks/grid_width + to_move_horizontal = (2 * grid_height * (grid_width - 1)) * (block_width - 2) + to_move_vertical = (2 * grid_width * (grid_height - 1)) * (block_height - 2) + to_move_diagonal = (4 * (grid_width-1) * (grid_height - 1)) + to_move_total = to_move_horizontal + to_move_vertical + to_move_diagonal + width_next = grid_width * block_width + height_next = grid_height * block_height + return to_move_total, width_next, height_next + +def total_move_cost(subarray_grid_width, bank_grid_width, rank_grid_width): + to_move_s2s, bank_block_width, bank_block_height = get_stats(subarrays_per_bank, subarray_grid_width, subarray_block_width, subarray_block_height) + to_move_b2b, rank_block_width, rank_block_height = get_stats(banks_per_rank, bank_grid_width, bank_block_width, bank_block_height) + to_move_r2r, final_block_width, final_block_height = get_stats(ranks, rank_grid_width, rank_block_width, rank_block_height) + cost = transfer_cost_subarray_to_subarray*to_move_s2s + cost += transfer_cost_bank_to_bank*to_move_b2b + cost += transfer_cost_rank_to_rank*to_move_r2r + return cost + +def get_divisors(n): + """Get all divisors of n""" + divisors = [] + for i in range(1, int(n**0.5) + 1): + if n % i == 0: + divisors.append(i) + if i != n // i: + divisors.append(n // i) + return sorted(divisors) + + +# Find all valid divisors +subarray_divisors = get_divisors(subarrays_per_bank) +bank_divisors = get_divisors(banks_per_rank) +rank_divisors = get_divisors(ranks) + +print("Valid divisors:") +print(f" subarrays_per_bank ({subarrays_per_bank}): {subarray_divisors}") +print(f" banks_per_rank ({banks_per_rank}): {bank_divisors}") +print(f" ranks ({ranks}): {rank_divisors}") +print() + +# Find optimal configuration +min_cost = float('inf') +best_config = None + +for sgw in subarray_divisors: + for bgw in bank_divisors: + for rgw in rank_divisors: + cost = total_move_cost(sgw, bgw, rgw) + if cost < min_cost: + min_cost = cost + best_config = (sgw, bgw, rgw) + +print("OPTIMAL CONFIGURATION:") +print(f" subarray_grid_width = {best_config[0]}") +print(f" bank_grid_width = {best_config[1]}") +print(f" rank_grid_width = {best_config[2]}") +print(f" Total move cost = {min_cost:,.0f}") +print() + +# Show top 10 configurations +print("Top 10 configurations:") +results = [] +for sgw in subarray_divisors: + for bgw in bank_divisors: + for rgw in rank_divisors: + cost = total_move_cost(sgw, bgw, rgw) + results.append((cost, sgw, bgw, rgw)) + +results.sort() +for i, (cost, sgw, bgw, rgw) in enumerate(results[:10], 1): + print(f"{i:2}. Cost={cost:12,.0f} subarray={sgw:2}, bank={bgw:2}, rank={rgw:2}") \ No newline at end of file From d08b6f3fe3e410794fa9f9e3d2ca6199e3ea05c3 Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Tue, 4 Nov 2025 20:22:59 -0500 Subject: [PATCH 26/31] fix failed 'make ' from root dir due to no makefile targets for string matching data generators --- .../hamming-string-match/hamming-data-generator/Makefile | 4 ++-- misc-bench/string-match/data-generator/Makefile | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/misc-bench/hamming-string-match/hamming-data-generator/Makefile b/misc-bench/hamming-string-match/hamming-data-generator/Makefile index 6426cf53..b4dff31d 100644 --- a/misc-bench/hamming-string-match/hamming-data-generator/Makefile +++ b/misc-bench/hamming-string-match/hamming-data-generator/Makefile @@ -24,9 +24,9 @@ ifeq ($(USE_OPENMP),1) CXXFLAGS += -fopenmp endif -.PHONY: all clean +.PHONY: all clean debug perf dramsim3_integ -all: $(EXEC) +all debug perf dramsim3_integ: $(EXEC) # Note: Need to avoid feeding .h files to clang command line $(EXEC): $(SRC_FILES) $(HEADER_DEPS) diff --git a/misc-bench/string-match/data-generator/Makefile b/misc-bench/string-match/data-generator/Makefile index 984a170a..524e7c52 100644 --- a/misc-bench/string-match/data-generator/Makefile +++ b/misc-bench/string-match/data-generator/Makefile @@ -24,9 +24,9 @@ ifeq ($(USE_OPENMP),1) CXXFLAGS += -fopenmp endif -.PHONY: all clean +.PHONY: all clean debug perf dramsim3_integ -all: $(EXEC) +all debug perf dramsim3_integ: $(EXEC) # Note: Need to avoid feeding .h files to clang command line $(EXEC): $(SRC_FILES) $(HEADER_DEPS) From 1c3495eb62096bd5f95626165bcec1efbfd69836 Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Thu, 6 Nov 2025 18:49:16 -0500 Subject: [PATCH 27/31] add alt stencil impl --- misc-bench/stencil/PIM/Makefile | 10 +- misc-bench/stencil/PIM/stencil_alt.cpp | 498 +++++++++++++++++++++++++ 2 files changed, 506 insertions(+), 2 deletions(-) create mode 100644 misc-bench/stencil/PIM/stencil_alt.cpp diff --git a/misc-bench/stencil/PIM/Makefile b/misc-bench/stencil/PIM/Makefile index 54481573..cbeb7aa0 100644 --- a/misc-bench/stencil/PIM/Makefile +++ b/misc-bench/stencil/PIM/Makefile @@ -15,10 +15,16 @@ endif EXEC := stencil.out SRC := stencil.cpp -debug perf dramsim3_integ: $(EXEC) +EXEC2 := stencil_alt.out +SRC2 := stencil_alt.cpp + +debug perf dramsim3_integ: $(EXEC) $(EXEC2) $(EXEC): $(SRC) $(DEPS) $(CXX) $< $(CXXFLAGS) -o $@ +$(EXEC2): $(SRC2) $(DEPS) + $(CXX) $< $(CXXFLAGS) -o $@ + clean: - rm -rf $(EXEC) *.dSYM \ No newline at end of file + rm -rf $(EXEC) $(EXEC2) *.dSYM \ No newline at end of file diff --git a/misc-bench/stencil/PIM/stencil_alt.cpp b/misc-bench/stencil/PIM/stencil_alt.cpp new file mode 100644 index 00000000..2324cdeb --- /dev/null +++ b/misc-bench/stencil/PIM/stencil_alt.cpp @@ -0,0 +1,498 @@ +// Test: C++ version of the stencil +// Copyright (c) 2025 University of Virginia +// This file is licensed under the MIT License. +// See the LICENSE file in the root of this repository for more details. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(_OPENMP) +#include +#endif + +#include "util.h" +#include "libpimeval.h" + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t iterations; + uint64_t gridWidth; + uint64_t gridHeight; + uint64_t radius; + const char *configFile; + const char *inputFile; + bool shouldVerify; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./stencil.out [options]" + "\n" + "\n -n iterations (default=10 iterations)" + "\n -x grid width (default=2048 elements)" + "\n -y grid height (default=2048 elements)" + "\n -r stencil radius (default=1)" + "\n -c dramsim config file" + "\n -i input file containing a 2d array (default=random)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +struct Params getInputParams(int argc, char **argv) +{ + struct Params p; + p.iterations = 10; + p.gridWidth = 2048; + p.gridHeight = 2048; + p.radius = 1; + p.configFile = nullptr; + p.inputFile = nullptr; + p.shouldVerify = false; + + int opt; + while ((opt = getopt(argc, argv, "h:n:x:y:r:c:i:v:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + break; + case 'n': + p.iterations = strtoull(optarg, NULL, 0); + break; + case 'x': + p.gridWidth = strtoull(optarg, NULL, 0); + break; + case 'y': + p.gridHeight = strtoull(optarg, NULL, 0); + break; + case 'r': + p.radius= strtoull(optarg, NULL, 0); + break; + case 'c': + p.configFile = optarg; + break; + case 'i': + p.inputFile = optarg; + break; + case 'v': + p.shouldVerify = (*optarg == 't'); + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + return p; +} + +//! @brief Sums the neighbors of each element in a stencil row to compute the horizontal stencil sum +//! +//! Sums radius number of elemements to the left and right of center element, including center element +//! Puts each result pimRowSum[i] where i is the center index +//! Formula: pimRowSum[i] = Σ (j ∈ [i-radius, i+radius]) mid[j] +//! Works by shifting mid to the left and right and adding shifted versions +//! @param[in] mid PIM row to be summed +//! @param[out] pimRowSum The resultant PIM object to place the sum into +//! @param[in,out] shiftBackup Temporary PIM object used for calculations +//! @param[in] radius The stencil radius +void sumStencilRow(PimObjId mid, PimObjId pimRowSum, PimObjId shiftBackup, const uint64_t radius) { + PimStatus status; + + if(radius == 0) { + return; + } + + status = pimCopyObjectToObject(mid, shiftBackup); + assert (status == PIM_OK); + + status = pimShiftElementsRight(shiftBackup, true); + assert (status == PIM_OK); + + status = pimAdd(mid, shiftBackup, pimRowSum); + assert (status == PIM_OK); + + for(uint64_t shiftIter=1; shiftIter rowsInSumCircularQueue; + std::vector workingPimMemory; + uint64_t firstRowIdxSrc; + PimObjId tmpPim; + PimObjId runningSum; + + VertChunkPim(uint64_t pimAllocWidth, uint64_t radius, uint64_t firstRowIdxSrc, uint64_t numRows) + : firstRowIdxSrc(firstRowIdxSrc) { + + tmpPim = pimAlloc(PIM_ALLOC_AUTO, pimAllocWidth, PIM_FP32); + assert(tmpPim != -1); + runningSum = pimAllocAssociated(tmpPim, PIM_FP32); + assert(runningSum != -1); + + rowsInSumCircularQueue.resize(2*radius+1); + for(uint64_t i=0; i> &srcHost) { + for(uint64_t idx = 0; idx < workingPimMemory.size(); ++idx) { + PimStatus status = pimCopyHostToDevice((void*) srcHost[firstRowIdxSrc + idx].data(), workingPimMemory[idx]); + assert (status == PIM_OK); + } + } + + void copyFromPim(std::vector> &dstHost, const uint64_t numOverlap) { + for(uint64_t idx = numOverlap; idx < workingPimMemory.size() - numOverlap; ++idx) { + PimStatus status = pimCopyDeviceToHost(workingPimMemory[idx], (void*) dstHost[firstRowIdxSrc + idx].data()); + assert (status == PIM_OK); + } + } + + //! @brief Computes one iteration of one chunk of the stencil + //! + //! Uses circular queue to compute window sums + //! Adds the next row to the front of the queue and to the sum + //! Takes the sum (divided by the stencil area) as the result from the row + //! Subtracts the back of the queue from the sum + //! Pops from the queue back of the queue + //! Repeats until done + //! @param[in] workingPimMemory PIM rows in the stencil chunk + //! @param[in] rowsInSumCircularQueue Queue used for keeping track of running sum of rows vertically + //! @param[in,out] tmpPim Temporary PIM object used for calculations + //! @param[in,out] runningSum Temporary PIM object used for keeping track of the current running (vertical) sum + //! @param[in] stencilAreaToMultiplyPim This algorithm computes stencil average, thus each element in the result must be divided by the stencil area. This is done by multiplying by the inverse. + //! @param[in] radius The stencil radius + void computeStencilIteration(const uint64_t stencilAreaToMultiplyPim, const uint64_t radius) { + PimStatus status; + + uint64_t circularQueueBot = 0; + uint64_t circularQueueTop = 0; + + sumStencilRow(workingPimMemory[0], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); + ++circularQueueTop; + sumStencilRow(workingPimMemory[1], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); + ++circularQueueTop; + status = pimAdd(rowsInSumCircularQueue[0], rowsInSumCircularQueue[1], runningSum); + assert (status == PIM_OK); + + // At this point: + // circularQueueBot = 0 + // circularQueueTop = 2 + // rowsInSumCircularQueue[0] = workingPimMemory[0] horizontally summed + // rowsInSumCircularQueue[1] = workingPimMemory[1] horizontally summed + // runningSum = sum of first two rows horizontally summed + + for(uint64_t i=2; i<2*radius; ++i) { + sumStencilRow(workingPimMemory[i], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); + status = pimAdd(runningSum, rowsInSumCircularQueue[circularQueueTop], runningSum); + assert (status == PIM_OK); + ++circularQueueTop; + } + + // At this point: + // circularQueueBot = 0 + // circularQueueTop = 2*radius + // rowsInSumCircularQueue[0...2*radius] are occupied with workingPimMemory[0...2*radius] horizontally summed + // runningSum = sum of rows [0...2*radius] horizontally summed + + uint64_t nextRowToAdd = 2*radius; // The index of the next row to add to the queue and to the running sum + + // Loops over the rest of the rows in the current chunk, vertically + // Each iteration, finds horizontal sum of the next row (nextRowToAdd) + // Places this horizontal sum at the front of the queue (at position circularQueueTop) + // Adds the horizontal sum to the runningSum + // Places runningSum/stencilArea into the workingPimMemory as the final result for the row + // If neccessary, subtracts the row from the back of the queue from the runningSum + + for(uint64_t row=radius; row& hostTmpRow, PimObjId pimSrc, PimObjId pimDst) { + PimStatus status = pimCopyDeviceToHost(pimSrc, hostTmpRow.data()); + assert(status == PIM_OK); + status = pimCopyHostToDevice(hostTmpRow.data(), pimDst); + assert(status == PIM_OK); +} + +//! @brief Computes a stencil pattern over a 2d array +//! @param[in] srcHost The input stencil grid +//! @param[out] dstHost The resultant stencil grid +//! @param[in] numAssociable Number of float 32 PIM objects that can be associated with each other +//! @param[in] numElementsHorizontal Number of float 32 PIM objects that can be placed in a PIM row without creating shifting issues +//! @param[in] iterations Number of iterations to run the stencil pattern for +//! @param[in] radius The radius of the stencil pattern +void stencil(const std::vector> &srcHost, std::vector> &dstHost, const uint64_t numAssociable, + const uint64_t numElementsHorizontal, const uint64_t iterations, const uint64_t radius) { + + assert(!srcHost.empty()); + assert(!srcHost[0].empty()); + assert(srcHost.size() == dstHost.size()); + assert(srcHost[0].size() == dstHost[0].size()); + + std::vector> tmpGrid; + tmpGrid.resize(srcHost.size(), std::vector(srcHost[0].size())); + + const uint64_t gridWidth = srcHost[0].size(); + + const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1); + const float stencilAreaFloat = 1.0f / static_cast(stencilAreaInt); + uint32_t tmp; + std::memcpy(&tmp, &stencilAreaFloat, sizeof(float)); + const uint64_t stencilAreaToMultiplyPim = static_cast(tmp); + const uint64_t pimAllocWidth = gridWidth; + + const uint64_t maxRowsPerVertChunk = numAssociable - (2*radius + 1) - 2; + const uint64_t numOverlap = radius; + uint64_t numVertChunks; + if (srcHost.size() <= maxRowsPerVertChunk) { + numVertChunks = 1; + } else if (srcHost.size() <= 2*(maxRowsPerVertChunk - numOverlap)) { + numVertChunks = 2; + } else { + const uint64_t firstAndLastChunkRows = 2 * (maxRowsPerVertChunk - numOverlap); + const uint64_t remainingRows = srcHost.size() - firstAndLastChunkRows; + const uint64_t middleChunkSize = maxRowsPerVertChunk - 2*numOverlap; + const uint64_t numMiddleChunks = (remainingRows + middleChunkSize - 1) / middleChunkSize; + numVertChunks = 2 + numMiddleChunks; + } + + std::vector vertChunks; + vertChunks.reserve(numVertChunks); + + for(uint64_t chunkIdx=0; chunkIdx hostTmpRow(gridWidth, 0.0f); + + for(uint64_t iter=0; iter& above = vertChunks[chunkIdx].workingPimMemory; + std::vector& below = vertChunks[chunkIdx+1].workingPimMemory; + + for(uint64_t row=0; row>& src, std::vector>& dst, const uint64_t iterations, const uint64_t radius) { + const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1); + const float stencilAreaInverseFloat = 1.0f / static_cast(stencilAreaInt); + + for(uint64_t iter=1; iter<=iterations; ++iter) { + // Only compute when stencil is fully in range + const uint64_t startY = radius*iter; + const uint64_t endY = src.size() - startY; + const uint64_t startX = radius*iter; + const uint64_t endX = src[0].size() - startX; + #pragma omp parallel for collapse(2) + for(uint64_t gridY=startY; gridY> x, y; + + if (params.inputFile == nullptr) + { + // Fill in random grid + x.resize(params.gridHeight, std::vector(params.gridWidth)); + + #pragma omp parallel + { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dist(0.0f, 10000.0f); + + #pragma omp for + for(size_t i=0; i(x[0].size())); + + PimDeviceProperties deviceProp; + PimStatus status = pimGetDeviceProperties(&deviceProp); + assert(status == PIM_OK); + + constexpr uint64_t bitsPerElement = 32; + + uint64_t numAssociable = 2 * deviceProp.numRowPerSubarray; + if(!deviceProp.isHLayoutDevice) { + numAssociable /= bitsPerElement; + } + + uint64_t numElementsHorizontal; + if(deviceProp.isHLayoutDevice) { + switch(deviceProp.simTarget) { + case PIM_DEVICE_FULCRUM: + case PIM_DEVICE_BANK_LEVEL: + numElementsHorizontal = deviceProp.numColPerSubarray / bitsPerElement; + break; + default: + std::cerr << "Stencil unimplemented for simulation target: " << deviceProp.simTarget << std::endl; + std::exit(1); + } + } else { + numElementsHorizontal = deviceProp.numColPerSubarray; + } + + stencil(x, y, numAssociable, numElementsHorizontal, params.iterations, params.radius); + + if (params.shouldVerify) + { + std::vector> cpuY(y.size(), std::vector(y[0].size())); + stencilCpu(x, cpuY, params.iterations, params.radius); + + bool ok = true; + + // Only compute when stencil is fully in range + const uint64_t startY = params.radius * params.iterations; + const uint64_t endY = params.gridHeight - startY; + const uint64_t startX = params.radius * params.iterations; + const uint64_t endX = params.gridWidth - startX; + + std::cout << std::fixed << std::setprecision(10); + + #pragma omp parallel for collapse(2) + for(uint64_t gridY=startY; gridY acceptableDelta) + { + #pragma omp critical + { + std::cout << "Wrong answer: " << y[gridY][gridX] << " (expected " << cpuY[gridY][gridX] << ") at position (" << gridX << ", " << gridY << ")" << std::endl; + ok = false; + } + } + } + } + if(ok) { + std::cout << "Correct for stencil!" << std::endl; + } + } + + pimShowStats(); + + return 0; +} \ No newline at end of file From 0cd037e8908084d27d1a5e4efc5ea37438ec097f Mon Sep 17 00:00:00 2001 From: Ethan Ermovick Date: Mon, 1 Dec 2025 16:57:45 -0500 Subject: [PATCH 28/31] start horizontal chunking, todo: data copy, chunk layout --- misc-bench/stencil/PIM/stencil_alt.cpp | 28 ++++++++++++++------------ 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil_alt.cpp b/misc-bench/stencil/PIM/stencil_alt.cpp index 2324cdeb..bf178f55 100644 --- a/misc-bench/stencil/PIM/stencil_alt.cpp +++ b/misc-bench/stencil/PIM/stencil_alt.cpp @@ -146,17 +146,19 @@ void sumStencilRow(PimObjId mid, PimObjId pimRowSum, PimObjId shiftBackup, const } } -struct VertChunkPim { +struct StencilTilePim { std::vector rowsInSumCircularQueue; std::vector workingPimMemory; - uint64_t firstRowIdxSrc; + uint64_t srcStartX; + uint64_t srcStartY; + uint64_t numX; PimObjId tmpPim; PimObjId runningSum; - VertChunkPim(uint64_t pimAllocWidth, uint64_t radius, uint64_t firstRowIdxSrc, uint64_t numRows) - : firstRowIdxSrc(firstRowIdxSrc) { + StencilTilePim(uint64_t radius, uint64_t srcStartY, uint64_t numY, uint64_t srcStartX, uint64_t numX) + : srcStartX(srcStartX), srcStartY(srcStartY), numX(numX) { - tmpPim = pimAlloc(PIM_ALLOC_AUTO, pimAllocWidth, PIM_FP32); + tmpPim = pimAlloc(PIM_ALLOC_AUTO, numX, PIM_FP32); assert(tmpPim != -1); runningSum = pimAllocAssociated(tmpPim, PIM_FP32); assert(runningSum != -1); @@ -167,7 +169,7 @@ struct VertChunkPim { assert(rowsInSumCircularQueue[i] != -1); } - workingPimMemory.resize(numRows); + workingPimMemory.resize(numY); for(uint64_t i=0; i> &srcHost) { for(uint64_t idx = 0; idx < workingPimMemory.size(); ++idx) { - PimStatus status = pimCopyHostToDevice((void*) srcHost[firstRowIdxSrc + idx].data(), workingPimMemory[idx]); + PimStatus status = pimCopyHostToDevice((void*) (srcHost[srcStartY + idx].data() + srcStartX), workingPimMemory[idx], 0, numX); assert (status == PIM_OK); } } void copyFromPim(std::vector> &dstHost, const uint64_t numOverlap) { for(uint64_t idx = numOverlap; idx < workingPimMemory.size() - numOverlap; ++idx) { - PimStatus status = pimCopyDeviceToHost(workingPimMemory[idx], (void*) dstHost[firstRowIdxSrc + idx].data()); + PimStatus status = pimCopyDeviceToHost(workingPimMemory[idx], (void*) dstHost[srcStartY + idx].data()); assert (status == PIM_OK); } } @@ -314,14 +316,14 @@ void stencil(const std::vector> &srcHost, std::vector vertChunks; + std::vector vertChunks; vertChunks.reserve(numVertChunks); for(uint64_t chunkIdx=0; chunkIdx Date: Mon, 1 Dec 2025 19:09:11 -0500 Subject: [PATCH 29/31] stencil progress, todo: wrong output --- libpimeval/src/pimPerfEnergyBankLevel.cpp | 2 +- libpimeval/src/pimPerfEnergyFulcrum.cpp | 2 +- misc-bench/stencil/PIM/stencil_alt.cpp | 189 +++++++++++++++++----- 3 files changed, 147 insertions(+), 46 deletions(-) diff --git a/libpimeval/src/pimPerfEnergyBankLevel.cpp b/libpimeval/src/pimPerfEnergyBankLevel.cpp index 733fb4da..88824e8c 100644 --- a/libpimeval/src/pimPerfEnergyBankLevel.cpp +++ b/libpimeval/src/pimPerfEnergyBankLevel.cpp @@ -381,7 +381,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); msRuntime += 2 * perfEnergyBT.m_msRuntime; mjEnergy += 2 * perfEnergyBT.m_mjEnergy; - printf("PIM-Warning: Perf energy model is not precise for PIM command %s\n", pimCmd::getName(cmdType, "").c_str()); + // printf("PIM-Warning: Perf energy model is not precise for PIM command %s\n", pimCmd::getName(cmdType, "").c_str()); } return pimeval::perfEnergy(msRuntime, mjEnergy, msRead, msWrite, msCompute, totalOp); diff --git a/libpimeval/src/pimPerfEnergyFulcrum.cpp b/libpimeval/src/pimPerfEnergyFulcrum.cpp index b221b1e6..b74a46cd 100644 --- a/libpimeval/src/pimPerfEnergyFulcrum.cpp +++ b/libpimeval/src/pimPerfEnergyFulcrum.cpp @@ -334,7 +334,7 @@ pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInf pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); msRuntime += 2 * perfEnergyBT.m_msRuntime; mjEnergy += 2 * perfEnergyBT.m_mjEnergy; - printf("PIM-Warning: Perf energy model is not precise for PIM command %s\n", pimCmd::getName(cmdType, "").c_str()); + // printf("PIM-Warning: Perf energy model is not precise for PIM command %s\n", pimCmd::getName(cmdType, "").c_str()); } return pimeval::perfEnergy(msRuntime, mjEnergy, msRead, msWrite, msCompute, totalOp); diff --git a/misc-bench/stencil/PIM/stencil_alt.cpp b/misc-bench/stencil/PIM/stencil_alt.cpp index bf178f55..cba65439 100644 --- a/misc-bench/stencil/PIM/stencil_alt.cpp +++ b/misc-bench/stencil/PIM/stencil_alt.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -152,11 +153,12 @@ struct StencilTilePim { uint64_t srcStartX; uint64_t srcStartY; uint64_t numX; + uint64_t numY; PimObjId tmpPim; PimObjId runningSum; StencilTilePim(uint64_t radius, uint64_t srcStartY, uint64_t numY, uint64_t srcStartX, uint64_t numX) - : srcStartX(srcStartX), srcStartY(srcStartY), numX(numX) { + : srcStartX(srcStartX), srcStartY(srcStartY), numX(numX), numY(numY) { tmpPim = pimAlloc(PIM_ALLOC_AUTO, numX, PIM_FP32); assert(tmpPim != -1); @@ -185,7 +187,7 @@ struct StencilTilePim { void copyFromPim(std::vector> &dstHost, const uint64_t numOverlap) { for(uint64_t idx = numOverlap; idx < workingPimMemory.size() - numOverlap; ++idx) { - PimStatus status = pimCopyDeviceToHost(workingPimMemory[idx], (void*) dstHost[srcStartY + idx].data()); + PimStatus status = pimCopyDeviceToHost(workingPimMemory[idx], (void*) (dstHost[srcStartY + idx].data() + srcStartX + numOverlap), numOverlap, numX - numOverlap); assert (status == PIM_OK); } } @@ -267,13 +269,27 @@ struct StencilTilePim { } }; -void pimMove(std::vector& hostTmpRow, PimObjId pimSrc, PimObjId pimDst) { - PimStatus status = pimCopyDeviceToHost(pimSrc, hostTmpRow.data()); +void pimMove(std::vector& hostTmpRow, PimObjId pimSrc, PimObjId pimDst, uint64_t srcIdx, uint64_t dstIdx, uint64_t num) { + PimStatus status = pimCopyDeviceToHost(pimSrc, hostTmpRow.data(), srcIdx, srcIdx + num); assert(status == PIM_OK); - status = pimCopyHostToDevice(hostTmpRow.data(), pimDst); + status = pimCopyHostToDevice(hostTmpRow.data(), pimDst, dstIdx, dstIdx + num); assert(status == PIM_OK); } +uint64_t getNumTiles(const uint64_t totalSize, const uint64_t maxChunkSize, const uint64_t numOverlap) { + if (totalSize <= maxChunkSize) { + return 1; + } else if (totalSize <= 2*(maxChunkSize - numOverlap)) { + return 2; + } else { + const uint64_t firstAndLastChunkRows = 2 * (maxChunkSize - numOverlap); + const uint64_t remainingRows = totalSize - firstAndLastChunkRows; + const uint64_t middleChunkSize = maxChunkSize - 2*numOverlap; + const uint64_t numMiddleChunks = (remainingRows + middleChunkSize - 1) / middleChunkSize; + return 2 + numMiddleChunks; + } +} + //! @brief Computes a stencil pattern over a 2d array //! @param[in] srcHost The input stencil grid //! @param[out] dstHost The resultant stencil grid @@ -301,61 +317,146 @@ void stencil(const std::vector> &srcHost, std::vector(tmp); const uint64_t pimAllocWidth = gridWidth; - const uint64_t maxRowsPerVertChunk = numAssociable - (2*radius + 1) - 2; + const uint64_t maxElemChunkY = numAssociable - (2*radius + 1) - 2; + const uint64_t maxElemChunkX = numElementsHorizontal; const uint64_t numOverlap = radius; - uint64_t numVertChunks; - if (srcHost.size() <= maxRowsPerVertChunk) { - numVertChunks = 1; - } else if (srcHost.size() <= 2*(maxRowsPerVertChunk - numOverlap)) { - numVertChunks = 2; - } else { - const uint64_t firstAndLastChunkRows = 2 * (maxRowsPerVertChunk - numOverlap); - const uint64_t remainingRows = srcHost.size() - firstAndLastChunkRows; - const uint64_t middleChunkSize = maxRowsPerVertChunk - 2*numOverlap; - const uint64_t numMiddleChunks = (remainingRows + middleChunkSize - 1) / middleChunkSize; - numVertChunks = 2 + numMiddleChunks; - } + const uint64_t numTileX = getNumTiles(srcHost[0].size(), maxElemChunkX, numOverlap); + const uint64_t numTileY = getNumTiles(srcHost.size(), maxElemChunkY, numOverlap); - std::vector vertChunks; - vertChunks.reserve(numVertChunks); + std::vector> stenTilesPim(numTileY); + for (auto& row : stenTilesPim) { + row.reserve(numTileX); + } - for(uint64_t chunkIdx=0; chunkIdx hostTmpRow(gridWidth, 0.0f); for(uint64_t iter=0; iter& above = vertChunks[chunkIdx].workingPimMemory; - std::vector& below = vertChunks[chunkIdx+1].workingPimMemory; + for(uint64_t tileIdxY=0; tileIdxY& above = tile.workingPimMemory; + std::vector& below = tileBelow.workingPimMemory; + + // only exchange rows with valid data + uint64_t startIdxX = tileIdxX == 0 ? 0 : numOverlap; + uint64_t endIdxX = tileIdxX == numTileX - 1 ? tile.numX : tile.numX - numOverlap; // exclusive + for(uint64_t row=0; row& left = tile.workingPimMemory; + std::vector& right = tileRight.workingPimMemory; + + // only exchange rows with valid data + uint64_t startIdxY = tileIdxY == 0 ? 0 : numOverlap; + uint64_t endIdxY = tileIdxY == numTileY - 1 ? tile.numY : tile.numY - numOverlap; // exclusive + for(uint64_t row=startIdxY; row& topLeft = tile.workingPimMemory; + std::vector& topRight = stenTilesPim[tileIdxY][tileIdxX+1].workingPimMemory; + std::vector& bottomLeft = stenTilesPim[tileIdxY+1][tileIdxX].workingPimMemory; + std::vector& bottomRight = tileRightBelow.workingPimMemory; + for(uint64_t row=0; row& above = vertChunks[chunkIdx].workingPimMemory; + // std::vector& below = vertChunks[chunkIdx+1].workingPimMemory; + + // for(uint64_t row=0; row Date: Tue, 2 Dec 2025 02:06:27 -0500 Subject: [PATCH 30/31] fix stencil bug --- misc-bench/stencil/PIM/stencil_alt.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil_alt.cpp b/misc-bench/stencil/PIM/stencil_alt.cpp index cba65439..d7655f66 100644 --- a/misc-bench/stencil/PIM/stencil_alt.cpp +++ b/misc-bench/stencil/PIM/stencil_alt.cpp @@ -315,7 +315,6 @@ void stencil(const std::vector> &srcHost, std::vector(tmp); - const uint64_t pimAllocWidth = gridWidth; const uint64_t maxElemChunkY = numAssociable - (2*radius + 1) - 2; const uint64_t maxElemChunkX = numElementsHorizontal; @@ -401,11 +400,10 @@ void stencil(const std::vector> &srcHost, std::vector& topLeft = tile.workingPimMemory; std::vector& topRight = stenTilesPim[tileIdxY][tileIdxX+1].workingPimMemory; std::vector& bottomLeft = stenTilesPim[tileIdxY+1][tileIdxX].workingPimMemory; - std::vector& bottomRight = tileRightBelow.workingPimMemory; + std::vector& bottomRight = stenTilesPim[tileIdxY+1][tileIdxX+1].workingPimMemory; for(uint64_t row=0; row> &srcHost, std::vector Date: Tue, 2 Dec 2025 02:11:48 -0500 Subject: [PATCH 31/31] stencil cleanup --- misc-bench/stencil/PIM/stencil_alt.cpp | 38 ++------------------------ 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/misc-bench/stencil/PIM/stencil_alt.cpp b/misc-bench/stencil/PIM/stencil_alt.cpp index d7655f66..9543e637 100644 --- a/misc-bench/stencil/PIM/stencil_alt.cpp +++ b/misc-bench/stencil/PIM/stencil_alt.cpp @@ -431,23 +431,6 @@ void stencil(const std::vector> &srcHost, std::vector& above = vertChunks[chunkIdx].workingPimMemory; - // std::vector& below = vertChunks[chunkIdx+1].workingPimMemory; - - // for(uint64_t row=0; row