Skip to content

Commit 9c8a046

Browse files
committed
tRAS + more fine-grain bank-level modeling
1 parent f116ea2 commit 9c8a046

9 files changed

Lines changed: 439 additions & 426 deletions

libpimeval/src/pimParamsDDRDram.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class pimParamsDDRDram : public pimParamsDram
4545
double gettCCD_L() const override { return m_tCCD_L; }
4646
double gettCCD_S() const override { return m_tCCD_S; }
4747
double gettCK() const override { return m_tCK; }
48+
double gettRAS() const override { return m_tRAS; }
4849

4950
private:
5051
// [dram_structure]

libpimeval/src/pimParamsDram.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class pimParamsDram
5050
virtual double gettRP() const = 0;
5151
virtual double gettCCD_L() const = 0;
5252
virtual double gettCCD_S() const = 0;
53+
virtual double gettRAS() const = 0;
5354
virtual double gettCK() const = 0;
5455
};
5556

libpimeval/src/pimParamsGDDRDram.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class pimParamsGDDRDram : public pimParamsDram
4545
double gettCCD_L() const override { return m_tCCD_L; }
4646
double gettCCD_S() const override { return m_tCCD_S; }
4747
double gettCK() const override { return m_tCK; }
48+
double gettRAS() const override { return m_tRAS; }
4849

4950
private:
5051
// [dram_structure]

libpimeval/src/pimParamsHBMDram.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class pimParamsHBMDram : public pimParamsDram
4545
double gettCCD_L() const override { return m_tCCD_L; }
4646
double gettCCD_S() const override { return m_tCCD_S; }
4747
double gettCK() const override { return m_tCK; }
48+
double gettRAS() const override { return m_tRAS; }
4849

4950
private:
5051
// [dram_structure]

libpimeval/src/pimParamsLPDDRDram.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class pimParamsLPDDRDram : public pimParamsDram
4545
double gettCCD_L() const override { return m_tCCD_L; }
4646
double gettCCD_S() const override { return m_tCCD_S; }
4747
double gettCK() const override { return m_tCK; }
48+
double gettRAS() const override { return m_tRAS; }
4849

4950
private:
5051
// [dram_structure]

libpimeval/src/pimPerfEnergyBankLevel.cpp

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,14 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
3535
unsigned maxGDLItr = std::ceil(maxElementsPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
3636
unsigned minGDLItr = std::ceil(minElementPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
3737
unsigned numBankPerChip = numCores / m_numChipsPerRank;
38+
double activateMS = minGDLItr * m_tGDL < m_tRAS * m_tCK ? m_tRAS * m_tCK : m_tACT; // Use tRAS if GDL is less than tRAS
3839
// for scalar operations an extra read is required to read the scalar value
3940
switch (cmdType)
4041
{
4142
case PimCmdEnum::COPY_O2O:
4243
{
43-
msRead = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
44-
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
44+
msRead = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
45+
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
4546
msCompute = 0;
4647
msRuntime = msRead + msWrite + msCompute;
4748
mjEnergy = numPass * numCores * (m_eACT + m_ePRE) * 2;
@@ -64,8 +65,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
6465
// numberOfOperationPerElement *= 5; // 2 shifts, 1 not, 1 and, 1 or
6566
}
6667
// Refer to fulcrum documentation
67-
msRead = (m_tACT + m_tPRE) * numPass;
68-
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
68+
msRead = (m_tACT + m_tPRE) * (numPass - 1) + (activateMS + m_tPRE);
69+
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
6970
msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1)) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
7071
msRuntime = msRead + msWrite + msCompute;
7172
mjEnergy = ((m_eACT + m_ePRE) * 2 + (maxElementsPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1);
@@ -81,8 +82,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
8182
case PimCmdEnum::MUL_SCALAR:
8283
case PimCmdEnum::DIV_SCALAR:
8384
{
84-
msRead = (m_tACT + m_tPRE) * numPass + m_tR + m_tGDL;
85-
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
85+
msRead = (m_tACT + m_tPRE) * (numPass - 1) + (activateMS + m_tPRE) + m_tR + m_tGDL;
86+
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
8687
msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1)) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
8788
msRuntime = msRead + msWrite + msCompute;
8889
mjEnergy = ((m_eACT + m_ePRE) * 2 + (maxElementsPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1);
@@ -104,8 +105,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
104105
case PimCmdEnum::MIN_SCALAR:
105106
case PimCmdEnum::MAX_SCALAR:
106107
{
107-
msRead = (m_tACT + m_tPRE) * numPass + m_tR + m_tGDL;
108-
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
108+
msRead = (m_tACT + m_tPRE) * (numPass - 1) + m_tR + m_tGDL + activateMS + m_tPRE;
109+
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
109110
msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1)) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
110111
msRuntime = msRead + msWrite + msCompute;
111112
mjEnergy = (((m_eACT + m_ePRE) * 2) + (maxElementsPerRegion * m_blimpLogicalEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1);
@@ -119,8 +120,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
119120
case PimCmdEnum::SHIFT_BITS_L:
120121
case PimCmdEnum::SHIFT_BITS_R:
121122
{
122-
msRead = (m_tACT + m_tPRE) * numPass;
123-
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
123+
msRead = (m_tACT + m_tPRE) * (numPass - 1) + (activateMS + m_tPRE);
124+
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
124125
msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1)) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
125126
msRuntime = msRead + msWrite + msCompute;
126127
mjEnergy = (((m_eACT + m_ePRE) * 2) + (maxElementsPerRegion * m_blimpLogicalEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1);
@@ -145,8 +146,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
145146
// corresponds to one logical LUT access, and we assume that this access is not vectorized across multiple inputs
146147
// within a single PE execution. In other words, we model the cost at the granularity of one element per operation.
147148
numberOfOperationPerElement = 1;
148-
msRead = (m_tACT + m_tPRE) * numPass;
149-
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (m_tW + (minGDLItr * m_tGDL));
149+
msRead = (m_tACT + m_tPRE) * (numPass - 1) + (activateMS + m_tPRE);
150+
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
150151
msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1)) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
151152
msRuntime = msRead + msWrite + msCompute;
152153
mjEnergy = ((m_eAP * 2) + (maxElementsPerRegion * m_blimpLogicalEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1);
@@ -186,6 +187,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjIn
186187
unsigned minGDLItr = std::ceil(minElementPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
187188
uint64_t totalOp = 0;
188189
unsigned numBankPerChip = numCoresUsed / m_numChipsPerRank;
190+
double activateMS = minGDLItr * m_tGDL < m_tRAS * m_tCK ? m_tRAS * m_tCK : m_tACT; // Use tRAS if GDL is less than tRAS
189191

190192
switch (cmdType)
191193
{
@@ -194,8 +196,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjIn
194196
case PimCmdEnum::MUL:
195197
case PimCmdEnum::DIV:
196198
{
197-
msRead = ((2 * (m_tACT + m_tPRE)) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((2 * (m_tACT + m_tPRE)) + (minGDLItr * m_tGDL));
198-
msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((m_tACT + m_tPRE) + (minGDLItr * m_tGDL));
199+
msRead = ((2 * (m_tACT + m_tPRE)) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((2 * (activateMS + m_tPRE)) + (minGDLItr * m_tGDL));
200+
msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((activateMS + m_tPRE) + (minGDLItr * m_tGDL));
199201
msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1)) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
200202
msRuntime = msRead + msWrite + msCompute;
201203
mjEnergy = (((m_eACT + m_ePRE) * 3) + (maxElementsPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement)) * numCoresUsed * (numPass - 1);
@@ -222,8 +224,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjIn
222224
*
223225
* As a result, only one read operation is necessary for the entire pass.
224226
*/
225-
msRead = ((m_tACT + m_tPRE) * 2) * numPass + (m_tR + m_tGDL);
226-
msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((m_tACT + m_tPRE) + (minGDLItr * m_tGDL));
227+
msRead = ((m_tACT + m_tPRE) * 2) * (numPass - 1) + (m_tR + m_tGDL) + (activateMS + m_tPRE);
228+
msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((activateMS + m_tPRE) + (minGDLItr * m_tGDL));
227229
msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * 2 * (numPass - 1)) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement * 2);
228230
msRuntime = msRead + msWrite + msCompute;
229231
mjEnergy = (((m_eACT + m_ePRE) * 3) + (maxElementsPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement * 2)) * numCoresUsed * (numPass - 1);
@@ -248,8 +250,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjIn
248250
case PimCmdEnum::COND_SELECT:
249251
case PimCmdEnum::COND_SELECT_SCALAR:
250252
{
251-
msRead = ((2 * (m_tACT + m_tPRE)) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((2 * (m_tACT + m_tPRE)) + (minGDLItr * m_tGDL));
252-
msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((m_tACT + m_tPRE) + (minGDLItr * m_tGDL));
253+
msRead = ((2 * (m_tACT + m_tPRE)) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((2 * (activateMS + m_tPRE)) + (minGDLItr * m_tGDL));
254+
msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((activateMS + m_tPRE) + (minGDLItr * m_tGDL));
253255
msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1)) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
254256
msRuntime = msRead + msWrite + msCompute;
255257
mjEnergy = (((m_eACT + m_ePRE) * 3) + (maxElementsPerRegion * m_blimpLogicalEnergy * numberOfOperationPerElement)) * numCoresUsed * (numPass - 1);
@@ -286,6 +288,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForReduction(PimCmdEnum cmdType, const pimO
286288
unsigned minGDLItr = std::ceil(minElementPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
287289
uint64_t totalOp = 0;
288290
unsigned numBankPerChip = numCore / m_numChipsPerRank;
291+
double activateMS = minGDLItr * m_tGDL < m_tRAS * m_tCK ? m_tRAS * m_tCK : m_tACT; // Use tRAS if GDL is less than tRAS
289292

290293
switch (cmdType) {
291294
case PimCmdEnum::REDSUM:
@@ -297,7 +300,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForReduction(PimCmdEnum cmdType, const pimO
297300
{
298301
// How many iteration require to read / write max elements per region
299302
double numberOfOperationPerElement = ((double)bitsPerElement / m_blimpCoreBitWidth);
300-
msRead = (m_tACT + m_tPRE) * numPass;
303+
msRead = (m_tACT + m_tPRE) * (numPass - 1) + (activateMS + m_tPRE);
301304
// reduction for all regions assuming 16 core AMD EPYC 9124
302305
double aggregateMs = static_cast<double>(obj.getNumCoresUsed()) / 2300000;
303306
msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1)) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement) + aggregateMs;
@@ -338,8 +341,10 @@ pimPerfEnergyBankLevel::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimO
338341
unsigned maxGDLItr = std::ceil(maxElementsPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
339342
unsigned minGDLItr = std::ceil(minElementPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
340343
unsigned numBankPerChip = numCore / m_numChipsPerRank;
344+
double activateMS = minGDLItr * m_tGDL < m_tRAS * m_tCK ? m_tRAS * m_tCK : m_tACT; // Use tRAS if GDL is less than tRAS
341345
uint64_t totalOp = 0;
342-
msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((m_tACT + m_tPRE) + (minGDLItr * m_tGDL));
346+
msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((activateMS + m_tPRE) + (minGDLItr * m_tGDL));
347+
343348
msRuntime = msRead + msWrite + msCompute;
344349
mjEnergy = (m_eACT + m_ePRE) * numPass * numCore;
345350
mjEnergy += (m_eW * maxGDLItr * (numPass-1) + m_eW * minGDLItr) * numBankPerChip;
@@ -399,6 +404,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForPrefixSum(PimCmdEnum cmdType, const pimO
399404
unsigned minGDLItr = std::ceil(minElementPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
400405
uint64_t totalOp = 0;
401406
unsigned numBankPerChip = numCore / m_numChipsPerRank;
407+
double activateMS = minGDLItr * m_tGDL < m_tRAS * m_tCK ? m_tRAS * m_tCK : m_tACT; // Use tRAS if GDL is less than tRAS
402408
switch (cmdType) {
403409
case PimCmdEnum::PREFIX_SUM:
404410
{
@@ -428,8 +434,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForPrefixSum(PimCmdEnum cmdType, const pimO
428434

429435
// How many iteration require to read / write max elements per region
430436
double numberOfOperationPerElement = ((double)bitsPerElement / m_blimpCoreBitWidth);
431-
msRead = 2 * numPass * (m_tACT + m_tPRE);
432-
msWrite = 2 * numPass * (m_tACT + m_tPRE);
437+
msRead = (2 * numPass - 1) * (m_tACT + m_tPRE) + 2 * (activateMS + m_tPRE);
438+
msWrite = (2 * numPass - 1) * (m_tACT + m_tPRE) + 2 *(activateMS + m_tPRE);
433439

434440
// reduction for all regions assuming 16 core AMD EPYC 9124
435441
double aggregateMs = static_cast<double>(obj.getNumCoresUsed()) / 2300000;

libpimeval/src/pimPerfEnergyBase.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ pimPerfEnergyBase::pimPerfEnergyBase(const pimPerfEnergyModelParams& params)
7474
m_tRCD = m_paramsDram.gettRCD();
7575
m_tRP = m_paramsDram.gettRP();
7676
m_tCAS = m_paramsDram.getNsTCAS() / m_nano_to_milli; // Convert ns to ms
77+
m_tRAS = m_paramsDram.gettRAS();
7778
}
7879

7980
//! @brief Perf energy model of data transfer between CPU memory and PIM memory

libpimeval/src/pimPerfEnergyBase.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,11 @@ class pimPerfEnergyBase
101101
double m_pBCore; // background power for each core in W
102102
double m_pBChip; // background power for each core in W
103103
double m_tCK; // Clock cycle time in ms
104-
unsigned m_tCCD_S; // Short command delay in ms
105-
unsigned m_tCCD_L; // Long command delay in ms
106-
unsigned m_tRCD; // RCD time in ms
107-
unsigned m_tRP; // RP time in ms
104+
unsigned m_tCCD_S; // Short command delay in cycles
105+
unsigned m_tCCD_L; // Long command delay in cycles
106+
unsigned m_tRCD; // RCD in cycles
107+
unsigned m_tRP; // RP in cycles
108+
unsigned m_tRAS; // RAS in cycles
108109
};
109110

110111
#endif

0 commit comments

Comments
 (0)