Skip to content

Commit f116ea2

Browse files
authored
Bank-level scalar operation overhead (#312)
2 parents 19b1bcf + 817d4b1 commit f116ea2

2 files changed

Lines changed: 197 additions & 171 deletions

File tree

libpimeval/src/pimPerfEnergyBankLevel.cpp

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
3535
unsigned maxGDLItr = std::ceil(maxElementsPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
3636
unsigned minGDLItr = std::ceil(minElementPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
3737
unsigned numBankPerChip = numCores / m_numChipsPerRank;
38-
38+
// for scalar operations an extra read is required to read the scalar value
3939
switch (cmdType)
4040
{
4141
case PimCmdEnum::COPY_O2O:
@@ -52,10 +52,6 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
5252
}
5353
case PimCmdEnum::POPCOUNT:
5454
case PimCmdEnum::ABS:
55-
case PimCmdEnum::ADD_SCALAR:
56-
case PimCmdEnum::SUB_SCALAR:
57-
case PimCmdEnum::MUL_SCALAR:
58-
case PimCmdEnum::DIV_SCALAR:
5955
case PimCmdEnum::BIT_SLICE_EXTRACT:
6056
case PimCmdEnum::BIT_SLICE_INSERT:
6157
case PimCmdEnum::CONVERT_TYPE:
@@ -80,6 +76,23 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
8076
totalOp = obj.getNumElements();
8177
break;
8278
}
79+
case PimCmdEnum::ADD_SCALAR:
80+
case PimCmdEnum::SUB_SCALAR:
81+
case PimCmdEnum::MUL_SCALAR:
82+
case PimCmdEnum::DIV_SCALAR:
83+
{
84+
msRead = (m_tACT + m_tPRE) * numPass + m_tR + m_tGDL;
85+
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
86+
msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1)) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
87+
msRuntime = msRead + msWrite + msCompute;
88+
mjEnergy = ((m_eACT + m_ePRE) * 2 + (maxElementsPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1);
89+
mjEnergy += ((m_eACT + m_ePRE) * 2 + (minElementPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement)) * numCores;
90+
mjEnergy += (m_eR * maxGDLItr * (numPass-1) * numBankPerChip * m_numRanks + (m_eR * minGDLItr * numBankPerChip * m_numRanks)) + (m_eAP * numCores + m_eR * numBankPerChip * m_numRanks);
91+
mjEnergy += (m_eW * maxGDLItr * (numPass-1) * numBankPerChip * m_numRanks + (m_eW * minGDLItr * numBankPerChip * m_numRanks));
92+
mjEnergy += m_pBChip * m_numChipsPerRank * m_numRanks * msRuntime;
93+
totalOp = obj.getNumElements();
94+
break;
95+
}
8396
case PimCmdEnum::AND_SCALAR:
8497
case PimCmdEnum::OR_SCALAR:
8598
case PimCmdEnum::XOR_SCALAR:
@@ -90,6 +103,19 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
90103
case PimCmdEnum::NE_SCALAR:
91104
case PimCmdEnum::MIN_SCALAR:
92105
case PimCmdEnum::MAX_SCALAR:
106+
{
107+
msRead = (m_tACT + m_tPRE) * numPass + m_tR + m_tGDL;
108+
msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1)) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
109+
msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1)) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
110+
msRuntime = msRead + msWrite + msCompute;
111+
mjEnergy = (((m_eACT + m_ePRE) * 2) + (maxElementsPerRegion * m_blimpLogicalEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1);
112+
mjEnergy += (((m_eACT + m_ePRE) * 2) + (minElementPerRegion * m_blimpLogicalEnergy * numberOfOperationPerElement)) * numCores;
113+
mjEnergy += (m_eR * maxGDLItr * (numPass-1) * numBankPerChip * m_numRanks + (m_eR * minGDLItr * numBankPerChip * m_numRanks)) + (m_eAP * numCores + m_eR * numBankPerChip * m_numRanks);
114+
mjEnergy += (m_eW * maxGDLItr * (numPass-1) * numBankPerChip * m_numRanks + (m_eW * minGDLItr * numBankPerChip * m_numRanks));
115+
mjEnergy += m_pBChip * m_numChipsPerRank * m_numRanks * msRuntime;
116+
totalOp = obj.getNumElements();
117+
break;
118+
}
93119
case PimCmdEnum::SHIFT_BITS_L:
94120
case PimCmdEnum::SHIFT_BITS_R:
95121
{
@@ -196,13 +222,13 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjIn
196222
*
197223
* As a result, only one read operation is necessary for the entire pass.
198224
*/
199-
msRead = ((m_tACT + m_tPRE) * 2) * numPass;
225+
msRead = ((m_tACT + m_tPRE) * 2) * numPass + (m_tR + m_tGDL);
200226
msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1) + ((m_tACT + m_tPRE) + (minGDLItr * m_tGDL));
201227
msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * 2 * (numPass - 1)) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement * 2);
202228
msRuntime = msRead + msWrite + msCompute;
203229
mjEnergy = (((m_eACT + m_ePRE) * 3) + (maxElementsPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement * 2)) * numCoresUsed * (numPass - 1);
204230
mjEnergy += (((m_eACT + m_ePRE) * 3) + (minElementPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement * 2)) * numCoresUsed;
205-
mjEnergy += ((m_eR * 2 * maxGDLItr * (numPass-1)) + (m_eR * 2 * minGDLItr)) * numBankPerChip * m_numRanks;
231+
mjEnergy += ((m_eR * 2 * maxGDLItr * (numPass-1)) + (m_eR * 2 * minGDLItr)) * numBankPerChip * m_numRanks + (m_eAP * numCoresUsed + m_eR * numBankPerChip * m_numRanks);
206232
mjEnergy += ((m_eW * maxGDLItr * (numPass-1)) + (m_eW * minGDLItr)) * numBankPerChip * m_numRanks;
207233
mjEnergy += m_pBChip * m_numChipsPerRank * m_numRanks * msRuntime;
208234
totalOp = obj.getNumElements() * 2;

0 commit comments

Comments
 (0)