@@ -35,7 +35,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
3535 unsigned maxGDLItr = std::ceil (maxElementsPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
3636 unsigned minGDLItr = std::ceil (minElementPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
3737 unsigned numBankPerChip = numCores / m_numChipsPerRank;
38-
38+ // for scalar operations an extra read is required to read the scalar value
3939 switch (cmdType)
4040 {
4141 case PimCmdEnum::COPY_O2O:
@@ -52,10 +52,6 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
5252 }
5353 case PimCmdEnum::POPCOUNT:
5454 case PimCmdEnum::ABS:
55- case PimCmdEnum::ADD_SCALAR:
56- case PimCmdEnum::SUB_SCALAR:
57- case PimCmdEnum::MUL_SCALAR:
58- case PimCmdEnum::DIV_SCALAR:
5955 case PimCmdEnum::BIT_SLICE_EXTRACT:
6056 case PimCmdEnum::BIT_SLICE_INSERT:
6157 case PimCmdEnum::CONVERT_TYPE:
@@ -80,6 +76,23 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
8076 totalOp = obj.getNumElements ();
8177 break ;
8278 }
79+ case PimCmdEnum::ADD_SCALAR:
80+ case PimCmdEnum::SUB_SCALAR:
81+ case PimCmdEnum::MUL_SCALAR:
82+ case PimCmdEnum::DIV_SCALAR:
83+ {
84+ msRead = (m_tACT + m_tPRE) * numPass + m_tR + m_tGDL;
85+ msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
86+ msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1 )) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
87+ msRuntime = msRead + msWrite + msCompute;
88+ mjEnergy = ((m_eACT + m_ePRE) * 2 + (maxElementsPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1 );
89+ mjEnergy += ((m_eACT + m_ePRE) * 2 + (minElementPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement)) * numCores;
90+ mjEnergy += (m_eR * maxGDLItr * (numPass-1 ) * numBankPerChip * m_numRanks + (m_eR * minGDLItr * numBankPerChip * m_numRanks)) + (m_eAP * numCores + m_eR * numBankPerChip * m_numRanks);
91+ mjEnergy += (m_eW * maxGDLItr * (numPass-1 ) * numBankPerChip * m_numRanks + (m_eW * minGDLItr * numBankPerChip * m_numRanks));
92+ mjEnergy += m_pBChip * m_numChipsPerRank * m_numRanks * msRuntime;
93+ totalOp = obj.getNumElements ();
94+ break ;
95+ }
8396 case PimCmdEnum::AND_SCALAR:
8497 case PimCmdEnum::OR_SCALAR:
8598 case PimCmdEnum::XOR_SCALAR:
@@ -90,6 +103,19 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
90103 case PimCmdEnum::NE_SCALAR:
91104 case PimCmdEnum::MIN_SCALAR:
92105 case PimCmdEnum::MAX_SCALAR:
106+ {
107+ msRead = (m_tACT + m_tPRE) * numPass + m_tR + m_tGDL;
108+ msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
109+ msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1 )) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
110+ msRuntime = msRead + msWrite + msCompute;
111+ mjEnergy = (((m_eACT + m_ePRE) * 2 ) + (maxElementsPerRegion * m_blimpLogicalEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1 );
112+ mjEnergy += (((m_eACT + m_ePRE) * 2 ) + (minElementPerRegion * m_blimpLogicalEnergy * numberOfOperationPerElement)) * numCores;
113+ mjEnergy += (m_eR * maxGDLItr * (numPass-1 ) * numBankPerChip * m_numRanks + (m_eR * minGDLItr * numBankPerChip * m_numRanks)) + (m_eAP * numCores + m_eR * numBankPerChip * m_numRanks);
114+ mjEnergy += (m_eW * maxGDLItr * (numPass-1 ) * numBankPerChip * m_numRanks + (m_eW * minGDLItr * numBankPerChip * m_numRanks));
115+ mjEnergy += m_pBChip * m_numChipsPerRank * m_numRanks * msRuntime;
116+ totalOp = obj.getNumElements ();
117+ break ;
118+ }
93119 case PimCmdEnum::SHIFT_BITS_L:
94120 case PimCmdEnum::SHIFT_BITS_R:
95121 {
@@ -196,13 +222,13 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjIn
196222 *
197223 * As a result, only one read operation is necessary for the entire pass.
198224 */
199- msRead = ((m_tACT + m_tPRE) * 2 ) * numPass;
225+ msRead = ((m_tACT + m_tPRE) * 2 ) * numPass + (m_tR + m_tGDL) ;
200226 msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((m_tACT + m_tPRE) + (minGDLItr * m_tGDL));
201227 msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * 2 * (numPass - 1 )) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement * 2 );
202228 msRuntime = msRead + msWrite + msCompute;
203229 mjEnergy = (((m_eACT + m_ePRE) * 3 ) + (maxElementsPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement * 2 )) * numCoresUsed * (numPass - 1 );
204230 mjEnergy += (((m_eACT + m_ePRE) * 3 ) + (minElementPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement * 2 )) * numCoresUsed;
205- mjEnergy += ((m_eR * 2 * maxGDLItr * (numPass-1 )) + (m_eR * 2 * minGDLItr)) * numBankPerChip * m_numRanks;
231+ mjEnergy += ((m_eR * 2 * maxGDLItr * (numPass-1 )) + (m_eR * 2 * minGDLItr)) * numBankPerChip * m_numRanks + (m_eAP * numCoresUsed + m_eR * numBankPerChip * m_numRanks) ;
206232 mjEnergy += ((m_eW * maxGDLItr * (numPass-1 )) + (m_eW * minGDLItr)) * numBankPerChip * m_numRanks;
207233 mjEnergy += m_pBChip * m_numChipsPerRank * m_numRanks * msRuntime;
208234 totalOp = obj.getNumElements () * 2 ;
0 commit comments