@@ -35,13 +35,14 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
3535 unsigned maxGDLItr = std::ceil (maxElementsPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
3636 unsigned minGDLItr = std::ceil (minElementPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
3737 unsigned numBankPerChip = numCores / m_numChipsPerRank;
38+ double activateMS = minGDLItr * m_tGDL < m_tRAS * m_tCK ? m_tRAS * m_tCK : m_tACT; // Use tRAS if GDL is less than tRAS
3839 // for scalar operations an extra read is required to read the scalar value
3940 switch (cmdType)
4041 {
4142 case PimCmdEnum::COPY_O2O:
4243 {
43- msRead = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
44- msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
44+ msRead = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
45+ msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
4546 msCompute = 0 ;
4647 msRuntime = msRead + msWrite + msCompute;
4748 mjEnergy = numPass * numCores * (m_eACT + m_ePRE) * 2 ;
@@ -64,8 +65,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
6465 // numberOfOperationPerElement *= 5; // 2 shifts, 1 not, 1 and, 1 or
6566 }
6667 // Refer to fulcrum documentation
67- msRead = (m_tACT + m_tPRE) * numPass;
68- msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
68+ msRead = (m_tACT + m_tPRE) * ( numPass - 1 ) + (activateMS + m_tPRE) ;
69+ msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
6970 msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1 )) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
7071 msRuntime = msRead + msWrite + msCompute;
7172 mjEnergy = ((m_eACT + m_ePRE) * 2 + (maxElementsPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1 );
@@ -81,8 +82,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
8182 case PimCmdEnum::MUL_SCALAR:
8283 case PimCmdEnum::DIV_SCALAR:
8384 {
84- msRead = (m_tACT + m_tPRE) * numPass + m_tR + m_tGDL;
85- msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
85+ msRead = (m_tACT + m_tPRE) * ( numPass - 1 ) + (activateMS + m_tPRE) + m_tR + m_tGDL;
86+ msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
8687 msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1 )) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
8788 msRuntime = msRead + msWrite + msCompute;
8889 mjEnergy = ((m_eACT + m_ePRE) * 2 + (maxElementsPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1 );
@@ -104,8 +105,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
104105 case PimCmdEnum::MIN_SCALAR:
105106 case PimCmdEnum::MAX_SCALAR:
106107 {
107- msRead = (m_tACT + m_tPRE) * numPass + m_tR + m_tGDL;
108- msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
108+ msRead = (m_tACT + m_tPRE) * ( numPass - 1 ) + m_tR + m_tGDL + activateMS + m_tPRE ;
109+ msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
109110 msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1 )) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
110111 msRuntime = msRead + msWrite + msCompute;
111112 mjEnergy = (((m_eACT + m_ePRE) * 2 ) + (maxElementsPerRegion * m_blimpLogicalEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1 );
@@ -119,8 +120,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
119120 case PimCmdEnum::SHIFT_BITS_L:
120121 case PimCmdEnum::SHIFT_BITS_R:
121122 {
122- msRead = (m_tACT + m_tPRE) * numPass;
123- msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (m_tACT + m_tPRE + (minGDLItr * m_tGDL));
123+ msRead = (m_tACT + m_tPRE) * ( numPass - 1 ) + (activateMS + m_tPRE) ;
124+ msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
124125 msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1 )) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
125126 msRuntime = msRead + msWrite + msCompute;
126127 mjEnergy = (((m_eACT + m_ePRE) * 2 ) + (maxElementsPerRegion * m_blimpLogicalEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1 );
@@ -145,8 +146,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc1(PimCmdEnum cmdType, const pimObjIn
145146 // corresponds to one logical LUT access, and we assume that this access is not vectorized across multiple inputs
146147 // within a single PE execution. In other words, we model the cost at the granularity of one element per operation.
147148 numberOfOperationPerElement = 1 ;
148- msRead = (m_tACT + m_tPRE) * numPass;
149- msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (m_tW + (minGDLItr * m_tGDL));
149+ msRead = (m_tACT + m_tPRE) * ( numPass - 1 ) + (activateMS + m_tPRE) ;
150+ msWrite = ((m_tACT + m_tPRE + maxGDLItr * m_tGDL) * (numPass - 1 )) + (activateMS + m_tPRE + (minGDLItr * m_tGDL));
150151 msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1 )) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
151152 msRuntime = msRead + msWrite + msCompute;
152153 mjEnergy = ((m_eAP * 2 ) + (maxElementsPerRegion * m_blimpLogicalEnergy * numberOfOperationPerElement)) * numCores * (numPass - 1 );
@@ -186,6 +187,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjIn
186187 unsigned minGDLItr = std::ceil (minElementPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
187188 uint64_t totalOp = 0 ;
188189 unsigned numBankPerChip = numCoresUsed / m_numChipsPerRank;
190+ double activateMS = minGDLItr * m_tGDL < m_tRAS * m_tCK ? m_tRAS * m_tCK : m_tACT; // Use tRAS if GDL is less than tRAS
189191
190192 switch (cmdType)
191193 {
@@ -194,8 +196,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjIn
194196 case PimCmdEnum::MUL:
195197 case PimCmdEnum::DIV:
196198 {
197- msRead = ((2 * (m_tACT + m_tPRE)) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((2 * (m_tACT + m_tPRE)) + (minGDLItr * m_tGDL));
198- msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((m_tACT + m_tPRE) + (minGDLItr * m_tGDL));
199+ msRead = ((2 * (m_tACT + m_tPRE)) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((2 * (activateMS + m_tPRE)) + (minGDLItr * m_tGDL));
200+ msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((activateMS + m_tPRE) + (minGDLItr * m_tGDL));
199201 msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1 )) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
200202 msRuntime = msRead + msWrite + msCompute;
201203 mjEnergy = (((m_eACT + m_ePRE) * 3 ) + (maxElementsPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement)) * numCoresUsed * (numPass - 1 );
@@ -222,8 +224,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjIn
222224 *
223225 * As a result, only one read operation is necessary for the entire pass.
224226 */
225- msRead = ((m_tACT + m_tPRE) * 2 ) * numPass + (m_tR + m_tGDL);
226- msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((m_tACT + m_tPRE) + (minGDLItr * m_tGDL));
227+ msRead = ((m_tACT + m_tPRE) * 2 ) * ( numPass - 1 ) + (m_tR + m_tGDL) + (activateMS + m_tPRE );
228+ msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((activateMS + m_tPRE) + (minGDLItr * m_tGDL));
227229 msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * 2 * (numPass - 1 )) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement * 2 );
228230 msRuntime = msRead + msWrite + msCompute;
229231 mjEnergy = (((m_eACT + m_ePRE) * 3 ) + (maxElementsPerRegion * m_blimpArithmeticEnergy * numberOfOperationPerElement * 2 )) * numCoresUsed * (numPass - 1 );
@@ -248,8 +250,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjIn
248250 case PimCmdEnum::COND_SELECT:
249251 case PimCmdEnum::COND_SELECT_SCALAR:
250252 {
251- msRead = ((2 * (m_tACT + m_tPRE)) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((2 * (m_tACT + m_tPRE)) + (minGDLItr * m_tGDL));
252- msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((m_tACT + m_tPRE) + (minGDLItr * m_tGDL));
253+ msRead = ((2 * (m_tACT + m_tPRE)) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((2 * (activateMS + m_tPRE)) + (minGDLItr * m_tGDL));
254+ msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((activateMS + m_tPRE) + (minGDLItr * m_tGDL));
253255 msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1 )) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement);
254256 msRuntime = msRead + msWrite + msCompute;
255257 mjEnergy = (((m_eACT + m_ePRE) * 3 ) + (maxElementsPerRegion * m_blimpLogicalEnergy * numberOfOperationPerElement)) * numCoresUsed * (numPass - 1 );
@@ -286,6 +288,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForReduction(PimCmdEnum cmdType, const pimO
286288 unsigned minGDLItr = std::ceil (minElementPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
287289 uint64_t totalOp = 0 ;
288290 unsigned numBankPerChip = numCore / m_numChipsPerRank;
291+ double activateMS = minGDLItr * m_tGDL < m_tRAS * m_tCK ? m_tRAS * m_tCK : m_tACT; // Use tRAS if GDL is less than tRAS
289292
290293 switch (cmdType) {
291294 case PimCmdEnum::REDSUM:
@@ -297,7 +300,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForReduction(PimCmdEnum cmdType, const pimO
297300 {
298301 // How many iteration require to read / write max elements per region
299302 double numberOfOperationPerElement = ((double )bitsPerElement / m_blimpCoreBitWidth);
300- msRead = (m_tACT + m_tPRE) * numPass;
303+ msRead = (m_tACT + m_tPRE) * ( numPass - 1 ) + (activateMS + m_tPRE) ;
301304 // reduction for all regions assuming 16 core AMD EPYC 9124
302305 double aggregateMs = static_cast <double >(obj.getNumCoresUsed ()) / 2300000 ;
303306 msCompute = (maxElementsPerRegion * m_blimpLatency * numberOfOperationPerElement * (numPass - 1 )) + (minElementPerRegion * m_blimpLatency * numberOfOperationPerElement) + aggregateMs;
@@ -338,8 +341,10 @@ pimPerfEnergyBankLevel::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimO
338341 unsigned maxGDLItr = std::ceil (maxElementsPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
339342 unsigned minGDLItr = std::ceil (minElementPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
340343 unsigned numBankPerChip = numCore / m_numChipsPerRank;
344+ double activateMS = minGDLItr * m_tGDL < m_tRAS * m_tCK ? m_tRAS * m_tCK : m_tACT; // Use tRAS if GDL is less than tRAS
341345 uint64_t totalOp = 0 ;
342- msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((m_tACT + m_tPRE) + (minGDLItr * m_tGDL));
346+ msWrite = ((m_tACT + m_tPRE) + (maxGDLItr * m_tGDL)) * (numPass - 1 ) + ((activateMS + m_tPRE) + (minGDLItr * m_tGDL));
347+
343348 msRuntime = msRead + msWrite + msCompute;
344349 mjEnergy = (m_eACT + m_ePRE) * numPass * numCore;
345350 mjEnergy += (m_eW * maxGDLItr * (numPass-1 ) + m_eW * minGDLItr) * numBankPerChip;
@@ -399,6 +404,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForPrefixSum(PimCmdEnum cmdType, const pimO
399404 unsigned minGDLItr = std::ceil (minElementPerRegion * bitsPerElement * 1.0 / m_GDLWidth);
400405 uint64_t totalOp = 0 ;
401406 unsigned numBankPerChip = numCore / m_numChipsPerRank;
407+ double activateMS = minGDLItr * m_tGDL < m_tRAS * m_tCK ? m_tRAS * m_tCK : m_tACT; // Use tRAS if GDL is less than tRAS
402408 switch (cmdType) {
403409 case PimCmdEnum::PREFIX_SUM:
404410 {
@@ -428,8 +434,8 @@ pimPerfEnergyBankLevel::getPerfEnergyForPrefixSum(PimCmdEnum cmdType, const pimO
428434
429435 // How many iteration require to read / write max elements per region
430436 double numberOfOperationPerElement = ((double )bitsPerElement / m_blimpCoreBitWidth);
431- msRead = 2 * numPass * (m_tACT + m_tPRE);
432- msWrite = 2 * numPass * (m_tACT + m_tPRE);
437+ msRead = ( 2 * numPass - 1 ) * (m_tACT + m_tPRE) + 2 * (activateMS + m_tPRE);
438+ msWrite = ( 2 * numPass - 1 ) * (m_tACT + m_tPRE) + 2 *(activateMS + m_tPRE);
433439
434440 // reduction for all regions assuming 16 core AMD EPYC 9124
435441 double aggregateMs = static_cast <double >(obj.getNumCoresUsed ()) / 2300000 ;
0 commit comments