Skip to content

Commit 0b61499

Browse files
committed
Add microbenchmark to codepass
1 parent a437c2e commit 0b61499

8 files changed

Lines changed: 370 additions & 34 deletions

File tree

Deeploy/Targets/PULPOpen/Bindings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@
103103
PULPSynchCoresPass(),
104104
ForkClosure(writeback = False, generateStruct = True),
105105
TilingVariableReplacementUpdate("L1"),
106-
PULPClusterTiling("L2", "L1", MchanDma()),
106+
PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True), # Enable perf counters
107107
ArgumentStructGeneration(),
108108
MemoryManagementGeneration("L1"),
109109
TilingVariableReplacement("L2"),
@@ -121,7 +121,7 @@
121121
TilingVariableReplacement("L1"),
122122
TilingCallClosure(writeback = False, generateStruct = True),
123123
TilingVariableReplacementUpdate("L1"),
124-
PULPClusterTiling("L2", "L1", MchanDma()),
124+
PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True), # Enable perf counters
125125
ArgumentStructGeneration(),
126126
MemoryManagementGeneration("L1"),
127127
TilingVariableReplacement("L2"),

Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
88
from Deeploy.TilingExtension.AsyncDma import AsyncDma
99
from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \
10-
DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn
10+
DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn, ProfilingDoubleBufferingTilingMixIn
1111
from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \
12-
ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration
12+
PerfCounterSingleBufferingTilingMixIn, ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration
1313

1414

1515
class PULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration):
@@ -28,24 +28,55 @@ class ProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration
2828
pass
2929

3030

31+
class PerfCounterPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, PerfCounterSingleBufferingTilingMixIn):
32+
"""Single buffering with performance counter profiling"""
33+
pass
34+
35+
36+
class PerfCounterPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn):
37+
"""Double buffering with performance counter profiling"""
38+
pass
39+
40+
41+
class CombinedProfilingPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn, PerfCounterSingleBufferingTilingMixIn):
42+
"""Single buffering with both cycle profiling and performance counter profiling"""
43+
pass
44+
45+
46+
class CombinedProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn, PerfCounterDoubleBufferingTilingMixIn):
47+
"""Double buffering with both cycle profiling and performance counter profiling"""
48+
pass
49+
50+
3151
class PULPClusterTiling(CodeTransformationPass):
3252

33-
def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
53+
def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, usePerfCounters: bool = False):
54+
self.usePerfCounters = usePerfCounters
3455
self.SB = PULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
3556
self.profilingSB = ProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
57+
self.perfCounterSB = PerfCounterPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
58+
self.combinedProfilingSB = CombinedProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
3659
self.DB = PULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
3760
self.profilingDB = ProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
61+
self.perfCounterDB = PerfCounterPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
62+
self.combinedProfilingDB = CombinedProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
3863

3964
def apply(self,
4065
ctxt: NetworkContext,
4166
executionBlock: ExecutionBlock,
4267
name: str,
4368
verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
4469

45-
if verbose.tilingProfiling:
70+
if self.usePerfCounters and verbose.tilingProfiling:
71+
# Use combined profiling: cycle measurements + performance counter stats
72+
ctxt, executionBlock = self.combinedProfilingSB.apply(ctxt, executionBlock, name)
73+
ctxt, executionBlock = self.combinedProfilingDB.apply(ctxt, executionBlock, name)
74+
elif verbose.tilingProfiling:
75+
# Use cycle profiling only (basic cycle measurements)
4676
ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name)
4777
ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name)
4878
else:
79+
# No profiling
4980
ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name)
5081
ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name)
5182

Deeploy/Targets/PULPOpen/Platform.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ class PULPStructBuffer(StructBuffer):
248248

249249
# SCHEREMO: stdint is included before pulp_nn_kernels.h because it is supposed to be included in there, but isn't...
250250
_includeList = [
251-
"pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h"
251+
"pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h", "perf_utils.h"
252252
]
253253

254254

Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, Future
1212
from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
1313
from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
14-
from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
15-
PrototypeTilingMixIn, TilingMetaInfo
14+
from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \
15+
ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo
1616
from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
1717
from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, stridesFromShape
1818

@@ -364,3 +364,38 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn
364364
executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements,
365365
_egressDMAStatements, closeLoopStatements)
366366
return executionBlock
367+
368+
class PerfCounterDoubleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn):
369+
"""
370+
Double buffering tiling with performance counter profiling.
371+
Provides detailed instruction-level statistics for each tile.
372+
"""
373+
374+
@classmethod
375+
def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
376+
setupStatements: List[CodeSnippet],
377+
teardownStatements: List[CodeSnippet]) -> ExecutionBlock:
378+
379+
executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
380+
teardownStatements)
381+
382+
# Inject performance counter initialization in setup (only once, not per-tile)
383+
executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo)
384+
385+
# Inject performance counter stop and print in teardown (only once, not per-tile)
386+
executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo)
387+
388+
return executionBlock
389+
390+
@classmethod
391+
def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
392+
openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
393+
egressDMAStatements: List[CodeSnippet],
394+
closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:
395+
396+
# Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles
397+
# executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo)
398+
399+
executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements,
400+
egressDMAStatements, closeLoopStatements)
401+
return executionBlock

Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future
1111
from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
1212
from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
13-
from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
14-
PrototypeTilingMixIn, TilingMetaInfo
13+
from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \
14+
ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo
1515
from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint
1616
from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme
1717

@@ -191,3 +191,39 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn
191191
executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements,
192192
_egressDMAStatements, closeLoopStatements)
193193
return executionBlock
194+
195+
196+
class PerfCounterSingleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn):
197+
"""
198+
Single buffering tiling with performance counter profiling.
199+
Provides detailed instruction-level statistics for each tile.
200+
"""
201+
202+
@classmethod
203+
def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
204+
setupStatements: List[CodeSnippet],
205+
teardownStatements: List[CodeSnippet]) -> ExecutionBlock:
206+
207+
executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
208+
teardownStatements)
209+
210+
# Inject performance counter initialization in setup (only once, not per-tile)
211+
executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo)
212+
213+
# Inject performance counter stop and print in teardown (only once, not per-tile)
214+
executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo)
215+
216+
return executionBlock
217+
218+
@classmethod
219+
def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
220+
openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
221+
egressDMAStatements: List[CodeSnippet],
222+
closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:
223+
224+
# Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles
225+
# executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo)
226+
227+
executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements,
228+
egressDMAStatements, closeLoopStatements)
229+
return executionBlock

Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,105 @@ def generateAllTilingCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingM
6464
return executionBlock
6565

6666

67+
class PerfCounterProfilingMixIn(ABC):
68+
"""
69+
MixIn for injecting performance counter profiling code.
70+
Provides detailed instruction-level statistics using CSR performance counters.
71+
"""
72+
73+
_perfCounterInit = NodeTemplate("""
74+
perf_stats_t ${nodeName}_perf_start, ${nodeName}_perf_end, ${nodeName}_perf_total;
75+
if (pi_core_id() == 0) {
76+
perf_bench_init();
77+
perf_bench_start();
78+
perf_bench_read(&${nodeName}_perf_start);
79+
}
80+
""")
81+
82+
_perfCounterStop = NodeTemplate("""
83+
if (pi_core_id() == 0) {
84+
perf_bench_stop();
85+
perf_bench_read(&${nodeName}_perf_end);
86+
perf_bench_diff(&${nodeName}_perf_total, &${nodeName}_perf_end, &${nodeName}_perf_start);
87+
perf_bench_print("${nodeName}", &${nodeName}_perf_total);
88+
}
89+
""")
90+
91+
_perfCounterKernelStart = NodeTemplate("""
92+
if (pi_core_id() == 0) {
93+
perf_bench_start();
94+
perf_bench_read(&${nodeName}_perf_kernel_start);
95+
}
96+
""")
97+
98+
_perfCounterKernelEnd = NodeTemplate("""
99+
if (pi_core_id() == 0) {
100+
perf_bench_stop();
101+
perf_bench_read(&${nodeName}_perf_kernel_end);
102+
perf_bench_diff(&${nodeName}_perf_kernel_total, &${nodeName}_perf_kernel_end, &${nodeName}_perf_kernel_start);
103+
perf_bench_print("${nodeName} Kernel", &${nodeName}_perf_kernel_total);
104+
}
105+
""")
106+
107+
_perfCounterKernelDecl = NodeTemplate("""
108+
perf_stats_t ${nodeName}_perf_kernel_start, ${nodeName}_perf_kernel_end, ${nodeName}_perf_kernel_total;
109+
""")
110+
111+
@classmethod
112+
def injectPerfCounterInit(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
113+
"""
114+
Inject performance counter initialization at the beginning of the node execution.
115+
This should be called in the setup phase.
116+
"""
117+
nodeName = metaInfo.nodeName
118+
119+
executionBlock.addLeft(cls._perfCounterInit, {
120+
"nodeName": nodeName,
121+
})
122+
123+
return executionBlock
124+
125+
@classmethod
126+
def injectPerfCounterStop(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
127+
"""
128+
Inject performance counter stop and print at the end of the node execution.
129+
This should be called in the teardown phase.
130+
"""
131+
nodeName = metaInfo.nodeName
132+
133+
executionBlock.addRight(cls._perfCounterStop, {
134+
"nodeName": nodeName,
135+
})
136+
137+
return executionBlock
138+
139+
@classmethod
140+
def injectPerfCounterKernelWrap(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
141+
"""
142+
Wrap the kernel execution with performance counter measurements.
143+
This provides detailed statistics for just the kernel computation (excluding DMA).
144+
"""
145+
nodeName = metaInfo.nodeName
146+
147+
if metaInfo.kernelLevelTiling:
148+
# Add declaration at the beginning
149+
executionBlock.addLeft(cls._perfCounterKernelDecl, {
150+
"nodeName": nodeName,
151+
})
152+
153+
# Add start measurement before kernel
154+
executionBlock.addLeft(cls._perfCounterKernelStart, {
155+
"nodeName": nodeName,
156+
})
157+
158+
# Add stop and print after kernel
159+
executionBlock.addRight(cls._perfCounterKernelEnd, {
160+
"nodeName": nodeName,
161+
})
162+
163+
return executionBlock
164+
165+
67166
class ProfilingPrototypeMixIn(ABC):
68167
_measureCycles = NodeTemplate("""
69168
${measurements}[${tileIdxVar}] = getCycles();

0 commit comments

Comments
 (0)