Skip to content

Commit e7dd555

Browse files
committed
Update pro microbenchmark codetransformation
1 parent 0b61499 commit e7dd555

10 files changed

Lines changed: 72 additions & 209 deletions

File tree

Deeploy/DeeployTypes.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ class CodeGenVerbosity:
5353

5454
tilingProfiling: Optional[bool] = False # Specifies if we should profile the tiling code
5555
untiledProfiling: Optional[bool] = None # Specifies if we should profile the untilied code
56+
microbenchmarkProfiling: Optional[bool] = False # Wrap each layer with PULP perf-counter microbenchmark
5657

5758

5859
_NoVerbosity = CodeGenVerbosity(None)

Deeploy/Targets/PULPOpen/Bindings.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
2525
from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
2626
from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
27+
from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark import PULPMicrobenchmark
2728
from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPProfileUntiled import PULPProfileUntiled
2829
from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
2930
from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack
@@ -103,7 +104,7 @@
103104
PULPSynchCoresPass(),
104105
ForkClosure(writeback = False, generateStruct = True),
105106
TilingVariableReplacementUpdate("L1"),
106-
PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True), # Enable perf counters
107+
PULPClusterTiling("L2", "L1", MchanDma()),
107108
ArgumentStructGeneration(),
108109
MemoryManagementGeneration("L1"),
109110
TilingVariableReplacement("L2"),
@@ -115,13 +116,14 @@
115116
MemoryManagementGeneration("L2"),
116117
MemoryManagementGeneration("L3.*"),
117118
MemoryManagementGeneration(),
119+
PULPMicrobenchmark(),
118120
])
119121

120122
ClusterTransformer = CodeTransformation([
121123
TilingVariableReplacement("L1"),
122124
TilingCallClosure(writeback = False, generateStruct = True),
123125
TilingVariableReplacementUpdate("L1"),
124-
PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True), # Enable perf counters
126+
PULPClusterTiling("L2", "L1", MchanDma()),
125127
ArgumentStructGeneration(),
126128
MemoryManagementGeneration("L1"),
127129
TilingVariableReplacement("L2"),
@@ -133,6 +135,7 @@
133135
MemoryManagementGeneration("L2"),
134136
MemoryManagementGeneration("L3.*"),
135137
MemoryManagementGeneration(),
138+
PULPMicrobenchmark(),
136139
])
137140

138141
SimpleTransformer = CodeTransformation([

Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py

Lines changed: 4 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
88
from Deeploy.TilingExtension.AsyncDma import AsyncDma
99
from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \
10-
DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn, ProfilingDoubleBufferingTilingMixIn
10+
DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn
1111
from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \
12-
PerfCounterSingleBufferingTilingMixIn, ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration
12+
ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration
1313

1414

1515
class PULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration):
@@ -28,55 +28,24 @@ class ProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration
2828
pass
2929

3030

31-
class PerfCounterPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, PerfCounterSingleBufferingTilingMixIn):
32-
"""Single buffering with performance counter profiling"""
33-
pass
34-
35-
36-
class PerfCounterPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn):
37-
"""Double buffering with performance counter profiling"""
38-
pass
39-
40-
41-
class CombinedProfilingPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn, PerfCounterSingleBufferingTilingMixIn):
42-
"""Single buffering with both cycle profiling and performance counter profiling"""
43-
pass
44-
45-
46-
class CombinedProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn, PerfCounterDoubleBufferingTilingMixIn):
47-
"""Double buffering with both cycle profiling and performance counter profiling"""
48-
pass
49-
50-
5131
class PULPClusterTiling(CodeTransformationPass):
5232

53-
def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, usePerfCounters: bool = False):
54-
self.usePerfCounters = usePerfCounters
33+
def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
5534
self.SB = PULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
5635
self.profilingSB = ProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
57-
self.perfCounterSB = PerfCounterPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
58-
self.combinedProfilingSB = CombinedProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
5936
self.DB = PULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
6037
self.profilingDB = ProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
61-
self.perfCounterDB = PerfCounterPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
62-
self.combinedProfilingDB = CombinedProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
6338

6439
def apply(self,
6540
ctxt: NetworkContext,
6641
executionBlock: ExecutionBlock,
6742
name: str,
6843
verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
6944

70-
if self.usePerfCounters and verbose.tilingProfiling:
71-
# Use combined profiling: cycle measurements + performance counter stats
72-
ctxt, executionBlock = self.combinedProfilingSB.apply(ctxt, executionBlock, name)
73-
ctxt, executionBlock = self.combinedProfilingDB.apply(ctxt, executionBlock, name)
74-
elif verbose.tilingProfiling:
75-
# Use cycle profiling only (basic cycle measurements)
45+
if verbose.tilingProfiling:
7646
ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name)
7747
ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name)
7848
else:
79-
# No profiling
8049
ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name)
8150
ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name)
8251

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from typing import Tuple
6+
7+
from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \
8+
NodeTemplate, _NoVerbosity
9+
10+
11+
class PULPMicrobenchmark(CodeTransformationPass):
12+
13+
_preTemplate = NodeTemplate("""
14+
perf_stats_t ${op}_perf_start, ${op}_perf_end, ${op}_perf_total;
15+
if (pi_core_id() == 0) {
16+
perf_bench_init();
17+
perf_bench_start();
18+
perf_bench_read(&${op}_perf_start);
19+
}
20+
""")
21+
22+
_postTemplate = NodeTemplate("""
23+
if (pi_core_id() == 0) {
24+
perf_bench_stop();
25+
perf_bench_read(&${op}_perf_end);
26+
perf_bench_diff(&${op}_perf_total, &${op}_perf_end, &${op}_perf_start);
27+
perf_bench_print("${op}", &${op}_perf_total);
28+
}
29+
""")
30+
31+
def apply(self,
32+
ctxt: NetworkContext,
33+
executionBlock: ExecutionBlock,
34+
name: str,
35+
verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
36+
37+
if not verbose.microbenchmarkProfiling:
38+
return ctxt, executionBlock
39+
40+
executionBlock.addLeft(self._preTemplate, {"op": name})
41+
executionBlock.addRight(self._postTemplate, {"op": name})
42+
return ctxt, executionBlock

Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py

Lines changed: 2 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, Future
1212
from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
1313
from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
14-
from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \
15-
ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo
14+
from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
15+
PrototypeTilingMixIn, TilingMetaInfo
1616
from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
1717
from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, stridesFromShape
1818

@@ -364,38 +364,3 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn
364364
executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements,
365365
_egressDMAStatements, closeLoopStatements)
366366
return executionBlock
367-
368-
class PerfCounterDoubleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn):
369-
"""
370-
Double buffering tiling with performance counter profiling.
371-
Provides detailed instruction-level statistics for each tile.
372-
"""
373-
374-
@classmethod
375-
def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
376-
setupStatements: List[CodeSnippet],
377-
teardownStatements: List[CodeSnippet]) -> ExecutionBlock:
378-
379-
executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
380-
teardownStatements)
381-
382-
# Inject performance counter initialization in setup (only once, not per-tile)
383-
executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo)
384-
385-
# Inject performance counter stop and print in teardown (only once, not per-tile)
386-
executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo)
387-
388-
return executionBlock
389-
390-
@classmethod
391-
def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
392-
openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
393-
egressDMAStatements: List[CodeSnippet],
394-
closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:
395-
396-
# Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles
397-
# executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo)
398-
399-
executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements,
400-
egressDMAStatements, closeLoopStatements)
401-
return executionBlock

Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py

Lines changed: 2 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future
1111
from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
1212
from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
13-
from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \
14-
ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo
13+
from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
14+
PrototypeTilingMixIn, TilingMetaInfo
1515
from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint
1616
from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme
1717

@@ -193,37 +193,3 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn
193193
return executionBlock
194194

195195

196-
class PerfCounterSingleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn):
197-
"""
198-
Single buffering tiling with performance counter profiling.
199-
Provides detailed instruction-level statistics for each tile.
200-
"""
201-
202-
@classmethod
203-
def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
204-
setupStatements: List[CodeSnippet],
205-
teardownStatements: List[CodeSnippet]) -> ExecutionBlock:
206-
207-
executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
208-
teardownStatements)
209-
210-
# Inject performance counter initialization in setup (only once, not per-tile)
211-
executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo)
212-
213-
# Inject performance counter stop and print in teardown (only once, not per-tile)
214-
executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo)
215-
216-
return executionBlock
217-
218-
@classmethod
219-
def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
220-
openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
221-
egressDMAStatements: List[CodeSnippet],
222-
closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:
223-
224-
# Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles
225-
# executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo)
226-
227-
executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements,
228-
egressDMAStatements, closeLoopStatements)
229-
return executionBlock

Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py

Lines changed: 0 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -64,105 +64,6 @@ def generateAllTilingCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingM
6464
return executionBlock
6565

6666

67-
class PerfCounterProfilingMixIn(ABC):
68-
"""
69-
MixIn for injecting performance counter profiling code.
70-
Provides detailed instruction-level statistics using CSR performance counters.
71-
"""
72-
73-
_perfCounterInit = NodeTemplate("""
74-
perf_stats_t ${nodeName}_perf_start, ${nodeName}_perf_end, ${nodeName}_perf_total;
75-
if (pi_core_id() == 0) {
76-
perf_bench_init();
77-
perf_bench_start();
78-
perf_bench_read(&${nodeName}_perf_start);
79-
}
80-
""")
81-
82-
_perfCounterStop = NodeTemplate("""
83-
if (pi_core_id() == 0) {
84-
perf_bench_stop();
85-
perf_bench_read(&${nodeName}_perf_end);
86-
perf_bench_diff(&${nodeName}_perf_total, &${nodeName}_perf_end, &${nodeName}_perf_start);
87-
perf_bench_print("${nodeName}", &${nodeName}_perf_total);
88-
}
89-
""")
90-
91-
_perfCounterKernelStart = NodeTemplate("""
92-
if (pi_core_id() == 0) {
93-
perf_bench_start();
94-
perf_bench_read(&${nodeName}_perf_kernel_start);
95-
}
96-
""")
97-
98-
_perfCounterKernelEnd = NodeTemplate("""
99-
if (pi_core_id() == 0) {
100-
perf_bench_stop();
101-
perf_bench_read(&${nodeName}_perf_kernel_end);
102-
perf_bench_diff(&${nodeName}_perf_kernel_total, &${nodeName}_perf_kernel_end, &${nodeName}_perf_kernel_start);
103-
perf_bench_print("${nodeName} Kernel", &${nodeName}_perf_kernel_total);
104-
}
105-
""")
106-
107-
_perfCounterKernelDecl = NodeTemplate("""
108-
perf_stats_t ${nodeName}_perf_kernel_start, ${nodeName}_perf_kernel_end, ${nodeName}_perf_kernel_total;
109-
""")
110-
111-
@classmethod
112-
def injectPerfCounterInit(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
113-
"""
114-
Inject performance counter initialization at the beginning of the node execution.
115-
This should be called in the setup phase.
116-
"""
117-
nodeName = metaInfo.nodeName
118-
119-
executionBlock.addLeft(cls._perfCounterInit, {
120-
"nodeName": nodeName,
121-
})
122-
123-
return executionBlock
124-
125-
@classmethod
126-
def injectPerfCounterStop(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
127-
"""
128-
Inject performance counter stop and print at the end of the node execution.
129-
This should be called in the teardown phase.
130-
"""
131-
nodeName = metaInfo.nodeName
132-
133-
executionBlock.addRight(cls._perfCounterStop, {
134-
"nodeName": nodeName,
135-
})
136-
137-
return executionBlock
138-
139-
@classmethod
140-
def injectPerfCounterKernelWrap(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
141-
"""
142-
Wrap the kernel execution with performance counter measurements.
143-
This provides detailed statistics for just the kernel computation (excluding DMA).
144-
"""
145-
nodeName = metaInfo.nodeName
146-
147-
if metaInfo.kernelLevelTiling:
148-
# Add declaration at the beginning
149-
executionBlock.addLeft(cls._perfCounterKernelDecl, {
150-
"nodeName": nodeName,
151-
})
152-
153-
# Add start measurement before kernel
154-
executionBlock.addLeft(cls._perfCounterKernelStart, {
155-
"nodeName": nodeName,
156-
})
157-
158-
# Add stop and print after kernel
159-
executionBlock.addRight(cls._perfCounterKernelEnd, {
160-
"nodeName": nodeName,
161-
})
162-
163-
return executionBlock
164-
165-
16667
class ProfilingPrototypeMixIn(ABC):
16768
_measureCycles = NodeTemplate("""
16869
${measurements}[${tileIdxVar}] = getCycles();

DeeployTest/testMVP.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,9 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
203203
- min: Initalize all variables at their minimal value.
204204
""")
205205
parser.add_argument('--profileTiling', action = "store_true")
206+
parser.add_argument('--profileMicrobenchmark',
207+
action = "store_true",
208+
help = 'Wrap each layer with PULP perf-counter microbenchmark instrumentation')
206209
parser.add_argument('--plotMemAlloc',
207210
action = 'store_true',
208211
help = 'Turn on plotting of the memory allocation and save it in the deeployState folder\n')
@@ -224,6 +227,9 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
224227
if args.profileTiling:
225228
verbosityCfg.tilingProfiling = True
226229

230+
if args.profileMicrobenchmark:
231+
verbosityCfg.microbenchmarkProfiling = True
232+
227233
onnx_graph = onnx.load_model(f'{args.dir}/network.onnx')
228234
graph = gs.import_onnx(onnx_graph)
229235

0 commit comments

Comments
 (0)