Skip to content

Commit 1bc65d2

Browse files
committed
fix(tiling): emit cumulative numTiles for L1 when only one tiling schedule
When there is a single tiling schedule (len == 1), L1 is the outermost tile loop and is called exactly once. The prior code always emitted per-tile numTiles {0,1,...,N} at L1, so only tile 0 ran — causing failures when a node tiles into multiple L1 slices with no L2 spatial loop wrapping the L1 closure. Switch to cumulative {0, total} when len(tilingSchedules) == 1 so the single call walks all tiles. Keep per-tile for len > 1 (L2 spatial loop drives L1 once per step). Also add profilingNodes field to CodeGenVerbosity and honour it in PULPClusterTiling / PULPL3Tiling to restrict cycle-counter instrumentation to a named subset of nodes.
1 parent 3166af7 commit 1bc65d2

4 files changed

Lines changed: 18 additions & 5 deletions

File tree

Deeploy/DeeployTypes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ class CodeGenVerbosity:
5353

5454
tilingProfiling: Optional[bool] = False # Specifies if we should profile the tiling code
5555
untiledProfiling: Optional[bool] = None # Specifies if we should profile the untilied code
56+
profilingNodes: Optional[
57+
list] = None # Restrict tiling profiling to nodes whose name contains any of these substrings; None means all nodes
5658

5759

5860
_NoVerbosity = CodeGenVerbosity(None)

Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ def apply(self,
4242
name: str,
4343
verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
4444

45-
if verbose.tilingProfiling:
45+
nodes = verbose.profilingNodes
46+
profileThis = verbose.tilingProfiling and (nodes is None or any(s in name for s in nodes))
47+
if profileThis:
4648
ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name)
4749
ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name)
4850
else:

Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ def apply(self,
4242
name: str,
4343
verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
4444

45-
if verbose.tilingProfiling:
45+
nodes = verbose.profilingNodes
46+
profileThis = verbose.tilingProfiling and (nodes is None or any(s in name for s in nodes))
47+
if profileThis:
4648
ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name)
4749
ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name)
4850
else:

Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,14 +80,21 @@ def _hoistTileNumAndIdxPtr(self, ctxt: NetworkContext,
8080

8181
# Core extension: at the innermost memory level (L1), emit a per-tile
8282
# boundary so each invocation of the inner closure processes exactly one
83-
# tile. The outer-level (L3→L2) closure iterates N_outer times and calls
84-
# inner once per iter; with the baseline cumulative layout
83+
# tile. The outer-level (L2 spatial) closure iterates N_outer times and
84+
# calls inner once per iter; with the baseline cumulative layout
8585
# `{0, N1, N1+N2, ...}` inner would process many tiles per call and only
8686
# tolerate `len(tilingSchedules)` outer iters before reading numTiles
8787
# OOB. Per-tile layout `{0,1,2,...,total}` keeps
8888
# outer_iters == inner_calls == total_tiles. Outer memory levels keep
8989
# cumulative layout to iterate per L2 tile.
90-
if self.memory == "L1":
90+
#
91+
# Detect whether an outer driver will call L1 multiple times:
92+
# `len(tilingSchedules) > 1` means there are multiple L2 spatial steps,
93+
# so an L2 outer loop drives L1 once per step → per-tile.
94+
# When len == 1 (L1 is the single outermost tile loop, called once from
95+
# RunNetwork / the L3 closure), emit cumulative {0, total} so the single
96+
# call walks all tiles.
97+
if self.memory == "L1" and len(tilingSchedules) > 1:
9198
total = sum(stepsNumTiles)
9299
cumulativeNumTiles = list(range(total + 1))
93100
else:

0 commit comments

Comments
 (0)