Skip to content

Commit 5b26788

Browse files
committed
Fix CCT_2 / AnomalyDetection promote correctness via maxBufferBytes cap
Root cause (Phase-1 \xc2\xa710.1 bisection): when a ConstantBuffer is promoted L3->L2, the L3 closure that would have refreshed the L2 staging buffer per tile via cl_ram_copy_2d is elided (the tiler sees the buffer at L2 and emits no L3->L2 transfer). The L1 closure still reads the source from a fixed L2 address with a +0 offset, so every tile re-reads the first tile_stride bytes of the static PI_L2 symbol. For broadcast-per-tile constants (LayerNorm scales/biases, small Gemm biases) this is correct; for tiled-across-channels weights (Conv, fused proj_bias DUPLICATEs, positional embedding) it is not, and the output corrupts. Adds maxBufferBytes (default 2048) to PromoteTensorsToL2Greedy; threaded through testMVP.py and deeployRunner.py as --promoteMaxBufferBytes. Buffers larger than the cap are kept in L3 (where the per-tile staging refill correctly advances the source pointer). Verified on the full Phase-1 sweep: * CCT_2_32_32_128 @ L2=400 KB: PASS 0/10 (was FAIL 10/10) * MLPerf/AnomalyDetection @ L2=200 KB: PASS 0/640, -15.3 %% cycles (was FAIL 169/640) * MLPerf/ImageClassification @ L2=100 KB: PASS 0/10, -3.9 %% cycles * MLPerf/ImageClassification @ L2=120 KB: PASS 0/10, -3.9 %% cycles * microLlama/microLlama1 @ L2=100 KB: PASS 0/2112, -17.8 %% cycles Set --promoteMaxBufferBytes=0 to disable the cap once the codegen handles tiled L2-resident weights correctly (Phase-2).
1 parent 678f4ba commit 5b26788

3 files changed

Lines changed: 43 additions & 12 deletions

File tree

Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ def __init__(self,
9999
metricsPath: Optional[str] = None,
100100
setupCycles: int = DEFAULT_PROMOTE_SETUP_CYCLES,
101101
bandwidthBytesPerCycle: float = DEFAULT_PROMOTE_BANDWIDTH_BYTES_PER_CYCLE,
102-
excludeNamePatterns: Optional[List[str]] = None):
102+
excludeNamePatterns: Optional[List[str]] = None,
103+
maxBufferBytes: Optional[int] = 2048):
103104
super().__init__()
104105
if strategy not in PROMOTE_STRATEGIES:
105106
raise ValueError(f"unknown promote strategy {strategy!r}; expected one of {PROMOTE_STRATEGIES}")
@@ -114,15 +115,23 @@ def __init__(self,
114115
self.metricsPath = metricsPath
115116
self.setupCycles = int(setupCycles)
116117
self.bandwidthBytesPerCycle = float(bandwidthBytesPerCycle)
117-
# Buffers whose name contains any of these substrings are skipped.
118-
# Default skips Conv-layer weights — see Phase-1 §10.1: promoting an
119-
# FP32 Conv weight into a static PI_L2 array breaks the per-tile
120-
# weight DMA path on Siracusa (CCT_2_32_32_128 fails 10/10 outputs
121-
# with only that one weight promoted; all 21 other CCT constants
122-
# promote correctly). Override with `excludeNamePatterns=[]` to
123-
# opt back in once the kernel codegen is fixed.
124-
self.excludeNamePatterns = list(excludeNamePatterns) if excludeNamePatterns is not None \
125-
else ["_conv_layers_", "_conv_weight", "_conv_bias"]
118+
# Name-pattern denylist (kept for Phase-2 finer control).
119+
self.excludeNamePatterns = list(excludeNamePatterns) if excludeNamePatterns is not None else []
120+
# Per-buffer byte cap. Root cause: when a ConstantBuffer is promoted
121+
# from L3 to L2, the L1 closure reads the weight from a static PI_L2
122+
# symbol with a fixed `+ 0` offset and the L3 closure that *would*
123+
# have advanced the source pointer per tile is elided (see Phase-1
124+
# §10.1 for the bisection). For broadcast-per-tile buffers
125+
# (LayerNorm scales/biases, small Gemm bias) every tile reads the
126+
# same bytes and that is correct; for *tiled-across-channels*
127+
# buffers (Conv weights, fused-bias duplicates, positional
128+
# embedding) every tile must read DIFFERENT bytes and the elided
129+
# advancement corrupts the output. Tile slabs in this codebase are
130+
# ~1.3 KB; a 2 KB cap reliably keeps tiled-weight buffers in L3
131+
# while still letting the cycle-aware strategy promote the small
132+
# broadcast ones (CCT_2_32_32_128 passes 0/10 with this cap).
133+
# Set to None to disable the cap once the codegen is fixed.
134+
self.maxBufferBytes = maxBufferBytes
126135
# Accumulated decisions across all apply() calls within one process.
127136
# Same tensor seen in later calls keeps its first-seen decision.
128137
self._decisions: dict = {}
@@ -183,7 +192,9 @@ def apply(self, ctxt: NetworkContext, graph: gs.Graph) -> Tuple[NetworkContext,
183192
continue
184193
if self.onlyConstants and not isinstance(buf, ConstantBuffer):
185194
continue
186-
if any(pat in name for pat in self.excludeNamePatterns):
195+
if self.excludeNamePatterns and any(pat in name for pat in self.excludeNamePatterns):
196+
continue
197+
if self.maxBufferBytes is not None and _bufferSizeBytes(buf) > self.maxBufferBytes:
187198
continue
188199
reuse = max(1, len(getattr(buf, "_users", [])))
189200
if reuse < self.minReuse:

DeeployTest/testMVP.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
114114
weightMemoryLevel = weightMemoryLevel))
115115

116116
if getattr(args, "promoteToL2", False):
117+
_maxBytes = getattr(args, "promoteMaxBufferBytes", 2048)
118+
if _maxBytes is not None and _maxBytes <= 0:
119+
_maxBytes = None
117120
memoryLevelAnnotationPasses.append(
118121
PromoteTensorsToL2Greedy(memoryHierarchy = memoryHierarchy,
119122
sourceLevel = "L3",
@@ -124,7 +127,8 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
124127
seed = args.promoteSeed,
125128
metricsPath = args.promoteMetricsPath,
126129
setupCycles = args.promoteSetupCycles,
127-
bandwidthBytesPerCycle = args.promoteBandwidth))
130+
bandwidthBytesPerCycle = args.promoteBandwidth,
131+
maxBufferBytes = _maxBytes))
128132

129133
# Make the deployer memory-level aware
130134
deployer = MemoryDeployerWrapper(deployer, memoryLevelAnnotationPasses)
@@ -248,6 +252,16 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
248252
type = float,
249253
default = 4.0,
250254
help = 'Effective DMA bandwidth (bytes/cycle) for cycle-aware strategy. Default 4.0.\n')
255+
parser.add_argument('--promoteMaxBufferBytes',
256+
type = int,
257+
default = 2048,
258+
help = 'Skip ConstantBuffers larger than this many bytes when promoting. '
259+
'Default 2048 keeps tiled-across-channels Conv/Gemm weights in L3 (where '
260+
'the per-tile L3->L2 staging refill in the L3 closure advances the source '
261+
'pointer correctly) and only promotes small broadcast-per-tile constants '
262+
'(LayerNorm scales/biases, small Gemm bias) where the static-PI_L2 read '
263+
'with a fixed +0 offset is the right behaviour. Set to 0 or a very large '
264+
'value to disable the cap.\n')
251265
parser.add_argument('--promoteMetricsPath',
252266
type = str,
253267
default = None,

DeeployTest/testUtils/deeployRunner.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,10 @@ def __init__(self,
194194
type = float,
195195
default = 4.0,
196196
help = 'Effective DMA bandwidth (bytes/cycle) for cycle-aware strategy. Default 4.0.\n')
197+
self.add_argument('--promoteMaxBufferBytes',
198+
type = int,
199+
default = 2048,
200+
help = 'Skip ConstantBuffers larger than N bytes. Default 2048.\n')
197201

198202
self.args = None
199203

@@ -283,6 +287,8 @@ def create_config_from_args(args: argparse.Namespace,
283287
gen_args_list.append(f"--promoteSetupCycles={args.promoteSetupCycles}")
284288
if hasattr(args, 'promoteBandwidth') and args.promoteBandwidth is not None:
285289
gen_args_list.append(f"--promoteBandwidth={args.promoteBandwidth}")
290+
if hasattr(args, 'promoteMaxBufferBytes') and args.promoteMaxBufferBytes is not None:
291+
gen_args_list.append(f"--promoteMaxBufferBytes={args.promoteMaxBufferBytes}")
286292

287293
if not tiling and getattr(args, 'profileUntiled', False):
288294
gen_args_list.append("--profileUntiled")

0 commit comments

Comments
 (0)