Fix CCT_2 / AnomalyDetection promote correctness via maxBufferBytes cap

runwangdl · runwangdl · commit 5b26788ebad7 · 2026-04-16T20:42:25.000Z
Root cause (Phase-1 \xc2\xa710.1 bisection): when a ConstantBuffer is promoted
L3-&gt;L2, the L3 closure that would have refreshed the L2 staging buffer
per tile via cl_ram_copy_2d is elided (the tiler sees the buffer at L2
and emits no L3-&gt;L2 transfer). The L1 closure still reads the source
from a fixed L2 address with a +0 offset, so every tile re-reads the
first tile_stride bytes of the static PI_L2 symbol. For
broadcast-per-tile constants (LayerNorm scales/biases, small Gemm
biases) this is correct; for tiled-across-channels weights (Conv,
fused proj_bias DUPLICATEs, positional embedding) it is not, and the
output corrupts.

Adds maxBufferBytes (default 2048) to PromoteTensorsToL2Greedy;
threaded through testMVP.py and deeployRunner.py as
--promoteMaxBufferBytes. Buffers larger than the cap are kept in L3
(where the per-tile staging refill correctly advances the source
pointer).

Verified on the full Phase-1 sweep:
* CCT_2_32_32_128 @ L2=400 KB: PASS 0/10 (was FAIL 10/10)
* MLPerf/AnomalyDetection @ L2=200 KB: PASS 0/640, -15.3 %% cycles
  (was FAIL 169/640)
* MLPerf/ImageClassification @ L2=100 KB: PASS 0/10, -3.9 %% cycles
* MLPerf/ImageClassification @ L2=120 KB: PASS 0/10, -3.9 %% cycles
* microLlama/microLlama1 @ L2=100 KB: PASS 0/2112, -17.8 %% cycles

Set --promoteMaxBufferBytes=0 to disable the cap once the codegen
handles tiled L2-resident weights correctly (Phase-2).
diff --git a/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py b/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py
@@ -99,7 +99,8 @@ def __init__(self,
                  metricsPath: Optional[str] = None,
                  setupCycles: int = DEFAULT_PROMOTE_SETUP_CYCLES,
                  bandwidthBytesPerCycle: float = DEFAULT_PROMOTE_BANDWIDTH_BYTES_PER_CYCLE,
-                 excludeNamePatterns: Optional[List[str]] = None):
+                 excludeNamePatterns: Optional[List[str]] = None,
+                 maxBufferBytes: Optional[int] = 2048):
         super().__init__()
         if strategy not in PROMOTE_STRATEGIES:
             raise ValueError(f"unknown promote strategy {strategy!r}; expected one of {PROMOTE_STRATEGIES}")
@@ -114,15 +115,23 @@ def __init__(self,
         self.metricsPath = metricsPath
         self.setupCycles = int(setupCycles)
         self.bandwidthBytesPerCycle = float(bandwidthBytesPerCycle)
-        # Buffers whose name contains any of these substrings are skipped.
-        # Default skips Conv-layer weights — see Phase-1 §10.1: promoting an
-        # FP32 Conv weight into a static PI_L2 array breaks the per-tile
-        # weight DMA path on Siracusa (CCT_2_32_32_128 fails 10/10 outputs
-        # with only that one weight promoted; all 21 other CCT constants
-        # promote correctly). Override with `excludeNamePatterns=[]` to
-        # opt back in once the kernel codegen is fixed.
-        self.excludeNamePatterns = list(excludeNamePatterns) if excludeNamePatterns is not None \
-            else ["_conv_layers_", "_conv_weight", "_conv_bias"]
+        # Name-pattern denylist (kept for Phase-2 finer control).
+        self.excludeNamePatterns = list(excludeNamePatterns) if excludeNamePatterns is not None else []
+        # Per-buffer byte cap.  Root cause: when a ConstantBuffer is promoted
+        # from L3 to L2, the L1 closure reads the weight from a static PI_L2
+        # symbol with a fixed `+ 0` offset and the L3 closure that *would*
+        # have advanced the source pointer per tile is elided (see Phase-1
+        # §10.1 for the bisection).  For broadcast-per-tile buffers
+        # (LayerNorm scales/biases, small Gemm bias) every tile reads the
+        # same bytes and that is correct; for *tiled-across-channels*
+        # buffers (Conv weights, fused-bias duplicates, positional
+        # embedding) every tile must read DIFFERENT bytes and the elided
+        # advancement corrupts the output.  Tile slabs in this codebase are
+        # ~1.3 KB; a 2 KB cap reliably keeps tiled-weight buffers in L3
+        # while still letting the cycle-aware strategy promote the small
+        # broadcast ones (CCT_2_32_32_128 passes 0/10 with this cap).
+        # Set to None to disable the cap once the codegen is fixed.
+        self.maxBufferBytes = maxBufferBytes
         # Accumulated decisions across all apply() calls within one process.
         # Same tensor seen in later calls keeps its first-seen decision.
         self._decisions: dict = {}
@@ -183,7 +192,9 @@ def apply(self, ctxt: NetworkContext, graph: gs.Graph) -> Tuple[NetworkContext,
                 continue
             if self.onlyConstants and not isinstance(buf, ConstantBuffer):
                 continue
-            if any(pat in name for pat in self.excludeNamePatterns):
+            if self.excludeNamePatterns and any(pat in name for pat in self.excludeNamePatterns):
+                continue
+            if self.maxBufferBytes is not None and _bufferSizeBytes(buf) > self.maxBufferBytes:
                 continue
             reuse = max(1, len(getattr(buf, "_users", [])))
             if reuse < self.minReuse:
diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py
@@ -114,6 +114,9 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
                                              weightMemoryLevel = weightMemoryLevel))
 
     if getattr(args, "promoteToL2", False):
+        _maxBytes = getattr(args, "promoteMaxBufferBytes", 2048)
+        if _maxBytes is not None and _maxBytes <= 0:
+            _maxBytes = None
         memoryLevelAnnotationPasses.append(
             PromoteTensorsToL2Greedy(memoryHierarchy = memoryHierarchy,
                                      sourceLevel = "L3",
@@ -124,7 +127,8 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
                                      seed = args.promoteSeed,
                                      metricsPath = args.promoteMetricsPath,
                                      setupCycles = args.promoteSetupCycles,
-                                     bandwidthBytesPerCycle = args.promoteBandwidth))
+                                     bandwidthBytesPerCycle = args.promoteBandwidth,
+                                     maxBufferBytes = _maxBytes))
 
     # Make the deployer memory-level aware
     deployer = MemoryDeployerWrapper(deployer, memoryLevelAnnotationPasses)
@@ -248,6 +252,16 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
                         type = float,
                         default = 4.0,
                         help = 'Effective DMA bandwidth (bytes/cycle) for cycle-aware strategy. Default 4.0.\n')
+    parser.add_argument('--promoteMaxBufferBytes',
+                        type = int,
+                        default = 2048,
+                        help = 'Skip ConstantBuffers larger than this many bytes when promoting. '
+                        'Default 2048 keeps tiled-across-channels Conv/Gemm weights in L3 (where '
+                        'the per-tile L3->L2 staging refill in the L3 closure advances the source '
+                        'pointer correctly) and only promotes small broadcast-per-tile constants '
+                        '(LayerNorm scales/biases, small Gemm bias) where the static-PI_L2 read '
+                        'with a fixed +0 offset is the right behaviour. Set to 0 or a very large '
+                        'value to disable the cap.\n')
     parser.add_argument('--promoteMetricsPath',
                         type = str,
                         default = None,
diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py
@@ -194,6 +194,10 @@ def __init__(self,
                               type = float,
                               default = 4.0,
                               help = 'Effective DMA bandwidth (bytes/cycle) for cycle-aware strategy. Default 4.0.\n')
+            self.add_argument('--promoteMaxBufferBytes',
+                              type = int,
+                              default = 2048,
+                              help = 'Skip ConstantBuffers larger than N bytes. Default 2048.\n')
 
         self.args = None
 
@@ -283,6 +287,8 @@ def create_config_from_args(args: argparse.Namespace,
             gen_args_list.append(f"--promoteSetupCycles={args.promoteSetupCycles}")
         if hasattr(args, 'promoteBandwidth') and args.promoteBandwidth is not None:
             gen_args_list.append(f"--promoteBandwidth={args.promoteBandwidth}")
+        if hasattr(args, 'promoteMaxBufferBytes') and args.promoteMaxBufferBytes is not None:
+            gen_args_list.append(f"--promoteMaxBufferBytes={args.promoteMaxBufferBytes}")
 
     if not tiling and getattr(args, 'profileUntiled', False):
         gen_args_list.append("--profileUntiled")