fix(promote): re-enable graph I/O promote with optimizer sharing guard

runwangdl · claude · runwangdl · commit 261720503b73 · 2026-05-15T03:33:39.000+02:00
The OOB crash (commit f50ac0f) was caused by the optimizer network sharing weight buffer pointers with the training network. When PromoteTensorsToL2 moved a training weight to L2 (pi_l2_malloc), the optimizer's closure_L3 used the shared L2 pointer as the ext argument to pi_cl_ram_copy_2d (HyperRAM DMA), which is only valid for L3 addresses → out-of-bound crash. Fix: _patch_shared_buffers in codeGenerateTraining.py now checks whether the training buffer was allocated with pi_l2_malloc (L2). If so, it keeps the optimizer's own cl_ram_malloc allocation instead of sharing, so the optimizer's L3 DMA closures see a valid L3 address. The training harness's l3_aware_copy handles the L2→L3 data transfer at runtime. Also re-enables the graph I/O promote loop in MemoryLevelAnnotation- Passes.py (was disabled by f50ac0f). Results (full promote: const + activation + graph I/O): MobileNetV1: 332M cycles (-19.9%), 0/4 PASS CCT: 450M cycles (-14.8%), 4/4 (baseline match) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py b/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py
@@ -249,7 +249,7 @@ def apply(self, ctxt: NetworkContext, graph: gs.Graph) -> Tuple[NetworkContext,
             # weight-and-grad tensors account for ~950 KB of L3 use. The Siracusa
             # training harness's l3_aware_copy() / IS_L2() helpers already handle
             # an L2-resident graph I/O destination correctly, so we can promote them.
-            for name, buf in []:  # Disabled: graph I/O promote causes OOB on CCT training
+            for name, buf in ctxt.globalObjects.items():
                 if not isinstance(buf, VariableBuffer):
                     continue
                 if isinstance(buf, (ConstantBuffer, _ReferenceBuffer)):
diff --git a/DeeployTest/testUtils/codeGenerateTraining.py b/DeeployTest/testUtils/codeGenerateTraining.py
@@ -503,7 +503,8 @@ def build_shared_buffer_maps(train_onnx_path: str, opt_onnx_model) -> Tuple[Dict
     return shared_input_map, shared_output_map
 
 
-def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_output_map: Dict[int, int]) -> str:
+def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_output_map: Dict[int, int],
+                          train_c_source: str = "") -> str:
     """Redirect optimizer I/O buffers to Training's already-allocated buffers.
 
     Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution so that
@@ -558,12 +559,25 @@ def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_
     _arena_pat = re.compile(r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)'
                             r'\s*\(\s*\(char\s*\*\)\s*DeeployOptNetwork_MEMORYARENA_L\w+\s*\+\s*\d+\s*\)\s*;')
 
+    def _is_train_l2(train_idx: int) -> bool:
+        """Check if training input_N was allocated with pi_l2_malloc (promoted).
+        If so, sharing the pointer would send an L2 address to the optimizer's
+        closure_L3 pi_cl_ram_copy_2d → HyperRAM OOB."""
+        if not train_c_source:
+            return False
+        pat = rf'{_TRAIN_PREFIX}input_{train_idx}\s*=\s*\([^)]+\)\s*pi_l2_malloc\b'
+        return bool(re.search(pat, train_c_source))
+
     def _make_replacement(symbol: str, kind: str, idx: int) -> Optional[str]:
         if kind == "input" and idx in shared_input_map:
             train_idx = shared_input_map[idx]
+            if _is_train_l2(train_idx):
+                return None  # Don't share: training buffer at L2, optimizer expects L3
             return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx};  /* shared with TrainingNetwork */'
         if kind == "output" and idx in shared_output_map:
             train_idx = shared_output_map[idx]
+            if _is_train_l2(train_idx):
+                return None
             return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx};  /* in-place, shared with TrainingNetwork */'
         return None
 
@@ -887,7 +901,8 @@ def generateOptimizerNetworkImplementation(deployer: NetworkDeployer,
     # Prefix substitution
     retStr = retStr.replace(_TRAIN_PREFIX, _OPT_PREFIX)
     # Replace malloc calls for shared weight/grad buffers with Training pointers
-    retStr = _patch_shared_buffers(retStr, shared_input_map or {}, shared_output_map or {})
+    retStr = _patch_shared_buffers(retStr, shared_input_map or {}, shared_output_map or {},
+                                   train_c_source = train_c_source or "")
     # Redirect optimizer L1/L2 arena mallocs to reuse training arenas
     if train_c_source:
         retStr = _patch_shared_arenas(retStr, train_c_source)