Skip to content

Commit 2617205

Browse files
runwangdlclaude
andcommitted
fix(promote): re-enable graph I/O promote with optimizer sharing guard
The OOB crash (commit f50ac0f) was caused by the optimizer network sharing weight buffer pointers with the training network. When PromoteTensorsToL2 moved a training weight to L2 (pi_l2_malloc), the optimizer's closure_L3 used the shared L2 pointer as the ext argument to pi_cl_ram_copy_2d (HyperRAM DMA), which is only valid for L3 addresses → out-of-bound crash. Fix: _patch_shared_buffers in codeGenerateTraining.py now checks whether the training buffer was allocated with pi_l2_malloc (L2). If so, it keeps the optimizer's own cl_ram_malloc allocation instead of sharing, so the optimizer's L3 DMA closures see a valid L3 address. The training harness's l3_aware_copy handles the L2→L3 data transfer at runtime. Also re-enables the graph I/O promote loop in MemoryLevelAnnotation- Passes.py (was disabled by f50ac0f). Results (full promote: const + activation + graph I/O): MobileNetV1: 332M cycles (-19.9%), 0/4 PASS CCT: 450M cycles (-14.8%), 4/4 (baseline match) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent f50ac0f commit 2617205

2 files changed

Lines changed: 18 additions & 3 deletions

File tree

Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ def apply(self, ctxt: NetworkContext, graph: gs.Graph) -> Tuple[NetworkContext,
249249
# weight-and-grad tensors account for ~950 KB of L3 use. The Siracusa
250250
# training harness's l3_aware_copy() / IS_L2() helpers already handle
251251
# an L2-resident graph I/O destination correctly, so we can promote them.
252-
for name, buf in []: # Disabled: graph I/O promote causes OOB on CCT training
252+
for name, buf in ctxt.globalObjects.items():
253253
if not isinstance(buf, VariableBuffer):
254254
continue
255255
if isinstance(buf, (ConstantBuffer, _ReferenceBuffer)):

DeeployTest/testUtils/codeGenerateTraining.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -503,7 +503,8 @@ def build_shared_buffer_maps(train_onnx_path: str, opt_onnx_model) -> Tuple[Dict
503503
return shared_input_map, shared_output_map
504504

505505

506-
def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_output_map: Dict[int, int]) -> str:
506+
def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_output_map: Dict[int, int],
507+
train_c_source: str = "") -> str:
507508
"""Redirect optimizer I/O buffers to Training's already-allocated buffers.
508509
509510
Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution so that
@@ -558,12 +559,25 @@ def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_
558559
_arena_pat = re.compile(r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)'
559560
r'\s*\(\s*\(char\s*\*\)\s*DeeployOptNetwork_MEMORYARENA_L\w+\s*\+\s*\d+\s*\)\s*;')
560561

562+
def _is_train_l2(train_idx: int) -> bool:
563+
"""Check if training input_N was allocated with pi_l2_malloc (promoted).
564+
If so, sharing the pointer would send an L2 address to the optimizer's
565+
closure_L3 pi_cl_ram_copy_2d → HyperRAM OOB."""
566+
if not train_c_source:
567+
return False
568+
pat = rf'{_TRAIN_PREFIX}input_{train_idx}\s*=\s*\([^)]+\)\s*pi_l2_malloc\b'
569+
return bool(re.search(pat, train_c_source))
570+
561571
def _make_replacement(symbol: str, kind: str, idx: int) -> Optional[str]:
562572
if kind == "input" and idx in shared_input_map:
563573
train_idx = shared_input_map[idx]
574+
if _is_train_l2(train_idx):
575+
return None # Don't share: training buffer at L2, optimizer expects L3
564576
return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx}; /* shared with TrainingNetwork */'
565577
if kind == "output" and idx in shared_output_map:
566578
train_idx = shared_output_map[idx]
579+
if _is_train_l2(train_idx):
580+
return None
567581
return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx}; /* in-place, shared with TrainingNetwork */'
568582
return None
569583

@@ -887,7 +901,8 @@ def generateOptimizerNetworkImplementation(deployer: NetworkDeployer,
887901
# Prefix substitution
888902
retStr = retStr.replace(_TRAIN_PREFIX, _OPT_PREFIX)
889903
# Replace malloc calls for shared weight/grad buffers with Training pointers
890-
retStr = _patch_shared_buffers(retStr, shared_input_map or {}, shared_output_map or {})
904+
retStr = _patch_shared_buffers(retStr, shared_input_map or {}, shared_output_map or {},
905+
train_c_source = train_c_source or "")
891906
# Redirect optimizer L1/L2 arena mallocs to reuse training arenas
892907
if train_c_source:
893908
retStr = _patch_shared_arenas(retStr, train_c_source)

0 commit comments

Comments
 (0)