@@ -127,7 +127,7 @@ def generateTrainingTestInputsHeader(deployer: NetworkDeployer,
127127 paddingElements = (pad_bytes * 8 + typeWidth - 1 ) // typeWidth
128128 list_str += ", " + ", " .join ("0" for _ in range (paddingElements ))
129129
130- retStr += f" { typeName } { buf_name } [] = {{{ list_str } }};\n "
130+ retStr += f'__attribute__((section(".weightmem_sram"))) { typeName } { buf_name } [] = {{{ list_str } }};\n '
131131
132132 # Emit the row pointer array for this mini-batch
133133 row_name = f"testDataRow{ mb } "
@@ -503,7 +503,10 @@ def build_shared_buffer_maps(train_onnx_path: str, opt_onnx_model) -> Tuple[Dict
503503 return shared_input_map , shared_output_map
504504
505505
506- def _patch_shared_buffers (retStr : str , shared_input_map : Dict [int , int ], shared_output_map : Dict [int , int ]) -> str :
506+ def _patch_shared_buffers (retStr : str ,
507+ shared_input_map : Dict [int , int ],
508+ shared_output_map : Dict [int , int ],
509+ train_c_source : str = "" ) -> str :
507510 """Redirect optimizer I/O buffers to Training's already-allocated buffers.
508511
509512 Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution so that
@@ -558,12 +561,25 @@ def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_
558561 _arena_pat = re .compile (r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)'
559562 r'\s*\(\s*\(char\s*\*\)\s*DeeployOptNetwork_MEMORYARENA_L\w+\s*\+\s*\d+\s*\)\s*;' )
560563
564+ def _is_train_l2 (train_idx : int ) -> bool :
565+ """Check if training input_N was allocated with pi_l2_malloc (promoted).
566+ If so, sharing the pointer would send an L2 address to the optimizer's
567+ closure_L3 pi_cl_ram_copy_2d → HyperRAM OOB."""
568+ if not train_c_source :
569+ return False
570+ pat = rf'{ _TRAIN_PREFIX } input_{ train_idx } \s*=\s*\([^)]+\)\s*pi_l2_malloc\b'
571+ return bool (re .search (pat , train_c_source ))
572+
561573 def _make_replacement (symbol : str , kind : str , idx : int ) -> Optional [str ]:
562574 if kind == "input" and idx in shared_input_map :
563575 train_idx = shared_input_map [idx ]
576+ if _is_train_l2 (train_idx ):
577+ return None # Don't share: training buffer at L2, optimizer expects L3
564578 return f'{ symbol } = (float32_t *){ _TRAIN_PREFIX } input_{ train_idx } ; /* shared with TrainingNetwork */'
565579 if kind == "output" and idx in shared_output_map :
566580 train_idx = shared_output_map [idx ]
581+ if _is_train_l2 (train_idx ):
582+ return None
567583 return f'{ symbol } = (float32_t *){ _TRAIN_PREFIX } input_{ train_idx } ; /* in-place, shared with TrainingNetwork */'
568584 return None
569585
@@ -574,6 +590,41 @@ def _replace(m: re.Match) -> str:
574590 retStr = _malloc_pat .sub (_replace , retStr )
575591 retStr = _arena_pat .sub (_replace , retStr )
576592
593+ # ------------------------------------------------------------------
594+ # Drop load_file_to_ram() for shared I/O buffers.
595+ #
596+ # InitOptimizerNetwork() emits one line per input:
597+ #
598+ # load_file_to_ram(DeeployOptNetwork_input_N, "N.hex");
599+ #
600+ # which expands to cl_ram_write(addr, ...). cl_ram_write expects
601+ # `addr` to be a hyperram (L3) offset; the underlying DMA engine
602+ # masks it to the hyperram address range. For a shared input that
603+ # has been redirected (above) to a TrainingNetwork buffer, the
604+ # destination address is whatever level that buffer lives in -- and
605+ # once PromoteTensorsToL2 starts hoisting training inputs to L2,
606+ # that pointer is an L2 address. Stripping it to a hyperram offset
607+ # yields nonsense (e.g. 0x10800000 -> 0x800000) which GVSoC reports
608+ # as `/ram out-of-bound request (addr 0x800000, ram_size 0x800000)`
609+ # and the simulation aborts.
610+ #
611+ # These loads are also dead code: the test harness re-initialises
612+ # every shared input via l3_aware_copy(testInitWeights[]) after both
613+ # InitTrainingNetwork() and InitOptimizerNetwork() return, and that
614+ # helper picks the right L2/L3 writer per buffer.
615+ _load_pat = re .compile (r'[^\n]*load_file_to_ram\s*\(\s*DeeployOptNetwork_(input|output)_(\d+)\s*,[^;]+\);\s*\n' )
616+
617+ def _maybe_drop_load (m : re .Match ) -> str :
618+ kind = m .group (1 )
619+ idx = int (m .group (2 ))
620+ if kind == "input" and idx in shared_input_map :
621+ return ''
622+ if kind == "output" and idx in shared_output_map :
623+ return ''
624+ return m .group (0 )
625+
626+ retStr = _load_pat .sub (_maybe_drop_load , retStr )
627+
577628 # ------------------------------------------------------------------
578629 # Arena elimination: if a MEMORYARENA_Lx is no longer used for any
579630 # pointer arithmetic after the redirects, its malloc is dead and can
@@ -852,7 +903,10 @@ def generateOptimizerNetworkImplementation(deployer: NetworkDeployer,
852903 # Prefix substitution
853904 retStr = retStr .replace (_TRAIN_PREFIX , _OPT_PREFIX )
854905 # Replace malloc calls for shared weight/grad buffers with Training pointers
855- retStr = _patch_shared_buffers (retStr , shared_input_map or {}, shared_output_map or {})
906+ retStr = _patch_shared_buffers (retStr ,
907+ shared_input_map or {},
908+ shared_output_map or {},
909+ train_c_source = train_c_source or "" )
856910 # Redirect optimizer L1/L2 arena mallocs to reuse training arenas
857911 if train_c_source :
858912 retStr = _patch_shared_arenas (retStr , train_c_source )
0 commit comments