Skip to content

Commit 95fef65

Browse files
runwangdlclaude
andcommitted
style: fix yapf formatting in codeGenerateTraining.py
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 3cd9e56 commit 95fef65

3 files changed

Lines changed: 64 additions & 8 deletions

File tree

Deeploy/Targets/PULPOpen/Bindings.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,8 +182,10 @@
182182

183183
PULPRQAddBindings = [
184184
NodeBinding(RQAddChecker([PointerClass(_type), PointerClass(_type2)], [PointerClass(_type3)]),
185-
RQAddTemplate.referenceTemplate, ForkTransformer) for _type in [int8_t, uint8_t]
186-
for _type2 in [int8_t, uint8_t] for _type3 in [int8_t, uint8_t]
185+
RQAddTemplate.referenceTemplate, ForkTransformer)
186+
for _type in [int8_t, uint8_t]
187+
for _type2 in [int8_t, uint8_t]
188+
for _type3 in [int8_t, uint8_t]
187189
]
188190

189191
PULPAddBindings = [
@@ -250,7 +252,7 @@
250252

251253
PULPFloatConvGradX2DBindings = [
252254
NodeBinding(ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
253-
FloatConvGradTemplate.referenceConvGradX2DTemplate, ForkTransformer),
255+
FloatConvGradTemplate.referenceConvGradX2DTemplate, ForkTransformer)
254256
]
255257

256258
PULPFloatDWConv2DBindings = [

Deeploy/Targets/PULPOpen/Tiler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@
3333
from Deeploy.Targets.PULPOpen.TileConstraints.BatchNormTileConstraint import BatchNormalizationGradTileConstraint, \
3434
BatchNormInternalTileConstraint
3535
from Deeploy.Targets.PULPOpen.TileConstraints.ConvGradConstraint import ConvGradBTileConstraint, \
36-
ConvGradW2DTileConstraint, ConvGradX2DHWTileConstraint, ConvGradX2DIm2ColHWTileConstraint, \
37-
DWConvGradW2DTileConstraint, DWConvGradX2DTileConstraint, PWConvGradWTileConstraint, PWConvGradXTileConstraint
36+
ConvGradW2DTileConstraint, ConvGradX2DIm2ColHWTileConstraint, DWConvGradW2DTileConstraint, \
37+
DWConvGradX2DTileConstraint, PWConvGradWTileConstraint, PWConvGradXTileConstraint
3838
from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv1DTileConstraint, \
3939
RQConv2DTileConstraint
4040
from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \

DeeployTest/testUtils/codeGenerateTraining.py

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def generateTrainingTestInputsHeader(deployer: NetworkDeployer,
127127
paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth
128128
list_str += ", " + ", ".join("0" for _ in range(paddingElements))
129129

130-
retStr += f"{typeName} {buf_name}[] = {{{list_str}}};\n"
130+
retStr += f'__attribute__((section(".weightmem_sram"))) {typeName} {buf_name}[] = {{{list_str}}};\n'
131131

132132
# Emit the row pointer array for this mini-batch
133133
row_name = f"testDataRow{mb}"
@@ -503,7 +503,10 @@ def build_shared_buffer_maps(train_onnx_path: str, opt_onnx_model) -> Tuple[Dict
503503
return shared_input_map, shared_output_map
504504

505505

506-
def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_output_map: Dict[int, int]) -> str:
506+
def _patch_shared_buffers(retStr: str,
507+
shared_input_map: Dict[int, int],
508+
shared_output_map: Dict[int, int],
509+
train_c_source: str = "") -> str:
507510
"""Redirect optimizer I/O buffers to Training's already-allocated buffers.
508511
509512
Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution so that
@@ -558,12 +561,25 @@ def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_
558561
_arena_pat = re.compile(r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)'
559562
r'\s*\(\s*\(char\s*\*\)\s*DeeployOptNetwork_MEMORYARENA_L\w+\s*\+\s*\d+\s*\)\s*;')
560563

564+
def _is_train_l2(train_idx: int) -> bool:
565+
"""Check if training input_N was allocated with pi_l2_malloc (promoted).
566+
If so, sharing the pointer would send an L2 address to the optimizer's
567+
closure_L3 pi_cl_ram_copy_2d → HyperRAM OOB."""
568+
if not train_c_source:
569+
return False
570+
pat = rf'{_TRAIN_PREFIX}input_{train_idx}\s*=\s*\([^)]+\)\s*pi_l2_malloc\b'
571+
return bool(re.search(pat, train_c_source))
572+
561573
def _make_replacement(symbol: str, kind: str, idx: int) -> Optional[str]:
562574
if kind == "input" and idx in shared_input_map:
563575
train_idx = shared_input_map[idx]
576+
if _is_train_l2(train_idx):
577+
return None # Don't share: training buffer at L2, optimizer expects L3
564578
return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx}; /* shared with TrainingNetwork */'
565579
if kind == "output" and idx in shared_output_map:
566580
train_idx = shared_output_map[idx]
581+
if _is_train_l2(train_idx):
582+
return None
567583
return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx}; /* in-place, shared with TrainingNetwork */'
568584
return None
569585

@@ -574,6 +590,41 @@ def _replace(m: re.Match) -> str:
574590
retStr = _malloc_pat.sub(_replace, retStr)
575591
retStr = _arena_pat.sub(_replace, retStr)
576592

593+
# ------------------------------------------------------------------
594+
# Drop load_file_to_ram() for shared I/O buffers.
595+
#
596+
# InitOptimizerNetwork() emits one line per input:
597+
#
598+
# load_file_to_ram(DeeployOptNetwork_input_N, "N.hex");
599+
#
600+
# which expands to cl_ram_write(addr, ...). cl_ram_write expects
601+
# `addr` to be a hyperram (L3) offset; the underlying DMA engine
602+
# masks it to the hyperram address range. For a shared input that
603+
# has been redirected (above) to a TrainingNetwork buffer, the
604+
# destination address is whatever level that buffer lives in -- and
605+
# once PromoteTensorsToL2 starts hoisting training inputs to L2,
606+
# that pointer is an L2 address. Stripping it to a hyperram offset
607+
# yields nonsense (e.g. 0x10800000 -> 0x800000) which GVSoC reports
608+
# as `/ram out-of-bound request (addr 0x800000, ram_size 0x800000)`
609+
# and the simulation aborts.
610+
#
611+
# These loads are also dead code: the test harness re-initialises
612+
# every shared input via l3_aware_copy(testInitWeights[]) after both
613+
# InitTrainingNetwork() and InitOptimizerNetwork() return, and that
614+
# helper picks the right L2/L3 writer per buffer.
615+
_load_pat = re.compile(r'[^\n]*load_file_to_ram\s*\(\s*DeeployOptNetwork_(input|output)_(\d+)\s*,[^;]+\);\s*\n')
616+
617+
def _maybe_drop_load(m: re.Match) -> str:
618+
kind = m.group(1)
619+
idx = int(m.group(2))
620+
if kind == "input" and idx in shared_input_map:
621+
return ''
622+
if kind == "output" and idx in shared_output_map:
623+
return ''
624+
return m.group(0)
625+
626+
retStr = _load_pat.sub(_maybe_drop_load, retStr)
627+
577628
# ------------------------------------------------------------------
578629
# Arena elimination: if a MEMORYARENA_Lx is no longer used for any
579630
# pointer arithmetic after the redirects, its malloc is dead and can
@@ -852,7 +903,10 @@ def generateOptimizerNetworkImplementation(deployer: NetworkDeployer,
852903
# Prefix substitution
853904
retStr = retStr.replace(_TRAIN_PREFIX, _OPT_PREFIX)
854905
# Replace malloc calls for shared weight/grad buffers with Training pointers
855-
retStr = _patch_shared_buffers(retStr, shared_input_map or {}, shared_output_map or {})
906+
retStr = _patch_shared_buffers(retStr,
907+
shared_input_map or {},
908+
shared_output_map or {},
909+
train_c_source = train_c_source or "")
856910
# Redirect optimizer L1/L2 arena mallocs to reuse training arenas
857911
if train_c_source:
858912
retStr = _patch_shared_arenas(retStr, train_c_source)

0 commit comments

Comments
 (0)