Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Deeploy/Targets/Generic/Layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def __init__(self, maps: List[NodeMapper]):
def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
channels_first) -> Tuple[Shape, Shape]:
if len(inputShapes) == 3:
inputShapes[2] = inputShapes[1][0]
inputShapes[2] = (inputShapes[1][0],)
return (inputShapes, outputShapes)

def computeOps(self):
Expand Down
32 changes: 30 additions & 2 deletions Deeploy/Targets/PULPOpen/Templates/FloatConvGradTemplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@
# the template always sees the real name even when a Closure pass renders
# the kernel body before TilingCodeGeneration runs.
_TILE_IDX_SYMBOL_FMT = "TILING_CODEGEN_L1_{node}_tileIdxPtr"
# Per-execution "dW already zeroed" flag. Reset to 0 on every backward pass
# (every RunTrainingNetwork entry), set to 1 after the first spatial tile zeroes
# grad_weight. This is the CORRECT first-tile signal for the H/W-tiled memset:
# tileIdxPtr is a per-EXECUTION index (constant across an execution's spatial
# tiles, so guarding the memset on `*tileIdxPtr==0` memset-ed EVERY tile and
# wiped the cross-tile mm_add weight-gradient accumulation).
_DW_ZERO_FLAG_SYMBOL_FMT = "TILING_CODEGEN_L1_{node}_dwZeroFlagPtr"


def _is_tiled_expr(val: Any) -> bool:
Expand Down Expand Up @@ -69,6 +76,25 @@ def alignToContext(self, ctxt: NetworkContext,
"${type.referencedType.typeName}* ${name} = &bu_${name};")

operatorRepresentation['tileIdxPtr'] = symbol

# Hoist a per-execution dW-zeroing flag (same stack-var pattern as
# tileIdxPtr: re-initialised to 0 on every backward pass). Used by the
# H/W-tiled memset guard so grad_weight is zeroed exactly once per
# execution (on the first spatial tile) and then accumulated across the
# remaining tiles via the kernel's mm_add.
flagSymbol = _DW_ZERO_FLAG_SYMBOL_FMT.format(node = node)
if not ctxt.is_buffer(flagSymbol):
from Deeploy.AbstractDataTypes import PointerClass
from Deeploy.CommonExtensions.DataTypes import uint32_t
dwZeroFlag = ctxt.VariableBuffer(flagSymbol, shape = [1])
ctxt.add(dwZeroFlag, "local")
dwZeroFlag._type = PointerClass(uint32_t)
dwZeroFlag._instance = dwZeroFlag._type(dwZeroFlag.name, ctxt)
dwZeroFlag.allocTemplate = NodeTemplate("")
dwZeroFlag.deallocTemplate = NodeTemplate("")
dwZeroFlag.initTemplate = NodeTemplate("${type.referencedType.typeName} bu_${name} = 0;\n"
"${type.referencedType.typeName}* ${name} = &bu_${name};")
operatorRepresentation['dwZeroFlagPtr'] = flagSymbol
return ctxt, operatorRepresentation, []


Expand Down Expand Up @@ -213,8 +239,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
## Tiled template vars render as '*..._ref' pointer-deref strings; untiled
## vars render as literal ints/identifiers — see _is_tiled_expr.
% if (isinstance(dim_im_out_x, str) or isinstance(dim_im_out_y, str) or isinstance(dim_im_in_x, str) or isinstance(dim_im_in_y, str)) and not isinstance(ch_im_out, str):
if ((uint32_t)*${tileIdxPtr} == 0u) {
if ((uint32_t)*${dwZeroFlagPtr} == 0u) {
memset(${grad_weight}, 0, (${ch_im_out} * ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y}) * sizeof(${grad_weight_type.referencedType.typeName}));
*${dwZeroFlagPtr} = 1u;
}
% else:
memset(${grad_weight}, 0, (${ch_im_out} * ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y}) * sizeof(${grad_weight_type.referencedType.typeName}));
Expand Down Expand Up @@ -252,8 +279,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
## Tiled template vars render as '*..._ref' pointer-deref strings; untiled
## vars render as literal ints/identifiers — see _is_tiled_expr.
% if (isinstance(dim_im_out_x, str) or isinstance(dim_im_out_y, str) or isinstance(dim_im_in_x, str) or isinstance(dim_im_in_y, str)) and not isinstance(ch_im_out, str):
if ((uint32_t)*${tileIdxPtr} == 0u) {
if ((uint32_t)*${dwZeroFlagPtr} == 0u) {
memset(${grad_weight}, 0, (${ch_im_out} * ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y}) * sizeof(${grad_weight_type.referencedType.typeName}));
*${dwZeroFlagPtr} = 1u;
}
% else:
memset(${grad_weight}, 0, (${ch_im_out} * ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y}) * sizeof(${grad_weight_type.referencedType.typeName}));
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions DeeployTest/test_siracusa_tiled_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@
"Models/Training/SimpleMLP/simplemlp_train": [64000],
"Models/Training/Autoencoder/autoencoder_train": [128000],
"Models/Training/DSCNN/dscnn_train": [128000, 64000],
"Models/Training/SpeechNet/speechnet_train": [128000],
}

# Training-enabled tiled models that need L3 spill (weights/activations don't
Expand Down
Loading
Loading