Skip to content

Commit 5f6813a

Browse files
committed
[CNNTraining] Convdw gradw
1 parent ce5491e commit 5f6813a

10 files changed

Lines changed: 245 additions & 9 deletions

File tree

Deeploy/Targets/PULPOpen/Bindings.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,13 @@
272272
ForkTransformer)
273273
]
274274

275+
PULPFloatDWConvGradW2DBindings = [
276+
NodeBinding(
277+
ConvChecker([PointerClass(float32_t), PointerClass(float32_t)],
278+
[PointerClass(float32_t)]), FloatConvTemplate.referenceDWConvGradW2DTemplate,
279+
ForkTransformer)
280+
]
281+
275282
PULPFloatConvGradB2DBindings = [
276283
NodeBinding(
277284
PULPConvGradBChecker([PointerClass(float32_t)], # Only one input: output_grad

Deeploy/Targets/PULPOpen/Parsers.py

Lines changed: 128 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -597,11 +597,38 @@ def parseNodeCtxt(self,
597597
return super().parseNodeCtxt(ctxt, node, channels_first)
598598

599599

600-
class PULPConvGradW2DParser(PULPFPConv2DParser):
600+
class PULPConvGradW2DParser(Conv2DParser):
601+
"""Parser for standard ConvGradW (non-grouped)"""
601602

602603
def __init__(self, noBiasHoisting = True):
603604
super().__init__(noBiasHoisting)
604605

606+
def parseNode(self, node: gs.Node) -> bool:
607+
"""Parse ConvGradW node, rejecting grouped convolutions"""
608+
# Call Conv2DParser.parseNode directly (skip PULPFPConv2DParser's group==1 check)
609+
wellFormed = Conv2DParser.parseNode(self, node)
610+
611+
if not wellFormed:
612+
return False
613+
614+
# Reject if group > 1 (handled by DWConvGradW2DParser)
615+
if 'group' in self.operatorRepresentation:
616+
group = self.operatorRepresentation['group']
617+
if group > 1:
618+
return False
619+
620+
# ConvGradW has 2 inputs: output_grad and input_data
621+
if len(node.inputs) != 2:
622+
return False
623+
624+
# Extract padding attributes
625+
self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
626+
self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
627+
self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
628+
self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
629+
630+
return True
631+
605632
def parseNodeCtxt(self,
606633
ctxt: NetworkContext,
607634
node: gs.Node,
@@ -712,4 +739,103 @@ def parseNodeCtxt(self,
712739
self.operatorRepresentation['bias'] = node.outputs[0].name
713740
self.operatorRepresentation['bias_type'] = output_grad_tensor._type # Same type as grad_out
714741

715-
return ctxt, True
742+
return ctxt, True
743+
744+
745+
class PULPDWConvGradW2DParser(Conv2DParser):
746+
"""Parser for depthwise ConvGradW (grouped convolution weight gradient)"""
747+
748+
def __init__(self, noBiasHoisting=True):
749+
super().__init__(noBiasHoisting)
750+
751+
def parseNode(self, node: gs.Node) -> bool:
752+
"""Parse grouped ConvGradW node"""
753+
# Call Conv2DParser.parseNode directly (skip PULPFPConv2DParser's group==1 check)
754+
wellFormed = Conv2DParser.parseNode(self, node)
755+
756+
if not wellFormed:
757+
return False
758+
759+
# Must have group attribute and group > 1
760+
if 'group' not in self.operatorRepresentation:
761+
return False
762+
763+
group = self.operatorRepresentation['group']
764+
if group <= 1:
765+
return False
766+
767+
# ConvGradW has 2 inputs: output_grad and input_data
768+
if len(node.inputs) != 2:
769+
return False
770+
771+
# Extract padding attributes
772+
self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
773+
self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
774+
self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
775+
self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
776+
777+
return True
778+
779+
def parseNodeCtxt(self,
780+
ctxt: NetworkContext,
781+
node: gs.Node,
782+
channels_first: bool = True) -> Tuple[NetworkContext, bool]:
783+
"""Parse DWConvGradW - depthwise/grouped weight gradient computation"""
784+
785+
if not self.parseNode(node):
786+
return ctxt, False
787+
788+
# Get input tensors
789+
grad_out_tensor = ctxt.lookup(node.inputs[0].name)
790+
data_in_tensor = ctxt.lookup(node.inputs[1].name)
791+
792+
# Extract batch size
793+
batch = grad_out_tensor.shape[0]
794+
795+
# Extract dimensions (NCHW format)
796+
C_out, H_out, W_out = grad_out_tensor.shape[1], grad_out_tensor.shape[2], grad_out_tensor.shape[3]
797+
C_in, H_in, W_in = data_in_tensor.shape[1], data_in_tensor.shape[2], data_in_tensor.shape[3]
798+
799+
# Get group info
800+
group = self.operatorRepresentation['group']
801+
802+
# Verify grouping constraints
803+
assert C_out % group == 0, f"Output channels {C_out} not divisible by group {group}"
804+
assert C_in % group == 0, f"Input channels {C_in} not divisible by group {group}"
805+
806+
# For depthwise: group == C_in == C_out
807+
# Weight shape is [C_out, C_in/group, kH, kW]
808+
C_in_per_group = C_in // group
809+
810+
# Store batch size
811+
self.operatorRepresentation['batch'] = batch
812+
813+
# Store dimensions
814+
self.operatorRepresentation['ch_im_out'] = C_out
815+
self.operatorRepresentation['dim_im_out_x'] = W_out
816+
self.operatorRepresentation['dim_im_out_y'] = H_out
817+
self.operatorRepresentation['ch_im_in'] = C_in
818+
self.operatorRepresentation['dim_im_in_x'] = W_in
819+
self.operatorRepresentation['dim_im_in_y'] = H_in
820+
821+
# Store kernel dimensions
822+
self.operatorRepresentation['dim_kernel_y'] = self.operatorRepresentation['kernel_shape'][0]
823+
self.operatorRepresentation['dim_kernel_x'] = self.operatorRepresentation['kernel_shape'][1]
824+
825+
# Store strides
826+
self.operatorRepresentation['stride_y'] = self.operatorRepresentation['strides'][0]
827+
self.operatorRepresentation['stride_x'] = self.operatorRepresentation['strides'][1]
828+
829+
# Set tensor names and types
830+
self.operatorRepresentation['grad_out'] = node.inputs[0].name
831+
self.operatorRepresentation['grad_out_type'] = grad_out_tensor._type
832+
self.operatorRepresentation['data_in'] = node.inputs[1].name
833+
self.operatorRepresentation['data_in_type'] = data_in_tensor._type
834+
self.operatorRepresentation['weight'] = node.outputs[0].name
835+
self.operatorRepresentation['weight_type'] = grad_out_tensor._type
836+
837+
# No bias for ConvGradW
838+
self.operatorRepresentation['has_bias'] = 'false'
839+
self.operatorRepresentation['bias'] = 'NULL'
840+
841+
return ctxt, True

Deeploy/Targets/PULPOpen/Platform.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@
3434
from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer
3535
from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \
3636
PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \
37-
PULPTallGEMMParser, PULPConvTrans2DParser, PULPConvGradW2DParser, PULPConvGradB2DParser, PULPDWConvTrans2DParser
37+
PULPTallGEMMParser, PULPConvTrans2DParser, PULPConvGradW2DParser, PULPConvGradB2DParser, PULPDWConvTrans2DParser, \
38+
PULPDWConvGradW2DParser
3839
from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate
3940
from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \
4041
PULPConv2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \
@@ -49,7 +50,8 @@
4950
PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \
5051
PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \
5152
PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings, PULPAveragePool2DTilingReadyBindings, \
52-
PULPAveragePoolGrad2DTilingReadyBindings, PULPConvTrans2DTilingReadyBindings, PULPConvGradW2DTilingReadyBindings, PULPConvGradB2DTilingReadyBindings, PULPDWConvTrans2DTilingReadyBindings
53+
PULPAveragePoolGrad2DTilingReadyBindings, PULPConvTrans2DTilingReadyBindings, PULPConvGradW2DTilingReadyBindings, PULPConvGradB2DTilingReadyBindings, PULPDWConvTrans2DTilingReadyBindings, \
54+
PULPDWConvGradW2DTilingReadyBindings
5355
from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \
5456
PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass
5557

@@ -82,6 +84,7 @@
8284
ConvGradXMapper = NodeMapper(PULPConvTrans2DParser(), PULPConvTrans2DTilingReadyBindings)
8385
DwConvGradxMapper = NodeMapper(PULPDWConvTrans2DParser(), PULPDWConvTrans2DTilingReadyBindings)
8486
ConvGradWMapper = NodeMapper(PULPConvGradW2DParser(), PULPConvGradW2DTilingReadyBindings)
87+
DwConvGradWMapper = NodeMapper(PULPDWConvGradW2DParser(), PULPDWConvGradW2DTilingReadyBindings)
8588
ConvGradBMapper = NodeMapper(PULPConvGradB2DParser(), PULPConvGradB2DTilingReadyBindings)
8689

8790
Conv2DMapper = NodeMapper(PULPConv2DParser(), PULPRQSConv2DTilingReadyBindings)
@@ -121,7 +124,7 @@
121124
PULPMapping = {
122125
'Conv': ConvLayer([FPConv2DMapper, FPDWConv2DMapper]),
123126
'ConvGradX': ConvLayer([ConvGradXMapper, DwConvGradxMapper]),
124-
'ConvGradW': ConvLayer([ConvGradWMapper]),
127+
'ConvGradW': ConvLayer([ConvGradWMapper, DwConvGradWMapper]),
125128
'ConvGradB': ConvLayer([ConvGradBMapper]),
126129
'RequantizedConv': PULPRQSConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]),
127130
'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]),

Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,28 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
9494
ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
9595
}
9696
""")
97+
referenceDWConvGradW2DTemplate = NodeTemplate("""
98+
// 2D FP DW ConvGradW NCHW (Name: ${nodeName}, Op: ${nodeOp})
99+
${grad_out_type.typeName} ref_${weight}_${grad_out} = ${grad_out};
100+
${data_in_type.typeName} ref_${weight}_${data_in} = ${data_in};
101+
${weight_type.typeName} ref_${weight}_out = ${weight};
102+
103+
for (uint32_t n=0; n<${batch}; ++n) {
104+
PULP_DWConvGradW2d_fp${grad_out_type.referencedType.typeWidth}_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_NCHW(
105+
ref_${weight}_${grad_out},
106+
${dim_im_out_y}, ${dim_im_out_x}, ${ch_im_out},
107+
ref_${weight}_${data_in},
108+
${dim_im_in_y}, ${dim_im_in_x}, ${ch_im_in},
109+
${dim_kernel_y}, ${dim_kernel_x},
110+
${stride_y}, ${stride_x},
111+
ref_${weight}_out,
112+
${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}
113+
);
97114
115+
ref_${weight}_${grad_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
116+
ref_${weight}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
117+
}
118+
""")
98119
reference2DIm2ColTemplate = PULP2DFloatConvIm2ColTemplate("""
99120
// 2D FP Conv HWC with Im2Col and ChannelOout parallelism (Name: ${nodeName}, Op: ${nodeOp})
100121

Deeploy/Targets/PULPOpen/Tiler.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
PULPSliceBindings, PULPSoftmaxBindings, PULPSoftmaxCrossEntropyLossBindings, \
2525
PULPAveragePool2DBindings, PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, \
2626
PULPTransposeBindings, PULPUniformRQSBindings, PULPAveragePoolGrad2DBindings, PULPFloatConvTrans2DBindings, \
27-
PULPFloatConvGradW2DBindings, PULPFloatConvGradB2DBindings, PULPFloatDWConvTrans2DBindings
27+
PULPFloatConvGradW2DBindings, PULPFloatConvGradB2DBindings, PULPFloatDWConvTrans2DBindings, \
28+
PULPFloatDWConvGradW2DBindings
2829
from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint
2930
from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \
3031
RQDWConv2DTileConstraint
@@ -175,5 +176,8 @@
175176
PULPConvGradW2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConvGradW2DBindings,
176177
tileConstraint = ConvGradW2DTileConstraint())
177178

179+
PULPDWConvGradW2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatDWConvGradW2DBindings,
180+
tileConstraint = ConvGradW2DTileConstraint())
181+
178182
PULPConvGradB2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConvGradB2DBindings,
179183
tileConstraint = ConvGradW2DTileConstraint())
992 Bytes
Binary file not shown.
Binary file not shown.
Binary file not shown.

TargetLibraries/PULPOpen/inc/kernel/Conv.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,4 +68,15 @@ void PULP_DWConvTrans2d_fp32_fp32_fp32_HWC(
6868
uint32_t pad_top, uint32_t pad_bottom,
6969
uint32_t pad_left, uint32_t pad_right);
7070

71+
void PULP_DWConvGradW2d_fp32_fp32_fp32_NCHW(
72+
const float *__restrict__ pGradOut,
73+
uint32_t H_out, uint32_t W_out, uint32_t C_out,
74+
const float *__restrict__ pInput,
75+
uint32_t H_in, uint32_t W_in, uint32_t C_in,
76+
uint32_t P, uint32_t Q,
77+
uint32_t SP, uint32_t SQ,
78+
float *__restrict__ pGradWeight,
79+
uint32_t pad_top, uint32_t pad_bottom,
80+
uint32_t pad_left, uint32_t pad_right);
81+
7182
#endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_

TargetLibraries/PULPOpen/src/Convolution_fp32.c

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ void PULP_DWConvTrans2d_fp32_fp32_fp32_HWC(
339339
// Workaround for GCC/RISC-V compiler optimization bug
340340
// Without this printf, the compiler generates incorrect pointer arithmetic
341341
// causing wrong results at specific indices (w=0,1 positions)
342-
342+
printf("hello");
343343
pGradIn[gi_idx] += pGradOut[go_idx] * w_val;
344344
}
345345
}
@@ -419,8 +419,6 @@ void PULP_ConvGradB2d_fp32_fp32_NCHW(const float *__restrict__ pGradOut,
419419
return;
420420
}
421421

422-
// Compute bias gradients
423-
// For each output channel, sum all gradients across batch, height, and width
424422
for (uint32_t oc = ch_start; oc < ch_stop; ++oc) {
425423
float grad_sum = 0.0f;
426424

@@ -435,4 +433,70 @@ void PULP_ConvGradB2d_fp32_fp32_NCHW(const float *__restrict__ pGradOut,
435433

436434
pGradBias[oc] = grad_sum;
437435
}
436+
}
437+
438+
439+
void PULP_DWConvGradW2d_fp32_fp32_fp32_NCHW(
440+
const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out,
441+
uint32_t C_out, const float *__restrict__ pInput, uint32_t H_in,
442+
uint32_t W_in, uint32_t C_in, uint32_t P, uint32_t Q, uint32_t SP,
443+
uint32_t SQ, float *__restrict__ pGradWeight, uint32_t pad_top,
444+
uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right) {
445+
446+
int8_t core_id = pi_core_id();
447+
int8_t log2Core = LOG2(NUM_CORES);
448+
449+
uint16_t ch_chunk = (C_out >> log2Core) + ((C_out & (NUM_CORES - 1)) != 0);
450+
uint16_t ch_start = MIN(ch_chunk * core_id, C_out);
451+
uint16_t ch_stop = MIN(ch_start + ch_chunk, C_out);
452+
453+
if (ch_start >= ch_stop) {
454+
return;
455+
}
456+
457+
uint32_t C_in_per_group = C_in / C_out;
458+
459+
for (uint32_t oc = ch_start; oc < ch_stop; ++oc) {
460+
uint32_t ic_start = oc * C_in_per_group;
461+
462+
for (uint32_t ic_idx = 0; ic_idx < C_in_per_group; ++ic_idx) {
463+
uint32_t ic = ic_start + ic_idx;
464+
465+
for (uint32_t kh = 0; kh < P; ++kh) {
466+
for (uint32_t kw = 0; kw < Q; ++kw) {
467+
468+
float grad_sum = 0.0f;
469+
470+
for (uint32_t oh = 0; oh < H_out; ++oh) {
471+
for (uint32_t ow = 0; ow < W_out; ++ow) {
472+
473+
int32_t ih = (int32_t)oh * (int32_t)SP +
474+
(int32_t)kh - (int32_t)pad_top;
475+
int32_t iw = (int32_t)ow * (int32_t)SQ +
476+
(int32_t)kw - (int32_t)pad_left;
477+
478+
if (ih >= 0 && ih < (int32_t)H_in &&
479+
iw >= 0 && iw < (int32_t)W_in) {
480+
481+
uint32_t go_idx = (oc * H_out + oh) * W_out + ow;
482+
float gy = pGradOut[go_idx];
483+
484+
uint32_t in_idx =
485+
(ic * H_in + (uint32_t)ih) * W_in + (uint32_t)iw;
486+
487+
grad_sum += gy * pInput[in_idx];
488+
printf("hello");
489+
}
490+
}
491+
}
492+
493+
uint32_t gw_idx =
494+
((oc * C_in_per_group + ic_idx) * P + kh) * Q + kw;
495+
pGradWeight[gw_idx] = grad_sum;
496+
497+
498+
}
499+
}
500+
}
501+
}
438502
}

0 commit comments

Comments
 (0)