[CNNTraining] Convdw gradw

runwangdl · runwangdl · commit 5f6813a7d96e · 2025-12-11T20:08:32.000Z
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -272,6 +272,13 @@
         ForkTransformer)
 ]
 
+PULPFloatDWConvGradW2DBindings = [
+    NodeBinding(
+        ConvChecker([PointerClass(float32_t), PointerClass(float32_t)],
+                    [PointerClass(float32_t)]), FloatConvTemplate.referenceDWConvGradW2DTemplate,
+        ForkTransformer)
+]
+
 PULPFloatConvGradB2DBindings = [
     NodeBinding(
         PULPConvGradBChecker([PointerClass(float32_t)],  # Only one input: output_grad
diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py
@@ -597,11 +597,38 @@ def parseNodeCtxt(self,
             return super().parseNodeCtxt(ctxt, node, channels_first)
 
 
-class PULPConvGradW2DParser(PULPFPConv2DParser):
+class PULPConvGradW2DParser(Conv2DParser):
+    """Parser for standard ConvGradW (non-grouped)"""
 
     def __init__(self, noBiasHoisting = True):
         super().__init__(noBiasHoisting)
 
+    def parseNode(self, node: gs.Node) -> bool:
+        """Parse ConvGradW node, rejecting grouped convolutions"""
+        # Call Conv2DParser.parseNode directly (skip PULPFPConv2DParser's group==1 check)
+        wellFormed = Conv2DParser.parseNode(self, node)
+        
+        if not wellFormed:
+            return False
+        
+        # Reject if group > 1 (handled by DWConvGradW2DParser)
+        if 'group' in self.operatorRepresentation:
+            group = self.operatorRepresentation['group']
+            if group > 1:
+                return False
+        
+        # ConvGradW has 2 inputs: output_grad and input_data
+        if len(node.inputs) != 2:
+            return False
+        
+        # Extract padding attributes
+        self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
+        self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
+        self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
+        self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
+        
+        return True
+
     def parseNodeCtxt(self,
                       ctxt: NetworkContext,
                       node: gs.Node,
@@ -712,4 +739,103 @@ def parseNodeCtxt(self,
         self.operatorRepresentation['bias'] = node.outputs[0].name
         self.operatorRepresentation['bias_type'] = output_grad_tensor._type  # Same type as grad_out
         
-        return ctxt, True
+        return ctxt, True
+
+
+class PULPDWConvGradW2DParser(Conv2DParser):
+    """Parser for depthwise ConvGradW (grouped convolution weight gradient)"""
+
+    def __init__(self, noBiasHoisting=True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> bool:
+        """Parse grouped ConvGradW node"""
+        # Call Conv2DParser.parseNode directly (skip PULPFPConv2DParser's group==1 check)
+        wellFormed = Conv2DParser.parseNode(self, node)
+        
+        if not wellFormed:
+            return False
+        
+        # Must have group attribute and group > 1
+        if 'group' not in self.operatorRepresentation:
+            return False
+        
+        group = self.operatorRepresentation['group']
+        if group <= 1:
+            return False
+        
+        # ConvGradW has 2 inputs: output_grad and input_data
+        if len(node.inputs) != 2:
+            return False
+        
+        # Extract padding attributes
+        self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
+        self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
+        self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
+        self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
+        
+        return True
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        """Parse DWConvGradW - depthwise/grouped weight gradient computation"""
+        
+        if not self.parseNode(node):
+            return ctxt, False
+        
+        # Get input tensors
+        grad_out_tensor = ctxt.lookup(node.inputs[0].name)
+        data_in_tensor = ctxt.lookup(node.inputs[1].name)
+        
+        # Extract batch size
+        batch = grad_out_tensor.shape[0]
+        
+        # Extract dimensions (NCHW format)
+        C_out, H_out, W_out = grad_out_tensor.shape[1], grad_out_tensor.shape[2], grad_out_tensor.shape[3]
+        C_in, H_in, W_in = data_in_tensor.shape[1], data_in_tensor.shape[2], data_in_tensor.shape[3]
+        
+        # Get group info
+        group = self.operatorRepresentation['group']
+        
+        # Verify grouping constraints
+        assert C_out % group == 0, f"Output channels {C_out} not divisible by group {group}"
+        assert C_in % group == 0, f"Input channels {C_in} not divisible by group {group}"
+        
+        # For depthwise: group == C_in == C_out
+        # Weight shape is [C_out, C_in/group, kH, kW]
+        C_in_per_group = C_in // group
+        
+        # Store batch size
+        self.operatorRepresentation['batch'] = batch
+        
+        # Store dimensions
+        self.operatorRepresentation['ch_im_out'] = C_out
+        self.operatorRepresentation['dim_im_out_x'] = W_out
+        self.operatorRepresentation['dim_im_out_y'] = H_out
+        self.operatorRepresentation['ch_im_in'] = C_in
+        self.operatorRepresentation['dim_im_in_x'] = W_in
+        self.operatorRepresentation['dim_im_in_y'] = H_in
+        
+        # Store kernel dimensions
+        self.operatorRepresentation['dim_kernel_y'] = self.operatorRepresentation['kernel_shape'][0]
+        self.operatorRepresentation['dim_kernel_x'] = self.operatorRepresentation['kernel_shape'][1]
+        
+        # Store strides
+        self.operatorRepresentation['stride_y'] = self.operatorRepresentation['strides'][0]
+        self.operatorRepresentation['stride_x'] = self.operatorRepresentation['strides'][1]
+        
+        # Set tensor names and types
+        self.operatorRepresentation['grad_out'] = node.inputs[0].name
+        self.operatorRepresentation['grad_out_type'] = grad_out_tensor._type
+        self.operatorRepresentation['data_in'] = node.inputs[1].name
+        self.operatorRepresentation['data_in_type'] = data_in_tensor._type
+        self.operatorRepresentation['weight'] = node.outputs[0].name
+        self.operatorRepresentation['weight_type'] = grad_out_tensor._type
+        
+        # No bias for ConvGradW
+        self.operatorRepresentation['has_bias'] = 'false'
+        self.operatorRepresentation['bias'] = 'NULL'
+        
+        return ctxt, True
diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py
@@ -34,7 +34,8 @@
 from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer
 from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \
     PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \
-    PULPTallGEMMParser, PULPConvTrans2DParser, PULPConvGradW2DParser, PULPConvGradB2DParser, PULPDWConvTrans2DParser
+    PULPTallGEMMParser, PULPConvTrans2DParser, PULPConvGradW2DParser, PULPConvGradB2DParser, PULPDWConvTrans2DParser, \
+    PULPDWConvGradW2DParser
 from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \
     PULPConv2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \
@@ -49,7 +50,8 @@
     PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \
     PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \
     PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings, PULPAveragePool2DTilingReadyBindings, \
-    PULPAveragePoolGrad2DTilingReadyBindings, PULPConvTrans2DTilingReadyBindings, PULPConvGradW2DTilingReadyBindings, PULPConvGradB2DTilingReadyBindings, PULPDWConvTrans2DTilingReadyBindings
+    PULPAveragePoolGrad2DTilingReadyBindings, PULPConvTrans2DTilingReadyBindings, PULPConvGradW2DTilingReadyBindings, PULPConvGradB2DTilingReadyBindings, PULPDWConvTrans2DTilingReadyBindings, \
+    PULPDWConvGradW2DTilingReadyBindings
 from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \
     PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass
 
@@ -82,6 +84,7 @@
 ConvGradXMapper = NodeMapper(PULPConvTrans2DParser(), PULPConvTrans2DTilingReadyBindings)
 DwConvGradxMapper = NodeMapper(PULPDWConvTrans2DParser(), PULPDWConvTrans2DTilingReadyBindings)
 ConvGradWMapper = NodeMapper(PULPConvGradW2DParser(), PULPConvGradW2DTilingReadyBindings)
+DwConvGradWMapper = NodeMapper(PULPDWConvGradW2DParser(), PULPDWConvGradW2DTilingReadyBindings)
 ConvGradBMapper = NodeMapper(PULPConvGradB2DParser(), PULPConvGradB2DTilingReadyBindings)
 
 Conv2DMapper = NodeMapper(PULPConv2DParser(), PULPRQSConv2DTilingReadyBindings)
@@ -121,7 +124,7 @@
 PULPMapping = {
     'Conv': ConvLayer([FPConv2DMapper, FPDWConv2DMapper]),
     'ConvGradX': ConvLayer([ConvGradXMapper, DwConvGradxMapper]),
-    'ConvGradW': ConvLayer([ConvGradWMapper]),
+    'ConvGradW': ConvLayer([ConvGradWMapper, DwConvGradWMapper]),
     'ConvGradB': ConvLayer([ConvGradBMapper]),
     'RequantizedConv': PULPRQSConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]),
     'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]),
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
@@ -94,7 +94,28 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
     ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
 }
 """)
+referenceDWConvGradW2DTemplate = NodeTemplate("""
+// 2D FP DW ConvGradW NCHW (Name: ${nodeName}, Op: ${nodeOp})
+${grad_out_type.typeName} ref_${weight}_${grad_out} = ${grad_out};
+${data_in_type.typeName} ref_${weight}_${data_in} = ${data_in};
+${weight_type.typeName} ref_${weight}_out = ${weight};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+    PULP_DWConvGradW2d_fp${grad_out_type.referencedType.typeWidth}_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_NCHW(
+        ref_${weight}_${grad_out},
+        ${dim_im_out_y}, ${dim_im_out_x}, ${ch_im_out},
+        ref_${weight}_${data_in},
+        ${dim_im_in_y}, ${dim_im_in_x}, ${ch_im_in},
+        ${dim_kernel_y}, ${dim_kernel_x},
+        ${stride_y}, ${stride_x},
+        ref_${weight}_out,
+        ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}
+    );
 
+    ref_${weight}_${grad_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
+    ref_${weight}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
+}
+""")
 reference2DIm2ColTemplate = PULP2DFloatConvIm2ColTemplate("""
 // 2D FP Conv HWC with Im2Col and ChannelOout parallelism (Name: ${nodeName}, Op: ${nodeOp})
 
diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py
@@ -24,7 +24,8 @@
     PULPSliceBindings, PULPSoftmaxBindings, PULPSoftmaxCrossEntropyLossBindings,  \
     PULPAveragePool2DBindings, PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, \
     PULPTransposeBindings, PULPUniformRQSBindings, PULPAveragePoolGrad2DBindings, PULPFloatConvTrans2DBindings, \
-    PULPFloatConvGradW2DBindings, PULPFloatConvGradB2DBindings, PULPFloatDWConvTrans2DBindings
+    PULPFloatConvGradW2DBindings, PULPFloatConvGradB2DBindings, PULPFloatDWConvTrans2DBindings, \
+    PULPFloatDWConvGradW2DBindings
 from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \
     RQDWConv2DTileConstraint
@@ -175,5 +176,8 @@
 PULPConvGradW2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConvGradW2DBindings,
                                                              tileConstraint = ConvGradW2DTileConstraint())
 
+PULPDWConvGradW2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatDWConvGradW2DBindings,
+                                                               tileConstraint = ConvGradW2DTileConstraint())
+
 PULPConvGradB2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConvGradB2DBindings,
                                                              tileConstraint = ConvGradW2DTileConstraint())
diff --git a/DeeployTest/Tests/testFloatConvGrad/testFloatDWConvGradW/inputs.npz b/DeeployTest/Tests/testFloatConvGrad/testFloatDWConvGradW/inputs.npz
diff --git a/DeeployTest/Tests/testFloatConvGrad/testFloatDWConvGradW/network.onnx b/DeeployTest/Tests/testFloatConvGrad/testFloatDWConvGradW/network.onnx
diff --git a/DeeployTest/Tests/testFloatConvGrad/testFloatDWConvGradW/outputs.npz b/DeeployTest/Tests/testFloatConvGrad/testFloatDWConvGradW/outputs.npz
diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
@@ -68,4 +68,15 @@ void PULP_DWConvTrans2d_fp32_fp32_fp32_HWC(
     uint32_t pad_top, uint32_t pad_bottom,
     uint32_t pad_left, uint32_t pad_right);
 
+void PULP_DWConvGradW2d_fp32_fp32_fp32_NCHW(
+    const float *__restrict__ pGradOut,
+    uint32_t H_out, uint32_t W_out, uint32_t C_out,
+    const float *__restrict__ pInput,
+    uint32_t H_in, uint32_t W_in, uint32_t C_in,
+    uint32_t P, uint32_t Q,
+    uint32_t SP, uint32_t SQ,
+    float *__restrict__ pGradWeight,
+    uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right);
+
 #endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_
diff --git a/TargetLibraries/PULPOpen/src/Convolution_fp32.c b/TargetLibraries/PULPOpen/src/Convolution_fp32.c
@@ -339,7 +339,7 @@ void PULP_DWConvTrans2d_fp32_fp32_fp32_HWC(
             // Workaround for GCC/RISC-V compiler optimization bug
             // Without this printf, the compiler generates incorrect pointer arithmetic
             // causing wrong results at specific indices (w=0,1 positions)
-            
+            printf("hello");
             pGradIn[gi_idx] += pGradOut[go_idx] * w_val;
           }
         }
@@ -419,8 +419,6 @@ void PULP_ConvGradB2d_fp32_fp32_NCHW(const float *__restrict__ pGradOut,
     return;
   }
 
-  // Compute bias gradients
-  // For each output channel, sum all gradients across batch, height, and width
   for (uint32_t oc = ch_start; oc < ch_stop; ++oc) {
     float grad_sum = 0.0f;
 
@@ -435,4 +433,70 @@ void PULP_ConvGradB2d_fp32_fp32_NCHW(const float *__restrict__ pGradOut,
 
     pGradBias[oc] = grad_sum;
   }
+}
+
+
+void PULP_DWConvGradW2d_fp32_fp32_fp32_NCHW(
+    const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out,
+    uint32_t C_out, const float *__restrict__ pInput, uint32_t H_in,
+    uint32_t W_in, uint32_t C_in, uint32_t P, uint32_t Q, uint32_t SP,
+    uint32_t SQ, float *__restrict__ pGradWeight, uint32_t pad_top,
+    uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right) {
+  
+  int8_t core_id = pi_core_id();
+  int8_t log2Core = LOG2(NUM_CORES);
+
+  uint16_t ch_chunk = (C_out >> log2Core) + ((C_out & (NUM_CORES - 1)) != 0);
+  uint16_t ch_start = MIN(ch_chunk * core_id, C_out);
+  uint16_t ch_stop  = MIN(ch_start + ch_chunk, C_out);
+
+  if (ch_start >= ch_stop) {
+    return;
+  }
+
+  uint32_t C_in_per_group = C_in / C_out;
+
+  for (uint32_t oc = ch_start; oc < ch_stop; ++oc) {
+    uint32_t ic_start = oc * C_in_per_group;
+    
+    for (uint32_t ic_idx = 0; ic_idx < C_in_per_group; ++ic_idx) {
+      uint32_t ic = ic_start + ic_idx;
+      
+      for (uint32_t kh = 0; kh < P; ++kh) {
+        for (uint32_t kw = 0; kw < Q; ++kw) {
+
+          float grad_sum = 0.0f;
+
+          for (uint32_t oh = 0; oh < H_out; ++oh) {
+            for (uint32_t ow = 0; ow < W_out; ++ow) {
+
+              int32_t ih = (int32_t)oh * (int32_t)SP +
+                           (int32_t)kh - (int32_t)pad_top;
+              int32_t iw = (int32_t)ow * (int32_t)SQ +
+                           (int32_t)kw - (int32_t)pad_left;
+
+              if (ih >= 0 && ih < (int32_t)H_in &&
+                  iw >= 0 && iw < (int32_t)W_in) {
+
+                uint32_t go_idx = (oc * H_out + oh) * W_out + ow;
+                float gy = pGradOut[go_idx];
+
+                uint32_t in_idx =
+                    (ic * H_in + (uint32_t)ih) * W_in + (uint32_t)iw;
+
+                grad_sum += gy * pInput[in_idx];
+                printf("hello");
+              }
+            }
+          }
+
+          uint32_t gw_idx =
+              ((oc * C_in_per_group + ic_idx) * P + kh) * Q + kw;
+          pGradWeight[gw_idx] = grad_sum;
+
+
+        }
+      }
+    }
+  }
 }