runwangdl
diff --git a/‎Deeploy/Targets/PULPOpen/Parsers.py‎
Lines changed: 11 additions & 0 deletions b/‎Deeploy/Targets/PULPOpen/Parsers.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎Deeploy/Targets/PULPOpen/Templates/FloatConvGradTemplate.py‎
Lines changed: 10 additions & 26 deletions b/‎Deeploy/Targets/PULPOpen/Templates/FloatConvGradTemplate.py‎
Lines changed: 10 additions & 26 deletions
@@ -552,6 +552,17 @@ def parseNode(self, node: gs.Node) -> bool:
         if kernel_shape != [1, 1]:
             return False
 
+        # The PW ConvGradX kernel (direct dX[ci, hw] = sum_co W[co, ci] *
+        # dY[co, hw]) implicitly assumes dX and dY share the same spatial
+        # extent — i.e. stride == 1. Stride>1 1x1 convolutions (e.g.
+        # ResNet8 downsample shortcuts) need the sparse-write semantics
+        # `dX[ci, stride*y, stride*x] = ...` which only the im2col-tiled
+        # general kernel implements correctly, so reject them here and let
+        # them fall through to ConvGradX2DIm2ColHWTileConstraint.
+        strides = self.operatorRepresentation.get('strides', [1, 1])
+        if list(strides) != [1, 1]:
+            return False
+
         return wellFormed and True
 
 
 
@@ -353,34 +353,19 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
 
 
 class PULP2DFloatPWConvGradXTemplate(NodeTemplate):
+    """PW (1x1) ConvGradX template.
+
+    The direct PULP_PWConvGradX2d_fp32_fp32_fp32_CHW kernel parallelises over
+    Cin and streams W rows / dY rows contiguously, so no weight-transpose
+    scratch is required. Allocating a Cin*Cout transient (the old
+    transpose buffer) used to eat ~64 KB of L1 for the MobileNetV1
+    block 6-10 PW layers and forced the tiler to fragment Cin/H/W into
+    ~36 tiles per layer; removing it lets the tiler pick coarse tiles.
+    """
 
     def __init__(self, templateStr):
         super().__init__(templateStr)
 
-    @staticmethod
-    def computeTransientBuffersSize(
-            ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
-        # Transpose buffer for weight matrix transpose (C_out x C_in)
-        # For pointwise convolution, kernel size is 1x1
-        bt_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * \
-                 operatorRepresentation['ch_im_in'] * operatorRepresentation['ch_im_out']
-
-        bt_name = operatorRepresentation['nodeName'] + "_transpose_buffer"
-
-        return [(bt_name, bt_dim)]
-
-    def hoistTransientBuffers(self, ctxt: NetworkContext,
-                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
-        bt_name, bt_dim = PULP2DFloatPWConvGradXTemplate.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
-
-        ctxt.hoistTransientBuffer(bt_name, bt_dim)
-
-        operatorRepresentation['transposeBuffer'] = bt_name
-        operatorRepresentation['transposeBufferSize'] = bt_dim
-
-        return ctxt, operatorRepresentation, [bt_name]
-
 
 referencePWConvGradW2DTemplate = _ConvGradWTemplate("""
 // 2D FP Pointwise ConvGradW (1x1) NCHW using pulp-trainlib pw interface (Name: ${nodeName}, Op: ${nodeOp})
@@ -424,8 +409,7 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         ref_${grad_in}_${weight},
         ${ch_im_in},
         ref_${grad_in}_out,
-        ${dim_im_in_x}, ${dim_im_in_y},
-        ${transposeBuffer}, ${transposeBufferSize}
+        ${dim_im_in_x}, ${dim_im_in_y}
     );
 
     ref_${grad_in}_${grad_out} += ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x};