fix(redmule+upstream-transpose): unblock CCT_train codegen end-to-end

runwangdl · claude · runwangdl · commit 1782a88d4d4e · 2026-05-10T15:28:38.000Z
This commit replaces 39bb8f1's experimental Gemm->MatMul lowering pass (which unblocked the original KeyError 'C' but exposed a deeper Transpose rank-mismatch bug downstream) with two smaller, locally-verified fixes: 1) Hoist a properly-shaped zero C tensor in GEMMRedmuleParser when an ONNX Gemm has only A and B (e.g. backward GradFusedMatMul rewrites in CCT_train). Fixes for the hoist path: - GEMMRedmuleParser.__init__ used to set self.noBiasHoisting *before* calling super().__init__(), but MatMulParser.__init__ also writes self.noBiasHoisting from its own default of True -- so the caller's flag was silently clobbered. Reverse the order and forward the kwarg. - The hoist used to allocate a 1-element np.zeros((1)) scalar; that would never satisfy RedmuleGEMMTileConstraint's "C dim equals output dim" assertion. Allocate a zero array whose shape matches node.outputs[0].shape. - Pass _type=PointerClass(float32_t) to ctxt.hoistConstant so the buffer is type-annotated up-front. Without it, MemoryScheduler.getConstantTensorOffset later trips an AttributeError on the un-annotated buffer. - Append the hoisted Constant to node.inputs so the tiler picks it up via its node.inputs + node.outputs walk, AND register the Gemm as a user via newCtxt.addUser so the MemoryConstraintFlow kill-set assertion (which walks _users) finds a consumer. - Engine.GEMMMRedmuleMapper now instantiates with noBiasHoisting=False so the hoist path is actually taken. Drop the BiaslessGemmToMatMulPass class (added in 39bb8f1) and its Deployer registration: the parser-side hoist is the smaller fix and side-steps the MatMul broadcasting issue entirely. 2) Fix Generic/TransposeTileConstraint and PULPOpen/TransposeTemplate to use a *spatial-view* interpretation of perm. When MatMulLayer. computeShapes broadens an already-existing tensor that is simultaneously a forward MatMul B input *and* a downstream non-broadening consumer (Gemm/Transpose), data_in and data_out of a downstream Transpose can end up with different ranks. Both addGeometricalConstraint and serializeTilingSolution previously assumed len(perm) == data_in_rank == data_out_rank; they now offset their shape lookups by len(shape) - len(perm) so the perm targets the trailing spatial dims in either tensor. PULPTransposeTemplate's alignToContext gets the same treatment for its dimLen_<idx> lookup and parallelDim selection. Aligned cases (existing kernel fixtures testFloatGEMM / testFloatGEMMtransB) compute identical offsets of 0 and behave exactly as before. This commit verifies the fix locally on Models/Training/CCT/cct_train: testMVPTraining.py and testMVPOptimizer.py both exit 0 on Siracusa_w_redmule, producing a ~7.7 MB TrainingNetwork.c and matching OptimizerNetwork.c. C compilation + GVSoC simulation still need to be validated on CI (can't run the runwangdl/gvsoc fork locally in the agent container). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py
@@ -24,15 +24,34 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         inputBufferName = parseDict['data_in']
         outputBufferName = parseDict['data_out']
 
+        inputShape = ctxt.lookup(inputBufferName).shape
+        outputShape = ctxt.lookup(outputBufferName).shape
+        perm = parseDict["perm"]
+
+        # Spatial-view interpretation of the perm: it operates on the last
+        # len(perm) dims of data_in and the last len(perm) dims of data_out.
+        # MatMulLayer.computeShapes can left-pad the rank of one side without
+        # touching the other when the same gs.Variable is shared between a
+        # broadening (MatMul) and a non-broadening (Gemm/Transpose) consumer,
+        # so the constraint indexing must offset by the per-side leading-batch
+        # depth rather than assume rank == len(perm) == rank_other.  When all
+        # ranks already match, offsets are 0 and behavior is unchanged.
+        inputOffset = len(inputShape) - len(perm)
+        outputOffset = len(outputShape) - len(perm)
+        assert inputOffset >= 0 and outputOffset >= 0, (
+            f"Transpose perm {perm} is longer than tensor ranks "
+            f"data_in={inputShape}, data_out={outputShape}")
+
         # Add I/O dimensions to the model as variables
         for bufferName in [inputBufferName, outputBufferName]:
             tilerModel.addTensorDimToModel(ctxt, bufferName)
 
-        # Map output dims to inputs dims
-        for idx, perm_idx in enumerate(parseDict["perm"]):
+        # Map output spatial dims to input spatial dims via perm.
+        for idx, perm_idx in enumerate(perm):
             tilerModel.addConstraint(
-                tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx) == tilerModel.getTensorDimVar(
-                    tensorName = inputBufferName, dimIdx = perm_idx))
+                tilerModel.getTensorDimVar(tensorName = outputBufferName,
+                                           dimIdx = outputOffset + idx) == tilerModel.getTensorDimVar(
+                                               tensorName = inputBufferName, dimIdx = inputOffset + perm_idx))
 
         return tilerModel
 
@@ -50,7 +69,10 @@ def serializeTilingSolution(
         replacementTypes = {}
         replacements: Dict[str, List[int]] = {}
 
-        numDims = len(ctxt.lookup(operatorRepresentation['data_in']).shape)
+        # Match the spatial-view interpretation in addGeometricalConstraint:
+        # only the last len(perm) dims of data_in are actually transposed,
+        # so emit exactly len(perm) dimLen_<i> replacement variables.
+        numDims = len(operatorRepresentation['perm'])
 
         for dim in range(numDims):
             replacementTypes[f"dimLen_{dim}"] = PointerClass(uint16_t)
diff --git a/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py
@@ -65,16 +65,27 @@ def alignToContext(self, ctxt: NetworkContext,
         fRep['accessStr'] = accessStr
         fRep['data_out_shape'] = data_out_shape
 
-        parallelDims = [idx for idx, dim in enumerate(data_out_shape) if dim >= 8]
+        # Spatial-view: perm targets the last len(perm) dims of data_in.  When
+        # data_in has been left-padded (e.g. by MatMulLayer.computeShapes
+        # broadening a shared upstream Transpose output), offset the
+        # data_in_shape lookup so dimLen_<idx> reflects the actual
+        # transposed dim rather than a leading batch placeholder.  Same
+        # for data_out_shape -- parallelDim must index within the spatial
+        # view since the per-tile for-loop count comes from len(perm).
+        dataInOffset = len(data_in_shape) - len(perm)
+        dataOutOffset = len(data_out_shape) - len(perm)
+        spatialOutShape = list(data_out_shape[dataOutOffset:])
+
+        parallelDims = [idx for idx, dim in enumerate(spatialOutShape) if dim >= 8]
         if len(parallelDims) > 0:
             parallelDim = parallelDims[0]
         else:
-            parallelDim = data_out_shape.index(max(data_out_shape))
+            parallelDim = spatialOutShape.index(max(spatialOutShape))
 
         forLoops = []
         dimLenPtrs = []
         for idx, i in enumerate(perm):
-            operatorRepresentation[f"dimLen_{idx}"] = data_in_shape[idx]
+            operatorRepresentation[f"dimLen_{idx}"] = data_in_shape[dataInOffset + idx]
             dimLenPtrs.append(f"dimLen_{idx}")
             if idx != parallelDim:
                 forLoops.append(_forLoop.generate({"i": i, "dimLenPtr": f"dimLen_{i}"}))
diff --git a/Deeploy/Targets/Redmule/Deployer.py b/Deeploy/Targets/Redmule/Deployer.py
@@ -31,7 +31,7 @@
 from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
 from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
 from Deeploy.Targets.Redmule.TopologyOptimizationPasses.Passes import RedMuleAdjustWeightMemoryLayoutPass, \
-    RedMuleBiaslessGemmToMatMulPass, RedMuleGEMMTransposePass
+    RedMuleGEMMTransposePass
 
 
 class RedmuleDeployer(PULPDeployer):
@@ -51,9 +51,5 @@ def __init__(self,
 
         self.loweringOptimizer.passes += [
             RedMuleAdjustWeightMemoryLayoutPass("Redmule"),
-            # Lower bias-less Gemm (e.g. backward GradFusedMatMul nodes in CCT
-            # training) to MatMul before GEMMTransposePass touches them; the
-            # bias-required tile constraint would otherwise crash.
-            RedMuleBiaslessGemmToMatMulPass("Redmule"),
             RedMuleGEMMTransposePass("Redmule")
         ]
diff --git a/Deeploy/Targets/Redmule/Engine.py b/Deeploy/Targets/Redmule/Engine.py
@@ -35,7 +35,7 @@
 
 MatMulRedmuleMapper = NodeMapper(MatMulParser(), RedmuleMatMulTilingReadyBindings)
 Conv2DRedmuleMapper = NodeMapper(PULPFPConv2DParser(), RedmuleConvTilingReadyBindings)
-GEMMMRedmuleMapper = NodeMapper(GEMMRedmuleParser(), RedmuleGEMMTilingReadyBindings)
+GEMMMRedmuleMapper = NodeMapper(GEMMRedmuleParser(noBiasHoisting = False), RedmuleGEMMTilingReadyBindings)
 
 RedmuleMapping = {
     'MatMul': MatMulLayer([MatMulRedmuleMapper]),
diff --git a/Deeploy/Targets/Redmule/Parsers.py b/Deeploy/Targets/Redmule/Parsers.py
@@ -30,15 +30,20 @@
 import numpy as np
 import onnx_graphsurgeon as gs
 
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import float32_t
 from Deeploy.DeeployTypes import NetworkContext
 from Deeploy.Targets.Generic.Parsers import MatMulParser
 
 
 class GEMMRedmuleParser(MatMulParser):
 
     def __init__(self, noBiasHoisting = True):
+        # Order matters: super().__init__() of MatMulParser also writes
+        # self.noBiasHoisting from its own default, so call super first and
+        # then overwrite, otherwise our flag gets clobbered to True.
+        super().__init__(noBiasHoisting = noBiasHoisting)
         self.noBiasHoisting = noBiasHoisting
-        super().__init__()
 
     def parseNode(self, node: gs.Node) -> (bool):
 
@@ -85,9 +90,23 @@ def parseNodeCtxt(self,
             if len(node.inputs) == 3:
                 self.operatorRepresentation['C'] = newCtxt.lookup(node.inputs[2].name).name
             elif not self.noBiasHoisting:
-                values = np.zeros((1))
+                # Hoist a zero C tensor whose shape matches the GEMM output, so
+                # the bias-required RedmuleGEMMTileConstraint and the existing
+                # 3-operand kernel template can run unchanged on bias-less
+                # Gemm nodes (e.g. backward GradFusedMatMul rewrites in CCT
+                # training graphs that emit Y = A @ B with no C).
+                outShape = node.outputs[0].shape
+                values = np.zeros(outShape, dtype = np.float32)
                 zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values)
-                newCtxt.hoistConstant(zeroTensor)
+                newCtxt.hoistConstant(zeroTensor, _type = PointerClass(float32_t))
+                # Also wire the hoisted Constant into the gs.Node inputs so the
+                # tiler picks it up via its `node.inputs + node.outputs` walk,
+                # AND register the Gemm as a user of the new buffer so the
+                # MemoryConstraintFlow's kill-set analysis (which walks
+                # `_users`) can find a consumer for it.  Without these the
+                # tiler / flow analyzer KeyError or assert on the C tensor.
+                node.inputs.append(zeroTensor)
+                newCtxt.addUser(f'{node.name}_C_Tensor', node)
                 self.operatorRepresentation['C'] = f'{node.name}_C_Tensor'
 
             self.operatorRepresentation['size'] = np.prod(newCtxt.lookup(node.inputs[0].name).shape)
diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py
@@ -149,83 +149,3 @@ def __init__(self, redmuleEngineName: str):
                          replacement_fn = _redmule_gemm_transpose_fun,
                          name = "_REDMULE_GEMM_TRANSPOSE_PASS")
 
-
-def _redmule_biasless_gemm_to_matmul_fun(graph: gs.Graph, match: Match, name: str):
-    """Rewrite a 2-input ONNX Gemm (no C / bias) into an equivalent MatMul.
-
-    Backward-pass codegen (e.g. the ``GradFusedMatMul`` rewrites that fall out
-    of the CCT training graph) emits ``Gemm`` nodes with only A and B and no
-    bias, which the ``GEMMRedmuleParser`` accepts but for which the
-    ``RedmuleGEMMTileConstraint`` then crashes (KeyError on ``parseDict['C']``).
-    A bias-less Gemm with alpha=1 is mathematically just a MatMul, and the
-    Redmule platform already maps ONNX ``MatMul`` to a kernel that doesn't
-    expect a C operand -- so we lower it here.
-
-    transA / transB are materialized as explicit ``Transpose`` nodes (or, for
-    constant operands, folded into the constant) before the op is rewritten,
-    because ``MatMul`` has no equivalent attributes.
-    """
-    gemm_node = list(match.nodes_map.values())[0]
-
-    # Pattern matcher may match Gemms with 3 inputs too; act only on the
-    # bias-less subset.
-    if len(gemm_node.inputs) != 2:
-        return graph
-
-    # Anything other than alpha=1 cannot be expressed as a plain MatMul.
-    if gemm_node.attrs.get('alpha', 1.0) != 1.0:
-        return graph
-
-    transA = gemm_node.attrs.get('transA', 0)
-    transB = gemm_node.attrs.get('transB', 0)
-
-    for inputIdx, transFlag in ((0, transA), (1, transB)):
-        if not transFlag:
-            continue
-        operand = gemm_node.inputs[inputIdx]
-        if isinstance(operand, gs.Constant):
-            if len(operand.values.shape) > 2:
-                perm = list(range(len(operand.values.shape)))
-                perm[-1], perm[-2] = perm[-2], perm[-1]
-                operand.values = np.transpose(operand.values, perm)
-            else:
-                operand.values = np.transpose(operand.values)
-        else:
-            perm = list(range(len(operand.shape)))
-            perm[-1], perm[-2] = perm[-2], perm[-1]
-            anchorTransposeNode = _appendTranspose(operand, gemm_node, perm)
-            graph.nodes.append(anchorTransposeNode)
-
-    gemm_node.op = "MatMul"
-    gemm_node.attrs.clear()
-
-    return graph
-
-
-@contextagnostic
-class RedMuleBiaslessGemmToMatMulPass(ReplaceSequentialPatternPass):
-    """Lower bias-less (2-input) ONNX Gemm nodes to MatMul on the Redmule path.
-
-    Must run before RedMuleGEMMTransposePass so the latter only sees the
-    real (3-input) Gemm nodes; otherwise its replacement_fn would write
-    ``transA`` / ``transB`` back to 0 on what is now a MatMul, and a stale
-    ``Gemm`` op type would still hit the bias-required tile constraint.
-    """
-
-    def __init__(self, redmuleEngineName: str):
-        pattern = gs.Graph()
-
-        input_a = gs.Variable(name = "input_a")
-        input_b = gs.Variable(name = "input_b")
-
-        gemm_output = pattern.layer(op = "Gemm",
-                                    name = "gemm_node",
-                                    inputs = [input_a, input_b],
-                                    outputs = ["gemm_output"])
-
-        pattern.inputs = [input_a, input_b]
-        pattern.outputs = [gemm_output]
-
-        super().__init__(pattern = pattern,
-                         replacement_fn = _redmule_biasless_gemm_to_matmul_fun,
-                         name = "_REDMULE_BIASLESS_GEMM_TO_MATMUL_PASS")