gemm no bias + input in name issue for codegenerate

runwangdl · runwangdl · commit 5c3f2871fae6 · 2025-06-19T14:09:06.000Z
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
@@ -22,9 +22,26 @@
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the Licens
-from Deeploy.DeeployTypes import NodeTemplate
+from Deeploy.DeeployTypes import NodeTemplate, NetworkContext, OperatorRepresentation
+from Deeploy.AbstractDataTypes import float32_tPtr
+from typing import Tuple, Dict, List
 
-referenceTemplate = NodeTemplate("""
+class FloatGEMMTemplate(NodeTemplate):
+    
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+    
+    def alignToContext(self, ctxt: NetworkContext,
+                      operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+    
+        if 'C' not in operatorRepresentation or operatorRepresentation['C'] is None:
+            # No bias case - set C to NULL and provide a default type
+            operatorRepresentation['C'] = None
+            operatorRepresentation['C_type'] = float32_tPtr  # Default to fp32 type
+        
+        return ctxt, operatorRepresentation, []
+
+referenceTemplate = FloatGEMMTemplate("""
 // GEMM (Name: ${nodeName}, Op: ${nodeOp})
 int8_t ${nodeName}_core_id = pi_core_id();
 int8_t ${nodeName}_log2Core = log2(NUM_CORES);
@@ -35,13 +52,21 @@
 
 ${A_type.typeName} ref_${data_out}_${A} = ${A};
 ${B_type.typeName} ref_${data_out}_${B} = ${B};
+% if C is not None:
 ${C_type.typeName} ref_${data_out}_${C} = ${C};
+% else:
+${C_type.typeName} ref_${data_out}_C = NULL;
+% endif
 ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
 
 for(uint32_t i=0; i<${batch}; i++) {
     ${A_type.typeName} batch_A = ref_${data_out}_${A} + i * ${M} * ${N};
     ${B_type.typeName} batch_B = ref_${data_out}_${B} + i * ${N} * ${O};
+% if C is not None:
     ${C_type.typeName} batch_C = ref_${data_out}_${C} + i * ${M} * ${O};
+% else:
+    ${C_type.typeName} batch_C = NULL;
+% endif
     ${data_out_type.typeName} batch_out = ref_${data_out}_${data_out} + i * ${M} * ${O};
     
     if (${nodeName}_M_size > 0) {
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py
@@ -226,16 +226,23 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         # Get to-be-tiled tensor's buffers
         bufferA = ctxt.lookup(name = parseDict['A'])
         bufferB = ctxt.lookup(name = parseDict['B'])
-        bufferC = ctxt.lookup(name = parseDict['C'])
         outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+        
+        has_bias = 'C' in parseDict and parseDict['C'] is not None
+        bufferC = None
+        if has_bias:
+            bufferC = ctxt.lookup(name = parseDict['C'])
 
         # Add I/O dimensions to the model as variables
-        for bufferName in [bufferA.name, bufferB.name, bufferC.name, outputBuffer.name]:
+        buffer_names = [bufferA.name, bufferB.name, outputBuffer.name]
+        if has_bias:
+            buffer_names.append(bufferC.name)
+            
+        for bufferName in buffer_names:
             tilerModel.addTensorDimToModel(ctxt, bufferName)
 
         dimOffsetA = len(bufferA.shape) - 2
         dimOffsetB = len(bufferB.shape) - 2
-        dimOffsetC = len(bufferC.shape) - 2
         dimOffsetOut = len(outputBuffer.shape) - 2
 
         AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA'])
@@ -254,10 +261,13 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         # Add GEMM Geometrical constraints
         tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
 
-        addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC)
-        addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1)
-        tilerModel.addConstraint(outputFirstDimVar == addDimVar_1)
-        tilerModel.addConstraint(outputSecondDimVar == addDimVar_2)
+        # Add bias constraints only if bias is present
+        if has_bias:
+            dimOffsetC = len(bufferC.shape) - 2
+            addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC)
+            addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1)
+            tilerModel.addConstraint(outputFirstDimVar == addDimVar_1)
+            tilerModel.addConstraint(outputSecondDimVar == addDimVar_2)
 
         return tilerModel
 
@@ -295,7 +305,14 @@ def serializeTilingSolution(
             operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
         outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
 
-        addrNames = ['A', 'B', 'C', 'data_out']
+        # Check if C (bias) is present
+        has_bias = 'C' in operatorRepresentation and operatorRepresentation['C'] is not None
+        
+        # Build address names list based on whether bias is present
+        addrNames = ['A', 'B', 'data_out']
+        if has_bias:
+            addrNames.insert(2, 'C')  # Insert 'C' before 'data_out'
+
         inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
                                                                   operatorRepresentation, addrNames)
 
@@ -350,11 +367,13 @@ def serializeTilingSolution(
             else:
                 BCube = HyperRectangle((BatchOffset, BOffset, OOffset, NOffset), (BatchSize, BSize, OSize, NSize))
 
-            CCube = HyperRectangle(cube.offset, cube.dims)
-
             inputACubes.append(ACube)
             inputBCubes.append(BCube)
-            inputAddCubes.append(CCube)
+            
+            # Only create C cubes if bias is present
+            if has_bias:
+                CCube = HyperRectangle(cube.offset, cube.dims)
+                inputAddCubes.append(CCube)
 
         inputLoadSchedule = []
         outputLoadSchedule = []
@@ -368,8 +387,13 @@ def serializeTilingSolution(
             "batch": PointerClass(uint8_t)
         }
 
-        for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes):
-            inputLoadSchedule.append({"A": a, "B": b, "C": c})
+        # Build input load schedule based on whether bias is present
+        if has_bias:
+            for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes):
+                inputLoadSchedule.append({"A": a, "B": b, "C": c})
+        else:
+            for a, b in zip(inputACubes, inputBCubes):
+                inputLoadSchedule.append({"A": a, "B": b})
 
         for out in outputCubes:
             outputLoadSchedule.append({"data_out": out})
diff --git a/DeeployTest/testUtils/codeGenerate.py b/DeeployTest/testUtils/codeGenerate.py
@@ -269,11 +269,11 @@ def type2TypeStr(dataType) -> Tuple[str, int]:
 
     def dumpBuffer(buf: VariableBuffer, path: str):
 
-        if "input" in buf.name:
+        if not isinstance(buf, ConstantBuffer) and "input" in buf.name:
             idx = int(buf.name.split("_")[1])
             array = _shapeBroadcast(deployer.ctxt, test_inputs[idx], f"input_{idx}")
 
-        elif "output" in buf.name:
+        elif not isinstance(buf, ConstantBuffer) and "output" in buf.name:
             _list = buf.name.split("_")
             idx = int(_list[1])
             array = _shapeBroadcast(deployer.ctxt, test_outputs[idx], f"output_{idx}")
diff --git a/TargetLibraries/Generic/src/Gemm_fp32.c b/TargetLibraries/Generic/src/Gemm_fp32.c
@@ -29,25 +29,36 @@
 #include "DeeployBasicMath.h"
 
 void Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
-                               const float32_t *__restrict__ pSrcB,
-                               const float32_t *__restrict__ pDstC, 
-                               float32_t *__restrict__ pDstY,
-                               uint32_t M,
-                               uint32_t N, 
-                               uint32_t O,
-                               int32_t transA,
-                               int32_t transB
-                               ) {
-  for (uint32_t i = 0; i < M; ++i) {
-    for (uint32_t j = 0; j < O; ++j) {
-      float32_t sum = 0.0f;
-      for (uint32_t k = 0; k < N; ++k) {
-        uint32_t a_idx = transA ? (k * M + i) : (i * N + k);
-        uint32_t b_idx = transB ? (j * N + k) : (k * O + j);
-        
-        sum += pSrcA[a_idx] * pSrcB[b_idx];
+                              const float32_t *__restrict__ pSrcB,
+                              const float32_t *__restrict__ pDstC,
+                              float32_t *__restrict__ pDstY, uint32_t M,
+                              uint32_t N, uint32_t O, int32_t transA,
+                              int32_t transB) {
+  if (pDstC == NULL) {
+    for (uint32_t i = 0; i < M; ++i) {
+      for (uint32_t j = 0; j < O; ++j) {
+        float32_t sum = 0.0f;
+        for (uint32_t k = 0; k < N; ++k) {
+          uint32_t a_idx = transA ? (k * M + i) : (i * N + k);
+          uint32_t b_idx = transB ? (j * N + k) : (k * O + j);
+
+          sum += pSrcA[a_idx] * pSrcB[b_idx];
+        }
+        pDstY[i * O + j] = sum;
+      }
+    }
+  } else {
+    for (uint32_t i = 0; i < M; ++i) {
+      for (uint32_t j = 0; j < O; ++j) {
+        float32_t sum = 0.0f;
+        for (uint32_t k = 0; k < N; ++k) {
+          uint32_t a_idx = transA ? (k * M + i) : (i * N + k);
+          uint32_t b_idx = transB ? (j * N + k) : (k * O + j);
+
+          sum += pSrcA[a_idx] * pSrcB[b_idx];
+        }
+        pDstY[i * O + j] = sum  + pDstC[i * O + j];
       }
-      pDstY[i * O + j] = sum + pDstC[i * O + j];
     }
   }
 }