Skip to content

Commit 5c3f287

Browse files
committed
gemm no bias + input in name issue for codegenerate
1 parent 90689e2 commit 5c3f287

4 files changed

Lines changed: 95 additions & 35 deletions

File tree

Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,26 @@
2222
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2323
# See the License for the specific language governing permissions and
2424
# limitations under the Licens
25-
from Deeploy.DeeployTypes import NodeTemplate
25+
from Deeploy.DeeployTypes import NodeTemplate, NetworkContext, OperatorRepresentation
26+
from Deeploy.AbstractDataTypes import float32_tPtr
27+
from typing import Tuple, Dict, List
2628

27-
referenceTemplate = NodeTemplate("""
29+
class FloatGEMMTemplate(NodeTemplate):
30+
31+
def __init__(self, templateStr):
32+
super().__init__(templateStr)
33+
34+
def alignToContext(self, ctxt: NetworkContext,
35+
operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
36+
37+
if 'C' not in operatorRepresentation or operatorRepresentation['C'] is None:
38+
# No bias case - set C to NULL and provide a default type
39+
operatorRepresentation['C'] = None
40+
operatorRepresentation['C_type'] = float32_tPtr # Default to fp32 type
41+
42+
return ctxt, operatorRepresentation, []
43+
44+
referenceTemplate = FloatGEMMTemplate("""
2845
// GEMM (Name: ${nodeName}, Op: ${nodeOp})
2946
int8_t ${nodeName}_core_id = pi_core_id();
3047
int8_t ${nodeName}_log2Core = log2(NUM_CORES);
@@ -35,13 +52,21 @@
3552
3653
${A_type.typeName} ref_${data_out}_${A} = ${A};
3754
${B_type.typeName} ref_${data_out}_${B} = ${B};
55+
% if C is not None:
3856
${C_type.typeName} ref_${data_out}_${C} = ${C};
57+
% else:
58+
${C_type.typeName} ref_${data_out}_C = NULL;
59+
% endif
3960
${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
4061
4162
for(uint32_t i=0; i<${batch}; i++) {
4263
${A_type.typeName} batch_A = ref_${data_out}_${A} + i * ${M} * ${N};
4364
${B_type.typeName} batch_B = ref_${data_out}_${B} + i * ${N} * ${O};
65+
% if C is not None:
4466
${C_type.typeName} batch_C = ref_${data_out}_${C} + i * ${M} * ${O};
67+
% else:
68+
${C_type.typeName} batch_C = NULL;
69+
% endif
4570
${data_out_type.typeName} batch_out = ref_${data_out}_${data_out} + i * ${M} * ${O};
4671
4772
if (${nodeName}_M_size > 0) {

Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -226,16 +226,23 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
226226
# Get to-be-tiled tensor's buffers
227227
bufferA = ctxt.lookup(name = parseDict['A'])
228228
bufferB = ctxt.lookup(name = parseDict['B'])
229-
bufferC = ctxt.lookup(name = parseDict['C'])
230229
outputBuffer = ctxt.lookup(name = parseDict['data_out'])
230+
231+
has_bias = 'C' in parseDict and parseDict['C'] is not None
232+
bufferC = None
233+
if has_bias:
234+
bufferC = ctxt.lookup(name = parseDict['C'])
231235

232236
# Add I/O dimensions to the model as variables
233-
for bufferName in [bufferA.name, bufferB.name, bufferC.name, outputBuffer.name]:
237+
buffer_names = [bufferA.name, bufferB.name, outputBuffer.name]
238+
if has_bias:
239+
buffer_names.append(bufferC.name)
240+
241+
for bufferName in buffer_names:
234242
tilerModel.addTensorDimToModel(ctxt, bufferName)
235243

236244
dimOffsetA = len(bufferA.shape) - 2
237245
dimOffsetB = len(bufferB.shape) - 2
238-
dimOffsetC = len(bufferC.shape) - 2
239246
dimOffsetOut = len(outputBuffer.shape) - 2
240247

241248
AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA'])
@@ -254,10 +261,13 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
254261
# Add GEMM Geometrical constraints
255262
tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
256263

257-
addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC)
258-
addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1)
259-
tilerModel.addConstraint(outputFirstDimVar == addDimVar_1)
260-
tilerModel.addConstraint(outputSecondDimVar == addDimVar_2)
264+
# Add bias constraints only if bias is present
265+
if has_bias:
266+
dimOffsetC = len(bufferC.shape) - 2
267+
addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC)
268+
addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1)
269+
tilerModel.addConstraint(outputFirstDimVar == addDimVar_1)
270+
tilerModel.addConstraint(outputSecondDimVar == addDimVar_2)
261271

262272
return tilerModel
263273

@@ -295,7 +305,14 @@ def serializeTilingSolution(
295305
operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
296306
outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
297307

298-
addrNames = ['A', 'B', 'C', 'data_out']
308+
# Check if C (bias) is present
309+
has_bias = 'C' in operatorRepresentation and operatorRepresentation['C'] is not None
310+
311+
# Build address names list based on whether bias is present
312+
addrNames = ['A', 'B', 'data_out']
313+
if has_bias:
314+
addrNames.insert(2, 'C') # Insert 'C' before 'data_out'
315+
299316
inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
300317
operatorRepresentation, addrNames)
301318

@@ -350,11 +367,13 @@ def serializeTilingSolution(
350367
else:
351368
BCube = HyperRectangle((BatchOffset, BOffset, OOffset, NOffset), (BatchSize, BSize, OSize, NSize))
352369

353-
CCube = HyperRectangle(cube.offset, cube.dims)
354-
355370
inputACubes.append(ACube)
356371
inputBCubes.append(BCube)
357-
inputAddCubes.append(CCube)
372+
373+
# Only create C cubes if bias is present
374+
if has_bias:
375+
CCube = HyperRectangle(cube.offset, cube.dims)
376+
inputAddCubes.append(CCube)
358377

359378
inputLoadSchedule = []
360379
outputLoadSchedule = []
@@ -368,8 +387,13 @@ def serializeTilingSolution(
368387
"batch": PointerClass(uint8_t)
369388
}
370389

371-
for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes):
372-
inputLoadSchedule.append({"A": a, "B": b, "C": c})
390+
# Build input load schedule based on whether bias is present
391+
if has_bias:
392+
for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes):
393+
inputLoadSchedule.append({"A": a, "B": b, "C": c})
394+
else:
395+
for a, b in zip(inputACubes, inputBCubes):
396+
inputLoadSchedule.append({"A": a, "B": b})
373397

374398
for out in outputCubes:
375399
outputLoadSchedule.append({"data_out": out})

DeeployTest/testUtils/codeGenerate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -269,11 +269,11 @@ def type2TypeStr(dataType) -> Tuple[str, int]:
269269

270270
def dumpBuffer(buf: VariableBuffer, path: str):
271271

272-
if "input" in buf.name:
272+
if not isinstance(buf, ConstantBuffer) and "input" in buf.name:
273273
idx = int(buf.name.split("_")[1])
274274
array = _shapeBroadcast(deployer.ctxt, test_inputs[idx], f"input_{idx}")
275275

276-
elif "output" in buf.name:
276+
elif not isinstance(buf, ConstantBuffer) and "output" in buf.name:
277277
_list = buf.name.split("_")
278278
idx = int(_list[1])
279279
array = _shapeBroadcast(deployer.ctxt, test_outputs[idx], f"output_{idx}")

TargetLibraries/Generic/src/Gemm_fp32.c

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -29,25 +29,36 @@
2929
#include "DeeployBasicMath.h"
3030

3131
void Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
32-
const float32_t *__restrict__ pSrcB,
33-
const float32_t *__restrict__ pDstC,
34-
float32_t *__restrict__ pDstY,
35-
uint32_t M,
36-
uint32_t N,
37-
uint32_t O,
38-
int32_t transA,
39-
int32_t transB
40-
) {
41-
for (uint32_t i = 0; i < M; ++i) {
42-
for (uint32_t j = 0; j < O; ++j) {
43-
float32_t sum = 0.0f;
44-
for (uint32_t k = 0; k < N; ++k) {
45-
uint32_t a_idx = transA ? (k * M + i) : (i * N + k);
46-
uint32_t b_idx = transB ? (j * N + k) : (k * O + j);
47-
48-
sum += pSrcA[a_idx] * pSrcB[b_idx];
32+
const float32_t *__restrict__ pSrcB,
33+
const float32_t *__restrict__ pDstC,
34+
float32_t *__restrict__ pDstY, uint32_t M,
35+
uint32_t N, uint32_t O, int32_t transA,
36+
int32_t transB) {
37+
if (pDstC == NULL) {
38+
for (uint32_t i = 0; i < M; ++i) {
39+
for (uint32_t j = 0; j < O; ++j) {
40+
float32_t sum = 0.0f;
41+
for (uint32_t k = 0; k < N; ++k) {
42+
uint32_t a_idx = transA ? (k * M + i) : (i * N + k);
43+
uint32_t b_idx = transB ? (j * N + k) : (k * O + j);
44+
45+
sum += pSrcA[a_idx] * pSrcB[b_idx];
46+
}
47+
pDstY[i * O + j] = sum;
48+
}
49+
}
50+
} else {
51+
for (uint32_t i = 0; i < M; ++i) {
52+
for (uint32_t j = 0; j < O; ++j) {
53+
float32_t sum = 0.0f;
54+
for (uint32_t k = 0; k < N; ++k) {
55+
uint32_t a_idx = transA ? (k * M + i) : (i * N + k);
56+
uint32_t b_idx = transB ? (j * N + k) : (k * O + j);
57+
58+
sum += pSrcA[a_idx] * pSrcB[b_idx];
59+
}
60+
pDstY[i * O + j] = sum + pDstC[i * O + j];
4961
}
50-
pDstY[i * O + j] = sum + pDstC[i * O + j];
5162
}
5263
}
5364
}

0 commit comments

Comments
 (0)