Skip to content

Commit 3a8bf48

Browse files
pauloohaharunwangdl
andcommitted
[Deeploy PR] NE16 Linear Layer Kernels
- Add NE16 linear layer kernels, including a topology pass, NE16 templates, parsers, tile constraints, and bindings - The topology pass recognizes NE16-compatible GEMM layers, adjusts the weight layout for the NE16, and converts the requant shift/scale to the NE16 format - The template detects whether the input is signed; if so, it adds a +128 offset to the input during C runtime and compensates via the bias - Add GAP9 SDK-based Dequant/Quant templates using CNN_Copy.c kernels, replacing the generic templates - Add a generic DequantQuantMergePass that folds adjacent Dequant→Quant pairs into identity or RequantShift - Add a GAP9-specific TopologyOptimizer (GAP9Optimizer) to replace PULPOptimizer Bug fixes: - Add output signedness check in QuantChecker - Fix L3 DMA template (add proper casts) and remove the blocking L3 DMA hack - Isolate dory memory functions from other libraries in CMakeLists so they compile with -Og while compute kernels compile with -O3 - Disable PULPAddRequantMergePass due to incorrect pattern matching when Add has multiple consumers Co-authored-by: runwangdl <samanthawangdl@gmail.com>
1 parent 275537f commit 3a8bf48

17 files changed

Lines changed: 1087 additions & 44 deletions

File tree

Deeploy/Targets/GAP9/Bindings.py

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@
1818
from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
1919
from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
2020
from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
21-
from Deeploy.Targets.GAP9.DMA.L3Dma import gap9L3DmaHack
21+
from Deeploy.Targets.GAP9.DMA.L3Dma import GAP9L3Dma
2222
from Deeploy.Targets.GAP9.DMA.MchanDma import GAP9MchanDma
23+
from Deeploy.Targets.GAP9.Templates import GAP9SDKDequantQuantTemplate, NE16GEMMTemplate
2324
# Import templates from PULPOpen and Generic
2425
from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \
25-
FloatReduceSumTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
26+
FloatReduceSumTemplate, GatherTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
2627
from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
2728
GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \
2829
QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \
@@ -57,7 +58,7 @@
5758
MemoryManagementGeneration("L1"),
5859
TilingVariableReplacement("L2"),
5960
MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
60-
PULPL3Tiling("L3", "L2", gap9L3DmaHack), # Use GAP9-specific L3 DMA
61+
PULPL3Tiling("L3", "L2", GAP9L3Dma()), # Use GAP9-specific L3 DMA
6162
PULPProfileUntiled(),
6263
ArgumentStructGeneration(),
6364
L3MemoryAwareFunctionCallClosure(writeback = False),
@@ -76,7 +77,7 @@
7677
MemoryManagementGeneration("L1"),
7778
TilingVariableReplacement("L2"),
7879
MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
79-
PULPL3Tiling("L3", "L2", gap9L3DmaHack), # Use GAP9-specific L3 DMA
80+
PULPL3Tiling("L3", "L2", GAP9L3Dma()), # Use GAP9-specific L3 DMA
8081
PULPProfileUntiled(),
8182
ArgumentStructGeneration(),
8283
L3MemoryAwareFunctionCallClosure(writeback = False),
@@ -183,6 +184,26 @@
183184
GAP9Transformer) for type1, type2 in zip([int8_t, uint8_t, int8_t, uint8_t], [int8_t, uint8_t, uint8_t, int8_t])
184185
]
185186

187+
GAP9NE16RQSGEMMBindings = [
188+
NodeBinding(
189+
PULPLinearChecker([
190+
PointerClass(type1),
191+
PointerClass(int8_t),
192+
PointerClass(int32_t),
193+
PointerClass(uint8_t),
194+
PointerClass(uint8_t)
195+
], [PointerClass(type2)]), NE16GEMMTemplate.referenceTemplate, GAP9ClusterTransformer)
196+
for type1 in [int8_t, uint8_t]
197+
for type2 in [int8_t, uint8_t]
198+
]
199+
200+
GAP9NE16GEMMInt32Bindings = [
201+
NodeBinding(
202+
GEMMChecker([PointerClass(type1), PointerClass(int8_t),
203+
PointerClass(int32_t)], [PointerClass(int32_t)]), NE16GEMMTemplate.int32OutputTemplate,
204+
GAP9ClusterTransformer) for type1 in [int8_t, uint8_t]
205+
]
206+
186207
GAP9FloatGEMMBindings = [
187208
NodeBinding(
188209
GEMMChecker([PointerClass(float32_t), PointerClass(float32_t),
@@ -386,14 +407,17 @@
386407
]
387408

388409
GAP9QuantBindings = [
389-
NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(int8_t)]), QuantTemplate.referenceTemplate,
390-
GAP9Transformer),
410+
NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(int8_t)]),
411+
GAP9SDKDequantQuantTemplate.fp32QuantI8Template, GAP9Transformer),
412+
NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(uint8_t)]),
413+
GAP9SDKDequantQuantTemplate.fp32QuantU8Template, GAP9Transformer),
391414
]
392415

393416
GAP9DequantBindings = [
394-
NodeBinding(DequantChecker([PointerClass(int8_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate,
395-
GAP9Transformer),
396-
] + [
417+
NodeBinding(DequantChecker([PointerClass(int8_t)], [PointerClass(float32_t)]),
418+
GAP9SDKDequantQuantTemplate.fp32DequantI8Template, GAP9Transformer),
419+
NodeBinding(DequantChecker([PointerClass(uint8_t)], [PointerClass(float32_t)]),
420+
GAP9SDKDequantQuantTemplate.fp32DequantU8Template, GAP9Transformer),
397421
NodeBinding(DequantChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate,
398422
GAP9Transformer),
399423
]

Deeploy/Targets/GAP9/DMA/L3Dma.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
from typing import Dict, Tuple
77

88
from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
9-
from Deeploy.TilingExtension.AsyncDma import AsyncDma, BlockingDmaFromAsyncDmaAdapter, DmaDirection, Future, \
10-
PerTensorWaitingStrategy
9+
from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy
1110

1211

1312
class GAP9L3DmaFuture(Future):
@@ -29,7 +28,7 @@ class GAP9L3Dma(AsyncDma):
2928
_transferTemplates = {
3029
2:
3130
NodeTemplate(
32-
"pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
31+
"pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t)${ext}, (void *)${loc}, (uint32_t)${transfer_size}, (uint32_t)${stride}, (uint32_t)${length}, ${ext2loc}, &${future});"
3332
)
3433
}
3534
_waitingStrategy = PerTensorWaitingStrategy(GAP9L3DmaFuture)
@@ -58,7 +57,3 @@ def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBu
5857
"stride": strideExt[0],
5958
})
6059
return operatorRepresentation
61-
62-
63-
# Blocking adapter for L3 DMA (used in GAP9 L3 tiling)
64-
gap9L3DmaHack = BlockingDmaFromAsyncDmaAdapter(GAP9L3Dma())

Deeploy/Targets/GAP9/Parsers.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from typing import Tuple
6+
7+
import onnx_graphsurgeon as gs
8+
9+
from Deeploy.DeeployTypes import NetworkContext
10+
from Deeploy.Targets.Generic.Parsers import GEMMParser, RQSParserInterface
11+
12+
13+
class NE16GEMMParser(GEMMParser, RQSParserInterface):
14+
"""Parser for NE16 RequantizedGemm nodes with 5 inputs [A, B, C, mul, scale_n]."""
15+
16+
def __init__(self):
17+
super().__init__(noBiasHoisting = True)
18+
19+
def parseNode(self, node: gs.Node) -> bool:
20+
ret_rqs = RQSParserInterface.parseNode(self, node)
21+
ret_matmul = GEMMParser.parseNode(self, node)
22+
ret = all([ret_rqs, ret_matmul, 'shift' in node.attrs, len(node.inputs) == 5])
23+
if ret:
24+
self.operatorRepresentation['shift'] = int(node.attrs['shift'].values)
25+
return ret
26+
27+
def parseNodeCtxt(self,
28+
ctxt: NetworkContext,
29+
node: gs.Node,
30+
channels_first: bool = True) -> Tuple[NetworkContext, bool]:
31+
newCtxt, ret = GEMMParser.parseNodeCtxt(self, ctxt, node, channels_first)
32+
if ret:
33+
inputs = ['A', 'B', 'C', 'mul', 'scale_n']
34+
for idx, inputNode in enumerate(node.inputs):
35+
self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
36+
return newCtxt, True
37+
return ctxt, False

Deeploy/Targets/GAP9/Platform.py

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,29 @@
55
import numpy as np
66
import onnx_graphsurgeon as gs
77

8+
from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
9+
RemoveEmptyConvBiasPass, RemoveOnlySingletonReduceMeanPass
810
from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \
9-
NodeTemplate, StructBuffer, TransientBuffer, VariableBuffer
11+
NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
1012
from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
1113
from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
14+
from Deeploy.Targets.GAP9.Parsers import NE16GEMMParser
1215
from Deeploy.Targets.GAP9.Templates import AllocateTemplate, FreeTemplate
1316
# Import GAP9-specific tiler bindings
14-
from Deeploy.Targets.GAP9.Tiler import GAP9AddTilingReadyBindings, GAP9ConcatTilingReadyBindings, \
15-
GAP9Conv2DTilingReadyBindings, GAP9DWConv2DTilingReadyBindings, GAP9FlattenTilingReadyBindings, \
16-
GAP9FPGELUTilingReadyBindings, GAP9FPGEMMTilingReadyBindings, GAP9GatherTilingReadyBindings, \
17-
GAP9iHardswishTilingReadyBindings, GAP9iRMSNormTilingReadyBindings, GAP9iRQSGELUTilingReadyBindings, \
18-
GAP9LayernormTilingReadyBindings, GAP9MatMulTilingReadyBindings, GAP9MaxPool2DTilingReadyBindings, \
19-
GAP9MulTilingReadyBindings, GAP9ReduceSumTilingReadyBindings, GAP9ReluTilingReadyBindings, \
17+
from Deeploy.Targets.GAP9.Tiler import DeQuantTilingReadyBindings, GAP9AddTilingReadyBindings, \
18+
GAP9ConcatTilingReadyBindings, GAP9Conv2DTilingReadyBindings, GAP9DWConv2DTilingReadyBindings, \
19+
GAP9FlattenTilingReadyBindings, GAP9FPGELUTilingReadyBindings, GAP9FPGEMMTilingReadyBindings, \
20+
GAP9GatherTilingReadyBindings, GAP9iHardswishTilingReadyBindings, GAP9iRMSNormTilingReadyBindings, \
21+
GAP9iRQSGELUTilingReadyBindings, GAP9LayernormTilingReadyBindings, GAP9MatMulTilingReadyBindings, \
22+
GAP9MaxPool2DTilingReadyBindings, GAP9MulTilingReadyBindings, GAP9NE16GEMMInt32TilingReadyBindings, \
23+
GAP9NE16RQSGEMMTilingReadyBindings, GAP9ReduceSumTilingReadyBindings, GAP9ReluTilingReadyBindings, \
2024
GAP9RQAddTilingReadyBindings, GAP9RQSConv2DTilingReadyBindings, GAP9RQSDWConv2DTilingReadyBindings, \
2125
GAP9RQSGEMMTilingReadyBindings, GAP9RQSiHardswishTilingReadyBindings, GAP9RQSMatrixVecTilingReadyBindings, \
2226
GAP9RQSTallGEMMTilingReadyBindings, GAP9RQSTilingReadyBindings, GAP9SGDTilingReadyBindings, \
2327
GAP9SoftmaxCrossEntropyGradTilingReadyBindings, GAP9SoftmaxCrossEntropyTilingReadyBindings, \
2428
GAP9SoftmaxGradTilingReadyBindings, GAP9SoftmaxTilingReadyBindings, GAP9TransposeTilingReadyBindings, \
25-
GAP9UniformRQSTilingReadyBindings
29+
GAP9UniformRQSTilingReadyBindings, QuantTilingReadyBindings
30+
from Deeploy.Targets.GAP9.TopologyOptimizationPasses.Passes import NE16AdjustGEMMWeightLayoutPass
2631
from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \
2732
BasicRQIntegerDivBinding
2833
from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELULayer, GEMMLayer, \
@@ -37,12 +42,18 @@
3742
SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, SoftmaxParser, \
3843
TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser
3944
from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
40-
from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPDMASliceBindings, \
41-
PULPDWConv1DBinding, PULPReduceMeanBindings, PULPRQSConv1DBindings, PULPSliceBindings
45+
from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, DequantQuantMergePass, \
46+
IntegerDivRequantMergePass, MatMulAddMergePass, MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, \
47+
QuantPatternPass, RQSSplitPass, SkipEmptyConcatPass, SkipUnityRequantPass, iGELURequantMergePass, \
48+
iHardswishRequantMergePass
49+
from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPConv1DBinding, \
50+
PULPDMASliceBindings, PULPDWConv1DBinding, PULPReduceMeanBindings, PULPRQSConv1DBindings, PULPSliceBindings
4251
from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer
4352
from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \
4453
PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \
4554
PULPTallGEMMParser
55+
from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPConvRequantMergePass, \
56+
PULPGEMMRequantMergePass, PULPMatMulRequantMergePass
4657

4758
# Create GAP9-specific NodeMappers
4859
GAP9_RQAddMapper = NodeMapper(RQAddParser(), GAP9RQAddTilingReadyBindings)
@@ -90,9 +101,37 @@
90101
GAP9_SoftmaxCrossEntropyLossGradMapper = NodeMapper(SoftmaxCrossEntropyLossGradParser(),
91102
GAP9SoftmaxCrossEntropyGradTilingReadyBindings)
92103
GAP9_SGDMapper = NodeMapper(SGDParser(), GAP9SGDTilingReadyBindings)
93-
GAP9_QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings)
94-
GAP9_DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings)
104+
GAP9_QuantMapper = NodeMapper(QuantParser(), QuantTilingReadyBindings)
105+
GAP9_DequantMapper = NodeMapper(DequantParser(), DeQuantTilingReadyBindings)
95106
GAP9_GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings)
107+
GAP9_NE16GEMMMapper = NodeMapper(NE16GEMMParser(), GAP9NE16RQSGEMMTilingReadyBindings)
108+
GAP9_NE16GEMMInt32Mapper = NodeMapper(GEMMParser(), GAP9NE16GEMMInt32TilingReadyBindings)
109+
110+
GAP9Optimizer = TopologyOptimizer(
111+
[
112+
QuantPatternPass(),
113+
DequantPatternPass(),
114+
DequantQuantMergePass(),
115+
MatMulAddMergePass(),
116+
SkipEmptyConcatPass(),
117+
SkipUnityRequantPass(previous_op_regex = "Concat", num_inputs = 2),
118+
SkipUnityRequantPass(previous_op_regex = "Reshape|Transpose", num_inputs = 1),
119+
SkipUnityRequantPass(previous_op_regex = "Reshape|Transpose", num_inputs = 1),
120+
RQSSplitPass(),
121+
MergeTrueIntegerDivRequantShiftPass(),
122+
IntegerDivRequantMergePass(),
123+
iGELURequantMergePass(),
124+
iHardswishRequantMergePass(),
125+
PULPConvRequantMergePass(),
126+
MergeConstAddAndRequantPass(),
127+
PULPGEMMRequantMergePass(),
128+
PULPMatMulRequantMergePass(),
129+
# PULPAddRequantMergePass(),
130+
RemoveEmptyConvBiasPass(),
131+
RemoveOnlySingletonReduceMeanPass(),
132+
NE16AdjustGEMMWeightLayoutPass(),
133+
],
134+
name = "GAP9Optimizer")
96135

97136
# GAP9-specific mapping using ClDma
98137
GAP9Mapping = {
@@ -101,9 +140,9 @@
101140
'RequantizedConv':
102141
PULPRQSConvLayer([GAP9_Conv2DMapper, GAP9_DWConv2DMapper, GAP9_Conv1DMapper, GAP9_DWConv1DMapper]),
103142
'RequantizedGemm':
104-
PULPRQSGEMMLayer([GAP9_MatrixVecMapper, GAP9_TallGEMMMapper, GAP9_GEMMMapper]),
143+
PULPRQSGEMMLayer([GAP9_NE16GEMMMapper, GAP9_MatrixVecMapper, GAP9_TallGEMMMapper, GAP9_GEMMMapper]),
105144
'Gemm':
106-
GEMMLayer([GAP9_FloatGEMMMapper, GAP9_GEMMDequantMapper]),
145+
GEMMLayer([GAP9_NE16GEMMInt32Mapper, GAP9_FloatGEMMMapper, GAP9_GEMMDequantMapper]),
107146
'Gelu':
108147
GELULayer([GAP9_GELUMapper]),
109148
'LayerNormalization':
@@ -244,7 +283,10 @@ class GAP9StructBuffer(StructBuffer):
244283
deallocTemplate = NodeTemplate("")
245284

246285

247-
_includeList = ["pmsis.h", "DeeployGAP9Math.h", "pulp_nn_kernels.h", "DeeployMchan.h"]
286+
_includeList = [
287+
"pmsis.h", "DeeployGAP9Math.h", "pulp_nn_kernels.h", "DeeployMchan.h", "CNN_BasicKernels_fp32.h",
288+
"CNN_BasicKernels_NE16.h", "CNN_Copy.h", "ne16_utils.h"
289+
]
248290

249291

250292
class GAP9ClusterEngine(DeploymentEngine):

0 commit comments

Comments
 (0)