[NE16] isolate NE16 GEMM mappers and headers from shared GAP9 platform

runwangdl · runwangdl · commit 11a641b8cd3f · 2026-04-18T21:25:16.000Z
NE16-specific includes (CNN_BasicKernels_NE16.h, ne16_utils.h), GEMM
mappers, and NE16AdjustGEMMWeightLayoutPass were added to the shared
GAP9 platform code, causing plain GAP9 builds to fail with missing
NE16 headers. Move them into NE16/Platform.py so only GAP9_w_NE16
picks them up.
diff --git a/Deeploy/Targets/GAP9/Platform.py b/Deeploy/Targets/GAP9/Platform.py
@@ -11,23 +11,21 @@
     NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
-from Deeploy.Targets.GAP9.Parsers import NE16GEMMParser
 from Deeploy.Targets.GAP9.Templates import AllocateTemplate, FreeTemplate
 # Import GAP9-specific tiler bindings
 from Deeploy.Targets.GAP9.Tiler import DeQuantTilingReadyBindings, GAP9AddTilingReadyBindings, \
     GAP9ConcatTilingReadyBindings, GAP9Conv2DTilingReadyBindings, GAP9DWConv2DTilingReadyBindings, \
     GAP9FlattenTilingReadyBindings, GAP9FPGELUTilingReadyBindings, GAP9FPGEMMTilingReadyBindings, \
     GAP9GatherTilingReadyBindings, GAP9iHardswishTilingReadyBindings, GAP9iRMSNormTilingReadyBindings, \
     GAP9iRQSGELUTilingReadyBindings, GAP9LayernormTilingReadyBindings, GAP9MatMulTilingReadyBindings, \
-    GAP9MaxPool2DTilingReadyBindings, GAP9MulTilingReadyBindings, GAP9NE16GEMMInt32TilingReadyBindings, \
-    GAP9NE16RQSGEMMTilingReadyBindings, GAP9ReduceSumTilingReadyBindings, GAP9ReluTilingReadyBindings, \
+    GAP9MaxPool2DTilingReadyBindings, GAP9MulTilingReadyBindings, GAP9ReduceSumTilingReadyBindings, \
+    GAP9ReluTilingReadyBindings, \
     GAP9RQAddTilingReadyBindings, GAP9RQSConv2DTilingReadyBindings, GAP9RQSDWConv2DTilingReadyBindings, \
     GAP9RQSGEMMTilingReadyBindings, GAP9RQSiHardswishTilingReadyBindings, GAP9RQSMatrixVecTilingReadyBindings, \
     GAP9RQSTallGEMMTilingReadyBindings, GAP9RQSTilingReadyBindings, GAP9SGDTilingReadyBindings, \
     GAP9SoftmaxCrossEntropyGradTilingReadyBindings, GAP9SoftmaxCrossEntropyTilingReadyBindings, \
     GAP9SoftmaxGradTilingReadyBindings, GAP9SoftmaxTilingReadyBindings, GAP9TransposeTilingReadyBindings, \
     GAP9UniformRQSTilingReadyBindings, QuantTilingReadyBindings
-from Deeploy.Targets.GAP9.TopologyOptimizationPasses.Passes import NE16AdjustGEMMWeightLayoutPass
 from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \
     BasicRQIntegerDivBinding
 from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELULayer, GEMMLayer, \
@@ -104,8 +102,6 @@
 GAP9_QuantMapper = NodeMapper(QuantParser(), QuantTilingReadyBindings)
 GAP9_DequantMapper = NodeMapper(DequantParser(), DeQuantTilingReadyBindings)
 GAP9_GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings)
-GAP9_NE16GEMMMapper = NodeMapper(NE16GEMMParser(), GAP9NE16RQSGEMMTilingReadyBindings)
-GAP9_NE16GEMMInt32Mapper = NodeMapper(GEMMParser(), GAP9NE16GEMMInt32TilingReadyBindings)
 
 GAP9Optimizer = TopologyOptimizer(
     [
@@ -129,7 +125,6 @@
         # PULPAddRequantMergePass(),
         RemoveEmptyConvBiasPass(),
         RemoveOnlySingletonReduceMeanPass(),
-        NE16AdjustGEMMWeightLayoutPass(),
     ],
     name = "GAP9Optimizer")
 
@@ -140,9 +135,9 @@
     'RequantizedConv':
         PULPRQSConvLayer([GAP9_Conv2DMapper, GAP9_DWConv2DMapper, GAP9_Conv1DMapper, GAP9_DWConv1DMapper]),
     'RequantizedGemm':
-        PULPRQSGEMMLayer([GAP9_NE16GEMMMapper, GAP9_MatrixVecMapper, GAP9_TallGEMMMapper, GAP9_GEMMMapper]),
+        PULPRQSGEMMLayer([GAP9_MatrixVecMapper, GAP9_TallGEMMMapper, GAP9_GEMMMapper]),
     'Gemm':
-        GEMMLayer([GAP9_NE16GEMMInt32Mapper, GAP9_FloatGEMMMapper, GAP9_GEMMDequantMapper]),
+        GEMMLayer([GAP9_FloatGEMMMapper, GAP9_GEMMDequantMapper]),
     'Gelu':
         GELULayer([GAP9_GELUMapper]),
     'LayerNormalization':
@@ -284,8 +279,7 @@ class GAP9StructBuffer(StructBuffer):
 
 
 _includeList = [
-    "pmsis.h", "DeeployGAP9Math.h", "pulp_nn_kernels.h", "DeeployMchan.h", "CNN_BasicKernels_fp32.h",
-    "CNN_BasicKernels_NE16.h", "CNN_Copy.h", "ne16_utils.h"
+    "pmsis.h", "DeeployGAP9Math.h", "pulp_nn_kernels.h", "DeeployMchan.h", "CNN_BasicKernels_fp32.h", "CNN_Copy.h"
 ]
 
 
diff --git a/Deeploy/Targets/NE16/Platform.py b/Deeploy/Targets/NE16/Platform.py
@@ -6,15 +6,39 @@
 
 from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
     RequantizedGemmToPwPass
-from Deeploy.DeeployTypes import TopologyOptimizer
+from Deeploy.DeeployTypes import NodeMapper, TopologyOptimizer
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
-from Deeploy.Targets.GAP9.Platform import GAP9ClusterEngine, GAP9ConstantBuffer, GAP9Platform, GAP9StructBuffer, \
-    GAP9TransientBuffer, GAP9VariableBuffer, MemoryGAP9Platform, MemoryGAP9PlatformWrapper
+from Deeploy.Targets.GAP9.Parsers import NE16GEMMParser
+from Deeploy.Targets.GAP9.Platform import GAP9ClusterEngine, GAP9ConstantBuffer, GAP9Mapping, GAP9Platform, \
+    GAP9StructBuffer, GAP9TransientBuffer, GAP9VariableBuffer, GAP9_FloatGEMMMapper, GAP9_GEMMDequantMapper, \
+    GAP9_GEMMMapper, GAP9_MatrixVecMapper, GAP9_TallGEMMMapper, MemoryGAP9Platform, MemoryGAP9PlatformWrapper
+from Deeploy.Targets.GAP9.Tiler import GAP9NE16GEMMInt32TilingReadyBindings, GAP9NE16RQSGEMMTilingReadyBindings
+from Deeploy.Targets.GAP9.TopologyOptimizationPasses.Passes import NE16AdjustGEMMWeightLayoutPass
+from Deeploy.Targets.Generic.Layers import GEMMLayer
+from Deeploy.Targets.Generic.Parsers import GEMMParser
 from Deeploy.Targets.NE16.Engine import NE16Engine
+from Deeploy.Targets.PULPOpen.Layers import PULPRQSGEMMLayer
 from Deeploy.Targets.PULPOpen.Platform import PULPOptimizer
 
+# NE16-specific GEMM mappers (run on the cluster engine using GAP9 SDK NE16 kernels)
+GAP9_NE16GEMMMapper = NodeMapper(NE16GEMMParser(), GAP9NE16RQSGEMMTilingReadyBindings)
+GAP9_NE16GEMMInt32Mapper = NodeMapper(GEMMParser(), GAP9NE16GEMMInt32TilingReadyBindings)
+
+# Build a cluster mapping that includes the NE16 GEMM mappers
+_NE16ClusterMapping = dict(GAP9Mapping)
+_NE16ClusterMapping['RequantizedGemm'] = PULPRQSGEMMLayer(
+    [GAP9_NE16GEMMMapper, GAP9_MatrixVecMapper, GAP9_TallGEMMMapper, GAP9_GEMMMapper])
+_NE16ClusterMapping['Gemm'] = GEMMLayer([GAP9_NE16GEMMInt32Mapper, GAP9_FloatGEMMMapper, GAP9_GEMMDequantMapper])
+
+# NE16 include list for the cluster engine: add SDK NE16 headers for GEMM kernels
+_NE16ClusterIncludeList = [
+    "pmsis.h", "DeeployGAP9Math.h", "pulp_nn_kernels.h", "DeeployMchan.h", "CNN_BasicKernels_fp32.h",
+    "CNN_BasicKernels_NE16.h", "CNN_Copy.h", "ne16_utils.h"
+]
+
 NE16Optimizer = TopologyOptimizer([
     *PULPOptimizer.passes,
+    NE16AdjustGEMMWeightLayoutPass(),
     RequantizedGemmToPwPass(),
 ], name = "NE16Optimizer")
 
@@ -28,15 +52,10 @@ def __init__(self,
                  structBuffer = GAP9StructBuffer,
                  transientBuffer = GAP9TransientBuffer) -> None:
         if engines is None:
-            # Drop SDK NE16 headers from the cluster engine include list so the
-            # generated Network.c does not pull in CNN_BasicKernels_NE16.h /
-            # ne16_utils.h alongside pulp-nnx's ne16_task_defs.h
-            # (NE16_REG_* macros are defined in both, causing -Werror redefs).
             cluster = GAP9ClusterEngine(
                 "GAP9Cluster",
-                includeList = [
-                    "pmsis.h", "DeeployGAP9Math.h", "pulp_nn_kernels.h", "DeeployMchan.h", "CNN_BasicKernels_fp32.h"
-                ],
+                Mapping = _NE16ClusterMapping,
+                includeList = _NE16ClusterIncludeList,
             )
             engines = [NE16Engine("NE16"), cluster]
         super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
@@ -54,15 +73,10 @@ def __init__(self,
                  structBuffer = GAP9StructBuffer,
                  transientBuffer = GAP9TransientBuffer) -> None:
         if engines is None:
-            # Drop SDK NE16 headers from the cluster engine include list so the
-            # generated Network.c does not pull in CNN_BasicKernels_NE16.h /
-            # ne16_utils.h alongside pulp-nnx's ne16_task_defs.h
-            # (NE16_REG_* macros are defined in both, causing -Werror redefs).
             cluster = GAP9ClusterEngine(
                 "GAP9Cluster",
-                includeList = [
-                    "pmsis.h", "DeeployGAP9Math.h", "pulp_nn_kernels.h", "DeeployMchan.h", "CNN_BasicKernels_fp32.h"
-                ],
+                Mapping = _NE16ClusterMapping,
+                includeList = _NE16ClusterIncludeList,
             )
             engines = [NE16Engine("NE16"), cluster]
         super().__init__(memoryHierarchy, defaultTargetMemoryLevel, engines, variableBuffer, constantBuffer,