pulp-platform · runwangdl · Apr 13, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: _runner-gap9-w-ne16-tiled
+
+"on":
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+      docker-image:
+        required: true
+        type: string
+      pytest-markers:
+        required: true
+        type: string
+
+jobs:
+  test-runner-gap9-w-ne16-tiled:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.docker-image }}
+    steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        shell: bash
+        run: |
+          source /app/install/gap9-sdk/.gap9-venv/bin/activate
+          source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true
+          pip install -e . || true
+          deactivate
+      - name: Cache ccache
+        uses: actions/cache/restore@v4
+        with:
+          path: /app/.ccache
+          key: ccache-gap9
+      - name: Run Test
+        run: |
+          source /app/install/gap9-sdk/.gap9-venv/bin/activate
+          source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true
+          export GVSOC_INSTALL_DIR=/app/install/gap9-sdk/install/workstation
+          export GAP_RISCV_GCC_TOOLCHAIN=/app/install/gcc/gap9
+          cd DeeployTest
+          mkdir -p /app/.ccache
+          export CCACHE_DIR=/app/.ccache
+          pytest test_platforms.py -v -m "${{ inputs.pytest-markers }}"
+          deactivate
+        shell: bash
+      - name: NE16 Profiling (cycle counts)
+        if: always()
+        run: |
+          source /app/install/gap9-sdk/.gap9-venv/bin/activate
+          source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true
+          export GVSOC_INSTALL_DIR=/app/install/gap9-sdk/install/workstation
+          export GAP_RISCV_GCC_TOOLCHAIN=/app/install/gcc/gap9
+          mkdir -p /app/.ccache
+          export CCACHE_DIR=/app/.ccache
+          cd DeeployTest
+          rm -rf TEST_GAP9_W_NE16/build_master
+          for test in \
+            "Tests/Kernels/Integer/Conv/Dense_2D_RQ_NE16Bench --l1 32000" \
+            "Tests/Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ --l1 32000" \
+            "Tests/Kernels/Integer/Conv/Dense_2D_RQ --l1 32000"; do
+            dir=$(echo $test | awk '{print $1}')
+            l1=$(echo $test | awk '{print $3}')
+            echo "========================================"
+            echo "PROFILING: $dir (L1=$l1)"
+            echo "========================================"
+            python3 deeployRunner_tiled_gap9_w_ne16.py \
+              -t "$dir" --l1 "$l1" \
+              --toolchain GCC --toolchain-install-dir /app/install/gcc/gap9 \
+              --cores 8 --enable-3x3 --profileTiling -v \
+              -D CMAKE_INTERPROCEDURAL_OPTIMIZATION=OFF 2>&1 || true
+          done
+          deactivate
+        shell: bash
@@ -33,10 +33,10 @@ jobs:
       - name: Build Deeploy
         shell: bash
         run: pip install -e .
-      - name: Run Test # VJUNG: Run tests with 4 parallel threads as GitHub action VM has 4 cores.
+      - name: Run Test # 2-way parallel: 4-way OOMs the GitHub runner on the FP32 GEMM/TransB build.
         run: |
           cd DeeployTest
           mkdir -p /app/.ccache
           export CCACHE_DIR=/app/.ccache
-          pytest test_platforms.py -v -n 4 -m "snitch_tiled and ${{ inputs.pytest-marker }}"
+          pytest test_platforms.py -v -n 2 -m "snitch_tiled and ${{ inputs.pytest-marker }}"
         shell: bash
@@ -25,6 +25,9 @@ concurrency:
 
 jobs:
   select-env:
+    # ghcr.io/pulp-platform/deeploy-gap9 is private; only upstream's
+    # self-hosted runners have credentials. Skip cleanly on forks.
+    if: github.repository == 'pulp-platform/Deeploy'
     uses: ./.github/workflows/_select-env.yml
     with:
       docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}

@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • GAP9 + NE16 (Tiled)
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy-gap9:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    # The GAP9 + NE16 image is hosted in pulp-platform's private ghcr.io
+    # registry; only upstream's self-hosted runners have credentials to
+    # pull it. On forks the docker pull always returns "denied", so skip
+    # the whole pipeline cleanly there. (Same constraint as the existing
+    # ci-platform-gap9{,-tiled}.yml jobs.)
+    if: github.repository == 'pulp-platform/Deeploy'
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}
+
+  gap9-w-ne16-kernels-tiled-singlebuffer-L2:
+    needs: select-env
+    uses: ./.github/workflows/_runner-gap9-w-ne16-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-markers: "gap9_w_ne16_tiled and kernels and singlebuffer and l2"
+
+  gap9-w-ne16-kernels-tiled-doublebuffer-L2:
+    needs: select-env
+    uses: ./.github/workflows/_runner-gap9-w-ne16-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-markers: "gap9_w_ne16_tiled and kernels and doublebuffer and l2"
+
+  gap9-w-ne16-models-tiled-singlebuffer-L2:
+    needs: select-env
+    uses: ./.github/workflows/_runner-gap9-w-ne16-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-markers: "gap9_w_ne16_tiled and models and singlebuffer and l2"
+
+  gap9-w-ne16-models-tiled-doublebuffer-L2:
+    needs: select-env
+    uses: ./.github/workflows/_runner-gap9-w-ne16-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-markers: "gap9_w_ne16_tiled and models and doublebuffer and l2"
@@ -26,6 +26,9 @@ concurrency:
 
 jobs:
   select-env:
+    # ghcr.io/pulp-platform/deeploy-gap9 is private; only upstream's
+    # self-hosted runners have credentials. Skip cleanly on forks.
+    if: github.repository == 'pulp-platform/Deeploy'
     uses: ./.github/workflows/_select-env.yml
     with:
       docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}

@@ -20,8 +20,8 @@ if(TOOLCHAIN STREQUAL GCC)
   set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif()
 
-set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)")
-set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch)
+set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, GAP9_w_NE16, Generic, Snitch)")
+set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 GAP9_w_NE16 Generic Snitch)
 
 if(platform STREQUAL MemPool)
   message(STATUS "Building for platform 'MemPool'")
@@ -33,8 +33,8 @@ elseif(platform STREQUAL Siracusa_w_neureka)
   message(STATUS "Building for platform 'Siracusa_w_neureka'")
 elseif(platform STREQUAL PULPOpen)
   message(STATUS "Building for platform 'PULP-Open'")
-elseif(platform STREQUAL GAP9)
-  message(STATUS "Building for platform 'GAP9'")
+elseif(platform STREQUAL GAP9 OR platform STREQUAL GAP9_w_NE16)
+  message(STATUS "Building for platform '${platform}'")
   set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
 
   # Select SDK config based on simulator type
@@ -62,7 +62,7 @@ endif()
 # Import useful functions / macros
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/Util.cmake)
 # Only if not GAP9
-if(NOT platform STREQUAL GAP9)
+if(NOT platform STREQUAL GAP9 AND NOT platform STREQUAL GAP9_w_NE16)
   include(${CMAKE_CURRENT_LIST_DIR}/cmake/common.cmake)
 endif()
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/simulation.cmake)
@@ -231,7 +231,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor
 
 endif()
 
-if(platform STREQUAL GAP9)
+if(platform STREQUAL GAP9 OR platform STREQUAL GAP9_w_NE16)
   project(${TESTNAME} LANGUAGES C ASM)
   include(${CMAKE_CURRENT_LIST_DIR}/cmake/gap9/gap9_gvsoc.cmake)
   include(${CMAKE_CURRENT_LIST_DIR}/cmake/gap9/gap9_board.cmake)

@@ -18,11 +18,12 @@
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
 from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
-from Deeploy.Targets.GAP9.DMA.L3Dma import gap9L3DmaHack
+from Deeploy.Targets.GAP9.DMA.L3Dma import GAP9L3Dma
 from Deeploy.Targets.GAP9.DMA.MchanDma import GAP9MchanDma
+from Deeploy.Targets.GAP9.Templates import GAP9SDKDequantQuantTemplate, NE16GEMMTemplate
 # Import templates from PULPOpen and Generic
 from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \
-    FloatReduceSumTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
+    FloatReduceSumTemplate, GatherTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
     GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \
     QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \
@@ -57,7 +58,7 @@
     MemoryManagementGeneration("L1"),
     TilingVariableReplacement("L2"),
     MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
-    PULPL3Tiling("L3", "L2", gap9L3DmaHack),  # Use GAP9-specific L3 DMA
+    PULPL3Tiling("L3", "L2", GAP9L3Dma()),  # Use GAP9-specific L3 DMA
     PULPProfileUntiled(),
     ArgumentStructGeneration(),
     L3MemoryAwareFunctionCallClosure(writeback = False),
@@ -76,7 +77,7 @@
     MemoryManagementGeneration("L1"),
     TilingVariableReplacement("L2"),
     MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
-    PULPL3Tiling("L3", "L2", gap9L3DmaHack),  # Use GAP9-specific L3 DMA
+    PULPL3Tiling("L3", "L2", GAP9L3Dma()),  # Use GAP9-specific L3 DMA
     PULPProfileUntiled(),
     ArgumentStructGeneration(),
     L3MemoryAwareFunctionCallClosure(writeback = False),
@@ -183,6 +184,26 @@
         GAP9Transformer) for type1, type2 in zip([int8_t, uint8_t, int8_t, uint8_t], [int8_t, uint8_t, uint8_t, int8_t])
 ]
 
+GAP9NE16RQSGEMMBindings = [
+    NodeBinding(
+        PULPLinearChecker([
+            PointerClass(type1),
+            PointerClass(int8_t),
+            PointerClass(int32_t),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t)
+        ], [PointerClass(type2)]), NE16GEMMTemplate.referenceTemplate, GAP9ClusterTransformer)
+    for type1 in [int8_t, uint8_t]
+    for type2 in [int8_t, uint8_t]
+]
+
+GAP9NE16GEMMInt32Bindings = [
+    NodeBinding(
+        GEMMChecker([PointerClass(type1), PointerClass(int8_t),
+                     PointerClass(int32_t)], [PointerClass(int32_t)]), NE16GEMMTemplate.int32OutputTemplate,
+        GAP9ClusterTransformer) for type1 in [int8_t, uint8_t]
+]
+
 GAP9FloatGEMMBindings = [
     NodeBinding(
         GEMMChecker([PointerClass(float32_t), PointerClass(float32_t),
@@ -386,14 +407,17 @@
 ]
 
 GAP9QuantBindings = [
-    NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(int8_t)]), QuantTemplate.referenceTemplate,
-                GAP9Transformer),
+    NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(int8_t)]),
+                GAP9SDKDequantQuantTemplate.fp32QuantI8Template, GAP9Transformer),
+    NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(uint8_t)]),
+                GAP9SDKDequantQuantTemplate.fp32QuantU8Template, GAP9Transformer),
 ]
 
 GAP9DequantBindings = [
-    NodeBinding(DequantChecker([PointerClass(int8_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate,
-                GAP9Transformer),
-] + [
+    NodeBinding(DequantChecker([PointerClass(int8_t)], [PointerClass(float32_t)]),
+                GAP9SDKDequantQuantTemplate.fp32DequantI8Template, GAP9Transformer),
+    NodeBinding(DequantChecker([PointerClass(uint8_t)], [PointerClass(float32_t)]),
+                GAP9SDKDequantQuantTemplate.fp32DequantU8Template, GAP9Transformer),
     NodeBinding(DequantChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate,
                 GAP9Transformer),
 ]
@@ -6,8 +6,7 @@
 from typing import Dict, Tuple
 
 from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
-from Deeploy.TilingExtension.AsyncDma import AsyncDma, BlockingDmaFromAsyncDmaAdapter, DmaDirection, Future, \
-    PerTensorWaitingStrategy
+from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy
 
 
 class GAP9L3DmaFuture(Future):
@@ -29,7 +28,7 @@ class GAP9L3Dma(AsyncDma):
     _transferTemplates = {
         2:
             NodeTemplate(
-                "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
+                "pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t)${ext}, (void *)${loc}, (uint32_t)${transfer_size}, (uint32_t)${stride}, (uint32_t)${length}, ${ext2loc}, &${future});"
             )
     }
     _waitingStrategy = PerTensorWaitingStrategy(GAP9L3DmaFuture)
@@ -58,7 +57,3 @@ def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBu
             "stride": strideExt[0],
         })
         return operatorRepresentation
-
-
-# Blocking adapter for L3 DMA (used in GAP9 L3 tiling)
-gap9L3DmaHack = BlockingDmaFromAsyncDmaAdapter(GAP9L3Dma())
@@ -0,0 +1,37 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Tuple
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import NetworkContext
+from Deeploy.Targets.Generic.Parsers import GEMMParser, RQSParserInterface
+
+
+class NE16GEMMParser(GEMMParser, RQSParserInterface):
+    """Parser for NE16 RequantizedGemm nodes with 5 inputs [A, B, C, mul, scale_n]."""
+
+    def __init__(self):
+        super().__init__(noBiasHoisting = True)
+
+    def parseNode(self, node: gs.Node) -> bool:
+        ret_rqs = RQSParserInterface.parseNode(self, node)
+        ret_matmul = GEMMParser.parseNode(self, node)
+        ret = all([ret_rqs, ret_matmul, 'shift' in node.attrs, len(node.inputs) == 5])
+        if ret:
+            self.operatorRepresentation['shift'] = int(node.attrs['shift'].values)
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        newCtxt, ret = GEMMParser.parseNodeCtxt(self, ctxt, node, channels_first)
+        if ret:
+            inputs = ['A', 'B', 'C', 'mul', 'scale_n']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+            return newCtxt, True
+        return ctxt, False