Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
275537f
[NE16] Add GAP9_w_NE16 platform: NE16 accelerator Engine on GAP9
runwangdl Apr 13, 2026
3a8bf48
[Deeploy PR] NE16 Linear Layer Kernels
pauloohaha Apr 14, 2026
6c8ae2b
[NE16] integrate Pu DENG's NE16 Linear PR with NE16-w platform
runwangdl Apr 14, 2026
e46a09a
[CI] skip gap9 pipelines on forks (private docker image)
runwangdl Apr 14, 2026
ddde88f
[Quant] add uint8 output bindings to QuantChecker / DequantChecker
runwangdl Apr 14, 2026
2373ff9
[CI] snitch-tiled: drop xdist parallelism 4 -> 2
runwangdl Apr 14, 2026
a47ae48
[NE16] add CNN_Libraries_HWPE include path for GAP9 SDK NE16 kernels
runwangdl Apr 18, 2026
6a3793a
[NE16] fix v4s/v4u type mismatch in NE16 GEMM template
runwangdl Apr 19, 2026
e6355a9
[CI] add NE16 profiling step to GAP9+NE16 tiled workflow
runwangdl Apr 19, 2026
74fac92
[GAP9] fix profileTiling: add CycleCounter.h to include lists and cmake
runwangdl Apr 19, 2026
2d75f76
[CI] fix NE16 profiling: disable LTO and clean build dir
runwangdl Apr 19, 2026
ee2f1b9
[NE16] wire --enable-3x3 flag in generateNetwork.py
runwangdl Apr 19, 2026
ac3d172
[NE16] pass --enable-3x3 from deeployRunner to generateNetwork
runwangdl Apr 19, 2026
822dd32
[NE16] add large Dense 3x3 conv benchmark for NE16 utilization test
runwangdl Apr 19, 2026
375a632
[NE16] enable MobileNetV1 (MLPerf VisualWakeWords) as L2 single-buffe…
runwangdl May 14, 2026
b8a518a
[NE16] engine-aware DW NHWC layout: fix mixed-engine graphs with --en…
runwangdl May 14, 2026
ecab71e
[NE16] enable --enableStrides for model tests: stride-2 convs on NE16
runwangdl May 14, 2026
bc0b99d
[NE16] enable L2 double-buffer model tests + fix Dense WeightCube rank
runwangdl May 14, 2026
7f6ce26
[CI] add GAP9_w_NE16 model jobs (singlebuffer + doublebuffer L2)
runwangdl May 14, 2026
438f100
[CI] gap9-w-ne16-tiled: mark workspace safe (match other runner workf…
runwangdl May 14, 2026
e010a75
[GAP9] gate NE16AdjustGEMMWeightLayoutPass on engine == "NE16"
runwangdl May 14, 2026
c01425b
[GAP9] re-enable PULPAddRequantMergePass; harden RQSSplitPass int32 cast
runwangdl May 14, 2026
51cd95e
[GAP9] unmask FP32 GEMM and SkipConnection paths
runwangdl May 14, 2026
9be1c57
[GAP9] lint: drop unused MatMulAddMergePass import + yapf format
runwangdl May 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions .github/workflows/_runner-gap9-w-ne16-tiled.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
#
# SPDX-License-Identifier: Apache-2.0

---
name: _runner-gap9-w-ne16-tiled

"on":
workflow_call:
inputs:
runner:
required: true
type: string
docker-image:
required: true
type: string
pytest-markers:
required: true
type: string

jobs:
test-runner-gap9-w-ne16-tiled:
runs-on: ${{ inputs.runner }}
container:
image: ${{ inputs.docker-image }}
steps:
- name: Mark workspace as safe
run: git config --global --add safe.directory '*'
- name: Checkout Repo
uses: actions/checkout@v4
with:
submodules: recursive
- name: Build Deeploy
shell: bash
run: |
source /app/install/gap9-sdk/.gap9-venv/bin/activate
source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true
pip install -e . || true
deactivate
- name: Cache ccache
uses: actions/cache/restore@v4
with:
path: /app/.ccache
key: ccache-gap9
- name: Run Test
run: |
source /app/install/gap9-sdk/.gap9-venv/bin/activate
source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true
export GVSOC_INSTALL_DIR=/app/install/gap9-sdk/install/workstation
export GAP_RISCV_GCC_TOOLCHAIN=/app/install/gcc/gap9
cd DeeployTest
mkdir -p /app/.ccache
export CCACHE_DIR=/app/.ccache
pytest test_platforms.py -v -m "${{ inputs.pytest-markers }}"
deactivate
shell: bash
- name: NE16 Profiling (cycle counts)
if: always()
run: |
source /app/install/gap9-sdk/.gap9-venv/bin/activate
source /app/install/gap9-sdk/configs/gap9_evk_audio.sh || true
export GVSOC_INSTALL_DIR=/app/install/gap9-sdk/install/workstation
export GAP_RISCV_GCC_TOOLCHAIN=/app/install/gcc/gap9
mkdir -p /app/.ccache
export CCACHE_DIR=/app/.ccache
cd DeeployTest
rm -rf TEST_GAP9_W_NE16/build_master
for test in \
"Tests/Kernels/Integer/Conv/Dense_2D_RQ_NE16Bench --l1 32000" \
"Tests/Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ --l1 32000" \
"Tests/Kernels/Integer/Conv/Dense_2D_RQ --l1 32000"; do
dir=$(echo $test | awk '{print $1}')
l1=$(echo $test | awk '{print $3}')
echo "========================================"
echo "PROFILING: $dir (L1=$l1)"
echo "========================================"
python3 deeployRunner_tiled_gap9_w_ne16.py \
-t "$dir" --l1 "$l1" \
--toolchain GCC --toolchain-install-dir /app/install/gcc/gap9 \
--cores 8 --enable-3x3 --profileTiling -v \
-D CMAKE_INTERPROCEDURAL_OPTIMIZATION=OFF 2>&1 || true
done
deactivate
shell: bash
4 changes: 2 additions & 2 deletions .github/workflows/_runner-snitch-tiled-sequential.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ jobs:
- name: Build Deeploy
shell: bash
run: pip install -e .
- name: Run Test # VJUNG: Run tests with 4 parallel threads as GitHub action VM has 4 cores.
- name: Run Test # 2-way parallel: 4-way OOMs the GitHub runner on the FP32 GEMM/TransB build.
run: |
cd DeeployTest
mkdir -p /app/.ccache
export CCACHE_DIR=/app/.ccache
pytest test_platforms.py -v -n 4 -m "snitch_tiled and ${{ inputs.pytest-marker }}"
pytest test_platforms.py -v -n 2 -m "snitch_tiled and ${{ inputs.pytest-marker }}"
shell: bash
3 changes: 3 additions & 0 deletions .github/workflows/ci-platform-gap9-tiled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ concurrency:

jobs:
select-env:
# ghcr.io/pulp-platform/deeploy-gap9 is private; only upstream's
# self-hosted runners have credentials. Skip cleanly on forks.
if: github.repository == 'pulp-platform/Deeploy'
uses: ./.github/workflows/_select-env.yml
with:
docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}
Expand Down
68 changes: 68 additions & 0 deletions .github/workflows/ci-platform-gap9-w-ne16-tiled.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
#
# SPDX-License-Identifier: Apache-2.0

---
name: CI • GAP9 + NE16 (Tiled)

"on":
push:
branches:
- "**"
tags:
- "v*.*.*"
pull_request:
workflow_dispatch:
inputs:
docker_image_deeploy:
description: "Deeploy Image to use"
required: false
default: "ghcr.io/pulp-platform/deeploy-gap9:devel"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
select-env:
# The GAP9 + NE16 image is hosted in pulp-platform's private ghcr.io
# registry; only upstream's self-hosted runners have credentials to
# pull it. On forks the docker pull always returns "denied", so skip
# the whole pipeline cleanly there. (Same constraint as the existing
# ci-platform-gap9{,-tiled}.yml jobs.)
if: github.repository == 'pulp-platform/Deeploy'
uses: ./.github/workflows/_select-env.yml
with:
docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}

gap9-w-ne16-kernels-tiled-singlebuffer-L2:
needs: select-env
uses: ./.github/workflows/_runner-gap9-w-ne16-tiled.yml
with:
runner: ${{ needs.select-env.outputs.runner }}
docker-image: ${{ needs.select-env.outputs.image }}
pytest-markers: "gap9_w_ne16_tiled and kernels and singlebuffer and l2"

gap9-w-ne16-kernels-tiled-doublebuffer-L2:
needs: select-env
uses: ./.github/workflows/_runner-gap9-w-ne16-tiled.yml
with:
runner: ${{ needs.select-env.outputs.runner }}
docker-image: ${{ needs.select-env.outputs.image }}
pytest-markers: "gap9_w_ne16_tiled and kernels and doublebuffer and l2"

gap9-w-ne16-models-tiled-singlebuffer-L2:
needs: select-env
uses: ./.github/workflows/_runner-gap9-w-ne16-tiled.yml
with:
runner: ${{ needs.select-env.outputs.runner }}
docker-image: ${{ needs.select-env.outputs.image }}
pytest-markers: "gap9_w_ne16_tiled and models and singlebuffer and l2"

gap9-w-ne16-models-tiled-doublebuffer-L2:
needs: select-env
uses: ./.github/workflows/_runner-gap9-w-ne16-tiled.yml
with:
runner: ${{ needs.select-env.outputs.runner }}
docker-image: ${{ needs.select-env.outputs.image }}
pytest-markers: "gap9_w_ne16_tiled and models and doublebuffer and l2"
3 changes: 3 additions & 0 deletions .github/workflows/ci-platform-gap9.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ concurrency:

jobs:
select-env:
# ghcr.io/pulp-platform/deeploy-gap9 is private; only upstream's
# self-hosted runners have credentials. Skip cleanly on forks.
if: github.repository == 'pulp-platform/Deeploy'
uses: ./.github/workflows/_select-env.yml
with:
docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}
Expand Down
12 changes: 6 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ if(TOOLCHAIN STREQUAL GCC)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
endif()

set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)")
set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch)
set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, GAP9_w_NE16, Generic, Snitch)")
set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 GAP9_w_NE16 Generic Snitch)

if(platform STREQUAL MemPool)
message(STATUS "Building for platform 'MemPool'")
Expand All @@ -33,8 +33,8 @@ elseif(platform STREQUAL Siracusa_w_neureka)
message(STATUS "Building for platform 'Siracusa_w_neureka'")
elseif(platform STREQUAL PULPOpen)
message(STATUS "Building for platform 'PULP-Open'")
elseif(platform STREQUAL GAP9)
message(STATUS "Building for platform 'GAP9'")
elseif(platform STREQUAL GAP9 OR platform STREQUAL GAP9_w_NE16)
message(STATUS "Building for platform '${platform}'")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})

# Select SDK config based on simulator type
Expand Down Expand Up @@ -62,7 +62,7 @@ endif()
# Import useful functions / macros
include(${CMAKE_CURRENT_LIST_DIR}/cmake/Util.cmake)
# Only if not GAP9
if(NOT platform STREQUAL GAP9)
if(NOT platform STREQUAL GAP9 AND NOT platform STREQUAL GAP9_w_NE16)
include(${CMAKE_CURRENT_LIST_DIR}/cmake/common.cmake)
endif()
include(${CMAKE_CURRENT_LIST_DIR}/cmake/simulation.cmake)
Expand Down Expand Up @@ -231,7 +231,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor

endif()

if(platform STREQUAL GAP9)
if(platform STREQUAL GAP9 OR platform STREQUAL GAP9_w_NE16)
project(${TESTNAME} LANGUAGES C ASM)
include(${CMAKE_CURRENT_LIST_DIR}/cmake/gap9/gap9_gvsoc.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/cmake/gap9/gap9_board.cmake)
Expand Down
42 changes: 33 additions & 9 deletions Deeploy/Targets/GAP9/Bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@
from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
from Deeploy.Targets.GAP9.DMA.L3Dma import gap9L3DmaHack
from Deeploy.Targets.GAP9.DMA.L3Dma import GAP9L3Dma
from Deeploy.Targets.GAP9.DMA.MchanDma import GAP9MchanDma
from Deeploy.Targets.GAP9.Templates import GAP9SDKDequantQuantTemplate, NE16GEMMTemplate
# Import templates from PULPOpen and Generic
from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \
FloatReduceSumTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
FloatReduceSumTemplate, GatherTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \
QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \
Expand Down Expand Up @@ -57,7 +58,7 @@
MemoryManagementGeneration("L1"),
TilingVariableReplacement("L2"),
MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
PULPL3Tiling("L3", "L2", gap9L3DmaHack), # Use GAP9-specific L3 DMA
PULPL3Tiling("L3", "L2", GAP9L3Dma()), # Use GAP9-specific L3 DMA
PULPProfileUntiled(),
ArgumentStructGeneration(),
L3MemoryAwareFunctionCallClosure(writeback = False),
Expand All @@ -76,7 +77,7 @@
MemoryManagementGeneration("L1"),
TilingVariableReplacement("L2"),
MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
PULPL3Tiling("L3", "L2", gap9L3DmaHack), # Use GAP9-specific L3 DMA
PULPL3Tiling("L3", "L2", GAP9L3Dma()), # Use GAP9-specific L3 DMA
PULPProfileUntiled(),
ArgumentStructGeneration(),
L3MemoryAwareFunctionCallClosure(writeback = False),
Expand Down Expand Up @@ -183,6 +184,26 @@
GAP9Transformer) for type1, type2 in zip([int8_t, uint8_t, int8_t, uint8_t], [int8_t, uint8_t, uint8_t, int8_t])
]

GAP9NE16RQSGEMMBindings = [
NodeBinding(
PULPLinearChecker([
PointerClass(type1),
PointerClass(int8_t),
PointerClass(int32_t),
PointerClass(uint8_t),
PointerClass(uint8_t)
], [PointerClass(type2)]), NE16GEMMTemplate.referenceTemplate, GAP9ClusterTransformer)
for type1 in [int8_t, uint8_t]
for type2 in [int8_t, uint8_t]
]

GAP9NE16GEMMInt32Bindings = [
NodeBinding(
GEMMChecker([PointerClass(type1), PointerClass(int8_t),
PointerClass(int32_t)], [PointerClass(int32_t)]), NE16GEMMTemplate.int32OutputTemplate,
GAP9ClusterTransformer) for type1 in [int8_t, uint8_t]
]

GAP9FloatGEMMBindings = [
NodeBinding(
GEMMChecker([PointerClass(float32_t), PointerClass(float32_t),
Expand Down Expand Up @@ -386,14 +407,17 @@
]

GAP9QuantBindings = [
NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(int8_t)]), QuantTemplate.referenceTemplate,
GAP9Transformer),
NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(int8_t)]),
GAP9SDKDequantQuantTemplate.fp32QuantI8Template, GAP9Transformer),
NodeBinding(QuantChecker([PointerClass(float32_t)], [PointerClass(uint8_t)]),
GAP9SDKDequantQuantTemplate.fp32QuantU8Template, GAP9Transformer),
]

GAP9DequantBindings = [
NodeBinding(DequantChecker([PointerClass(int8_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate,
GAP9Transformer),
] + [
NodeBinding(DequantChecker([PointerClass(int8_t)], [PointerClass(float32_t)]),
GAP9SDKDequantQuantTemplate.fp32DequantI8Template, GAP9Transformer),
NodeBinding(DequantChecker([PointerClass(uint8_t)], [PointerClass(float32_t)]),
GAP9SDKDequantQuantTemplate.fp32DequantU8Template, GAP9Transformer),
NodeBinding(DequantChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate,
GAP9Transformer),
]
9 changes: 2 additions & 7 deletions Deeploy/Targets/GAP9/DMA/L3Dma.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
from typing import Dict, Tuple

from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
from Deeploy.TilingExtension.AsyncDma import AsyncDma, BlockingDmaFromAsyncDmaAdapter, DmaDirection, Future, \
PerTensorWaitingStrategy
from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy


class GAP9L3DmaFuture(Future):
Expand All @@ -29,7 +28,7 @@ class GAP9L3Dma(AsyncDma):
_transferTemplates = {
2:
NodeTemplate(
"pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
"pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t)${ext}, (void *)${loc}, (uint32_t)${transfer_size}, (uint32_t)${stride}, (uint32_t)${length}, ${ext2loc}, &${future});"
)
}
_waitingStrategy = PerTensorWaitingStrategy(GAP9L3DmaFuture)
Expand Down Expand Up @@ -58,7 +57,3 @@ def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBu
"stride": strideExt[0],
})
return operatorRepresentation


# Blocking adapter for L3 DMA (used in GAP9 L3 tiling)
gap9L3DmaHack = BlockingDmaFromAsyncDmaAdapter(GAP9L3Dma())
37 changes: 37 additions & 0 deletions Deeploy/Targets/GAP9/Parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
#
# SPDX-License-Identifier: Apache-2.0

from typing import Tuple

import onnx_graphsurgeon as gs

from Deeploy.DeeployTypes import NetworkContext
from Deeploy.Targets.Generic.Parsers import GEMMParser, RQSParserInterface


class NE16GEMMParser(GEMMParser, RQSParserInterface):
"""Parser for NE16 RequantizedGemm nodes with 5 inputs [A, B, C, mul, scale_n]."""

def __init__(self):
super().__init__(noBiasHoisting = True)

def parseNode(self, node: gs.Node) -> bool:
ret_rqs = RQSParserInterface.parseNode(self, node)
ret_matmul = GEMMParser.parseNode(self, node)
ret = all([ret_rqs, ret_matmul, 'shift' in node.attrs, len(node.inputs) == 5])
if ret:
self.operatorRepresentation['shift'] = int(node.attrs['shift'].values)
return ret

def parseNodeCtxt(self,
ctxt: NetworkContext,
node: gs.Node,
channels_first: bool = True) -> Tuple[NetworkContext, bool]:
newCtxt, ret = GEMMParser.parseNodeCtxt(self, ctxt, node, channels_first)
if ret:
inputs = ['A', 'B', 'C', 'mul', 'scale_n']
for idx, inputNode in enumerate(node.inputs):
self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
return newCtxt, True
return ctxt, False
Loading
Loading