diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c8a024c15..9ca0eda18f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,8 +20,8 @@ if(TOOLCHAIN STREQUAL GCC) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) endif() -set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)") -set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch) +set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch, Spatz)") +set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch Spatz) if(platform STREQUAL MemPool) message(STATUS "Building for platform 'MemPool'") @@ -46,6 +46,8 @@ elseif(platform STREQUAL SoftHier) message(STATUS "Building for platform 'SoftHier'") elseif(platform STREQUAL Chimera) message(STATUS "Building for platform 'Chimera'") +elseif(platform STREQUAL Spatz) + message(STATUS "Building for platform 'Spatz'") else() message(FATAL_ERROR "Invalid platform '${platform}' specified!") endif() @@ -299,5 +301,33 @@ if(platform STREQUAL Chimera) endif() +if(platform STREQUAL Spatz) + + if(NOT DEFINED ENV{SPATZ_HOME}) + message(FATAL_ERROR "Environment variable SPATZ_HOME not set.") + endif() + + set(SPATZ_HOME $ENV{SPATZ_HOME}) + + set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/spatz/toolchain_llvm.cmake) + + include(${CMAKE_CURRENT_LIST_DIR}/cmake/spatz/spatz.cmake) + + project(deeploy LANGUAGES C ASM) + + message(STATUS "============================= ${platform} Configuration ============================") + message(STATUS "[cMake ] ISA = " ${ISA}) + message(STATUS "================================================================================") + message(STATUS "") + + add_subdirectory(TargetLibraries/Generic) + add_subdirectory(TargetLibraries/Spatz) + target_include_directories(deeployspatz PUBLIC TargetLibraries/Generic/inc) + + add_subdirectory(DeeployTest) + target_link_libraries(deeploylib INTERFACE deeploybasic deeployspatz) + +endif() + print_simulation_config() diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py index 308b179aef..4b0ecfc258 100644 --- a/Deeploy/Targets/Generic/Bindings.py +++ b/Deeploy/Targets/Generic/Bindings.py @@ -19,12 +19,12 @@ GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, \ MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \ RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, TransposeTemplate, \ - iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate + iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate, TopKTemplate from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \ DebugPrintChecker, DequantChecker, DivChecker, DummyChecker, GatherChecker, GELUChecker, GEMMChecker, \ LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceMeanChecker, \ ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, SliceChecker, \ - SoftmaxChecker, TransposeChecker + SoftmaxChecker, TransposeChecker, TopKChecker BasicTransformer = CodeTransformation([ArgumentStructGeneration(), MemoryManagementGeneration(), FutureGeneration()]) @@ -327,3 +327,14 @@ ConvTransposeTemplate.referenceTemplate, BasicTransformer) for type in FloatDataTypes ] + +BasicTopKBindings = [ + NodeBinding( + TopKChecker( + [PointerClass(float32_t), PointerClass(int8_t)], # inputs + [PointerClass(float32_t), PointerClass(int8_t)] # outputs + ), + TopKTemplate.referenceTemplate, + BasicTransformer, + ) +] diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index cc733937cc..51b7b45dd4 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -709,3 +709,15 @@ def computeOps(self): numPx = opRep['dim_im_out_x'] return numPx * opsPerPx + + +class TopKLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + +# def computeOps(self): +# ??? +# +# def computeShapes(self): +# ??? \ No newline at end of file diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index ad787d9e4b..48a00c38f5 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -982,7 +982,7 @@ def parseNode(self, node: gs.Node) -> (bool): return False indices_shape = node.inputs[1].shape - assert np.prod(indices_shape) == 1, f"Only indices of size 1 supported. Got indices of shape {indices_shape}" + self.operatorRepresentation['num_indices'] = int(np.prod(indices_shape)) self.operatorRepresentation['axis'] = node.attrs['axis'] if 'axis' in node.attrs else 0 return True @@ -1002,10 +1002,17 @@ def parseNodeCtxt(self, axis = self.operatorRepresentation['axis'] shape = ctxt.lookup(node.inputs[0].name).shape - self.operatorRepresentation['batch'] = np.prod(shape[:axis]) - self.operatorRepresentation['batch_length'] = np.prod(shape[axis:]) - self.operatorRepresentation['axis_length'] = np.prod(shape[axis + 1:]) - self.operatorRepresentation['index'] = int(node.inputs[1].values.item()) + self.operatorRepresentation['batch'] = int(np.prod(shape[:axis])) if axis > 0 else 1 + self.operatorRepresentation['batch_length'] = int(np.prod(shape[axis:])) + self.operatorRepresentation['axis_length'] = int(np.prod(shape[axis + 1:])) if axis + 1 < len(shape) else 1 + + if self.operatorRepresentation['num_indices'] == 1: + try: + self.operatorRepresentation['index'] = int(node.inputs[1].values.item()) + except Exception: + self.operatorRepresentation['index'] = f"{self.operatorRepresentation['indices']}[0]" + else: + self.operatorRepresentation['index'] = 0 # in this case is not used but is needed for mako template return ctxt, True @@ -2886,3 +2893,28 @@ def parseNodeCtxt(self, self.operatorRepresentation['size'] = int(np.prod(data_in.shape)) return ctxt, True + +# TopKParser: selects the largest k elements from a vector +class TopKParser(NodeParser): + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + return len(node.inputs)==2 and len(node.outputs)==2 and node.op=='TopK' + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + data_in = ctxt.lookup(node.inputs[0].name) + k_in = ctxt.lookup(node.inputs[1].name) + values_out = ctxt.lookup(node.outputs[0].name) + indices_out = ctxt.lookup(node.outputs[1].name) + + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_in_size'] = int(np.prod(data_in.shape)) + self.operatorRepresentation['k_value'] = int(k_in.values[0]) + self.operatorRepresentation['values_out'] = values_out.name + self.operatorRepresentation['indices_out'] = indices_out.name + + return ctxt, True diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py index e05e897270..2e4601bdd4 100644 --- a/Deeploy/Targets/Generic/Platform.py +++ b/Deeploy/Targets/Generic/Platform.py @@ -14,19 +14,19 @@ BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, \ BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, \ BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, BasicTransposeBindings, \ - DummyBinding + DummyBinding, BasicTopKBindings from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \ ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, GatherLayer, GELULayer, GEMMLayer, ITAMaxLayer, \ LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, \ ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, \ - SoftmaxLayer, SqrtLayer, TransposeLayer + SoftmaxLayer, SqrtLayer, TransposeLayer, TopKLayer from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, ConvTranspose1DParser, \ DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, GELUParser, GenericConv1DParser, \ GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, \ IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, \ Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, \ RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, \ - TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser + TransposeParser, TopKParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, ExtractPaddingFromConvPass, \ ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, QuantPatternPass, \ @@ -67,6 +67,7 @@ SoftmaxMapper = NodeMapper(SoftmaxParser(), BasicSoftmaxBindings) iSoftmaxMapper = NodeMapper(iSoftmaxParser(), BasicSoftmaxBindings) TransposeMapper = NodeMapper(TransposeParser(), BasicTransposeBindings) +TopKMapper = NodeMapper(TopKParser(), BasicTopKBindings) UnsqueezeMapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings) QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) @@ -113,6 +114,7 @@ 'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]), 'Squeeze': ReshapeLayer([UnsqueezeMapper]), 'Transpose': TransposeLayer([TransposeMapper]), + 'TopK': TopKLayer([TopKMapper]), 'Unsqueeze': ReshapeLayer([UnsqueezeMapper]), 'Slice': SliceLayer([SliceMapper]), 'Quant': QuantLayer([QuantMapper]), diff --git a/Deeploy/Targets/Generic/Templates/GatherTemplate.py b/Deeploy/Targets/Generic/Templates/GatherTemplate.py index dd5e534fa4..4efce4d050 100644 --- a/Deeploy/Targets/Generic/Templates/GatherTemplate.py +++ b/Deeploy/Targets/Generic/Templates/GatherTemplate.py @@ -10,8 +10,18 @@ width = int(data_in_type.referencedType.typeWidth/8) %> BEGIN_SINGLE_CORE +% if num_indices == 1: for (uint32_t i=0; i<${batch}; ++i) { memcpy(${data_out} + i * ${axis_length}, ${data_in} + i * ${batch_length} + ${index} * ${axis_length}, ${axis_length} * ${width}); } +% else: +for (uint32_t i=0; i<${batch}; ++i) { + for (uint32_t j=0; j<${num_indices}; ++j) { + memcpy(${data_out} + i * (${num_indices} * ${axis_length}) + j * ${axis_length}, + ${data_in} + i * ${batch_length} + ${indices}[j] * ${axis_length}, + ${axis_length} * ${width}); + } +} +% endif END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/TopKTemplate.py b/Deeploy/Targets/Generic/Templates/TopKTemplate.py new file mode 100644 index 0000000000..3f9b6474fa --- /dev/null +++ b/Deeploy/Targets/Generic/Templates/TopKTemplate.py @@ -0,0 +1,40 @@ +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +referenceTemplate = NodeTemplate(""" +// TopK (Name: ${nodeName}, Op: ${nodeOp}) +BEGIN_SINGLE_CORE +// Find the top ${k_value} values and their indices +// Assumes 1D input for simplicity +typedef struct { + ${data_in_type.referencedType.typeName} value; + uint32_t index; +} topk_pair_t; + +topk_pair_t pairs[${data_in_size}]; +for (uint32_t i = 0; i < ${data_in_size}; ++i) { + pairs[i].value = ((${data_in_type.referencedType.typeName}*)${data_in})[i]; + pairs[i].index = i; +} +// Simple selection sort for top-k +for (uint32_t i = 0; i < ${k_value}; ++i) { + uint32_t max_idx = i; + for (uint32_t j = i + 1; j < ${data_in_size}; ++j) { + if (pairs[j].value > pairs[max_idx].value) { + max_idx = j; + } + } + // Swap + if (max_idx != i) { + topk_pair_t tmp = pairs[i]; + pairs[i] = pairs[max_idx]; + pairs[max_idx] = tmp; + } + // Write output + ((${values_out_type.referencedType.typeName}*)${values_out})[i] = pairs[i].value; + ((${indices_out_type.referencedType.typeName}*)${indices_out})[i] = pairs[i].index; +} +END_SINGLE_CORE +""") \ No newline at end of file diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..5d363206f8 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -610,3 +610,17 @@ def _inferNumLevels(self, inputs: List[VariableBuffer], def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: return [True] + +# TopKChecker: infers types for both values and indices outputs of TopK operation +class TopKChecker(SignPropTypeChecker): + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: + # Output 0: values (same as input), Output 1: indices (integer, usually not quantized) + # We assume indices output is not quantized (set to 0 or 1) + return [inputs[0].nLevels, 1] + + def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: + # Output 0: values (same signedness as input), Output 1: indices (unsigned) + return [inputs[0]._signed, False] \ No newline at end of file diff --git a/Deeploy/Targets/Spatz/Bindings.py b/Deeploy/Targets/Spatz/Bindings.py new file mode 100644 index 0000000000..78431b74e0 --- /dev/null +++ b/Deeploy/Targets/Spatz/Bindings.py @@ -0,0 +1,117 @@ +from functools import partial + +from Deeploy.DeeployTypes import CodeTransformation, NodeBinding +from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ + MemoryManagementGeneration +from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, SignedIntegerDataTypes, float32_t, int8_t, int32_t +from Deeploy.Targets.Generic.TypeCheckers import GatherChecker, MatMulChecker, TopKChecker, SoftmaxChecker + +from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration +from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchClusterTiling import SnitchClusterTiling +from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchCoreFilter import SnitchCoreFilterPass +from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchClusterSynch import SnitchSynchCoresPass +from Deeploy.Targets.Spatz.DMA.SpatzDma import SpatzDma +from Deeploy.Targets.Spatz.Templates import GatherTemplate, MatMulTemplate as SpatzMatMulTemplate, TopKTemplate, SoftmaxTemplate +from Deeploy.Targets.Generic.Templates import MatMulTemplate, FloatMatMulTemplate +from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ + TilingVariableReplacementUpdate + +TilingCallClosure = partial(ClosureGeneration, closureSuffix = "_tiling_closure") +MemoryAwareFunctionCallClosure = partial(MemoryAwareClosureGeneration, + closureSuffix = "_closure", + startRegion = "L3", + endRegion = "L1") + +BasicTransformer = CodeTransformation( + [ArgumentStructGeneration(), + MemoryManagementGeneration(), + FutureGeneration()]) + +TiledTransformer = CodeTransformation([ + SnitchCoreFilterPass("compute"), + TilingVariableReplacement("L1"), + TilingCallClosure(writeback = False), + SnitchSynchCoresPass(), # snrt_cluster_hw_barrier() + TilingVariableReplacementUpdate("L1"), + SnitchClusterTiling("L3", "L1", SpatzDma()), + ArgumentStructGeneration(), + MemoryManagementGeneration("L1"), + MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True), + MemoryManagementGeneration() +]) + +SpatzGatherBindings = [ + NodeBinding( + GatherChecker( + [PointerClass(float32_t), PointerClass(type)], + [PointerClass(float32_t)] + ), + GatherTemplate.tilingReferenceTemplate, + TiledTransformer + ) for type in IntegerDataTypes +] +# [ +# NodeBinding( +# GatherChecker( +# [PointerClass(type), PointerClass(int32_t)], +# [PointerClass(type)] +# ), +# GatherTemplate.referenceTemplate, +# BasicTransformer +# ) for type in SignedIntegerDataTypes] + + +# with tiled transformer +SpatzMatMulBindings = [ + NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]), + SpatzMatMulTemplate.spatzSIMatMulTemplate, TiledTransformer), + NodeBinding( + MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + SpatzMatMulTemplate.spatzFloatMatMulTemplate, TiledTransformer) +] +''' +# without tiled transformer +SpatzMatMulBindings = [ + NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]), + SpatzMatMulTemplate.spatzSIMatMulTemplate, BasicTransformer), + NodeBinding( + MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + SpatzMatMulTemplate.spatzFloatMatMulTemplate, BasicTransformer) +] +# with BEGIN_SINGLE_CORE +# SpatzMatMulBindings = [ +# NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]), +# MatMulTemplate.referenceTemplate, TiledTransformer) +# ] + [ +# NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), +# FloatMatMulTemplate.referenceTemplate, TiledTransformer) +# ] +''' + +SpatzTopKBindings = [ + NodeBinding( + TopKChecker( + [PointerClass(float32_t), PointerClass(int32_t)], # inputs + [PointerClass(float32_t), PointerClass(int32_t)] # outputs + ), + TopKTemplate.SpatzTilingTemplate, + TiledTransformer, + ) +] + + +SpatzSoftmaxBindings = [ + NodeBinding( + SoftmaxChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + SoftmaxTemplate.floatTilingTemplate, + TiledTransformer + ) +] +# [ +# NodeBinding( +# SoftmaxChecker([PointerClass(int8_t)], [PointerClass(int8_t)]), +# SoftmaxTemplate.integerTilingTemplate, +# TiledTransformer +# ) +# ] diff --git a/Deeploy/Targets/Spatz/CodeTransformationPasses/SpatzCoreFilter.py b/Deeploy/Targets/Spatz/CodeTransformationPasses/SpatzCoreFilter.py new file mode 100644 index 0000000000..f5f6ac4797 --- /dev/null +++ b/Deeploy/Targets/Spatz/CodeTransformationPasses/SpatzCoreFilter.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Literal, Tuple + +from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \ + NodeTemplate, _NoVerbosity + + +class SpatzCoreFilterPass(CodeTransformationPass): + + def __init__(self, coreType: Literal["dm", "compute"]): + super().__init__() + self.coreType = coreType + + def apply(self, + ctxt: NetworkContext, + executionBlock: ExecutionBlock, + name: str, + verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]: + theother = self.coreType=="dm" ? "compute" : "dm" + executionBlock.addLeft(NodeTemplate(f"if (snrt_is_{theother}_core()) {{\n"), {}) + executionBlock.addRight(NodeTemplate("}\n"), {}) + return ctxt, executionBlock diff --git a/Deeploy/Targets/Spatz/DMA/SpatzDma.py b/Deeploy/Targets/Spatz/DMA/SpatzDma.py new file mode 100644 index 0000000000..ea0f19ab90 --- /dev/null +++ b/Deeploy/Targets/Spatz/DMA/SpatzDma.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer +from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy + + +class SnitchBarrierFuture(Future): + _initTemplate = NodeTemplate("") + _deinitTemplate = NodeTemplate("") + _allocTemplate = NodeTemplate("") + _waitTemplate = NodeTemplate("if (snrt_is_dm_core()) snrt_dma_wait_all();") + + +# LMACAN: TODO: Add single transfer waiting +class SnitchFuture(Future): + _initTemplate = NodeTemplate("snrt_dma_txid_t ${name} = (snrt_dma_txid_t) -1;") + + _deinitTemplate = NodeTemplate("") + + _allocTemplate = NodeTemplate("") + + _waitTemplate = NodeTemplate(#remove if condition -1 + "if ( (${name} != ( (snrt_dma_txid_t) -1) ) && snrt_is_dm_core() ) snrt_dma_wait_all();") + # "if ( (${name} != ( (snrt_dma_txid_t) -1) ) && snrt_is_dm_core() ) snrt_dma_wait(${name});") + + +class SpatzDma(AsyncDma): + + _transferTemplates = { + 2: + NodeTemplate(""" + if (snrt_is_dm_core()) { + ${future} = snrt_dma_start_2d(${dest}, ${src}, ${size}, ${stride_dest}, ${stride_src}, ${repeat}); + } + """), + } + _waitingStrategy = PerTensorWaitingStrategy(SnitchFuture) + + def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None: + super().__init__(transferTemplates) + + def checkTransfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, + shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], + direction: DmaDirection) -> None: + super().checkTransfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction) + assert strideLoc[1] == 1 and strideExt[1] == 1, f"Supports only contigous transfers in the innermost dimension" + + def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...], + strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection, + future: Future) -> OperatorRepresentation: + operatorRepresentation: OperatorRepresentation = { + "dest": localBuffer.name if direction == "ExternalToLocal" else externalBuffer.name, + "src": externalBuffer.name if direction == "ExternalToLocal" else localBuffer.name, + "repeat": shape[0], + "size": shape[1], + "stride_dest": strideLoc[0] if direction == "ExternalToLocal" else strideExt[0], + "stride_src": strideExt[0] if direction == "ExternalToLocal" else strideLoc[0], + "future": future.name + } + return operatorRepresentation diff --git a/Deeploy/Targets/Spatz/Deployer.py b/Deeploy/Targets/Spatz/Deployer.py new file mode 100644 index 0000000000..2442059606 --- /dev/null +++ b/Deeploy/Targets/Spatz/Deployer.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Callable, Dict, Type + +import onnx_graphsurgeon as gs + +from Deeploy.AbstractDataTypes import Pointer +from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer +from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import DebugPrintMergePass +from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ + NCHWtoNHWCPass, TransposeMatmulInputsPass +from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import TransposeConstOptPass, TransposeMergePass + + +class SpatzDeployer(SignPropDeployer): + + def __init__(self, + graph: gs.Graph, + deploymentPlatform: DeploymentPlatform, + inputTypes: Dict[str, Type[Pointer]], + loweringOptimizer: TopologyOptimizer, + scheduler: Callable = lambda x: x, + name: str = 'DeeployNetwork', + default_channels_first = False, + deeployStateDir: str = "DeeployStateDir", + inputOffsets: Dict[str, int] = {}): + + super().__init__(graph, + deploymentPlatform, + inputTypes, + loweringOptimizer, + scheduler, + name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir) + +# self.inputOffsets = inputOffsets +# +# self.loweringOptimizer.passes += [ +# TransposeMatmulInputsPass(), +# NCHWtoNHWCPass(self.default_channels_first), +# TransposeMergePass(), +# TransposeConstOptPass(), +# DebugPrintMergePass() +# ] diff --git a/Deeploy/Targets/Spatz/Platform.py b/Deeploy/Targets/Spatz/Platform.py new file mode 100644 index 0000000000..c8af56ded5 --- /dev/null +++ b/Deeploy/Targets/Spatz/Platform.py @@ -0,0 +1,135 @@ +from typing import List +import numpy as np + +from Deeploy.DeeployTypes import VariableBuffer, TransientBuffer, ConstantBuffer, StructBuffer, \ + NodeMapper, NodeTemplate, TopologyOptimizer, DeploymentEngine, DeploymentPlatform + +from Deeploy.Targets.Generic.Templates import AllocateTemplate as GenericAllocateTemplate +from Deeploy.Targets.Spatz.Templates import AllocateTemplate as SpatzAllocateTemplate +from Deeploy.Targets.Spatz.Templates import FreeTemplate as SpatzFreeTemplate +from Deeploy.Targets.Snitch.Templates import AllocateTemplate as SnitchAllocateTemplate, FreeTemplate as SnitchFreeTemplate + +from Deeploy.Targets.Spatz.Bindings import SpatzGatherBindings, SpatzMatMulBindings, SpatzTopKBindings +from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicMatMulBindings, BasicSoftmaxBindings, BasicTopKBindings +from Deeploy.Targets.Spatz.Tiler import SpatzMatMulTilingBindings, SpatzGatherTilingBindings, SpatzTopKTilingBindings, SpatzSoftmaxTilingBindings +from Deeploy.Targets.Generic.Layers import AddLayer, GEMMLayer, SoftmaxLayer, TopKLayer, GatherLayer +from Deeploy.Targets.Generic.Parsers import AddParser, MatMulParser, SoftmaxParser, TopKParser, GatherParser + +# # print(SpatzMatMulBindings) +# # for binding in SpatzMatMulBindings: +# # print(binding.template.tileConstraint) +# +# print(SpatzMatMulTilingReadyBindings) +# for binding in SpatzMatMulTilingReadyBindings: +# print(binding.template.tileConstraint) +# +# print(SpatzMatMulTilingReadyBindings[0].template.tileConstraint) +# print(SpatzMatMulTilingReadyBindings[1].template.tileConstraint) + +SpatzAddMapper = NodeMapper(AddParser(), BasicAddBindings) +# MatMulMapper = NodeMapper(MatMulParser(), BasicMatMulBindings) +MatMulMapper = NodeMapper(MatMulParser(), SpatzMatMulTilingBindings) +# SoftmaxMapper = NodeMapper(SoftmaxParser(), BasicSoftmaxBindings) +SoftmaxMapper = NodeMapper(SoftmaxParser(), SpatzSoftmaxTilingBindings) +# TopKMapper = NodeMapper(TopKParser(), SpatzTopKBindings) +TopKMapper = NodeMapper(TopKParser(), SpatzTopKTilingBindings) +# GatherMapper = NodeMapper(GatherParser(), SpatzGatherBindings) +GatherMapper = NodeMapper(GatherParser(), SpatzGatherTilingBindings) + +SpatzMapping = { + 'Add': AddLayer([SpatzAddMapper]), + 'MatMul': GEMMLayer([MatMulMapper]), + 'Softmax': SoftmaxLayer([SoftmaxMapper]), + 'TopK': TopKLayer([TopKMapper]), + 'Gather': GatherLayer([GatherMapper]), +} + + +class SpatzVariableBuffer(VariableBuffer): + initTemplate = GenericAllocateTemplate.referenceInitTemplate + allocTemplate = SpatzAllocateTemplate.spatzGenericAllocate + deallocTemplate = SpatzFreeTemplate.spatzLocalTemplate + + def _bufferRepresentation(self): + + if hasattr(self, "_memoryLevel"): + memoryLevel = self._memoryLevel + else: + memoryLevel = None + + return { + "type": self._instance, + "name": self.name, + "size": int(np.prod(self.shape)), + "_memoryLevel": memoryLevel + } + +class SpatzTransientBuffer(TransientBuffer): + initTemplate = GenericAllocateTemplate.referenceInitTemplate + allocTemplate = SpatzAllocateTemplate.spatzGenericAllocate + deallocTemplate = SpatzFreeTemplate.spatzLocalTemplate +# def _bufferRepresentation(self): +# +# if hasattr(self, "_memoryLevel"): +# memoryLevel = self._memoryLevel +# else: +# memoryLevel = None +# +# return { +# "type": self._type, +# "name": self.name, +# "size": self.size, +# "_memoryLevel": memoryLevel +# } + + +class SpatzConstantBuffer(ConstantBuffer): + initTemplate = SnitchAllocateTemplate.snitchGenericGlobalInitTemplate + allocTemplate = NodeTemplate("") + deallocTemplate = NodeTemplate("") # const not deallocated + + def _bufferRepresentation(self): + operatorRepresentation = super()._bufferRepresentation() + + if hasattr(self, "_memoryLevel"): + memoryLevel = self._memoryLevel + else: + memoryLevel = None + + operatorRepresentation["_memoryLevel"] = memoryLevel + + return operatorRepresentation + + +class SpatzStructBuffer(StructBuffer): + initTemplate = GenericAllocateTemplate.referenceStructInitTemplate + allocTemplate = GenericAllocateTemplate.referenceStructAllocateTemplate + deallocTemplate = NodeTemplate("") # struct not deallocated ? + + +SpatzOptimizer = TopologyOptimizer([ + # TODO add something ? +], name = "SpatzOptimizer") + +includeList = [ + "DeeploySpatzMath.h", +] + + +class SpatzEngine(DeploymentEngine): + def __init__(self, name: str, Mapping = SpatzMapping, initCode = "", includeList = includeList) -> None: + super().__init__(name, Mapping, initCode, includeList) + + +class SpatzPlatform(DeploymentPlatform): + + def __init__( self, + engines = [SpatzEngine("SpatzVectorProcessor")], + variableBuffer = SpatzVariableBuffer, + transientBuffer = SpatzTransientBuffer, + constantBuffer = SpatzConstantBuffer, + structBuffer = SpatzStructBuffer, + includeList: List[str] = includeList + ): + super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) + diff --git a/Deeploy/Targets/Spatz/Templates/AllocateTemplate.py b/Deeploy/Targets/Spatz/Templates/AllocateTemplate.py new file mode 100644 index 0000000000..1bc11bc76d --- /dev/null +++ b/Deeploy/Targets/Spatz/Templates/AllocateTemplate.py @@ -0,0 +1,16 @@ +from Deeploy.DeeployTypes import NodeTemplate + +# allocate +referenceAllocateTemplate = NodeTemplate( + "${name} = (${type.typeName}) snrt_l1alloc(${type.referencedType.typeWidth//8} * ${size});\n") + +spatzGenericAllocate = NodeTemplate(""" +% if _memoryLevel == "L1": +${name} = (${type.typeName}) snrt_l1alloc(sizeof(${type.referencedType.typeName}) * ${size});\n +% elif _memoryLevel == "L3" or _memoryLevel is None: +${name} = (${type.typeName}) snrt_l3alloc(sizeof(${type.referencedType.typeName}) * ${size});\n +% else: +// COMPILER WARNING — unsupported memory level ${_memoryLevel}, defaulting to L3 +${name} = (${type.typeName}) snrt_l3alloc(${type.referencedType.typeWidth//8} * ${size}); +% endif +""") diff --git a/Deeploy/Targets/Spatz/Templates/FreeTemplate.py b/Deeploy/Targets/Spatz/Templates/FreeTemplate.py new file mode 100644 index 0000000000..f67cb3de38 --- /dev/null +++ b/Deeploy/Targets/Spatz/Templates/FreeTemplate.py @@ -0,0 +1,5 @@ +from Deeploy.DeeployTypes import NodeTemplate + +# snrt_l1alloc currently does not support free-ing of memory (spatz/sw/snRuntime/src/alloc.c) +spatzLocalTemplate = NodeTemplate("") +spatzGlobalTemplate = NodeTemplate("") \ No newline at end of file diff --git a/Deeploy/Targets/Spatz/Templates/GatherTemplate.py b/Deeploy/Targets/Spatz/Templates/GatherTemplate.py new file mode 100644 index 0000000000..428e654b3d --- /dev/null +++ b/Deeploy/Targets/Spatz/Templates/GatherTemplate.py @@ -0,0 +1,58 @@ +from Deeploy.DeeployTypes import NodeTemplate + +# TODO for l3 -> l3 transfers in spatz should memcpy be used? +referenceTemplate = NodeTemplate(""" +// Gather (Name: ${nodeName}, Op: ${nodeOp}) +<% +width = int(data_in_type.referencedType.typeWidth/8) +%> +BEGIN_SINGLE_CORE +% if num_indices == 1: +for (uint32_t i=0; i<${batch}; ++i) { + snrt_dma_start_1d(${data_out} + i * ${axis_length}, ${data_in} + i * ${batch_length} + ${index} * ${axis_length}, ${axis_length} * ${width}); +} +% elif batch==1: +for (uint32_t j=0; j<${num_indices}; ++j) { + snrt_dma_start_1d(${data_out} + j * ${axis_length}, + ${data_in} + ${indices}[j] * ${axis_length}, + ${axis_length} * ${width}); +} +% else: +for (uint32_t i=0; i<${batch}; ++i) { + for (uint32_t j=0; j<${num_indices}; ++j) { + snrt_dma_start_1d(${data_out} + i * (${num_indices} * ${axis_length}) + j * ${axis_length}, + ${data_in} + i * ${batch_length} + ${indices}[j] * ${axis_length}, + ${axis_length} * ${width}); + } +} +% endif +END_SINGLE_CORE +""") + + +tilingReferenceTemplate = NodeTemplate(""" +// Gather (Name: ${nodeName}, Op: ${nodeOp}) +<% +width = int(data_in_type.referencedType.typeWidth/8) +%> + +% if num_indices == 1: +for (uint32_t i=0; i<${batch}; ++i) { + memcpy(${data_out} + i * ${axis_length}, ${data_in} + i * ${batch_length} + ${index} * ${axis_length}, ${axis_length} * ${width}); +} +% elif batch==1: +for (uint32_t j=0; j<${num_indices}; ++j) { + memcpy(${data_out} + j * ${axis_length}, + ${data_in} + ${indices}[j] * ${axis_length}, + ${axis_length} * ${width}); +} +% else: +for (uint32_t i=0; i<${batch}; ++i) { + for (uint32_t j=0; j<${num_indices}; ++j) { + memcpy(${data_out} + i * (${num_indices} * ${axis_length}) + j * ${axis_length}, + ${data_in} + i * ${batch_length} + ${indices}[j] * ${axis_length}, + ${axis_length} * ${width}); + } +} +% endif +""") \ No newline at end of file diff --git a/Deeploy/Targets/Spatz/Templates/MatMulTemplate.py b/Deeploy/Targets/Spatz/Templates/MatMulTemplate.py new file mode 100644 index 0000000000..ba354fe422 --- /dev/null +++ b/Deeploy/Targets/Spatz/Templates/MatMulTemplate.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _MatMulTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + A = ctxt.lookup(operatorRepresentation['A']) + B = ctxt.lookup(operatorRepresentation['B']) + C = ctxt.lookup(operatorRepresentation['data_out']) + operatorRepresentation['A_offset'] = 0 + operatorRepresentation['B_offset'] = 0 + operatorRepresentation['C_offset'] = 0 + if hasattr(A, "_signed") and hasattr(A, "nLevels"): + operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2) + if hasattr(B, "_signed") and hasattr(B, "nLevels"): + operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2) + if hasattr(C, "_signed") and hasattr(C, "nLevels"): + operatorRepresentation['C_offset'] = -(C._signed == 0) * int(C.nLevels / 2) + + return ctxt, operatorRepresentation, [] + + +# signed integer +spatzSIMatMulTemplate = _MatMulTemplate(""" +// MatMul (Name: ${nodeName}, Op: ${nodeOp}) +${A_type.typeName} ref_${data_out}_${A} = ${A}; +${B_type.typeName} ref_${data_out}_${B} = ${B}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for(uint32_t i=0;i<${batch};i++){ + MatMul_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}( + ref_${data_out}_${A}, + ref_${data_out}_${B}, + ref_${data_out}_${data_out}, + ${M}, + ${N}, + ${O}, + ${A_offset}, ${B_offset}, ${C_offset} + ); + + ref_${data_out}_${A} += ${M} * ${N}; + ref_${data_out}_${B} += ${N} * ${O}; + ref_${data_out}_${data_out} += ${M} * ${O}; +} +""") + +# supports single precision float (fp32) +# also possible ot add half and double precision +spatzFloatMatMulTemplate = NodeTemplate(""" +// Matmul (Name: ${nodeName}, Op: ${nodeOp}) +${A_type.typeName} ref_${data_out}_${A} = ${A}; +${B_type.typeName} ref_${data_out}_${B} = ${B}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for(uint32_t i=0; i<${batch}; i++){ + Spatz_MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( + ref_${data_out}_${A}, + ref_${data_out}_${B}, + ref_${data_out}_${data_out}, + ${M}, + ${N}, + ${O} + ); + + ref_${data_out}_${A} += ${M} * ${N}; + ref_${data_out}_${B} += ${N} * ${O}; + ref_${data_out}_${data_out} += ${M} * ${O}; +} +""") diff --git a/Deeploy/Targets/Spatz/Templates/SoftmaxTemplate.py b/Deeploy/Targets/Spatz/Templates/SoftmaxTemplate.py new file mode 100644 index 0000000000..2ddcc2c9b0 --- /dev/null +++ b/Deeploy/Targets/Spatz/Templates/SoftmaxTemplate.py @@ -0,0 +1,8 @@ +from Deeploy.DeeployTypes import NodeTemplate + +# integerTilingTemplate + +floatTilingTemplate = NodeTemplate(""" +// Softmax (Name: ${nodeName}, Op: ${nodeOp}) +Spatz_Softmax_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${lastDimLength}); +""") diff --git a/Deeploy/Targets/Spatz/Templates/TopKTemplate.py b/Deeploy/Targets/Spatz/Templates/TopKTemplate.py new file mode 100644 index 0000000000..84764656d6 --- /dev/null +++ b/Deeploy/Targets/Spatz/Templates/TopKTemplate.py @@ -0,0 +1,37 @@ +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +SpatzTilingTemplate = NodeTemplate(""" +// TopK node: finds the top ${k_value} values and their indices +// Assumes 1D input +${data_in_type.referencedType.typeName} *values_tmp = snrt_l1alloc(sizeof(${data_in_type.referencedType.typeName})*${data_in_size}); +${indices_out_type.referencedType.typeName} *indices_tmp = snrt_l1alloc(sizeof(${indices_out_type.referencedType.typeName})*${data_in_size}); + +for (uint32_t i = 0; i < ${data_in_size}; ++i) { + values_tmp[i] = ((${data_in_type.referencedType.typeName}*)${data_in})[i]; + indices_tmp[i] = i; +} +// Simple selection sort for top-k +for (uint32_t i = 0; i < ${k_value}; ++i) { + uint32_t max_idx = i; + for (uint32_t j = i + 1; j < ${data_in_size}; ++j) { + if (values_tmp[j] > values_tmp[max_idx]) { + max_idx = j; + } + } + // Swap + if (max_idx != i) { + float32_t tmp_val = values_tmp[i]; + int32_t tmp_idx = indices_tmp[i]; + values_tmp[i] = values_tmp[max_idx]; + indices_tmp[i] = indices_tmp[max_idx]; + values_tmp[max_idx] = tmp_val; + indices_tmp[max_idx] = tmp_idx; + } + // Write output + ((${values_out_type.referencedType.typeName}*)${values_out})[i] = values_tmp[i]; + ((${indices_out_type.referencedType.typeName}*)${indices_out})[i] = indices_tmp[i]; +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/Spatz/TileConstraints/GatherTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/GatherTileConstraint.py new file mode 100644 index 0000000000..5c5fc8eb7a --- /dev/null +++ b/Deeploy/Targets/Spatz/TileConstraints/GatherTileConstraint.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class GatherTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + pointer: List[str] = [] + + for key, value in parseDict.items(): + if not isinstance(value, str): + continue + + if ctxt.is_global(value) or ctxt.is_local(value): + pointer.append(value) + + for tensorName in pointer: + + _buffer = ctxt.lookup(tensorName) + if isinstance(_buffer, TransientBuffer): + continue + + tilerModel.addTensorDimToModel(ctxt, tensorName) + + for idx, shapeDim in enumerate(_buffer.shape): + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) == shapeDim) + + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + + symbolicParseDict = parseDict.copy() + + return symbolicParseDict + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'indices', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + dataInBuffer = ctxt.lookup(operatorRepresentation['data_in']) + indicesBuffer = ctxt.lookup(operatorRepresentation['indices']) + + dataInCube = HyperRectangle(offset = (0,) * len(dataInBuffer.shape), dims = tuple(dataInBuffer.shape)) + indicesCube = HyperRectangle(offset = (0,) * len(indicesBuffer.shape), dims = tuple(indicesBuffer.shape)) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for out in outputCubes: + # Gather execution policy: load full inputs in L1, execute once, then store output tile. + inputLoadSchedule.append({'data_in': dataInCube, 'indices': indicesCube}) + outputLoadSchedule.append({'data_out': out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + repScheme = VariableReplacementScheme({}, {}) + + return repScheme, schedule diff --git a/Deeploy/Targets/Spatz/TileConstraints/SoftmaxTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/SoftmaxTileConstraint.py new file mode 100644 index 0000000000..c34b84890f --- /dev/null +++ b/Deeploy/Targets/Spatz/TileConstraints/SoftmaxTileConstraint.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class SoftmaxTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # Register and pin all referenced tensors to full shape to avoid tiling. + # This also covers constant inputs that may appear as parseDict string references. + tensorNames: List[str] = [] + + for value in parseDict.values(): + if not isinstance(value, str): + continue + if ctxt.is_global(value) or ctxt.is_local(value): + tensorNames.append(value) + + for tensorName in tensorNames: + _buffer = ctxt.lookup(tensorName) + if isinstance(_buffer, TransientBuffer): + continue + + tilerModel.addTensorDimToModel(ctxt, tensorName) + + for idx, shapeDim in enumerate(_buffer.shape): + tileDimVar = tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) + tilerModel.addConstraint(tileDimVar == shapeDim) + + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + + symbolicParseDict = parseDict.copy() + + return symbolicParseDict + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + dataInBuffer = ctxt.lookup(operatorRepresentation['data_in']) + + dataInCube = HyperRectangle(offset = (0,) * len(dataInBuffer.shape), dims = tuple(dataInBuffer.shape)) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for out in outputCubes: + inputLoadSchedule.append({'data_in': dataInCube}) + outputLoadSchedule.append({'data_out': out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + repScheme = VariableReplacementScheme({}, {}) + + return repScheme, schedule diff --git a/Deeploy/Targets/Spatz/TileConstraints/TopKTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/TopKTileConstraint.py new file mode 100644 index 0000000000..30572d5819 --- /dev/null +++ b/Deeploy/Targets/Spatz/TileConstraints/TopKTileConstraint.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class TopKTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # Register and pin all referenced tensors to full shape to avoid tiling. + # This also covers constant inputs that may appear as parseDict string references. + tensorNames: List[str] = [] + + for value in parseDict.values(): + if not isinstance(value, str): + continue + if ctxt.is_global(value) or ctxt.is_local(value): + tensorNames.append(value) + + for tensorName in tensorNames: + _buffer = ctxt.lookup(tensorName) + if isinstance(_buffer, TransientBuffer): + continue + + tilerModel.addTensorDimToModel(ctxt, tensorName) + + for idx, shapeDim in enumerate(_buffer.shape): + tileDimVar = tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) + tilerModel.addConstraint(tileDimVar == shapeDim) + + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + + symbolicParseDict = parseDict.copy() + + return symbolicParseDict + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + # k_value is a scalar parsed in operatorRepresentation, not a tensor to transfer. + addrNames = ['data_in', 'values_out', 'indices_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + dataInBuffer = ctxt.lookup(operatorRepresentation['data_in']) + + dataInCube = HyperRectangle(offset = (0,) * len(dataInBuffer.shape), dims = tuple(dataInBuffer.shape)) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for out in outputCubes: + # TopK execution policy: load full input in L1, execute once, then store both outputs. + inputLoadSchedule.append({'data_in': dataInCube}) + outputLoadSchedule.append({'values_out': out, 'indices_out': out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + repScheme = VariableReplacementScheme({}, {}) + + return repScheme, schedule diff --git a/Deeploy/Targets/Spatz/Tiler.py b/Deeploy/Targets/Spatz/Tiler.py new file mode 100644 index 0000000000..96a0a09bfe --- /dev/null +++ b/Deeploy/Targets/Spatz/Tiler.py @@ -0,0 +1,18 @@ +from Deeploy.Targets.Spatz.Bindings import SpatzMatMulBindings, SpatzGatherBindings, SpatzTopKBindings, SpatzSoftmaxBindings +from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings +from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint +from Deeploy.Targets.Spatz.TileConstraints.GatherTileConstraint import GatherTileConstraint +from Deeploy.Targets.Spatz.TileConstraints.TopKTileConstraint import TopKTileConstraint +from Deeploy.Targets.Spatz.TileConstraints.SoftmaxTileConstraint import SoftmaxTileConstraint + +SpatzMatMulTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzMatMulBindings, + tileConstraint = MatMulTileConstraint()) + +SpatzGatherTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzGatherBindings, + tileConstraint = GatherTileConstraint()) + +SpatzTopKTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzTopKBindings, + tileConstraint = TopKTileConstraint()) + +SpatzSoftmaxTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzSoftmaxBindings, + tileConstraint = SoftmaxTileConstraint()) diff --git a/Deeploy/TilingExtension/TileConstraint.py b/Deeploy/TilingExtension/TileConstraint.py index 5b067b2ce9..9a2aa6b9d9 100644 --- a/Deeploy/TilingExtension/TileConstraint.py +++ b/Deeploy/TilingExtension/TileConstraint.py @@ -131,7 +131,9 @@ def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List return solution, solutionLengths - assert len(tilingSolution.outputTensorMemoryConstraints) == 1, "Expected node to have only one output!" + # Support multi-output nodes: use first output tensor to determine tiling structure. + # For operators like TopK with multiple outputs, all outputs share the same tiling pattern. + assert len(tilingSolution.outputTensorMemoryConstraints) >= 1, "Expected node to have at least one output!" outVar, outTensorConstraint = next(iter(tilingSolution.outputTensorMemoryConstraints.items())) memoryPath = list(outTensorConstraint.memoryConstraints.keys()) diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py index 9b48d9456c..1903c8178a 100644 --- a/Deeploy/TilingExtension/TilerExtension.py +++ b/Deeploy/TilingExtension/TilerExtension.py @@ -532,6 +532,15 @@ def _setupGeometricConstraints(self, tilerModel: TilerModel, ctxt: NetworkContex parseDict = layerBinding[node.name].mapper.parser.operatorRepresentation template = layerBinding[node.name].mapper.binder.template + # print("eccomi===================") + # print(layerBinding) + # print(node.name) + # print(layerBinding[node.name]) + # print(layerBinding[node.name].mapper) + # print(layerBinding[node.name].mapper.parser) + # print(layerBinding[node.name].mapper.binder) + # print(layerBinding[node.name].mapper.parser.operatorRepresentation) + # print(layerBinding[node.name].mapper.binder.template) tilerModel = template.tileConstraint.addGeometricalConstraint(tilerModel, parseDict = parseDict, @@ -556,7 +565,7 @@ def _setupHeuristics(self, tilerModel: TilerModel, ctxt: NetworkContext, schedul patternMemSizeExpr: IntVar = 0 for tensor in patternTensorList: - if not ctxt.lookup(tensor.name)._deploy: + if not ctxt.lookup(tensor.name)._deploy or isinstance(ctxt.lookup(tensor.name), ConstantBuffer): continue patternMemSizeExpr += tilerModel.getTensorNumberOfEltVar( diff --git a/Deeploy/TilingExtension/TilerModel.py b/Deeploy/TilingExtension/TilerModel.py index db83974f0c..080211270b 100644 --- a/Deeploy/TilingExtension/TilerModel.py +++ b/Deeploy/TilingExtension/TilerModel.py @@ -10,6 +10,7 @@ import numpy as np from ortools.constraint_solver.pywrapcp import IntExpr, IntVar, SolutionCollector, Solver +from Deeploy.DeeployTypes import ConstantBuffer from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation from Deeploy.Logging import DEFAULT_LOGGER as log from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryLevel @@ -170,6 +171,10 @@ def addTensorNumOfEltToModel(self, ctxt: NetworkContext, tensorName: str, copyId tensor = ctxt.lookup(tensorName) + # Skip constant buffers: they don't participate in tiling and don't need num_elements variables + if isinstance(tensor, ConstantBuffer): + return + tensorDimProductExpr = 1 for idx, _ in enumerate([ diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt index b7f3535790..71f632cbd2 100644 --- a/DeeployTest/CMakeLists.txt +++ b/DeeployTest/CMakeLists.txt @@ -50,6 +50,8 @@ elseif(DEEPLOY_ARCH STREQUAL SNITCH) add_subdirectory(Platforms/Snitch) elseif(DEEPLOY_ARCH STREQUAL CHIMERA) add_subdirectory(Platforms/Chimera) +elseif(DEEPLOY_ARCH STREQUAL SPATZ) + add_subdirectory(Platforms/Spatz) elseif(platform STREQUAL GAP9) # Search for hex files generated by Python code generator diff --git a/DeeployTest/Platforms/Spatz/CMakeLists.txt b/DeeployTest/Platforms/Spatz/CMakeLists.txt new file mode 100644 index 0000000000..2d0f730e52 --- /dev/null +++ b/DeeployTest/Platforms/Spatz/CMakeLists.txt @@ -0,0 +1,22 @@ +set(ProjectId ${TESTNAME}) + +file(GLOB_RECURSE SOURCES + main.c +) + +list(APPEND SOURCES + ${SPATZ_HOME}/sw/spatzBenchmarks/benchmark/benchmark.c +) + +add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES}) + +set(SPATZ_BENCHMARK_INCLUDE_DIR + ${SPATZ_HOME}/sw/spatzBenchmarks/include +) +target_include_directories(${ProjectId} PRIVATE ${SPATZ_BENCHMARK_INCLUDE_DIR}) + +target_link_libraries(${ProjectId} PRIVATE network deeploylib) +target_compile_options(${ProjectId} INTERFACE network) + +add_spatz_gvsoc_emulation(${ProjectId} "spatz_v2") +add_spatz_vsim_simulation(${ProjectId}) \ No newline at end of file diff --git a/DeeployTest/Platforms/Spatz/main.c b/DeeployTest/Platforms/Spatz/main.c new file mode 100644 index 0000000000..c9084b8ee5 --- /dev/null +++ b/DeeployTest/Platforms/Spatz/main.c @@ -0,0 +1,86 @@ + +#include +#include +#include "printf.h" + +#include "Network.h" +#include "testinputs.h" +#include "testoutputs.h" + +int main() { + const unsigned int core_id = snrt_cluster_core_idx(); + unsigned int timer_start, timer_end, timer; + + if (core_id == 0) printf("[INFO] Running on %d cores\n", snrt_cluster_core_num()); + if (snrt_is_dm_core()){printf("[INFO] DM core is core number %d\n", core_id);} + snrt_cluster_hw_barrier(); + + // do it only with one of the two spatz cores + if (snrt_is_dm_core()){ + timer_start = benchmark_get_cycle(); + + printf("Initializing network...\r\n"); + InitNetwork(0, 1); + + for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) { + memcpy(DeeployNetwork_inputs[buf], testInputVector[buf], DeeployNetwork_inputs_bytes[buf]); + // DeeployNetwork_inputs[buf] = (void *)testInputVector[buf]; TODO ??? + } + + + printf("Running network...\r\n"); + } + + snrt_cluster_hw_barrier(); + if (snrt_is_dm_core()){ timer_start = benchmark_get_cycle(); } + RunNetwork(core_id, 2); + + snrt_cluster_hw_barrier(); + + if (snrt_is_dm_core()){ + timer_end = benchmark_get_cycle(); + timer = timer_end - timer_start; + + printf("Network ran in %d cycles.\r\nChecking Outputs...\r\n", timer); + int32_t tot_err = 0; + uint32_t tot = 0; + OUTPUTTYPE diff; + OUTPUTTYPE expected, actual; + + for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) { + tot += DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); + for (uint32_t i = 0; + i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) { + expected = ((OUTPUTTYPE *)testOutputVector[buf])[i]; + actual = ((OUTPUTTYPE *)DeeployNetwork_outputs[buf])[i]; + diff = expected - actual; + +#if ISOUTPUTFLOAT == 1 + // RUNWANG: Allow margin of error for float32_t + // MATTIA: if diff is a quiet nan 0x7FC00000 we want to error + if ((diff < -1e-4f) || (diff > 1e-4f) || *(uint32_t*)&diff == 0x7FC00000) { + tot_err += 1; + // printf("Expected: %f Actual: %f Diff: %f at Index %12u in Output %u\r\n", expected, actual, diff, i, buf); + printf("Expected: 0x%08x Actual: 0x%08x Diff: 0x%08x at Index %12u in Output %u\r\n", *(uint32_t*)&expected, *(uint32_t*)&actual, *(uint32_t*)&diff, i, buf); + } +#else + // RUNWANG: No margin for integer comparison + if (diff != 0) { + tot_err += 1; + printf("Expected: %4d ", expected); + printf("Actual: %4d ", actual); + printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf); + } +#endif + } + } + + printf("Errors: %d out of %d \r\n", tot_err, tot); + } + + printf("core %d arrived at the end\r\n", core_id); + snrt_cluster_hw_barrier(); + printf("We are after hw barrier\r\n"); + + return 0; +} diff --git a/DeeployTest/Tests/Kernels/FP32/Gather/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Gather/inputs.npz new file mode 100644 index 0000000000..eb073685c7 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Gather/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Gather/network.onnx b/DeeployTest/Tests/Kernels/FP32/Gather/network.onnx new file mode 100644 index 0000000000..c20c89bd05 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Gather/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/Gather/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Gather/outputs.npz new file mode 100644 index 0000000000..ed786d2e1d Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Gather/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Big/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Big/inputs.npz new file mode 100644 index 0000000000..930d02b187 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Big/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Big/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/Big/network.onnx new file mode 100644 index 0000000000..f1f3c60551 --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/MatMul/Big/network.onnx @@ -0,0 +1,18 @@ +pytorch2.7.0:o + +a +bout/MatMul"MatMul +main_graphZ +a + + +  +ÀZ +b +  +À +Pb +out +  +  +PB \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Big/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Big/outputs.npz new file mode 100644 index 0000000000..9915d63151 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Big/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/inputs.npz new file mode 100644 index 0000000000..d9a6ad5605 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/network.onnx new file mode 100644 index 0000000000..8719d9bc82 --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/network.onnx @@ -0,0 +1,17 @@ +pytorch2.7.0:m + +a +bout/MatMul"MatMul +main_graphZ +a +  +  + Z +b +  + +Pb +out +  +  +PB \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/outputs.npz new file mode 100644 index 0000000000..1f79303b10 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/inputs.npz new file mode 100644 index 0000000000..d8f4a477c3 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/network.onnx new file mode 100644 index 0000000000..3388387955 --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/network.onnx @@ -0,0 +1,17 @@ +pytorch2.7.0:m + +a +bout/MatMul"MatMul +main_graphZ +a +  + + Z +b +  + +€b +out +  + +€B \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/outputs.npz new file mode 100644 index 0000000000..e91f150988 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular/inputs.npz similarity index 100% rename from DeeployTest/Tests/Kernels/FP32/MatMul/inputs.npz rename to DeeployTest/Tests/Kernels/FP32/MatMul/Regular/inputs.npz diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular/network.onnx similarity index 100% rename from DeeployTest/Tests/Kernels/FP32/MatMul/network.onnx rename to DeeployTest/Tests/Kernels/FP32/MatMul/Regular/network.onnx diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular/outputs.npz similarity index 100% rename from DeeployTest/Tests/Kernels/FP32/MatMul/outputs.npz rename to DeeployTest/Tests/Kernels/FP32/MatMul/Regular/outputs.npz diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/inputs.npz new file mode 100644 index 0000000000..d0aed0662f Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/network.onnx new file mode 100644 index 0000000000..72e83fab2f --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/network.onnx @@ -0,0 +1,17 @@ +pytorch2.7.0:k + +a +bout/MatMul"MatMul +main_graphZ +a +  + + Z +b +  + +b +out +  + +B \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/outputs.npz new file mode 100644 index 0000000000..6982ce772a Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/inputs.npz new file mode 100644 index 0000000000..542bc5789f Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/network.onnx new file mode 100644 index 0000000000..7beeeef202 --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/network.onnx @@ -0,0 +1,16 @@ +pytorch2.0.1:j + +a +bout/MatMul"MatMul torch_jitZ +a +  + +Z +b +  + +b +out +  + +B \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/outputs.npz new file mode 100644 index 0000000000..42ffb87810 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/inputs.npz new file mode 100644 index 0000000000..afc11e34d7 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/network.onnx new file mode 100644 index 0000000000..94e265be97 --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/network.onnx @@ -0,0 +1,13 @@ +pytorch2.7.0:^ +& +VA/Softmax"Softmax* +axis  +main_graphZ +V +  + +b +A +  + +B \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/outputs.npz new file mode 100644 index 0000000000..f5f6daea15 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/SparseAttention/inputs.npz b/DeeployTest/Tests/Kernels/FP32/SparseAttention/inputs.npz new file mode 100644 index 0000000000..1f27f7766e Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SparseAttention/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/SparseAttention/network.onnx b/DeeployTest/Tests/Kernels/FP32/SparseAttention/network.onnx new file mode 100644 index 0000000000..e35b7eb103 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SparseAttention/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/SparseAttention/outputs.npz b/DeeployTest/Tests/Kernels/FP32/SparseAttention/outputs.npz new file mode 100644 index 0000000000..b65882fc0a Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SparseAttention/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/inputs.npz b/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/inputs.npz new file mode 100644 index 0000000000..5ad6b6d71a Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/network.onnx b/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/network.onnx new file mode 100644 index 0000000000..7e6acbde40 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/outputs.npz b/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/outputs.npz new file mode 100644 index 0000000000..1c877a4b96 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/10/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopK/10/inputs.npz new file mode 100644 index 0000000000..a02c827160 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/10/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/10/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopK/10/network.onnx new file mode 100644 index 0000000000..13fae39c48 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/10/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/10/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopK/10/outputs.npz new file mode 100644 index 0000000000..b9bc6c4183 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/10/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/20/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopK/20/inputs.npz new file mode 100644 index 0000000000..0b66bfd41b Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/20/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/20/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopK/20/network.onnx new file mode 100644 index 0000000000..1a82699d31 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/20/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/20/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopK/20/outputs.npz new file mode 100644 index 0000000000..abccfba295 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/20/outputs.npz differ diff --git a/DeeployTest/deeployRunner_spatz.py b/DeeployTest/deeployRunner_spatz.py new file mode 100644 index 0000000000..5404defc13 --- /dev/null +++ b/DeeployTest/deeployRunner_spatz.py @@ -0,0 +1,12 @@ +import sys + +from testUtils.deeployRunner import main + +if __name__ == "__main__": + sys.exit( + main( + default_platform = "Spatz", + default_simulator = "gvsoc", + tiling_enabled = False, + ) + ) diff --git a/DeeployTest/deeployRunner_tiled_spatz.py b/DeeployTest/deeployRunner_tiled_spatz.py new file mode 100644 index 0000000000..6900d7010e --- /dev/null +++ b/DeeployTest/deeployRunner_tiled_spatz.py @@ -0,0 +1,12 @@ +import sys + +from testUtils.deeployRunner import main + +if __name__ == "__main__": + sys.exit( + main( + default_platform = "Spatz", + default_simulator = "gvsoc", + tiling_enabled = True, + ) + ) diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py index 01216984af..686fa99d8f 100644 --- a/DeeployTest/testMVP.py +++ b/DeeployTest/testMVP.py @@ -250,11 +250,20 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg test_inputs = [test_inputs[0]] test_outputs = [test_outputs[-2]] - # Instantiate Classes Requried for Memory Level Annotation Extension - L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64000000) - L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = args.l2) - L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1) - memoryLevels = [L3, L2, L1] + # Instantiate Classes Required for Memory Level Annotation Extension + if args.platform == "Spatz": + # Spatz cluster has only TCDM (L1) + external DRAM (L3). No on-chip L2. + # Declare L1 and L3 as direct neighbours so BFS-based tile-path + # generation does not insert a phantom L2 staging buffer. + L3 = MemoryLevel(name = "L3", neighbourNames = ["L1"], size = 64000000) + L1 = MemoryLevel(name = "L1", neighbourNames = ["L3"], size = args.l1) + memoryLevels = [L3, L1] + else: + L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64000000) + L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = args.l2) + L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1) + memoryLevels = [L3, L2, L1] + if args.neureka_wmem: memoryLevels.append(MemoryLevel(name = "WeightMemory_SRAM", neighbourNames = [], size = 4 * 1024 * 1024)) diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py index 1dcddeea62..cdbd0af3db 100644 --- a/DeeployTest/testUtils/core/execution.py +++ b/DeeployTest/testUtils/core/execution.py @@ -6,6 +6,7 @@ import shutil import subprocess import sys +import threading from pathlib import Path from Deeploy.Logging import DEFAULT_LOGGER as log @@ -191,15 +192,43 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult: log.debug(f"[Execution] Simulation command: {' '.join(cmd)}") - result = subprocess.run(cmd, capture_output = True, text = True, env = env) - - if result.stdout: - print(result.stdout, end = '') - if result.stderr: - print(result.stderr, end = '', file = sys.stderr) + process = subprocess.Popen( + cmd, + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, + text = True, + env = env, + bufsize = 1, + ) + + stdout_chunks = [] + stderr_chunks = [] + + def _stream_reader(pipe, chunks, is_stderr: bool = False) -> None: + assert pipe is not None + for line in iter(pipe.readline, ''): + chunks.append(line) + if is_stderr: + print(line, end = '', file = sys.stderr, flush = True) + else: + print(line, end = '', flush = True) + pipe.close() + + stdout_thread = threading.Thread(target = _stream_reader, args = (process.stdout, stdout_chunks), daemon = True) + stderr_thread = threading.Thread(target = _stream_reader, args = (process.stderr, stderr_chunks, True), daemon = True) + + stdout_thread.start() + stderr_thread.start() + + returncode = process.wait() + stdout_thread.join() + stderr_thread.join() + + stdout = ''.join(stdout_chunks) + stderr = ''.join(stderr_chunks) # Parse output for error count and cycles - test_result = parse_test_output(result.stdout, result.stderr) + test_result = parse_test_output(stdout, stderr) if not test_result.success and test_result.error_count == -1: log.warning(f"Could not parse error count from output") diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py index a5a8d70ef3..0c98e254aa 100644 --- a/DeeployTest/testUtils/deeployRunner.py +++ b/DeeployTest/testUtils/deeployRunner.py @@ -348,6 +348,7 @@ def main(default_platform: Optional[str] = None, "snitch": "Snitch", "chimera": "Chimera", "softhier": "SoftHier", + "spatz": "Spatz", } if args.platform: @@ -388,6 +389,7 @@ def main(default_platform: Optional[str] = None, "Snitch": "gvsoc", "Chimera": "gvsoc", "SoftHier": "gvsoc", + "Spatz": "vsim", } simulator = simulator_map.get(platform, "host") log.info(f"No simulator specified, using default for {platform}: {simulator}") diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 9d526906f9..69a83f1e8d 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -10,6 +10,8 @@ from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, TopologyOptimizer from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper +from Deeploy.Targets.Spatz.Deployer import SpatzDeployer +from Deeploy.Targets.Spatz.Platform import SpatzOptimizer, SpatzPlatform from Deeploy.Targets.Chimera.Deployer import ChimeraDeployer from Deeploy.Targets.Chimera.Platform import ChimeraOptimizer, ChimeraPlatform from Deeploy.Targets.CortexM.Deployer import CMSISDeployer @@ -31,7 +33,7 @@ from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"] -_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"] +_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9", "Spatz"] _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS @@ -76,6 +78,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: elif platformName == "Chimera": Platform = ChimeraPlatform() + elif platformName == "Spatz": + Platform = SpatzPlatform() + else: raise RuntimeError(f"Deployment platform {platformName} is not implemented") @@ -272,6 +277,18 @@ def mapDeployer(platform: DeploymentPlatform, name = name, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) + + elif isinstance(platform, (SpatzPlatform)): + deployer = SpatzDeployer( + graph, + platform, + inputTypes, + SpatzOptimizer, + scheduler, + name = name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir + ) else: raise RuntimeError(f"Deployer for platform {platform} is not implemented") diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 6d9f3cfcd7..6be4bef197 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -110,6 +110,14 @@ def param_id(param): "model_tests": SNITCH_MODEL_TESTS, "default_num_cores": SNITCH_DEFAULT_NUM_CORES, }, + "spatz": { + "platform": "Spatz", + "simulator": "vsim", + # TODO: Define KERNEL_TESTS and MODEL_TESTS for Spatz + "kernel_tests": [], + "model_tests": [], + # "default_num_cores": , + }, "gap9": { "platform": "GAP9", "simulator": "gvsoc", diff --git a/Makefile b/Makefile index d40a49da11..0bc7ffa7fe 100644 --- a/Makefile +++ b/Makefile @@ -27,10 +27,12 @@ PICOLIBC_RV32IMF_INSTALL_DIR ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32imf CHIMERA_SDK_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/chimera-sdk PULP_SDK_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/pulp-sdk SNITCH_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/snitch_cluster +SPATZ_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/spatz QEMU_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/qemu BANSHEE_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/banshee MEMPOOL_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/mempool GVSOC_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/gvsoc +GVSOC_SPATZ_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/gvsoc_spatz SOFTHIER_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/softhier MINIMALLOC_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/minimalloc XTL_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/xtl @@ -44,8 +46,10 @@ PICOLIBC_COMMIT_HASH ?= 31ff1b3601b379e4cab63837f253f59729ce1fef PULP_SDK_COMMIT_HASH ?= 7f4f22516157a1b7c55bcbbc72ca81326180b3b4 MEMPOOL_COMMIT_HASH ?= affd45d94e05e375a6966af6a762deeb182a7bd6 SNITCH_COMMIT_HASH ?= e02cc9e3f24b92d4607455d5345caba3eb6273b2 +SPATZ_COMMIT_HASH ?= 6bd9f3094e237dab392983edb827105bce8e3e86 SOFTHIER_COMMIT_HASH ?= 0 # bowwang: to be updated -GVSOC_COMMIT_HASH ?= edfcd8398840ceb1e151711befa06678b05f06a0 +# GVSOC_COMMIT_HASH ?= edfcd8398840ceb1e151711befa06678b05f06a0 # old +GVSOC_COMMIT_HASH ?= 209c147cbd293d5c1590694e68c489122c777acc # new MINIMALLOC_COMMMIT_HASH ?= e9eaf54094025e1c246f9ec231b905f8ef42a29d CHIMERA_SDK_COMMIT_HASH ?= b2392f6efcff75c03f4c65eaf3e12104442b22ea XTL_VERSION ?= 0.7.5 @@ -69,7 +73,7 @@ else $(error unsupported platform $(OS)) endif -all: toolchain emulators docs echo-bash +all: toolchain emulators # docs echo-bash echo-bash: @@ -79,8 +83,10 @@ echo-bash: @echo "export PULP_SDK_HOME=${PULP_SDK_INSTALL_DIR}" @echo "export CHIMERA_SDK_HOME=${CHIMERA_SDK_INSTALL_DIR}" @echo "export SNITCH_HOME=${SNITCH_INSTALL_DIR}" + @echo "export SPATZ_HOME=${SPATZ_INSTALL_DIR}" @echo "export GVSOC_INSTALL_DIR=${GVSOC_INSTALL_DIR}" @echo "export SOFTHIER_INSTALL_DIR=${SOFTHIER_INSTALL_DIR}" + @echo "export BANSHEE_INSTALL_DIR=${BANSHEE_INSTALL_DIR}" @echo "export LLVM_INSTALL_DIR=${LLVM_INSTALL_DIR}" @echo "export MEMPOOL_HOME=${MEMPOOL_INSTALL_DIR}" @echo "export CMAKE=$$(which cmake)" @@ -91,9 +97,9 @@ echo-bash: @echo "source ${PULP_SDK_INSTALL_DIR}/configs/siracusa.sh" -toolchain: llvm llvm-compiler-rt-riscv llvm-compiler-rt-arm picolibc-arm picolibc-riscv +toolchain: llvm llvm-compiler-rt-riscv llvm-compiler-rt-arm picolibc-arm picolibc-riscv xtensor minimalloc # xtensor needed for gvsoc, minimalloc for tiling -emulators: snitch_runtime pulp-sdk qemu banshee mempool +emulators: snitch_runtime spatz_runtime pulp-sdk qemu banshee mempool gvsoc ${TOOLCHAIN_DIR}/llvm-project: cd ${TOOLCHAIN_DIR} && \ @@ -124,6 +130,7 @@ ${LLVM_INSTALL_DIR}: ${TOOLCHAIN_DIR}/llvm-project llvm: ${LLVM_INSTALL_DIR} +# runtimes for different architectures ${LLVM_CLANG_RT_RISCV_RV32IM}: ${TOOLCHAIN_DIR}/llvm-project cd ${TOOLCHAIN_DIR}/llvm-project && mkdir -p build-compiler-rt-riscv-rv32im \ && cd build-compiler-rt-riscv-rv32im; \ @@ -429,16 +436,55 @@ ${SNITCH_INSTALL_DIR}: ${TOOLCHAIN_DIR}/snitch_cluster snitch_runtime: ${SNITCH_INSTALL_DIR} +${TOOLCHAIN_DIR}/spatz: + cd ${TOOLCHAIN_DIR} && \ + git clone https://github.com/pulp-platform/spatz.git && \ + cd ${TOOLCHAIN_DIR}/spatz && git checkout ${SPATZ_COMMIT_HASH} && \ + git submodule update --init --recursive + +${SPATZ_INSTALL_DIR}: ${TOOLCHAIN_DIR}/spatz + mkdir -p ${SPATZ_INSTALL_DIR} + cp -r ${TOOLCHAIN_DIR}/spatz/ ${SPATZ_INSTALL_DIR}/../ + cd ${SPATZ_INSTALL_DIR} + make all -j8 && \ + python3.6 -m venv .venv && \ + .venv/bin/pip install jsonref jsonschema jstyleson dataclasses hjson mako && \ + source .venv/bin/activate && \ + source util/iis-env.sh && \ + make init && \ + cd hw/system/spatz_cluster/ && \ + make sw + +spatz_runtime: ${SPATZ_INSTALL_DIR} + +# ${TOOLCHAIN_DIR}/gvsoc_spatz: +# cd ${TOOLCHAIN_DIR} && \ +# git clone https://github.com/gvsoc/gvsoc.git gvsoc_spatz && \ +# cd ${TOOLCHAIN_DIR}/gvsoc_spatz && git checkout ${GVSOC_SPATZ_COMMIT_HASH} && \ +# git submodule update --init --recursive && \ +# python3 -m venv venv && source venv/bin/activate &&\ +# pip3 install -r core/requirements.txt && pip3 install -r gapy/requirements.txt && pip3 install psutil && \ +# cd core && git apply ${TOOLCHAIN_DIR}/gvsoc.patch +# +# +# ${GVSOC_SPATZ_INSTALL_DIR}: ${TOOLCHAIN_DIR}/gvsoc_spatz +# cd ${TOOLCHAIN_DIR}/gvsoc_spatz && \ +# source venv/bin/activate &&\ +# CXX=g++-11.2.0 CC=gcc-11.2.0 CMAKE=cmake-3.18.1 make all TARGETS=spatz_v2 INSTALLDIR=${GVSOC_SPATZ_INSTALL_DIR} +# +# gvsoc_spatz: ${GVSOC_SPATZ_INSTALL_DIR} + ${TOOLCHAIN_DIR}/gvsoc: cd ${TOOLCHAIN_DIR} && \ git clone https://github.com/gvsoc/gvsoc.git && \ cd ${TOOLCHAIN_DIR}/gvsoc && git checkout ${GVSOC_COMMIT_HASH} && \ git submodule update --init --recursive && \ - pip install -r core/requirements.txt && pip install -r gapy/requirements.txt + pip3 install -r core/requirements.txt && pip3 install -r gapy/requirements.txt && pip3 install psutil &&\ + cd core && git apply ${TOOLCHAIN_DIR}/gvsoc.patch ${GVSOC_INSTALL_DIR}: ${TOOLCHAIN_DIR}/gvsoc cd ${TOOLCHAIN_DIR}/gvsoc && \ - XTENSOR_INSTALL_DIR=${XTENSOR_INSTALL_DIR}/include XTL_INSTALL_DIR=${XTL_INSTALL_DIR}/include XSIMD_INSTALL_DIR=${XSIMD_INSTALL_DIR}/include make all TARGETS="pulp.snitch.snitch_cluster_single siracusa chimera" build INSTALLDIR=${GVSOC_INSTALL_DIR} + XTENSOR_INSTALL_DIR=${XTENSOR_INSTALL_DIR}/include XTL_INSTALL_DIR=${XTL_INSTALL_DIR}/include XSIMD_INSTALL_DIR=${XSIMD_INSTALL_DIR}/include make all TARGETS="pulp.snitch.snitch_cluster_single siracusa chimera spatz_v2" build INSTALLDIR=${GVSOC_INSTALL_DIR} gvsoc: ${GVSOC_INSTALL_DIR} @@ -543,7 +589,7 @@ ${TOOLCHAIN_DIR}/minimalloc: cd ${TOOLCHAIN_DIR} && \ git clone --recursive https://github.com/google/minimalloc.git && \ cd ${TOOLCHAIN_DIR}/minimalloc && git checkout ${MINIMALLOC_COMMMIT_HASH} && \ - cmake -DCMAKE_BUILD_TYPE=Release && make -j && \ + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=17 && make -j && \ mkdir -p ${MINIMALLOC_INSTALL_DIR} && cp minimalloc ${MINIMALLOC_INSTALL_DIR} ${CHIMERA_SDK_INSTALL_DIR}: diff --git a/TargetLibraries/Spatz/CMakeLists.txt b/TargetLibraries/Spatz/CMakeLists.txt new file mode 100644 index 0000000000..ef0fd63ab8 --- /dev/null +++ b/TargetLibraries/Spatz/CMakeLists.txt @@ -0,0 +1,18 @@ +file(GLOB_RECURSE SOURCES + "src/**" +) + +list(APPEND SOURCES + ${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel/sp-fmatmul.c +) + +include(cmake/spatz-runtime-precompiled.cmake) + +add_deeploy_library(deeployspatz STATIC ${SOURCES}) +target_include_directories(deeployspatz + PUBLIC + ${CMAKE_CURRENT_LIST_DIR}/inc +) +target_include_directories(deeployspatz PRIVATE ${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel) +target_include_directories(deeployspatz SYSTEM PUBLIC ${SPATZ_RUNTIME_INCLUDE}) +target_link_libraries(deeployspatz INTERFACE spatz-runtime) diff --git a/TargetLibraries/Spatz/cmake/spatz-runtime-precompiled.cmake b/TargetLibraries/Spatz/cmake/spatz-runtime-precompiled.cmake new file mode 100644 index 0000000000..42e15e1b31 --- /dev/null +++ b/TargetLibraries/Spatz/cmake/spatz-runtime-precompiled.cmake @@ -0,0 +1,27 @@ + +set(SPATZ_RUNTIME_BASE_INCLUDE + ${SPATZ_HOME}/sw/snRuntime/include + ${SPATZ_HOME}/sw/snRuntime/vendor + ${SPATZ_HOME}/sw/toolchain/riscv-opcodes +) + +set(SPATZ_CLUSTER_LINK_INCLUDE + ${SPATZ_HOME}/hw/system/spatz_cluster/sw/build/snRuntime +) + +set(SPATZ_LINKER_SCRIPT ${SPATZ_HOME}/hw/system/spatz_cluster/sw/build/snRuntime/common.ld) +# set(SPATZ_LINKER_SCRIPT ${SNITCH_RUNTIME_HOME}/base.ld) +if(NOT EXISTS ${SPATZ_LINKER_SCRIPT}) + message(FATAL_ERROR "Spatz linker script not found: ${SPATZ_LINKER_SCRIPT}") +endif() + +set(SPATZ_CLUSTER_LINK_OPTIONS + -Wl,--gc-sections + -T ${SPATZ_LINKER_SCRIPT} +) + +set(SPATZ_RUNTIME_INCLUDE ${SPATZ_RUNTIME_BASE_INCLUDE}) + +add_library(spatz-runtime INTERFACE) +target_link_directories(spatz-runtime INTERFACE ${SPATZ_CLUSTER_LINK_INCLUDE}) +target_link_libraries(spatz-runtime INTERFACE ${SPATZ_CLUSTER_LINK_OPTIONS} libsnRuntime-cluster.a) diff --git a/TargetLibraries/Spatz/inc/DeeploySpatzMath.h b/TargetLibraries/Spatz/inc/DeeploySpatzMath.h new file mode 100644 index 0000000000..027fbbc974 --- /dev/null +++ b/TargetLibraries/Spatz/inc/DeeploySpatzMath.h @@ -0,0 +1,38 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_SPATZ_MATH_HEADER_ +#define __DEEPLOY_SPATZ_MATH_HEADER_ + +#include +#include + +#include "DeeployBasicMath.h" +#include "snrt.h" + +void Spatz_MatMul_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t O); + +void Spatz_Softmax_fp32_fp32(float32_t *input, float32_t *output, int32_t size, + int32_t last_dim_length); + +// void Spatz_MatMul_fp16_fp16_fp16(const __fp16 *__restrict__ pSrcA, +// const __fp16 *__restrict__ pSrcB, +// __fp16 *__restrict__ pDstY, uint32_t M, +// uint32_t N, uint32_t O); +// +// void Spatz_MatMul_fp64_fp64_fp64(const double *__restrict__ pSrcA, +// const double *__restrict__ pSrcB, +// double *__restrict__ pDstY, uint32_t M, +// uint32_t N, uint32_t O); + +#define BEGIN_SINGLE_CORE if (core_id == 0) { +#define END_SINGLE_CORE } +#define SINGLE_CORE if (core_id == 0) + +#endif // __DEEPLOY_SPATZ_MATH_HEADER_ diff --git a/TargetLibraries/Spatz/inc/Util.h b/TargetLibraries/Spatz/inc/Util.h new file mode 100644 index 0000000000..893d687fa1 --- /dev/null +++ b/TargetLibraries/Spatz/inc/Util.h @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SPATZ_UTIL_H +#define SPATZ_UTIL_H + +void spatz_util_dummy(void); + +#endif // SPATZ_UTIL_H diff --git a/TargetLibraries/Spatz/src/MatMul_fp32_spatz.c b/TargetLibraries/Spatz/src/MatMul_fp32_spatz.c new file mode 100644 index 0000000000..babc94b795 --- /dev/null +++ b/TargetLibraries/Spatz/src/MatMul_fp32_spatz.c @@ -0,0 +1,59 @@ +#include "DeeploySpatzMath.h" +#include + +// functions defined in ${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel/sp-fmatmul.c +void matmul_2xVL(float *c, const float *a, const float *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); + + +void matmul_4xVL(float *c, const float *a, const float *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); + + +void matmul_8xVL(float *c, const float *a, const float *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); + +// void matmul(float *c, const float *a, const float *b, const unsigned int M, +// const unsigned int N, const unsigned int P); + +// void Spatz_MatMul_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, +// const float32_t *__restrict__ pSrcB, +// float32_t *__restrict__ pDstY, uint32_t M, +// uint32_t N, uint32_t O) { +// // defined in ${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel/sp-fmatmul.c +// matmul(pDstY, pSrcA, pSrcB, M, N, O); +// } + +/* +a * b = c +*/ +void Spatz_MatMul_fp32_fp32_fp32(const float32_t *__restrict__ a, + const float32_t *__restrict__ b, + float32_t *__restrict__ c, uint32_t M, + uint32_t N, uint32_t P) { + // const unsigned int num_cores = snrt_cluster_core_num(); = 2 for spatz + const unsigned int cid = snrt_cluster_core_idx(); + + unsigned int m_start, m_end; + if (cid == 0){ + m_start = 0; + m_end = (M/2); + } else { + m_start = (M/2); + m_end = M; + } + + if (M <= 4) { + matmul_2xVL(c, a, b, m_start, m_end, N, P, 0, P); + } else if (M <= 8) { + matmul_4xVL(c, a, b, m_start, m_end, N, P, 0, P); + } else { + matmul_8xVL(c, a, b, m_start, m_end, N, P, 0, P); + } +} diff --git a/TargetLibraries/Spatz/src/Softmax_fp32_spatz.c b/TargetLibraries/Spatz/src/Softmax_fp32_spatz.c new file mode 100644 index 0000000000..f50571f906 --- /dev/null +++ b/TargetLibraries/Spatz/src/Softmax_fp32_spatz.c @@ -0,0 +1,82 @@ +/* + * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployBasicMath.h" +#include + +float32_t myexpf(float32_t x){ + const float32_t inv_ln2 = 1.4426950409f; + const float32_t ln2 = 0.6931471806f; + + // Range reduction: x = k * ln(2) + r, with r kept small so the polynomial is accurate. + float32_t scaled = x * inv_ln2; + int32_t k = (int32_t)(scaled + (scaled >= 0.0f ? 0.5f : -0.5f)); + float32_t r = x - ((float32_t)k * ln2); + + float32_t r2 = r * r; + float32_t r3 = r2 * r; + float32_t r4 = r3 * r; + float32_t r5 = r4 * r; + float32_t r6 = r5 * r; + float32_t r7 = r6 * r; + + float32_t poly = 1.0f + r + (r2 * 0.5f) + (r3 * 0.1666666667f) + (r4 * 0.0416666667f) + (r5 * 0.0083333333f) + (r6 * 0.0013888889f) + (r7 * 0.0001984127f); + + return ldexpf(poly, k); +} + +// inverse funciton that doesnt use fdiv.s +float32_t myinv(float32_t x){ + uint32_t i = *(uint32_t*)&x; + i = 0x7EEEEEEE - i; + float y = *(float*)&i; + + // Newton-Raphson steps (Multiplication only!) + y = y * (2.0f - x * y); + y = y * (2.0f - x * y); + y = y * (2.0f - x * y); + + return y; +} + +void Spatz_Softmax_fp32_fp32(float32_t *input, float32_t *output, int32_t size, int32_t last_dim_length) { + const unsigned int cid = snrt_cluster_core_idx(); + int32_t batch_size = size / last_dim_length; + // divide in two cores + unsigned int items_per_core = (batch_size + 1) / 2; + + unsigned int b_start, b_end; + + if (cid == 0) { + b_start = 0; + b_end = items_per_core; + } else { + b_start = items_per_core; + // Core 1 always ends at the total batch size + b_end = batch_size; + } + for (int b = b_start; b < b_end; b++) { + float32_t max_val = -inf; + float sum = 0.0f; + + for (int i = 0; i < last_dim_length; i++) { + if (input[b * last_dim_length + i] > max_val) { + max_val = input[b * last_dim_length + i]; + } + } + + for (int i = 0; i < last_dim_length; i++) { + float32_t exp_val = input[b * last_dim_length + i] - max_val; + output[b * last_dim_length + i] = myexpf(exp_val); + sum += output[b * last_dim_length + i]; + } + + float32_t sum_1 = myinv(sum); + for (int i = 0; i < last_dim_length; i++) { + output[b * last_dim_length + i] = output[b * last_dim_length + i] * sum_1; + } + } +} diff --git a/TargetLibraries/Spatz/src/Util.c b/TargetLibraries/Spatz/src/Util.c new file mode 100644 index 0000000000..9c30c11f49 --- /dev/null +++ b/TargetLibraries/Spatz/src/Util.c @@ -0,0 +1,5 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// SPDX-License-Identifier: Apache-2.0 + +// Minimal stub for Spatz runtime linkage +void spatz_util_dummy(void) {} diff --git a/cmake/simulation.cmake b/cmake/simulation.cmake index 55525feedd..983dc0e4ee 100644 --- a/cmake/simulation.cmake +++ b/cmake/simulation.cmake @@ -102,3 +102,19 @@ macro(add_gvsoc_emulation name target) USES_TERMINAL ) endmacro() + +macro(add_spatz_gvsoc_emulation name target) + set(GVSOC_WORKDIR ${CMAKE_BINARY_DIR}/gvsoc_workdir) + make_directory(${GVSOC_WORKDIR}) + set(GVSOC_BINARY "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name}") + add_custom_target(gvsoc_${name} + DEPENDS ${name} + WORKING_DIRECTORY ${GVSOC_WORKDIR} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_BINARY_DIR}/*.bin ${GVSOC_WORKDIR}/ || true + COMMAND bash -c "${GVSOC_INSTALL_DIR}/bin/gvrun --target ${target} --param chip/soc/binary=${GVSOC_BINARY} run" + COMMENT "Simulating deeploytest ${name} with gvsoc for the target ${target}" + POST_BUILD + USES_TERMINAL + VERBATIM + ) +endmacro() diff --git a/cmake/spatz/spatz.cmake b/cmake/spatz/spatz.cmake new file mode 100644 index 0000000000..b715f625c9 --- /dev/null +++ b/cmake/spatz/spatz.cmake @@ -0,0 +1,30 @@ +add_compile_definitions( + DEEPLOY_SPATZ_PLATFORM +) + +set(DEEPLOY_ARCH SPATZ) + +set(num_threads ${NUM_CORES}) + +macro(add_spatz_vsim_simulation name) + add_custom_target(vsim_${name} + WORKING_DIRECTORY ${SPATZ_HOME}/hw/system/spatz_cluster + DEPENDS ${name} + COMMAND ${QUESTA} bin/spatz_cluster.vsim + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name} || true + COMMENT "Simulating deeploytest with vsim (Spatz cluster)" + POST_BUILD + USES_TERMINAL + VERBATIM + ) +endmacro() + +add_compile_options( + -ffast-math +) + +add_link_options( + -ffast-math + -Wl,--gc-sections +) + diff --git a/cmake/spatz/toolchain_llvm.cmake b/cmake/spatz/toolchain_llvm.cmake new file mode 100644 index 0000000000..3a149c04f0 --- /dev/null +++ b/cmake/spatz/toolchain_llvm.cmake @@ -0,0 +1,72 @@ + +set(CMAKE_SYSTEM_NAME Generic) + +# Crucial: Point CMake to the specialized Clang toolchain instead of system cc +set(SPATZ_TOOLCHAIN_DIR ${SPATZ_HOME}/sw/toolchain/llvm-project/build/bin) + +set(CMAKE_C_COMPILER ${SPATZ_TOOLCHAIN_DIR}/clang) +set(CMAKE_CXX_COMPILER ${SPATZ_TOOLCHAIN_DIR}/clang++) +set(CMAKE_ASM_COMPILER ${SPATZ_TOOLCHAIN_DIR}/clang) +set(CMAKE_OBJCOPY ${SPATZ_TOOLCHAIN_DIR}/llvm-objcopy) +set(CMAKE_OBJDUMP ${SPATZ_TOOLCHAIN_DIR}/llvm-objdump) +set(CMAKE_LINKER ${SPATZ_TOOLCHAIN_DIR}/ld.lld) +set(CMAKE_EXECUTABLE_SUFFIX ".elf") + +set(ISA rv32imafdvzfh_xdma) + +# Compile options based on user's manual compilation commands +add_compile_options( + -target riscv32-unknown-elf + # -MP + -mcpu=snitch + -mcmodel=small + + -ffast-math + -fno-builtin-printf + -fno-common + -falign-loops=16 + -ffunction-sections + -Wextra + + # LLVM specific flags from user command + -mllvm -misched-topdown + -menable-experimental-extensions + -mno-relax + + -march=${ISA} + -mabi=ilp32d + -isystem ${SPATZ_HOME}/sw/toolchain/riscv-gnu-toolchain/riscv-newlib/newlib/libc/include + + # Optimization and debug + -O3 + -g +) + +# Link options matching user command +add_link_options( + # -target riscv32-unknown-elf + -mcpu=snitch + -march=${ISA} + -mabi=ilp32d + -mcmodel=small + + -fuse-ld=lld + -nostartfiles + + -ffast-math + -fno-common + -fno-builtin-printf + + -static + -Wl,-z,norelro + -Wl,--gc-sections + -Wl,--no-relax + + --gcc-toolchain=/usr/pack/riscv-1.0-kgf/spatz-gcc-7.1.1 +) + +# User command linked: -lm -lgcc -lm -lgcc libsnRuntime-cluster.a +# libsnRuntime-cluster.a is handled by our target_link_libraries(deeployspatz INTERFACE spatz-runtime) +link_libraries( + -lm -lgcc -lm -lgcc +) diff --git a/conda_enviroment_deeply_mattia.yml b/conda_enviroment_deeply_mattia.yml new file mode 100644 index 0000000000..ad24ceee15 --- /dev/null +++ b/conda_enviroment_deeply_mattia.yml @@ -0,0 +1,81 @@ +name: /scratch/mmm/.conda/envs/deeploy_conda_venv2 +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - bzip2=1.0.8=h5eee18b_6 + - ca-certificates=2025.12.2=h06a4308_0 + - ld_impl_linux-64=2.44=h9e0c5a2_3 + - libexpat=2.7.5=h7354ed3_0 + - libffi=3.4.4=h6a678d5_1 + - libgcc=15.2.0=h69a1729_7 + - libgcc-ng=15.2.0=h166f726_7 + - libgomp=15.2.0=h4751f2c_7 + - libnsl=2.0.0=h5eee18b_0 + - libstdcxx=15.2.0=h39759b7_7 + - libstdcxx-ng=15.2.0=hc03a8fd_7 + - libuuid=1.41.5=h5eee18b_0 + - libxcb=1.17.0=h9b100fa_0 + - libzlib=1.3.1=hb25bd0a_0 + - ncurses=6.5=h7934f7d_0 + - openssl=3.5.5=h1b28b03_0 + - packaging=25.0=py311h06a4308_1 + - pip=26.0.1=pyhc872135_0 + - pthread-stubs=0.3=h0ce48e5_1 + - python=3.11.15=h741d88c_0 + - readline=8.3=hc2a1206_0 + - setuptools=80.10.2=py311h06a4308_0 + - sqlite=3.51.2=h3e8d24a_0 + - tk=8.6.15=h54e0aa7_0 + - tzdata=2026a=he532380_0 + - wheel=0.46.3=py311h06a4308_0 + - xorg-libx11=1.8.12=h9b100fa_1 + - xorg-libxau=1.0.12=h9b100fa_0 + - xorg-libxdmcp=1.1.5=h9b100fa_0 + - xorg-xorgproto=2024.1=h5eee18b_1 + - xz=5.8.2=h448239c_0 + - zlib=1.3.1=hb25bd0a_0 + - pip: + - absl-py==2.4.0 + - argparse==1.4.0 + - beautifulsoup4==4.14.3 + - certifi==2026.2.25 + - chardet==5.2.0 + - charset-normalizer==3.4.6 + - contourpy==1.3.3 + - cycler==0.12.1 + - deeploy-pulp==0.2.1 + - flatbuffers==25.12.19 + - fonttools==4.62.1 + - idna==3.11 + - imagesize==2.0.0 + - iniconfig==2.3.0 + - jinja2==3.1.6 + - kiwisolver==1.5.0 + - lz4==4.4.5 + - markdown-it-py==4.0.0 + - markupsafe==3.0.3 + - mdurl==0.1.2 + - mpmath==1.3.0 + - narwhals==2.18.1 + - pillow==12.1.1 + - plotly==6.6.0 + - pluggy==1.6.0 + - psutil==7.2.2 + - ptyprocess==0.7.0 + - pyparsing==3.3.2 + - pytest==9.0.2 + - python-dateutil==2.9.0.post0 + - pytz==2026.1.post1 + - six==1.17.0 + - snowballstemmer==3.0.1 + - soupsieve==2.8.3 + - sphinxcontrib-jsmath==1.0.1 + - sympy==1.14.0 + - tabulate==0.10.0 + - toml==0.10.2 + - typing-extensions==4.15.0 + - urllib3==2.6.3 + - wcwidth==0.6.0 +prefix: /scratch/mmm/.conda/envs/deeploy_conda_venv2 diff --git a/toolchain/gvsoc.patch b/toolchain/gvsoc.patch new file mode 100644 index 0000000000..22e65922a9 --- /dev/null +++ b/toolchain/gvsoc.patch @@ -0,0 +1,12 @@ +diff --git a/engine/src/launcher.cpp b/engine/src/launcher.cpp +index f0b1b654..48c83592 100644 +--- a/engine/src/launcher.cpp ++++ b/engine/src/launcher.cpp +@@ -21,6 +21,7 @@ + + #include + #include ++#include + #include + + #include