diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c8a024c15..9ca0eda18f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,8 +20,8 @@ if(TOOLCHAIN STREQUAL GCC)
   set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif()
 
-set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)")
-set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch)
+set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch, Spatz)")
+set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch Spatz)
 
 if(platform STREQUAL MemPool)
   message(STATUS "Building for platform 'MemPool'")
@@ -46,6 +46,8 @@ elseif(platform STREQUAL SoftHier)
   message(STATUS "Building for platform 'SoftHier'")
 elseif(platform STREQUAL Chimera)
   message(STATUS "Building for platform 'Chimera'")
+elseif(platform STREQUAL Spatz)
+  message(STATUS "Building for platform 'Spatz'")
 else()
   message(FATAL_ERROR "Invalid platform '${platform}' specified!")
 endif()
@@ -299,5 +301,33 @@ if(platform STREQUAL Chimera)
 
 endif()
 
+if(platform STREQUAL Spatz)
+
+  if(NOT DEFINED ENV{SPATZ_HOME})
+    message(FATAL_ERROR "Environment variable SPATZ_HOME not set.")
+  endif()
+
+  set(SPATZ_HOME $ENV{SPATZ_HOME})
+
+  set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/spatz/toolchain_llvm.cmake)
+
+  include(${CMAKE_CURRENT_LIST_DIR}/cmake/spatz/spatz.cmake)
+
+  project(deeploy LANGUAGES C ASM)
+
+  message(STATUS "============================= ${platform} Configuration ============================")
+  message(STATUS "[cMake  ]   ISA                      = " ${ISA})
+  message(STATUS "================================================================================")
+  message(STATUS "")
+
+  add_subdirectory(TargetLibraries/Generic)
+  add_subdirectory(TargetLibraries/Spatz)
+  target_include_directories(deeployspatz PUBLIC TargetLibraries/Generic/inc)
+
+  add_subdirectory(DeeployTest)
+  target_link_libraries(deeploylib INTERFACE deeploybasic deeployspatz)
+
+endif()
+
 
 print_simulation_config()
diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py
index 308b179aef..4b0ecfc258 100644
--- a/Deeploy/Targets/Generic/Bindings.py
+++ b/Deeploy/Targets/Generic/Bindings.py
@@ -19,12 +19,12 @@
     GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, \
     MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \
     RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, TransposeTemplate, \
-    iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate
+    iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate, TopKTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \
     DebugPrintChecker, DequantChecker, DivChecker, DummyChecker, GatherChecker, GELUChecker, GEMMChecker, \
     LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceMeanChecker, \
     ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, SliceChecker, \
-    SoftmaxChecker, TransposeChecker
+    SoftmaxChecker, TransposeChecker, TopKChecker
 
 BasicTransformer = CodeTransformation([ArgumentStructGeneration(), MemoryManagementGeneration(), FutureGeneration()])
 
@@ -327,3 +327,14 @@
         ConvTransposeTemplate.referenceTemplate,
         BasicTransformer) for type in FloatDataTypes
 ]
+
+BasicTopKBindings = [
+    NodeBinding(
+        TopKChecker(
+            [PointerClass(float32_t), PointerClass(int8_t)], # inputs
+            [PointerClass(float32_t), PointerClass(int8_t)] # outputs
+        ),
+        TopKTemplate.referenceTemplate,
+        BasicTransformer,
+    )
+]
diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py
index cc733937cc..51b7b45dd4 100644
--- a/Deeploy/Targets/Generic/Layers.py
+++ b/Deeploy/Targets/Generic/Layers.py
@@ -709,3 +709,15 @@ def computeOps(self):
             numPx = opRep['dim_im_out_x']
 
         return numPx * opsPerPx
+
+
+class TopKLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+    
+#     def computeOps(self):
+#         ???
+# 
+#     def computeShapes(self):
+#         ???
\ No newline at end of file
diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py
index ad787d9e4b..48a00c38f5 100644
--- a/Deeploy/Targets/Generic/Parsers.py
+++ b/Deeploy/Targets/Generic/Parsers.py
@@ -982,7 +982,7 @@ def parseNode(self, node: gs.Node) -> (bool):
             return False
 
         indices_shape = node.inputs[1].shape
-        assert np.prod(indices_shape) == 1, f"Only indices of size 1 supported. Got indices of shape {indices_shape}"
+        self.operatorRepresentation['num_indices'] = int(np.prod(indices_shape))
 
         self.operatorRepresentation['axis'] = node.attrs['axis'] if 'axis' in node.attrs else 0
         return True
@@ -1002,10 +1002,17 @@ def parseNodeCtxt(self,
 
         axis = self.operatorRepresentation['axis']
         shape = ctxt.lookup(node.inputs[0].name).shape
-        self.operatorRepresentation['batch'] = np.prod(shape[:axis])
-        self.operatorRepresentation['batch_length'] = np.prod(shape[axis:])
-        self.operatorRepresentation['axis_length'] = np.prod(shape[axis + 1:])
-        self.operatorRepresentation['index'] = int(node.inputs[1].values.item())
+        self.operatorRepresentation['batch'] = int(np.prod(shape[:axis])) if axis > 0 else 1
+        self.operatorRepresentation['batch_length'] = int(np.prod(shape[axis:]))
+        self.operatorRepresentation['axis_length'] = int(np.prod(shape[axis + 1:])) if axis + 1 < len(shape) else 1
+        
+        if self.operatorRepresentation['num_indices'] == 1:
+            try:
+                self.operatorRepresentation['index'] = int(node.inputs[1].values.item())
+            except Exception:
+                self.operatorRepresentation['index'] = f"{self.operatorRepresentation['indices']}[0]"
+        else:
+            self.operatorRepresentation['index'] = 0 # in this case is not used but is needed for mako template
 
         return ctxt, True
 
@@ -2886,3 +2893,28 @@ def parseNodeCtxt(self,
         self.operatorRepresentation['size'] = int(np.prod(data_in.shape))
 
         return ctxt, True
+
+# TopKParser: selects the largest k elements from a vector
+class TopKParser(NodeParser):
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return len(node.inputs)==2 and len(node.outputs)==2 and node.op=='TopK'
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        data_in = ctxt.lookup(node.inputs[0].name)
+        k_in = ctxt.lookup(node.inputs[1].name)
+        values_out = ctxt.lookup(node.outputs[0].name)
+        indices_out = ctxt.lookup(node.outputs[1].name)
+
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_in_size'] = int(np.prod(data_in.shape))
+        self.operatorRepresentation['k_value'] = int(k_in.values[0])
+        self.operatorRepresentation['values_out'] = values_out.name
+        self.operatorRepresentation['indices_out'] = indices_out.name
+
+        return ctxt, True
diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py
index e05e897270..2e4601bdd4 100644
--- a/Deeploy/Targets/Generic/Platform.py
+++ b/Deeploy/Targets/Generic/Platform.py
@@ -14,19 +14,19 @@
     BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, \
     BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, \
     BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, BasicTransposeBindings, \
-    DummyBinding
+    DummyBinding, BasicTopKBindings
 from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \
     ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, GatherLayer, GELULayer, GEMMLayer, ITAMaxLayer, \
     LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, \
     ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, \
-    SoftmaxLayer, SqrtLayer, TransposeLayer
+    SoftmaxLayer, SqrtLayer, TransposeLayer, TopKLayer
 from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, ConvTranspose1DParser, \
     DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, GELUParser, GenericConv1DParser, \
     GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, \
     IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, \
     Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, \
     RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, \
-    TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser
+    TransposeParser, TopKParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser
 from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, ExtractPaddingFromConvPass, \
     ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, QuantPatternPass, \
@@ -67,6 +67,7 @@
 SoftmaxMapper = NodeMapper(SoftmaxParser(), BasicSoftmaxBindings)
 iSoftmaxMapper = NodeMapper(iSoftmaxParser(), BasicSoftmaxBindings)
 TransposeMapper = NodeMapper(TransposeParser(), BasicTransposeBindings)
+TopKMapper = NodeMapper(TopKParser(), BasicTopKBindings)
 UnsqueezeMapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings)
 QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings)
 DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings)
@@ -113,6 +114,7 @@
     'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]),
     'Squeeze': ReshapeLayer([UnsqueezeMapper]),
     'Transpose': TransposeLayer([TransposeMapper]),
+    'TopK': TopKLayer([TopKMapper]),
     'Unsqueeze': ReshapeLayer([UnsqueezeMapper]),
     'Slice': SliceLayer([SliceMapper]),
     'Quant': QuantLayer([QuantMapper]),
diff --git a/Deeploy/Targets/Generic/Templates/GatherTemplate.py b/Deeploy/Targets/Generic/Templates/GatherTemplate.py
index dd5e534fa4..4efce4d050 100644
--- a/Deeploy/Targets/Generic/Templates/GatherTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/GatherTemplate.py
@@ -10,8 +10,18 @@
 width = int(data_in_type.referencedType.typeWidth/8)
 %>
 BEGIN_SINGLE_CORE
+% if num_indices == 1:
 for (uint32_t i=0; i<${batch}; ++i) {
     memcpy(${data_out} + i * ${axis_length}, ${data_in} + i * ${batch_length} + ${index} * ${axis_length}, ${axis_length} * ${width});
 }
+% else:
+for (uint32_t i=0; i<${batch}; ++i) {
+    for (uint32_t j=0; j<${num_indices}; ++j) {
+        memcpy(${data_out} + i * (${num_indices} * ${axis_length}) + j * ${axis_length}, 
+               ${data_in} + i * ${batch_length} + ${indices}[j] * ${axis_length}, 
+               ${axis_length} * ${width});
+    }
+}
+% endif
 END_SINGLE_CORE
 """)
diff --git a/Deeploy/Targets/Generic/Templates/TopKTemplate.py b/Deeploy/Targets/Generic/Templates/TopKTemplate.py
new file mode 100644
index 0000000000..3f9b6474fa
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/TopKTemplate.py
@@ -0,0 +1,40 @@
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+referenceTemplate = NodeTemplate("""
+// TopK (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+// Find the top ${k_value} values and their indices
+// Assumes 1D input for simplicity
+typedef struct {
+	${data_in_type.referencedType.typeName} value;
+	uint32_t index;
+} topk_pair_t;
+
+topk_pair_t pairs[${data_in_size}];
+for (uint32_t i = 0; i < ${data_in_size}; ++i) {
+	pairs[i].value = ((${data_in_type.referencedType.typeName}*)${data_in})[i];
+	pairs[i].index = i;
+}
+// Simple selection sort for top-k
+for (uint32_t i = 0; i < ${k_value}; ++i) {
+	uint32_t max_idx = i;
+	for (uint32_t j = i + 1; j < ${data_in_size}; ++j) {
+		if (pairs[j].value > pairs[max_idx].value) {
+			max_idx = j;
+		}
+	}
+	// Swap
+	if (max_idx != i) {
+		topk_pair_t tmp = pairs[i];
+		pairs[i] = pairs[max_idx];
+		pairs[max_idx] = tmp;
+	}
+	// Write output
+	((${values_out_type.referencedType.typeName}*)${values_out})[i] = pairs[i].value;
+	((${indices_out_type.referencedType.typeName}*)${indices_out})[i] = pairs[i].index;
+}
+END_SINGLE_CORE
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py
index c2c8d436f8..5d363206f8 100644
--- a/Deeploy/Targets/Generic/TypeCheckers.py
+++ b/Deeploy/Targets/Generic/TypeCheckers.py
@@ -610,3 +610,17 @@ def _inferNumLevels(self, inputs: List[VariableBuffer],
     def _inferSignedness(self, inputs: List[VariableBuffer],
                          operatorRepresentation: OperatorRepresentation) -> List[bool]:
         return [True]
+
+# TopKChecker: infers types for both values and indices outputs of TopK operation
+class TopKChecker(SignPropTypeChecker):
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]:
+        # Output 0: values (same as input), Output 1: indices (integer, usually not quantized)
+        # We assume indices output is not quantized (set to 0 or 1)
+        return [inputs[0].nLevels, 1]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        # Output 0: values (same signedness as input), Output 1: indices (unsigned)
+        return [inputs[0]._signed, False]
\ No newline at end of file
diff --git a/Deeploy/Targets/Spatz/Bindings.py b/Deeploy/Targets/Spatz/Bindings.py
new file mode 100644
index 0000000000..78431b74e0
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Bindings.py
@@ -0,0 +1,117 @@
+from functools import partial
+
+from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
+from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
+    MemoryManagementGeneration
+from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, SignedIntegerDataTypes, float32_t, int8_t, int32_t
+from Deeploy.Targets.Generic.TypeCheckers import GatherChecker, MatMulChecker, TopKChecker, SoftmaxChecker
+
+from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration
+from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchClusterTiling import SnitchClusterTiling
+from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchCoreFilter import SnitchCoreFilterPass
+from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchClusterSynch import SnitchSynchCoresPass
+from Deeploy.Targets.Spatz.DMA.SpatzDma import SpatzDma
+from Deeploy.Targets.Spatz.Templates import GatherTemplate, MatMulTemplate as SpatzMatMulTemplate, TopKTemplate, SoftmaxTemplate
+from Deeploy.Targets.Generic.Templates import MatMulTemplate, FloatMatMulTemplate
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \
+    TilingVariableReplacementUpdate
+
+TilingCallClosure = partial(ClosureGeneration, closureSuffix = "_tiling_closure")
+MemoryAwareFunctionCallClosure = partial(MemoryAwareClosureGeneration,
+                                         closureSuffix = "_closure",
+                                         startRegion = "L3",
+                                         endRegion = "L1")
+
+BasicTransformer = CodeTransformation(
+    [ArgumentStructGeneration(),
+    MemoryManagementGeneration(),
+    FutureGeneration()])
+
+TiledTransformer = CodeTransformation([
+    SnitchCoreFilterPass("compute"),
+    TilingVariableReplacement("L1"),
+    TilingCallClosure(writeback = False),
+    SnitchSynchCoresPass(), # snrt_cluster_hw_barrier()
+    TilingVariableReplacementUpdate("L1"),
+    SnitchClusterTiling("L3", "L1", SpatzDma()),
+    ArgumentStructGeneration(),
+    MemoryManagementGeneration("L1"),
+    MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
+    MemoryManagementGeneration()
+])
+
+SpatzGatherBindings = [
+    NodeBinding(
+        GatherChecker(
+            [PointerClass(float32_t), PointerClass(type)],
+            [PointerClass(float32_t)]
+        ),
+        GatherTemplate.tilingReferenceTemplate,
+        TiledTransformer
+    ) for type in IntegerDataTypes
+]
+# [
+#     NodeBinding(
+#         GatherChecker(
+#             [PointerClass(type), PointerClass(int32_t)],
+#             [PointerClass(type)]
+#         ),
+#         GatherTemplate.referenceTemplate,
+#         BasicTransformer
+#     ) for type in SignedIntegerDataTypes] +
+
+# with tiled transformer
+SpatzMatMulBindings = [
+    NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                SpatzMatMulTemplate.spatzSIMatMulTemplate, TiledTransformer),
+    NodeBinding(
+        MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+        SpatzMatMulTemplate.spatzFloatMatMulTemplate, TiledTransformer)
+]
+'''
+# without tiled transformer
+SpatzMatMulBindings = [
+    NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                SpatzMatMulTemplate.spatzSIMatMulTemplate, BasicTransformer),
+    NodeBinding(
+        MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+        SpatzMatMulTemplate.spatzFloatMatMulTemplate, BasicTransformer)
+]
+# with BEGIN_SINGLE_CORE
+# SpatzMatMulBindings = [
+#     NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+#                 MatMulTemplate.referenceTemplate, TiledTransformer)
+# ] + [
+#     NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+#                 FloatMatMulTemplate.referenceTemplate, TiledTransformer)
+# ]
+'''
+
+SpatzTopKBindings = [
+    NodeBinding(
+        TopKChecker(
+            [PointerClass(float32_t), PointerClass(int32_t)], # inputs
+            [PointerClass(float32_t), PointerClass(int32_t)] # outputs
+        ),
+        TopKTemplate.SpatzTilingTemplate,
+        TiledTransformer,
+    )
+]
+
+
+SpatzSoftmaxBindings = [
+    NodeBinding(
+        SoftmaxChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+        SoftmaxTemplate.floatTilingTemplate,
+        TiledTransformer
+    )
+]
+# [
+#     NodeBinding(
+#         SoftmaxChecker([PointerClass(int8_t)], [PointerClass(int8_t)]),
+#         SoftmaxTemplate.integerTilingTemplate,
+#         TiledTransformer
+#     )
+# ]
diff --git a/Deeploy/Targets/Spatz/CodeTransformationPasses/SpatzCoreFilter.py b/Deeploy/Targets/Spatz/CodeTransformationPasses/SpatzCoreFilter.py
new file mode 100644
index 0000000000..f5f6ac4797
--- /dev/null
+++ b/Deeploy/Targets/Spatz/CodeTransformationPasses/SpatzCoreFilter.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Literal, Tuple
+
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \
+    NodeTemplate, _NoVerbosity
+
+
+class SpatzCoreFilterPass(CodeTransformationPass):
+
+    def __init__(self, coreType: Literal["dm", "compute"]):
+        super().__init__()
+        self.coreType = coreType
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        theother = self.coreType=="dm" ? "compute" : "dm"
+        executionBlock.addLeft(NodeTemplate(f"if (snrt_is_{theother}_core()) {{\n"), {})
+        executionBlock.addRight(NodeTemplate("}\n"), {})
+        return ctxt, executionBlock
diff --git a/Deeploy/Targets/Spatz/DMA/SpatzDma.py b/Deeploy/Targets/Spatz/DMA/SpatzDma.py
new file mode 100644
index 0000000000..ea0f19ab90
--- /dev/null
+++ b/Deeploy/Targets/Spatz/DMA/SpatzDma.py
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
+from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy
+
+
+class SnitchBarrierFuture(Future):
+    _initTemplate = NodeTemplate("")
+    _deinitTemplate = NodeTemplate("")
+    _allocTemplate = NodeTemplate("")
+    _waitTemplate = NodeTemplate("if (snrt_is_dm_core()) snrt_dma_wait_all();")
+
+
+# LMACAN: TODO: Add single transfer waiting
+class SnitchFuture(Future):
+    _initTemplate = NodeTemplate("snrt_dma_txid_t ${name} = (snrt_dma_txid_t) -1;")
+
+    _deinitTemplate = NodeTemplate("")
+
+    _allocTemplate = NodeTemplate("")
+
+    _waitTemplate = NodeTemplate(#remove if condition -1
+        "if ( (${name} != ( (snrt_dma_txid_t) -1) ) && snrt_is_dm_core() ) snrt_dma_wait_all();")
+        # "if ( (${name} != ( (snrt_dma_txid_t) -1) ) && snrt_is_dm_core() ) snrt_dma_wait(${name});")
+
+
+class SpatzDma(AsyncDma):
+
+    _transferTemplates = {
+        2:
+            NodeTemplate("""
+            if (snrt_is_dm_core()) {
+                ${future} = snrt_dma_start_2d(${dest}, ${src}, ${size}, ${stride_dest}, ${stride_src}, ${repeat});
+            }
+            """),
+    }
+    _waitingStrategy = PerTensorWaitingStrategy(SnitchFuture)
+
+    def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None:
+        super().__init__(transferTemplates)
+
+    def checkTransfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer,
+                      shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...],
+                      direction: DmaDirection) -> None:
+        super().checkTransfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction)
+        assert strideLoc[1] == 1 and strideExt[1] == 1, f"Supports only contigous transfers in the innermost dimension"
+
+    def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...],
+                       strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection,
+                       future: Future) -> OperatorRepresentation:
+        operatorRepresentation: OperatorRepresentation = {
+            "dest": localBuffer.name if direction == "ExternalToLocal" else externalBuffer.name,
+            "src": externalBuffer.name if direction == "ExternalToLocal" else localBuffer.name,
+            "repeat": shape[0],
+            "size": shape[1],
+            "stride_dest": strideLoc[0] if direction == "ExternalToLocal" else strideExt[0],
+            "stride_src": strideExt[0] if direction == "ExternalToLocal" else strideLoc[0],
+            "future": future.name
+        }
+        return operatorRepresentation
diff --git a/Deeploy/Targets/Spatz/Deployer.py b/Deeploy/Targets/Spatz/Deployer.py
new file mode 100644
index 0000000000..2442059606
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Deployer.py
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Callable, Dict, Type
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import DebugPrintMergePass
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    NCHWtoNHWCPass, TransposeMatmulInputsPass
+from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import TransposeConstOptPass, TransposeMergePass
+
+
+class SpatzDeployer(SignPropDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda x: x,
+                 name: str = 'DeeployNetwork',
+                 default_channels_first = False,
+                 deeployStateDir: str = "DeeployStateDir",
+                 inputOffsets: Dict[str, int] = {}):
+
+        super().__init__(graph,
+                         deploymentPlatform,
+                         inputTypes,
+                         loweringOptimizer,
+                         scheduler,
+                         name,
+                         default_channels_first = default_channels_first,
+                         deeployStateDir = deeployStateDir)
+
+#         self.inputOffsets = inputOffsets
+# 
+#         self.loweringOptimizer.passes += [
+#             TransposeMatmulInputsPass(),
+#             NCHWtoNHWCPass(self.default_channels_first),
+#             TransposeMergePass(),
+#             TransposeConstOptPass(),
+#             DebugPrintMergePass()
+#         ]
diff --git a/Deeploy/Targets/Spatz/Platform.py b/Deeploy/Targets/Spatz/Platform.py
new file mode 100644
index 0000000000..c8af56ded5
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Platform.py
@@ -0,0 +1,135 @@
+from typing import List
+import numpy as np
+
+from Deeploy.DeeployTypes import VariableBuffer, TransientBuffer, ConstantBuffer, StructBuffer, \
+    NodeMapper, NodeTemplate, TopologyOptimizer, DeploymentEngine, DeploymentPlatform
+
+from Deeploy.Targets.Generic.Templates import AllocateTemplate as GenericAllocateTemplate
+from Deeploy.Targets.Spatz.Templates import AllocateTemplate as SpatzAllocateTemplate
+from Deeploy.Targets.Spatz.Templates import FreeTemplate as SpatzFreeTemplate
+from Deeploy.Targets.Snitch.Templates import AllocateTemplate as SnitchAllocateTemplate, FreeTemplate as SnitchFreeTemplate
+
+from Deeploy.Targets.Spatz.Bindings import SpatzGatherBindings, SpatzMatMulBindings, SpatzTopKBindings
+from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicMatMulBindings, BasicSoftmaxBindings, BasicTopKBindings
+from Deeploy.Targets.Spatz.Tiler import SpatzMatMulTilingBindings, SpatzGatherTilingBindings, SpatzTopKTilingBindings, SpatzSoftmaxTilingBindings
+from Deeploy.Targets.Generic.Layers import AddLayer, GEMMLayer, SoftmaxLayer, TopKLayer, GatherLayer
+from Deeploy.Targets.Generic.Parsers import AddParser, MatMulParser, SoftmaxParser, TopKParser, GatherParser
+
+# # print(SpatzMatMulBindings)
+# # for binding in SpatzMatMulBindings:
+# #     print(binding.template.tileConstraint)
+# 
+# print(SpatzMatMulTilingReadyBindings)
+# for binding in SpatzMatMulTilingReadyBindings:
+#     print(binding.template.tileConstraint)
+# 
+# print(SpatzMatMulTilingReadyBindings[0].template.tileConstraint)
+# print(SpatzMatMulTilingReadyBindings[1].template.tileConstraint)
+
+SpatzAddMapper = NodeMapper(AddParser(), BasicAddBindings)
+# MatMulMapper = NodeMapper(MatMulParser(), BasicMatMulBindings)
+MatMulMapper = NodeMapper(MatMulParser(), SpatzMatMulTilingBindings)
+# SoftmaxMapper = NodeMapper(SoftmaxParser(), BasicSoftmaxBindings)
+SoftmaxMapper = NodeMapper(SoftmaxParser(), SpatzSoftmaxTilingBindings)
+# TopKMapper = NodeMapper(TopKParser(), SpatzTopKBindings)
+TopKMapper = NodeMapper(TopKParser(), SpatzTopKTilingBindings)
+# GatherMapper = NodeMapper(GatherParser(), SpatzGatherBindings)
+GatherMapper = NodeMapper(GatherParser(), SpatzGatherTilingBindings)
+
+SpatzMapping = {
+    'Add': AddLayer([SpatzAddMapper]),
+    'MatMul': GEMMLayer([MatMulMapper]),
+    'Softmax': SoftmaxLayer([SoftmaxMapper]),
+    'TopK': TopKLayer([TopKMapper]),
+    'Gather': GatherLayer([GatherMapper]),
+}
+
+
+class SpatzVariableBuffer(VariableBuffer):
+    initTemplate = GenericAllocateTemplate.referenceInitTemplate
+    allocTemplate = SpatzAllocateTemplate.spatzGenericAllocate
+    deallocTemplate = SpatzFreeTemplate.spatzLocalTemplate
+
+    def _bufferRepresentation(self):
+
+        if hasattr(self, "_memoryLevel"):
+            memoryLevel = self._memoryLevel
+        else:
+            memoryLevel = None
+
+        return {
+            "type": self._instance,
+            "name": self.name,
+            "size": int(np.prod(self.shape)),
+            "_memoryLevel": memoryLevel
+        }
+
+class SpatzTransientBuffer(TransientBuffer):
+    initTemplate = GenericAllocateTemplate.referenceInitTemplate
+    allocTemplate = SpatzAllocateTemplate.spatzGenericAllocate
+    deallocTemplate = SpatzFreeTemplate.spatzLocalTemplate
+#     def _bufferRepresentation(self):
+# 
+#         if hasattr(self, "_memoryLevel"):
+#             memoryLevel = self._memoryLevel
+#         else:
+#             memoryLevel = None
+# 
+#         return {
+#             "type": self._type,
+#             "name": self.name,
+#             "size": self.size,
+#             "_memoryLevel": memoryLevel
+#         }
+
+
+class SpatzConstantBuffer(ConstantBuffer):
+    initTemplate = SnitchAllocateTemplate.snitchGenericGlobalInitTemplate
+    allocTemplate = NodeTemplate("")
+    deallocTemplate = NodeTemplate("") # const not deallocated
+
+    def _bufferRepresentation(self):
+        operatorRepresentation = super()._bufferRepresentation()
+
+        if hasattr(self, "_memoryLevel"):
+            memoryLevel = self._memoryLevel
+        else:
+            memoryLevel = None
+
+        operatorRepresentation["_memoryLevel"] = memoryLevel
+
+        return operatorRepresentation
+
+
+class SpatzStructBuffer(StructBuffer):
+    initTemplate = GenericAllocateTemplate.referenceStructInitTemplate
+    allocTemplate = GenericAllocateTemplate.referenceStructAllocateTemplate
+    deallocTemplate = NodeTemplate("") # struct not deallocated ?
+
+
+SpatzOptimizer = TopologyOptimizer([
+    # TODO add something ?
+], name = "SpatzOptimizer")
+
+includeList = [
+    "DeeploySpatzMath.h",
+]
+
+
+class SpatzEngine(DeploymentEngine):
+    def __init__(self, name: str, Mapping = SpatzMapping, initCode = "", includeList = includeList) -> None:
+        super().__init__(name, Mapping, initCode, includeList)
+
+
+class SpatzPlatform(DeploymentPlatform):
+
+    def __init__( self,
+        engines = [SpatzEngine("SpatzVectorProcessor")],
+        variableBuffer = SpatzVariableBuffer,
+        transientBuffer = SpatzTransientBuffer,
+        constantBuffer = SpatzConstantBuffer,
+        structBuffer = SpatzStructBuffer,
+        includeList: List[str] = includeList
+    ):
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
+
diff --git a/Deeploy/Targets/Spatz/Templates/AllocateTemplate.py b/Deeploy/Targets/Spatz/Templates/AllocateTemplate.py
new file mode 100644
index 0000000000..1bc11bc76d
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Templates/AllocateTemplate.py
@@ -0,0 +1,16 @@
+from Deeploy.DeeployTypes import NodeTemplate
+
+# allocate
+referenceAllocateTemplate = NodeTemplate(
+    "${name} = (${type.typeName}) snrt_l1alloc(${type.referencedType.typeWidth//8} * ${size});\n")
+
+spatzGenericAllocate = NodeTemplate("""
+% if _memoryLevel == "L1":
+${name} = (${type.typeName}) snrt_l1alloc(sizeof(${type.referencedType.typeName}) * ${size});\n
+% elif _memoryLevel == "L3" or _memoryLevel is None:
+${name} = (${type.typeName}) snrt_l3alloc(sizeof(${type.referencedType.typeName}) * ${size});\n
+% else:
+// COMPILER WARNING — unsupported memory level ${_memoryLevel}, defaulting to L3                                                                
+${name} = (${type.typeName}) snrt_l3alloc(${type.referencedType.typeWidth//8} * ${size});                                                       
+% endif 
+""")
diff --git a/Deeploy/Targets/Spatz/Templates/FreeTemplate.py b/Deeploy/Targets/Spatz/Templates/FreeTemplate.py
new file mode 100644
index 0000000000..f67cb3de38
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Templates/FreeTemplate.py
@@ -0,0 +1,5 @@
+from Deeploy.DeeployTypes import NodeTemplate
+
+# snrt_l1alloc currently does not support free-ing of memory (spatz/sw/snRuntime/src/alloc.c)
+spatzLocalTemplate = NodeTemplate("")
+spatzGlobalTemplate = NodeTemplate("")
\ No newline at end of file
diff --git a/Deeploy/Targets/Spatz/Templates/GatherTemplate.py b/Deeploy/Targets/Spatz/Templates/GatherTemplate.py
new file mode 100644
index 0000000000..428e654b3d
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Templates/GatherTemplate.py
@@ -0,0 +1,58 @@
+from Deeploy.DeeployTypes import NodeTemplate
+
+# TODO for l3 -> l3 transfers in spatz should memcpy be used?
+referenceTemplate = NodeTemplate("""
+// Gather (Name: ${nodeName}, Op: ${nodeOp})
+<%
+width = int(data_in_type.referencedType.typeWidth/8)
+%>
+BEGIN_SINGLE_CORE
+% if num_indices == 1:
+for (uint32_t i=0; i<${batch}; ++i) {
+    snrt_dma_start_1d(${data_out} + i * ${axis_length}, ${data_in} + i * ${batch_length} + ${index} * ${axis_length}, ${axis_length} * ${width});
+}
+% elif batch==1:
+for (uint32_t j=0; j<${num_indices}; ++j) {
+    snrt_dma_start_1d(${data_out} + j * ${axis_length}, 
+            ${data_in} + ${indices}[j] * ${axis_length}, 
+            ${axis_length} * ${width});
+}
+% else:
+for (uint32_t i=0; i<${batch}; ++i) {
+    for (uint32_t j=0; j<${num_indices}; ++j) {
+        snrt_dma_start_1d(${data_out} + i * (${num_indices} * ${axis_length}) + j * ${axis_length}, 
+               ${data_in} + i * ${batch_length} + ${indices}[j] * ${axis_length}, 
+               ${axis_length} * ${width});
+    }
+}
+% endif
+END_SINGLE_CORE
+""")
+
+
+tilingReferenceTemplate = NodeTemplate("""
+// Gather (Name: ${nodeName}, Op: ${nodeOp})
+<%
+width = int(data_in_type.referencedType.typeWidth/8)
+%>
+
+% if num_indices == 1:
+for (uint32_t i=0; i<${batch}; ++i) {
+    memcpy(${data_out} + i * ${axis_length}, ${data_in} + i * ${batch_length} + ${index} * ${axis_length}, ${axis_length} * ${width});
+}
+% elif batch==1:
+for (uint32_t j=0; j<${num_indices}; ++j) {
+    memcpy(${data_out} + j * ${axis_length}, 
+            ${data_in} + ${indices}[j] * ${axis_length}, 
+            ${axis_length} * ${width});
+}
+% else:
+for (uint32_t i=0; i<${batch}; ++i) {
+    for (uint32_t j=0; j<${num_indices}; ++j) {
+        memcpy(${data_out} + i * (${num_indices} * ${axis_length}) + j * ${axis_length}, 
+               ${data_in} + i * ${batch_length} + ${indices}[j] * ${axis_length}, 
+               ${axis_length} * ${width});
+    }
+}
+% endif
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Spatz/Templates/MatMulTemplate.py b/Deeploy/Targets/Spatz/Templates/MatMulTemplate.py
new file mode 100644
index 0000000000..ba354fe422
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Templates/MatMulTemplate.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _MatMulTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        C = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['A_offset'] = 0
+        operatorRepresentation['B_offset'] = 0
+        operatorRepresentation['C_offset'] = 0
+        if hasattr(A, "_signed") and hasattr(A, "nLevels"):
+            operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2)
+        if hasattr(B, "_signed") and hasattr(B, "nLevels"):
+            operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2)
+        if hasattr(C, "_signed") and hasattr(C, "nLevels"):
+            operatorRepresentation['C_offset'] = -(C._signed == 0) * int(C.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+# signed integer
+spatzSIMatMulTemplate = _MatMulTemplate("""
+// MatMul (Name: ${nodeName}, Op: ${nodeOp})
+${A_type.typeName} ref_${data_out}_${A} = ${A};
+${B_type.typeName} ref_${data_out}_${B} = ${B};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for(uint32_t i=0;i<${batch};i++){
+    MatMul_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(
+        ref_${data_out}_${A},
+        ref_${data_out}_${B},
+        ref_${data_out}_${data_out},
+        ${M},
+        ${N},
+        ${O},
+        ${A_offset}, ${B_offset}, ${C_offset}
+    );
+
+    ref_${data_out}_${A} += ${M} * ${N};
+    ref_${data_out}_${B} += ${N} * ${O};
+    ref_${data_out}_${data_out} += ${M} * ${O};
+}
+""")
+
+# supports single precision float (fp32)
+# also possible ot add half and double precision
+spatzFloatMatMulTemplate = NodeTemplate("""
+// Matmul (Name: ${nodeName}, Op: ${nodeOp})
+${A_type.typeName} ref_${data_out}_${A} = ${A};
+${B_type.typeName} ref_${data_out}_${B} = ${B};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for(uint32_t i=0; i<${batch}; i++){
+    Spatz_MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
+        ref_${data_out}_${A},
+        ref_${data_out}_${B},
+        ref_${data_out}_${data_out},
+        ${M},
+        ${N},
+        ${O}
+    );
+
+    ref_${data_out}_${A} += ${M} * ${N};
+    ref_${data_out}_${B} += ${N} * ${O};
+    ref_${data_out}_${data_out} += ${M} * ${O};
+}
+""")
diff --git a/Deeploy/Targets/Spatz/Templates/SoftmaxTemplate.py b/Deeploy/Targets/Spatz/Templates/SoftmaxTemplate.py
new file mode 100644
index 0000000000..2ddcc2c9b0
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Templates/SoftmaxTemplate.py
@@ -0,0 +1,8 @@
+from Deeploy.DeeployTypes import NodeTemplate
+
+# integerTilingTemplate
+
+floatTilingTemplate = NodeTemplate("""
+// Softmax (Name: ${nodeName}, Op: ${nodeOp})
+Spatz_Softmax_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${lastDimLength});
+""")
diff --git a/Deeploy/Targets/Spatz/Templates/TopKTemplate.py b/Deeploy/Targets/Spatz/Templates/TopKTemplate.py
new file mode 100644
index 0000000000..84764656d6
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Templates/TopKTemplate.py
@@ -0,0 +1,37 @@
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+SpatzTilingTemplate = NodeTemplate("""
+// TopK node: finds the top ${k_value} values and their indices
+// Assumes 1D input 
+${data_in_type.referencedType.typeName} *values_tmp = snrt_l1alloc(sizeof(${data_in_type.referencedType.typeName})*${data_in_size});
+${indices_out_type.referencedType.typeName} *indices_tmp = snrt_l1alloc(sizeof(${indices_out_type.referencedType.typeName})*${data_in_size});
+
+for (uint32_t i = 0; i < ${data_in_size}; ++i) {
+	values_tmp[i] = ((${data_in_type.referencedType.typeName}*)${data_in})[i];
+	indices_tmp[i] = i;
+}
+// Simple selection sort for top-k
+for (uint32_t i = 0; i < ${k_value}; ++i) {
+	uint32_t max_idx = i;
+	for (uint32_t j = i + 1; j < ${data_in_size}; ++j) {
+        if (values_tmp[j] > values_tmp[max_idx]) {
+          max_idx = j;
+        }
+	}
+	// Swap
+	if (max_idx != i) {
+		float32_t tmp_val = values_tmp[i];
+		int32_t tmp_idx = indices_tmp[i];
+		values_tmp[i] = values_tmp[max_idx];
+		indices_tmp[i] = indices_tmp[max_idx];
+		values_tmp[max_idx] = tmp_val;
+		indices_tmp[max_idx] = tmp_idx;
+	}
+	// Write output
+	((${values_out_type.referencedType.typeName}*)${values_out})[i] = values_tmp[i];
+	((${indices_out_type.referencedType.typeName}*)${indices_out})[i] = indices_tmp[i];
+}
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Spatz/TileConstraints/GatherTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/GatherTileConstraint.py
new file mode 100644
index 0000000000..5c5fc8eb7a
--- /dev/null
+++ b/Deeploy/Targets/Spatz/TileConstraints/GatherTileConstraint.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class GatherTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        pointer: List[str] = []
+
+        for key, value in parseDict.items():
+            if not isinstance(value, str):
+                continue
+
+            if ctxt.is_global(value) or ctxt.is_local(value):
+                pointer.append(value)
+
+        for tensorName in pointer:
+
+            _buffer = ctxt.lookup(tensorName)
+            if isinstance(_buffer, TransientBuffer):
+                continue
+
+            tilerModel.addTensorDimToModel(ctxt, tensorName)
+
+            for idx, shapeDim in enumerate(_buffer.shape):
+                tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) == shapeDim)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        symbolicParseDict = parseDict.copy()
+
+        return symbolicParseDict
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'indices', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        dataInBuffer = ctxt.lookup(operatorRepresentation['data_in'])
+        indicesBuffer = ctxt.lookup(operatorRepresentation['indices'])
+
+        dataInCube = HyperRectangle(offset = (0,) * len(dataInBuffer.shape), dims = tuple(dataInBuffer.shape))
+        indicesCube = HyperRectangle(offset = (0,) * len(indicesBuffer.shape), dims = tuple(indicesBuffer.shape))
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for out in outputCubes:
+            # Gather execution policy: load full inputs in L1, execute once, then store output tile.
+            inputLoadSchedule.append({'data_in': dataInCube, 'indices': indicesCube})
+            outputLoadSchedule.append({'data_out': out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        repScheme = VariableReplacementScheme({}, {})
+
+        return repScheme, schedule
diff --git a/Deeploy/Targets/Spatz/TileConstraints/SoftmaxTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/SoftmaxTileConstraint.py
new file mode 100644
index 0000000000..c34b84890f
--- /dev/null
+++ b/Deeploy/Targets/Spatz/TileConstraints/SoftmaxTileConstraint.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class SoftmaxTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # Register and pin all referenced tensors to full shape to avoid tiling.
+        # This also covers constant inputs that may appear as parseDict string references.
+        tensorNames: List[str] = []
+
+        for value in parseDict.values():
+            if not isinstance(value, str):
+                continue
+            if ctxt.is_global(value) or ctxt.is_local(value):
+                tensorNames.append(value)
+
+        for tensorName in tensorNames:
+            _buffer = ctxt.lookup(tensorName)
+            if isinstance(_buffer, TransientBuffer):
+                continue
+
+            tilerModel.addTensorDimToModel(ctxt, tensorName)
+
+            for idx, shapeDim in enumerate(_buffer.shape):
+                tileDimVar = tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx)
+                tilerModel.addConstraint(tileDimVar == shapeDim)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        symbolicParseDict = parseDict.copy()
+
+        return symbolicParseDict
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        dataInBuffer = ctxt.lookup(operatorRepresentation['data_in'])
+
+        dataInCube = HyperRectangle(offset = (0,) * len(dataInBuffer.shape), dims = tuple(dataInBuffer.shape))
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for out in outputCubes:
+            inputLoadSchedule.append({'data_in': dataInCube})
+            outputLoadSchedule.append({'data_out': out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        repScheme = VariableReplacementScheme({}, {})
+
+        return repScheme, schedule
diff --git a/Deeploy/Targets/Spatz/TileConstraints/TopKTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/TopKTileConstraint.py
new file mode 100644
index 0000000000..30572d5819
--- /dev/null
+++ b/Deeploy/Targets/Spatz/TileConstraints/TopKTileConstraint.py
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class TopKTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # Register and pin all referenced tensors to full shape to avoid tiling.
+        # This also covers constant inputs that may appear as parseDict string references.
+        tensorNames: List[str] = []
+
+        for value in parseDict.values():
+            if not isinstance(value, str):
+                continue
+            if ctxt.is_global(value) or ctxt.is_local(value):
+                tensorNames.append(value)
+
+        for tensorName in tensorNames:
+            _buffer = ctxt.lookup(tensorName)
+            if isinstance(_buffer, TransientBuffer):
+                continue
+
+            tilerModel.addTensorDimToModel(ctxt, tensorName)
+
+            for idx, shapeDim in enumerate(_buffer.shape):
+                tileDimVar = tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx)
+                tilerModel.addConstraint(tileDimVar == shapeDim)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        symbolicParseDict = parseDict.copy()
+
+        return symbolicParseDict
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        # k_value is a scalar parsed in operatorRepresentation, not a tensor to transfer.
+        addrNames = ['data_in', 'values_out', 'indices_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        dataInBuffer = ctxt.lookup(operatorRepresentation['data_in'])
+
+        dataInCube = HyperRectangle(offset = (0,) * len(dataInBuffer.shape), dims = tuple(dataInBuffer.shape))
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for out in outputCubes:
+            # TopK execution policy: load full input in L1, execute once, then store both outputs.
+            inputLoadSchedule.append({'data_in': dataInCube})
+            outputLoadSchedule.append({'values_out': out, 'indices_out': out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        repScheme = VariableReplacementScheme({}, {})
+
+        return repScheme, schedule
diff --git a/Deeploy/Targets/Spatz/Tiler.py b/Deeploy/Targets/Spatz/Tiler.py
new file mode 100644
index 0000000000..96a0a09bfe
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Tiler.py
@@ -0,0 +1,18 @@
+from Deeploy.Targets.Spatz.Bindings import SpatzMatMulBindings, SpatzGatherBindings, SpatzTopKBindings, SpatzSoftmaxBindings
+from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
+from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint
+from Deeploy.Targets.Spatz.TileConstraints.GatherTileConstraint import GatherTileConstraint
+from Deeploy.Targets.Spatz.TileConstraints.TopKTileConstraint import TopKTileConstraint
+from Deeploy.Targets.Spatz.TileConstraints.SoftmaxTileConstraint import SoftmaxTileConstraint
+
+SpatzMatMulTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzMatMulBindings,
+                                                     tileConstraint = MatMulTileConstraint())
+
+SpatzGatherTilingBindings  = TilingReadyNodeBindings(nodeBindings = SpatzGatherBindings,
+                                                     tileConstraint = GatherTileConstraint())
+
+SpatzTopKTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzTopKBindings,
+                                                     tileConstraint = TopKTileConstraint())
+
+SpatzSoftmaxTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzSoftmaxBindings,
+                                                     tileConstraint = SoftmaxTileConstraint())
diff --git a/Deeploy/TilingExtension/TileConstraint.py b/Deeploy/TilingExtension/TileConstraint.py
index 5b067b2ce9..9a2aa6b9d9 100644
--- a/Deeploy/TilingExtension/TileConstraint.py
+++ b/Deeploy/TilingExtension/TileConstraint.py
@@ -131,7 +131,9 @@ def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List
 
             return solution, solutionLengths
 
-        assert len(tilingSolution.outputTensorMemoryConstraints) == 1, "Expected node to have only one output!"
+        # Support multi-output nodes: use first output tensor to determine tiling structure.
+        # For operators like TopK with multiple outputs, all outputs share the same tiling pattern.
+        assert len(tilingSolution.outputTensorMemoryConstraints) >= 1, "Expected node to have at least one output!"
 
         outVar, outTensorConstraint = next(iter(tilingSolution.outputTensorMemoryConstraints.items()))
         memoryPath = list(outTensorConstraint.memoryConstraints.keys())
diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py
index 9b48d9456c..1903c8178a 100644
--- a/Deeploy/TilingExtension/TilerExtension.py
+++ b/Deeploy/TilingExtension/TilerExtension.py
@@ -532,6 +532,15 @@ def _setupGeometricConstraints(self, tilerModel: TilerModel, ctxt: NetworkContex
 
                 parseDict = layerBinding[node.name].mapper.parser.operatorRepresentation
                 template = layerBinding[node.name].mapper.binder.template
+                # print("eccomi===================")
+                # print(layerBinding)
+                # print(node.name)
+                # print(layerBinding[node.name])
+                # print(layerBinding[node.name].mapper)
+                # print(layerBinding[node.name].mapper.parser)
+                # print(layerBinding[node.name].mapper.binder)
+                # print(layerBinding[node.name].mapper.parser.operatorRepresentation)
+                # print(layerBinding[node.name].mapper.binder.template)
 
                 tilerModel = template.tileConstraint.addGeometricalConstraint(tilerModel,
                                                                               parseDict = parseDict,
@@ -556,7 +565,7 @@ def _setupHeuristics(self, tilerModel: TilerModel, ctxt: NetworkContext, schedul
 
             patternMemSizeExpr: IntVar = 0
             for tensor in patternTensorList:
-                if not ctxt.lookup(tensor.name)._deploy:
+                if not ctxt.lookup(tensor.name)._deploy or isinstance(ctxt.lookup(tensor.name), ConstantBuffer):
                     continue
 
                 patternMemSizeExpr += tilerModel.getTensorNumberOfEltVar(
diff --git a/Deeploy/TilingExtension/TilerModel.py b/Deeploy/TilingExtension/TilerModel.py
index db83974f0c..080211270b 100644
--- a/Deeploy/TilingExtension/TilerModel.py
+++ b/Deeploy/TilingExtension/TilerModel.py
@@ -10,6 +10,7 @@
 import numpy as np
 from ortools.constraint_solver.pywrapcp import IntExpr, IntVar, SolutionCollector, Solver
 
+from Deeploy.DeeployTypes import ConstantBuffer
 from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
 from Deeploy.Logging import DEFAULT_LOGGER as log
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryLevel
@@ -170,6 +171,10 @@ def addTensorNumOfEltToModel(self, ctxt: NetworkContext, tensorName: str, copyId
 
         tensor = ctxt.lookup(tensorName)
 
+        # Skip constant buffers: they don't participate in tiling and don't need num_elements variables
+        if isinstance(tensor, ConstantBuffer):
+            return
+
         tensorDimProductExpr = 1
 
         for idx, _ in enumerate([
diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt
index b7f3535790..71f632cbd2 100644
--- a/DeeployTest/CMakeLists.txt
+++ b/DeeployTest/CMakeLists.txt
@@ -50,6 +50,8 @@ elseif(DEEPLOY_ARCH STREQUAL SNITCH)
   add_subdirectory(Platforms/Snitch)
 elseif(DEEPLOY_ARCH STREQUAL CHIMERA)
   add_subdirectory(Platforms/Chimera)
+elseif(DEEPLOY_ARCH STREQUAL SPATZ)
+  add_subdirectory(Platforms/Spatz)
 elseif(platform STREQUAL GAP9)
 
   # Search for hex files generated by Python code generator
diff --git a/DeeployTest/Platforms/Spatz/CMakeLists.txt b/DeeployTest/Platforms/Spatz/CMakeLists.txt
new file mode 100644
index 0000000000..2d0f730e52
--- /dev/null
+++ b/DeeployTest/Platforms/Spatz/CMakeLists.txt
@@ -0,0 +1,22 @@
+set(ProjectId ${TESTNAME})
+
+file(GLOB_RECURSE SOURCES
+    main.c
+)
+
+list(APPEND SOURCES
+    ${SPATZ_HOME}/sw/spatzBenchmarks/benchmark/benchmark.c
+)
+
+add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES})
+
+set(SPATZ_BENCHMARK_INCLUDE_DIR
+    ${SPATZ_HOME}/sw/spatzBenchmarks/include
+)
+target_include_directories(${ProjectId} PRIVATE ${SPATZ_BENCHMARK_INCLUDE_DIR})
+
+target_link_libraries(${ProjectId} PRIVATE network deeploylib)
+target_compile_options(${ProjectId} INTERFACE network)
+
+add_spatz_gvsoc_emulation(${ProjectId} "spatz_v2")
+add_spatz_vsim_simulation(${ProjectId})
\ No newline at end of file
diff --git a/DeeployTest/Platforms/Spatz/main.c b/DeeployTest/Platforms/Spatz/main.c
new file mode 100644
index 0000000000..c9084b8ee5
--- /dev/null
+++ b/DeeployTest/Platforms/Spatz/main.c
@@ -0,0 +1,86 @@
+
+#include <stdint.h>
+#include <benchmark.h>
+#include "printf.h"
+
+#include "Network.h"
+#include "testinputs.h"
+#include "testoutputs.h"
+
+int main() {
+  const unsigned int core_id = snrt_cluster_core_idx();
+  unsigned int timer_start, timer_end, timer;
+
+  if (core_id == 0) printf("[INFO] Running on %d cores\n", snrt_cluster_core_num());
+  if (snrt_is_dm_core()){printf("[INFO] DM core is core number %d\n", core_id);}
+  snrt_cluster_hw_barrier();
+
+  // do it only with one of the two spatz cores
+  if (snrt_is_dm_core()){
+    timer_start = benchmark_get_cycle();
+
+    printf("Initializing network...\r\n");
+    InitNetwork(0, 1);
+
+    for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {                                                                                
+      memcpy(DeeployNetwork_inputs[buf], testInputVector[buf], DeeployNetwork_inputs_bytes[buf]);
+      // DeeployNetwork_inputs[buf] = (void *)testInputVector[buf]; TODO ???
+    }   
+
+
+    printf("Running network...\r\n");
+  }
+
+  snrt_cluster_hw_barrier();
+  if (snrt_is_dm_core()){ timer_start = benchmark_get_cycle(); }
+  RunNetwork(core_id, 2);
+
+  snrt_cluster_hw_barrier();
+  
+  if (snrt_is_dm_core()){
+    timer_end = benchmark_get_cycle();
+    timer = timer_end - timer_start;
+
+    printf("Network ran in %d cycles.\r\nChecking Outputs...\r\n", timer);
+    int32_t tot_err = 0;
+    uint32_t tot = 0;
+    OUTPUTTYPE diff;
+    OUTPUTTYPE expected, actual;
+
+    for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+      tot += DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE);
+      for (uint32_t i = 0;
+          i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) {
+        expected = ((OUTPUTTYPE *)testOutputVector[buf])[i];
+        actual = ((OUTPUTTYPE *)DeeployNetwork_outputs[buf])[i];
+        diff = expected - actual;
+
+#if ISOUTPUTFLOAT == 1
+        // RUNWANG: Allow margin of error for float32_t
+        // MATTIA: if diff is a quiet nan 0x7FC00000 we want to error
+        if ((diff < -1e-4f) || (diff > 1e-4f) || *(uint32_t*)&diff == 0x7FC00000) {
+          tot_err += 1;
+          // printf("Expected: %f  Actual: %f  Diff: %f at Index %12u in Output %u\r\n", expected, actual, diff, i, buf);  
+          printf("Expected: 0x%08x  Actual: 0x%08x  Diff: 0x%08x at Index %12u in Output %u\r\n", *(uint32_t*)&expected, *(uint32_t*)&actual, *(uint32_t*)&diff, i, buf);  
+        }
+#else
+        // RUNWANG: No margin for integer comparison
+        if (diff != 0) {
+          tot_err += 1;
+          printf("Expected: %4d  ", expected);
+          printf("Actual: %4d  ", actual);
+          printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
+        }
+#endif
+      }
+    }
+
+    printf("Errors: %d out of %d \r\n", tot_err, tot);
+  }
+
+  printf("core %d arrived at the end\r\n", core_id);
+  snrt_cluster_hw_barrier();
+  printf("We are after hw barrier\r\n");
+
+  return 0;
+}
diff --git a/DeeployTest/Tests/Kernels/FP32/Gather/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Gather/inputs.npz
new file mode 100644
index 0000000000..eb073685c7
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Gather/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Gather/network.onnx b/DeeployTest/Tests/Kernels/FP32/Gather/network.onnx
new file mode 100644
index 0000000000..c20c89bd05
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Gather/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Gather/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Gather/outputs.npz
new file mode 100644
index 0000000000..ed786d2e1d
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Gather/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Big/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Big/inputs.npz
new file mode 100644
index 0000000000..930d02b187
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Big/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Big/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/Big/network.onnx
new file mode 100644
index 0000000000..f1f3c60551
--- /dev/null
+++ b/DeeployTest/Tests/Kernels/FP32/MatMul/Big/network.onnx
@@ -0,0 +1,18 @@
+pytorch2.7.0:o
+
+a
+bout/MatMul"MatMul
+main_graphZ
+a
+
+
+�
+�Z
+b
+	
+�
+Pb
+out
+	
+�
+PB
\ No newline at end of file
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Big/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Big/outputs.npz
new file mode 100644
index 0000000000..9915d63151
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Big/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/inputs.npz
new file mode 100644
index 0000000000..d9a6ad5605
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/network.onnx
new file mode 100644
index 0000000000..8719d9bc82
--- /dev/null
+++ b/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/network.onnx
@@ -0,0 +1,17 @@
+pytorch2.7.0:m
+
+a
+bout/MatMul"MatMul
+main_graphZ
+a
+	
+�
+ Z
+b
+
+ 
+Pb
+out
+	
+�
+PB
\ No newline at end of file
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/outputs.npz
new file mode 100644
index 0000000000..1f79303b10
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Medium/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/inputs.npz
new file mode 100644
index 0000000000..d8f4a477c3
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/network.onnx
new file mode 100644
index 0000000000..3388387955
--- /dev/null
+++ b/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/network.onnx
@@ -0,0 +1,17 @@
+pytorch2.7.0:m
+
+a
+bout/MatMul"MatMul
+main_graphZ
+a
+
+
+ Z
+b
+	
+ 
+�b
+out
+	
+
+�B
\ No newline at end of file
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/outputs.npz
new file mode 100644
index 0000000000..e91f150988
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/MidMedium/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/Kernels/FP32/MatMul/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/MatMul/Regular/inputs.npz
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular/network.onnx
similarity index 100%
rename from DeeployTest/Tests/Kernels/FP32/MatMul/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/MatMul/Regular/network.onnx
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/Kernels/FP32/MatMul/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/MatMul/Regular/outputs.npz
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/inputs.npz
new file mode 100644
index 0000000000..d0aed0662f
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/network.onnx
new file mode 100644
index 0000000000..72e83fab2f
--- /dev/null
+++ b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/network.onnx
@@ -0,0 +1,17 @@
+pytorch2.7.0:k
+
+a
+bout/MatMul"MatMul
+main_graphZ
+a
+
+
+ Z
+b
+
+ 
+b
+out
+
+
+B
\ No newline at end of file
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/outputs.npz
new file mode 100644
index 0000000000..6982ce772a
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Regular2D/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/inputs.npz
new file mode 100644
index 0000000000..542bc5789f
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/network.onnx
new file mode 100644
index 0000000000..7beeeef202
--- /dev/null
+++ b/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/network.onnx
@@ -0,0 +1,16 @@
+pytorch2.0.1:j
+
+a
+bout/MatMul"MatMul	torch_jitZ
+a
+
+
+Z
+b
+
+
+b
+out
+
+
+B
\ No newline at end of file
diff --git a/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/outputs.npz
new file mode 100644
index 0000000000..42ffb87810
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/MatMul/Smal/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/inputs.npz
new file mode 100644
index 0000000000..afc11e34d7
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/network.onnx
new file mode 100644
index 0000000000..94e265be97
--- /dev/null
+++ b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/network.onnx
@@ -0,0 +1,13 @@
+pytorch2.7.0:^
+&
+VA/Softmax"Softmax*
+axis�
+main_graphZ
+V
+
+
+b
+A
+
+
+B
\ No newline at end of file
diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/outputs.npz
new file mode 100644
index 0000000000..f5f6daea15
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/SparseAttention/inputs.npz b/DeeployTest/Tests/Kernels/FP32/SparseAttention/inputs.npz
new file mode 100644
index 0000000000..1f27f7766e
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SparseAttention/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/SparseAttention/network.onnx b/DeeployTest/Tests/Kernels/FP32/SparseAttention/network.onnx
new file mode 100644
index 0000000000..e35b7eb103
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SparseAttention/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/SparseAttention/outputs.npz b/DeeployTest/Tests/Kernels/FP32/SparseAttention/outputs.npz
new file mode 100644
index 0000000000..b65882fc0a
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SparseAttention/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/inputs.npz b/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/inputs.npz
new file mode 100644
index 0000000000..5ad6b6d71a
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/network.onnx b/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/network.onnx
new file mode 100644
index 0000000000..7e6acbde40
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/outputs.npz b/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/outputs.npz
new file mode 100644
index 0000000000..1c877a4b96
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SparseAttentionBig/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/10/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopK/10/inputs.npz
new file mode 100644
index 0000000000..a02c827160
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/10/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/10/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopK/10/network.onnx
new file mode 100644
index 0000000000..13fae39c48
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/10/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/10/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopK/10/outputs.npz
new file mode 100644
index 0000000000..b9bc6c4183
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/10/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/20/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopK/20/inputs.npz
new file mode 100644
index 0000000000..0b66bfd41b
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/20/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/20/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopK/20/network.onnx
new file mode 100644
index 0000000000..1a82699d31
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/20/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/20/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopK/20/outputs.npz
new file mode 100644
index 0000000000..abccfba295
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/20/outputs.npz differ
diff --git a/DeeployTest/deeployRunner_spatz.py b/DeeployTest/deeployRunner_spatz.py
new file mode 100644
index 0000000000..5404defc13
--- /dev/null
+++ b/DeeployTest/deeployRunner_spatz.py
@@ -0,0 +1,12 @@
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+    sys.exit(
+        main(
+            default_platform = "Spatz",
+            default_simulator = "gvsoc",
+            tiling_enabled = False,
+        )
+    )
diff --git a/DeeployTest/deeployRunner_tiled_spatz.py b/DeeployTest/deeployRunner_tiled_spatz.py
new file mode 100644
index 0000000000..6900d7010e
--- /dev/null
+++ b/DeeployTest/deeployRunner_tiled_spatz.py
@@ -0,0 +1,12 @@
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+    sys.exit(
+        main(
+            default_platform = "Spatz",
+            default_simulator = "gvsoc",
+            tiling_enabled = True,
+        )
+    )
diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py
index 01216984af..686fa99d8f 100644
--- a/DeeployTest/testMVP.py
+++ b/DeeployTest/testMVP.py
@@ -250,11 +250,20 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
             test_inputs = [test_inputs[0]]
             test_outputs = [test_outputs[-2]]
 
-    # Instantiate Classes Requried for Memory Level Annotation Extension
-    L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64000000)
-    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = args.l2)
-    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1)
-    memoryLevels = [L3, L2, L1]
+    # Instantiate Classes Required for Memory Level Annotation Extension
+    if args.platform == "Spatz":
+        # Spatz cluster has only TCDM (L1) + external DRAM (L3). No on-chip L2.
+        # Declare L1 and L3 as direct neighbours so BFS-based tile-path
+        # generation does not insert a phantom L2 staging buffer.
+        L3 = MemoryLevel(name = "L3", neighbourNames = ["L1"], size = 64000000)
+        L1 = MemoryLevel(name = "L1", neighbourNames = ["L3"], size = args.l1)
+        memoryLevels = [L3, L1]
+    else:
+        L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"],        size = 64000000)
+        L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"],  size = args.l2)
+        L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"],        size = args.l1)
+        memoryLevels = [L3, L2, L1]
+
 
     if args.neureka_wmem:
         memoryLevels.append(MemoryLevel(name = "WeightMemory_SRAM", neighbourNames = [], size = 4 * 1024 * 1024))
diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py
index 1dcddeea62..cdbd0af3db 100644
--- a/DeeployTest/testUtils/core/execution.py
+++ b/DeeployTest/testUtils/core/execution.py
@@ -6,6 +6,7 @@
 import shutil
 import subprocess
 import sys
+import threading
 from pathlib import Path
 
 from Deeploy.Logging import DEFAULT_LOGGER as log
@@ -191,15 +192,43 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult:
 
     log.debug(f"[Execution] Simulation command: {' '.join(cmd)}")
 
-    result = subprocess.run(cmd, capture_output = True, text = True, env = env)
-
-    if result.stdout:
-        print(result.stdout, end = '')
-    if result.stderr:
-        print(result.stderr, end = '', file = sys.stderr)
+    process = subprocess.Popen(
+        cmd,
+        stdout = subprocess.PIPE,
+        stderr = subprocess.PIPE,
+        text = True,
+        env = env,
+        bufsize = 1,
+    )
+
+    stdout_chunks = []
+    stderr_chunks = []
+
+    def _stream_reader(pipe, chunks, is_stderr: bool = False) -> None:
+        assert pipe is not None
+        for line in iter(pipe.readline, ''):
+            chunks.append(line)
+            if is_stderr:
+                print(line, end = '', file = sys.stderr, flush = True)
+            else:
+                print(line, end = '', flush = True)
+        pipe.close()
+
+    stdout_thread = threading.Thread(target = _stream_reader, args = (process.stdout, stdout_chunks), daemon = True)
+    stderr_thread = threading.Thread(target = _stream_reader, args = (process.stderr, stderr_chunks, True), daemon = True)
+
+    stdout_thread.start()
+    stderr_thread.start()
+
+    returncode = process.wait()
+    stdout_thread.join()
+    stderr_thread.join()
+
+    stdout = ''.join(stdout_chunks)
+    stderr = ''.join(stderr_chunks)
 
     # Parse output for error count and cycles
-    test_result = parse_test_output(result.stdout, result.stderr)
+    test_result = parse_test_output(stdout, stderr)
 
     if not test_result.success and test_result.error_count == -1:
         log.warning(f"Could not parse error count from output")
diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py
index a5a8d70ef3..0c98e254aa 100644
--- a/DeeployTest/testUtils/deeployRunner.py
+++ b/DeeployTest/testUtils/deeployRunner.py
@@ -348,6 +348,7 @@ def main(default_platform: Optional[str] = None,
         "snitch": "Snitch",
         "chimera": "Chimera",
         "softhier": "SoftHier",
+        "spatz": "Spatz",
     }
 
     if args.platform:
@@ -388,6 +389,7 @@ def main(default_platform: Optional[str] = None,
             "Snitch": "gvsoc",
             "Chimera": "gvsoc",
             "SoftHier": "gvsoc",
+            "Spatz": "vsim",
         }
         simulator = simulator_map.get(platform, "host")
         log.info(f"No simulator specified, using default for {platform}: {simulator}")
diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
index 9d526906f9..69a83f1e8d 100644
--- a/DeeployTest/testUtils/platformMapping.py
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -10,6 +10,8 @@
 from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, TopologyOptimizer
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
+from Deeploy.Targets.Spatz.Deployer import SpatzDeployer
+from Deeploy.Targets.Spatz.Platform import SpatzOptimizer, SpatzPlatform
 from Deeploy.Targets.Chimera.Deployer import ChimeraDeployer
 from Deeploy.Targets.Chimera.Platform import ChimeraOptimizer, ChimeraPlatform
 from Deeploy.Targets.CortexM.Deployer import CMSISDeployer
@@ -31,7 +33,7 @@
 from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform
 
 _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"]
-_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"]
+_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9", "Spatz"]
 _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS
 
 
@@ -76,6 +78,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
     elif platformName == "Chimera":
         Platform = ChimeraPlatform()
 
+    elif platformName == "Spatz":
+        Platform = SpatzPlatform()
+
     else:
         raise RuntimeError(f"Deployment platform {platformName} is not implemented")
 
@@ -272,6 +277,18 @@ def mapDeployer(platform: DeploymentPlatform,
                                    name = name,
                                    default_channels_first = default_channels_first,
                                    deeployStateDir = deeployStateDir)
+    
+    elif isinstance(platform, (SpatzPlatform)):
+        deployer = SpatzDeployer(
+            graph,
+            platform,
+            inputTypes,
+            SpatzOptimizer,
+            scheduler,
+            name = name,
+            default_channels_first = default_channels_first,
+            deeployStateDir = deeployStateDir
+        )
 
     else:
         raise RuntimeError(f"Deployer for platform {platform} is not implemented")
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 6d9f3cfcd7..6be4bef197 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -110,6 +110,14 @@ def param_id(param):
         "model_tests": SNITCH_MODEL_TESTS,
         "default_num_cores": SNITCH_DEFAULT_NUM_CORES,
     },
+    "spatz": {
+        "platform": "Spatz",
+        "simulator": "vsim",
+        # TODO: Define KERNEL_TESTS and MODEL_TESTS for Spatz
+        "kernel_tests": [],
+        "model_tests": [],
+        # "default_num_cores": <set if known>,
+    },
     "gap9": {
         "platform": "GAP9",
         "simulator": "gvsoc",
diff --git a/Makefile b/Makefile
index d40a49da11..0bc7ffa7fe 100644
--- a/Makefile
+++ b/Makefile
@@ -27,10 +27,12 @@ PICOLIBC_RV32IMF_INSTALL_DIR      ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32imf
 CHIMERA_SDK_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/chimera-sdk
 PULP_SDK_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/pulp-sdk
 SNITCH_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/snitch_cluster
+SPATZ_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/spatz
 QEMU_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/qemu
 BANSHEE_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/banshee
 MEMPOOL_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/mempool
 GVSOC_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/gvsoc
+GVSOC_SPATZ_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/gvsoc_spatz
 SOFTHIER_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/softhier
 MINIMALLOC_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/minimalloc
 XTL_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/xtl
@@ -44,8 +46,10 @@ PICOLIBC_COMMIT_HASH ?= 31ff1b3601b379e4cab63837f253f59729ce1fef
 PULP_SDK_COMMIT_HASH ?= 7f4f22516157a1b7c55bcbbc72ca81326180b3b4
 MEMPOOL_COMMIT_HASH ?= affd45d94e05e375a6966af6a762deeb182a7bd6
 SNITCH_COMMIT_HASH ?= e02cc9e3f24b92d4607455d5345caba3eb6273b2
+SPATZ_COMMIT_HASH ?= 6bd9f3094e237dab392983edb827105bce8e3e86
 SOFTHIER_COMMIT_HASH ?= 0       # bowwang: to be updated
-GVSOC_COMMIT_HASH ?= edfcd8398840ceb1e151711befa06678b05f06a0
+# GVSOC_COMMIT_HASH ?= edfcd8398840ceb1e151711befa06678b05f06a0 # old
+GVSOC_COMMIT_HASH ?= 209c147cbd293d5c1590694e68c489122c777acc # new
 MINIMALLOC_COMMMIT_HASH ?= e9eaf54094025e1c246f9ec231b905f8ef42a29d
 CHIMERA_SDK_COMMIT_HASH ?= b2392f6efcff75c03f4c65eaf3e12104442b22ea
 XTL_VERSION ?= 0.7.5
@@ -69,7 +73,7 @@ else
 	$(error unsupported platform $(OS))
 endif
 
-all: toolchain emulators docs echo-bash
+all: toolchain emulators # docs echo-bash
 
 echo-bash:
 
@@ -79,8 +83,10 @@ echo-bash:
 	@echo "export PULP_SDK_HOME=${PULP_SDK_INSTALL_DIR}"
 	@echo "export CHIMERA_SDK_HOME=${CHIMERA_SDK_INSTALL_DIR}"
 	@echo "export SNITCH_HOME=${SNITCH_INSTALL_DIR}"
+	@echo "export SPATZ_HOME=${SPATZ_INSTALL_DIR}"
 	@echo "export GVSOC_INSTALL_DIR=${GVSOC_INSTALL_DIR}"
 	@echo "export SOFTHIER_INSTALL_DIR=${SOFTHIER_INSTALL_DIR}"
+	@echo "export BANSHEE_INSTALL_DIR=${BANSHEE_INSTALL_DIR}"
 	@echo "export LLVM_INSTALL_DIR=${LLVM_INSTALL_DIR}"
 	@echo "export MEMPOOL_HOME=${MEMPOOL_INSTALL_DIR}"
 	@echo "export CMAKE=$$(which cmake)"
@@ -91,9 +97,9 @@ echo-bash:
 	@echo "source ${PULP_SDK_INSTALL_DIR}/configs/siracusa.sh"
 
 
-toolchain: llvm llvm-compiler-rt-riscv llvm-compiler-rt-arm picolibc-arm picolibc-riscv
+toolchain: llvm llvm-compiler-rt-riscv llvm-compiler-rt-arm picolibc-arm picolibc-riscv xtensor minimalloc # xtensor needed for gvsoc, minimalloc for tiling
 
-emulators: snitch_runtime pulp-sdk qemu banshee mempool
+emulators: snitch_runtime spatz_runtime pulp-sdk qemu banshee mempool gvsoc
 
 ${TOOLCHAIN_DIR}/llvm-project:
 	cd ${TOOLCHAIN_DIR} && \
@@ -124,6 +130,7 @@ ${LLVM_INSTALL_DIR}: ${TOOLCHAIN_DIR}/llvm-project
 llvm: ${LLVM_INSTALL_DIR}
 
 
+# runtimes for different architectures
 ${LLVM_CLANG_RT_RISCV_RV32IM}: ${TOOLCHAIN_DIR}/llvm-project
 	cd ${TOOLCHAIN_DIR}/llvm-project && mkdir -p build-compiler-rt-riscv-rv32im \
 	&& cd build-compiler-rt-riscv-rv32im; \
@@ -429,16 +436,55 @@ ${SNITCH_INSTALL_DIR}: ${TOOLCHAIN_DIR}/snitch_cluster
 
 snitch_runtime: ${SNITCH_INSTALL_DIR}
 
+${TOOLCHAIN_DIR}/spatz:
+	cd ${TOOLCHAIN_DIR} && \
+	git clone https://github.com/pulp-platform/spatz.git && \
+	cd ${TOOLCHAIN_DIR}/spatz && git checkout ${SPATZ_COMMIT_HASH} && \
+ 	git submodule update --init --recursive
+
+${SPATZ_INSTALL_DIR}: ${TOOLCHAIN_DIR}/spatz
+	mkdir -p ${SPATZ_INSTALL_DIR}
+	cp -r ${TOOLCHAIN_DIR}/spatz/ ${SPATZ_INSTALL_DIR}/../
+	cd ${SPATZ_INSTALL_DIR} 
+	make all -j8 && \
+	python3.6 -m venv .venv && \
+	.venv/bin/pip install jsonref jsonschema jstyleson dataclasses hjson mako && \
+	source .venv/bin/activate && \
+	source util/iis-env.sh && \
+	make init && \
+	cd hw/system/spatz_cluster/ && \
+	make sw
+
+spatz_runtime: ${SPATZ_INSTALL_DIR}
+
+# ${TOOLCHAIN_DIR}/gvsoc_spatz:
+# 	cd ${TOOLCHAIN_DIR} && \
+# 	git clone https://github.com/gvsoc/gvsoc.git gvsoc_spatz && \
+# 	cd ${TOOLCHAIN_DIR}/gvsoc_spatz && git checkout ${GVSOC_SPATZ_COMMIT_HASH} && \
+# 	git submodule update --init --recursive && \
+# 	python3 -m venv venv && source venv/bin/activate &&\
+# 	pip3 install -r core/requirements.txt && pip3 install -r gapy/requirements.txt && pip3 install psutil && \
+# 	cd core && git apply ${TOOLCHAIN_DIR}/gvsoc.patch
+# 
+# 
+# ${GVSOC_SPATZ_INSTALL_DIR}: ${TOOLCHAIN_DIR}/gvsoc_spatz
+# 	cd ${TOOLCHAIN_DIR}/gvsoc_spatz && \
+# 	source venv/bin/activate &&\
+# 	CXX=g++-11.2.0 CC=gcc-11.2.0 CMAKE=cmake-3.18.1 make all TARGETS=spatz_v2 INSTALLDIR=${GVSOC_SPATZ_INSTALL_DIR}
+# 
+# gvsoc_spatz: ${GVSOC_SPATZ_INSTALL_DIR}
+
 ${TOOLCHAIN_DIR}/gvsoc:
 	cd ${TOOLCHAIN_DIR} && \
 	git clone https://github.com/gvsoc/gvsoc.git && \
 	cd ${TOOLCHAIN_DIR}/gvsoc && git checkout ${GVSOC_COMMIT_HASH} && \
 	git submodule update --init --recursive && \
-	pip install -r core/requirements.txt && pip install -r gapy/requirements.txt
+	pip3 install -r core/requirements.txt && pip3 install -r gapy/requirements.txt && pip3 install psutil &&\
+	cd core && git apply ${TOOLCHAIN_DIR}/gvsoc.patch
 
 ${GVSOC_INSTALL_DIR}: ${TOOLCHAIN_DIR}/gvsoc
 	cd ${TOOLCHAIN_DIR}/gvsoc && \
-	 XTENSOR_INSTALL_DIR=${XTENSOR_INSTALL_DIR}/include XTL_INSTALL_DIR=${XTL_INSTALL_DIR}/include XSIMD_INSTALL_DIR=${XSIMD_INSTALL_DIR}/include make all TARGETS="pulp.snitch.snitch_cluster_single siracusa chimera" build INSTALLDIR=${GVSOC_INSTALL_DIR}
+	XTENSOR_INSTALL_DIR=${XTENSOR_INSTALL_DIR}/include XTL_INSTALL_DIR=${XTL_INSTALL_DIR}/include XSIMD_INSTALL_DIR=${XSIMD_INSTALL_DIR}/include make all TARGETS="pulp.snitch.snitch_cluster_single siracusa chimera spatz_v2" build INSTALLDIR=${GVSOC_INSTALL_DIR}
 
 gvsoc: ${GVSOC_INSTALL_DIR}
 
@@ -543,7 +589,7 @@ ${TOOLCHAIN_DIR}/minimalloc:
 	cd ${TOOLCHAIN_DIR} && \
 	git clone --recursive https://github.com/google/minimalloc.git && \
 	cd ${TOOLCHAIN_DIR}/minimalloc && git checkout ${MINIMALLOC_COMMMIT_HASH} && \
-	cmake -DCMAKE_BUILD_TYPE=Release && make -j && \
+	cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=17 && make -j && \
 	mkdir -p ${MINIMALLOC_INSTALL_DIR} && cp minimalloc ${MINIMALLOC_INSTALL_DIR}
 
 ${CHIMERA_SDK_INSTALL_DIR}:
diff --git a/TargetLibraries/Spatz/CMakeLists.txt b/TargetLibraries/Spatz/CMakeLists.txt
new file mode 100644
index 0000000000..ef0fd63ab8
--- /dev/null
+++ b/TargetLibraries/Spatz/CMakeLists.txt
@@ -0,0 +1,18 @@
+file(GLOB_RECURSE SOURCES
+	"src/**"
+)
+
+list(APPEND SOURCES
+	${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel/sp-fmatmul.c
+)
+
+include(cmake/spatz-runtime-precompiled.cmake)
+
+add_deeploy_library(deeployspatz STATIC ${SOURCES})
+target_include_directories(deeployspatz
+	PUBLIC
+	${CMAKE_CURRENT_LIST_DIR}/inc
+)
+target_include_directories(deeployspatz PRIVATE ${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel)
+target_include_directories(deeployspatz SYSTEM PUBLIC ${SPATZ_RUNTIME_INCLUDE})
+target_link_libraries(deeployspatz INTERFACE spatz-runtime)
diff --git a/TargetLibraries/Spatz/cmake/spatz-runtime-precompiled.cmake b/TargetLibraries/Spatz/cmake/spatz-runtime-precompiled.cmake
new file mode 100644
index 0000000000..42e15e1b31
--- /dev/null
+++ b/TargetLibraries/Spatz/cmake/spatz-runtime-precompiled.cmake
@@ -0,0 +1,27 @@
+
+set(SPATZ_RUNTIME_BASE_INCLUDE
+	${SPATZ_HOME}/sw/snRuntime/include
+	${SPATZ_HOME}/sw/snRuntime/vendor
+	${SPATZ_HOME}/sw/toolchain/riscv-opcodes
+)
+
+set(SPATZ_CLUSTER_LINK_INCLUDE
+	${SPATZ_HOME}/hw/system/spatz_cluster/sw/build/snRuntime
+)
+
+set(SPATZ_LINKER_SCRIPT ${SPATZ_HOME}/hw/system/spatz_cluster/sw/build/snRuntime/common.ld)
+# set(SPATZ_LINKER_SCRIPT ${SNITCH_RUNTIME_HOME}/base.ld)
+if(NOT EXISTS ${SPATZ_LINKER_SCRIPT})
+	message(FATAL_ERROR "Spatz linker script not found: ${SPATZ_LINKER_SCRIPT}")
+endif()
+
+set(SPATZ_CLUSTER_LINK_OPTIONS
+	-Wl,--gc-sections
+	-T ${SPATZ_LINKER_SCRIPT}
+)
+
+set(SPATZ_RUNTIME_INCLUDE ${SPATZ_RUNTIME_BASE_INCLUDE})
+
+add_library(spatz-runtime INTERFACE)
+target_link_directories(spatz-runtime INTERFACE ${SPATZ_CLUSTER_LINK_INCLUDE})
+target_link_libraries(spatz-runtime INTERFACE ${SPATZ_CLUSTER_LINK_OPTIONS} libsnRuntime-cluster.a)
diff --git a/TargetLibraries/Spatz/inc/DeeploySpatzMath.h b/TargetLibraries/Spatz/inc/DeeploySpatzMath.h
new file mode 100644
index 0000000000..027fbbc974
--- /dev/null
+++ b/TargetLibraries/Spatz/inc/DeeploySpatzMath.h
@@ -0,0 +1,38 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_SPATZ_MATH_HEADER_
+#define __DEEPLOY_SPATZ_MATH_HEADER_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "DeeployBasicMath.h"
+#include "snrt.h"
+
+void Spatz_MatMul_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
+								 const float32_t *__restrict__ pSrcB,
+								 float32_t *__restrict__ pDstY, uint32_t M,
+								 uint32_t N, uint32_t O);
+
+void Spatz_Softmax_fp32_fp32(float32_t *input, float32_t *output, int32_t size,
+                       int32_t last_dim_length);
+
+// void Spatz_MatMul_fp16_fp16_fp16(const __fp16 *__restrict__ pSrcA,
+// 								 const __fp16 *__restrict__ pSrcB,
+// 								 __fp16 *__restrict__ pDstY, uint32_t M,
+// 								 uint32_t N, uint32_t O);
+// 
+// void Spatz_MatMul_fp64_fp64_fp64(const double *__restrict__ pSrcA,
+// 								 const double *__restrict__ pSrcB,
+// 								 double *__restrict__ pDstY, uint32_t M,
+// 								 uint32_t N, uint32_t O);
+
+#define BEGIN_SINGLE_CORE if (core_id == 0) {
+#define END_SINGLE_CORE }
+#define SINGLE_CORE if (core_id == 0)
+
+#endif // __DEEPLOY_SPATZ_MATH_HEADER_
diff --git a/TargetLibraries/Spatz/inc/Util.h b/TargetLibraries/Spatz/inc/Util.h
new file mode 100644
index 0000000000..893d687fa1
--- /dev/null
+++ b/TargetLibraries/Spatz/inc/Util.h
@@ -0,0 +1,9 @@
+// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SPATZ_UTIL_H
+#define SPATZ_UTIL_H
+
+void spatz_util_dummy(void);
+
+#endif // SPATZ_UTIL_H
diff --git a/TargetLibraries/Spatz/src/MatMul_fp32_spatz.c b/TargetLibraries/Spatz/src/MatMul_fp32_spatz.c
new file mode 100644
index 0000000000..babc94b795
--- /dev/null
+++ b/TargetLibraries/Spatz/src/MatMul_fp32_spatz.c
@@ -0,0 +1,59 @@
+#include "DeeploySpatzMath.h"
+#include <snrt.h>
+
+// functions defined in ${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel/sp-fmatmul.c
+void matmul_2xVL(float *c, const float *a, const float *b,
+                 const unsigned int m_start, const unsigned int m_end,
+                 const unsigned int N, const unsigned int P,
+                 const unsigned int p_start, const unsigned int p_end);
+
+
+void matmul_4xVL(float *c, const float *a, const float *b,
+                 const unsigned int m_start, const unsigned int m_end,
+                 const unsigned int N, const unsigned int P,
+                 const unsigned int p_start, const unsigned int p_end);
+
+
+void matmul_8xVL(float *c, const float *a, const float *b,
+                 const unsigned int m_start, const unsigned int m_end,
+                 const unsigned int N, const unsigned int P,
+                 const unsigned int p_start, const unsigned int p_end);
+
+// void matmul(float *c, const float *a, const float *b, const unsigned int M,
+//             const unsigned int N, const unsigned int P);
+
+// void Spatz_MatMul_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
+//                                  const float32_t *__restrict__ pSrcB,
+//                                  float32_t *__restrict__ pDstY, uint32_t M,
+//                                  uint32_t N, uint32_t O) {
+// 	// defined in ${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel/sp-fmatmul.c
+//   matmul(pDstY, pSrcA, pSrcB, M, N, O);
+// }
+
+/*
+a * b = c
+*/
+void Spatz_MatMul_fp32_fp32_fp32(const float32_t *__restrict__ a,
+                                 const float32_t *__restrict__ b,
+                                 float32_t *__restrict__ c, uint32_t M,
+                                 uint32_t N, uint32_t P) {
+  // const unsigned int num_cores = snrt_cluster_core_num(); = 2 for spatz
+  const unsigned int cid = snrt_cluster_core_idx();
+
+  unsigned int m_start, m_end;
+  if (cid == 0){
+    m_start = 0;
+    m_end = (M/2);
+  } else {
+    m_start = (M/2);
+    m_end = M;
+  }
+
+  if (M <= 4) {
+    matmul_2xVL(c, a, b, m_start, m_end, N, P, 0, P);
+  } else if (M <= 8) {
+    matmul_4xVL(c, a, b, m_start, m_end, N, P, 0, P);
+  } else {
+    matmul_8xVL(c, a, b, m_start, m_end, N, P, 0, P);
+  }
+}
diff --git a/TargetLibraries/Spatz/src/Softmax_fp32_spatz.c b/TargetLibraries/Spatz/src/Softmax_fp32_spatz.c
new file mode 100644
index 0000000000..f50571f906
--- /dev/null
+++ b/TargetLibraries/Spatz/src/Softmax_fp32_spatz.c
@@ -0,0 +1,82 @@
+/*
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+float32_t myexpf(float32_t x){
+  const float32_t inv_ln2 = 1.4426950409f;
+  const float32_t ln2 = 0.6931471806f;
+
+  // Range reduction: x = k * ln(2) + r, with r kept small so the polynomial is accurate.
+  float32_t scaled = x * inv_ln2;
+  int32_t k = (int32_t)(scaled + (scaled >= 0.0f ? 0.5f : -0.5f));
+  float32_t r = x - ((float32_t)k * ln2);
+
+  float32_t r2 = r * r;
+  float32_t r3 = r2 * r;
+  float32_t r4 = r3 * r;
+  float32_t r5 = r4 * r;
+  float32_t r6 = r5 * r;
+  float32_t r7 = r6 * r;
+
+  float32_t poly = 1.0f + r + (r2 * 0.5f) + (r3 * 0.1666666667f) + (r4 * 0.0416666667f) + (r5 * 0.0083333333f) + (r6 * 0.0013888889f) + (r7 * 0.0001984127f);
+
+  return ldexpf(poly, k);
+}
+
+// inverse funciton that doesnt use fdiv.s
+float32_t myinv(float32_t x){
+    uint32_t i = *(uint32_t*)&x;
+    i = 0x7EEEEEEE - i; 
+    float y = *(float*)&i;
+
+    // Newton-Raphson steps (Multiplication only!)
+    y = y * (2.0f - x * y);
+    y = y * (2.0f - x * y);
+    y = y * (2.0f - x * y); 
+    
+    return y;
+}
+
+void Spatz_Softmax_fp32_fp32(float32_t *input, float32_t *output, int32_t size, int32_t last_dim_length) {
+  const unsigned int cid = snrt_cluster_core_idx();
+  int32_t batch_size = size / last_dim_length;
+  // divide in two cores
+  unsigned int items_per_core = (batch_size + 1) / 2;
+
+  unsigned int b_start, b_end;
+
+  if (cid == 0) {
+      b_start = 0;
+      b_end   = items_per_core;
+  } else {
+      b_start = items_per_core;
+      // Core 1 always ends at the total batch size
+      b_end   = batch_size;
+  }
+  for (int b = b_start; b < b_end; b++) {
+    float32_t max_val = -inf;
+    float sum = 0.0f;
+
+    for (int i = 0; i < last_dim_length; i++) {
+      if (input[b * last_dim_length + i] > max_val) {
+        max_val = input[b * last_dim_length + i];
+      }
+    }
+
+    for (int i = 0; i < last_dim_length; i++) {
+      float32_t exp_val = input[b * last_dim_length + i] - max_val;
+      output[b * last_dim_length + i] = myexpf(exp_val);
+      sum += output[b * last_dim_length + i];
+    }
+
+    float32_t sum_1 = myinv(sum);
+    for (int i = 0; i < last_dim_length; i++) {
+      output[b * last_dim_length + i] = output[b * last_dim_length + i] * sum_1;
+    }
+  }
+}
diff --git a/TargetLibraries/Spatz/src/Util.c b/TargetLibraries/Spatz/src/Util.c
new file mode 100644
index 0000000000..9c30c11f49
--- /dev/null
+++ b/TargetLibraries/Spatz/src/Util.c
@@ -0,0 +1,5 @@
+// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+// SPDX-License-Identifier: Apache-2.0
+
+// Minimal stub for Spatz runtime linkage
+void spatz_util_dummy(void) {}
diff --git a/cmake/simulation.cmake b/cmake/simulation.cmake
index 55525feedd..983dc0e4ee 100644
--- a/cmake/simulation.cmake
+++ b/cmake/simulation.cmake
@@ -102,3 +102,19 @@ macro(add_gvsoc_emulation name target)
 		USES_TERMINAL
 	)
 endmacro()
+
+macro(add_spatz_gvsoc_emulation name target)
+	set(GVSOC_WORKDIR ${CMAKE_BINARY_DIR}/gvsoc_workdir)
+	make_directory(${GVSOC_WORKDIR})
+	set(GVSOC_BINARY "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name}")
+	add_custom_target(gvsoc_${name}
+		DEPENDS ${name}
+		WORKING_DIRECTORY ${GVSOC_WORKDIR}
+		COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_BINARY_DIR}/*.bin ${GVSOC_WORKDIR}/ || true
+		COMMAND bash -c "${GVSOC_INSTALL_DIR}/bin/gvrun --target ${target} --param chip/soc/binary=${GVSOC_BINARY} run"
+		COMMENT "Simulating deeploytest ${name} with gvsoc for the target ${target}"
+		POST_BUILD
+		USES_TERMINAL
+		VERBATIM
+	)
+endmacro()
diff --git a/cmake/spatz/spatz.cmake b/cmake/spatz/spatz.cmake
new file mode 100644
index 0000000000..b715f625c9
--- /dev/null
+++ b/cmake/spatz/spatz.cmake
@@ -0,0 +1,30 @@
+add_compile_definitions(
+	DEEPLOY_SPATZ_PLATFORM
+)
+
+set(DEEPLOY_ARCH SPATZ)
+
+set(num_threads ${NUM_CORES})
+
+macro(add_spatz_vsim_simulation name)
+	add_custom_target(vsim_${name}
+	WORKING_DIRECTORY ${SPATZ_HOME}/hw/system/spatz_cluster
+	DEPENDS ${name}
+	COMMAND ${QUESTA} bin/spatz_cluster.vsim
+	${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name} || true
+	COMMENT "Simulating deeploytest with vsim (Spatz cluster)"
+	POST_BUILD
+	USES_TERMINAL
+	VERBATIM
+	)
+endmacro()
+
+add_compile_options(
+		-ffast-math
+)
+
+add_link_options(
+		-ffast-math
+		-Wl,--gc-sections
+)
+
diff --git a/cmake/spatz/toolchain_llvm.cmake b/cmake/spatz/toolchain_llvm.cmake
new file mode 100644
index 0000000000..3a149c04f0
--- /dev/null
+++ b/cmake/spatz/toolchain_llvm.cmake
@@ -0,0 +1,72 @@
+
+set(CMAKE_SYSTEM_NAME Generic)
+
+# Crucial: Point CMake to the specialized Clang toolchain instead of system cc
+set(SPATZ_TOOLCHAIN_DIR ${SPATZ_HOME}/sw/toolchain/llvm-project/build/bin)
+
+set(CMAKE_C_COMPILER   ${SPATZ_TOOLCHAIN_DIR}/clang)
+set(CMAKE_CXX_COMPILER ${SPATZ_TOOLCHAIN_DIR}/clang++)
+set(CMAKE_ASM_COMPILER ${SPATZ_TOOLCHAIN_DIR}/clang)
+set(CMAKE_OBJCOPY ${SPATZ_TOOLCHAIN_DIR}/llvm-objcopy)
+set(CMAKE_OBJDUMP ${SPATZ_TOOLCHAIN_DIR}/llvm-objdump)
+set(CMAKE_LINKER ${SPATZ_TOOLCHAIN_DIR}/ld.lld)
+set(CMAKE_EXECUTABLE_SUFFIX ".elf")
+
+set(ISA rv32imafdvzfh_xdma)
+
+# Compile options based on user's manual compilation commands
+add_compile_options(
+    -target riscv32-unknown-elf
+    # -MP
+    -mcpu=snitch
+    -mcmodel=small
+    
+    -ffast-math
+    -fno-builtin-printf
+    -fno-common
+    -falign-loops=16 
+    -ffunction-sections
+    -Wextra
+    
+    # LLVM specific flags from user command
+    -mllvm -misched-topdown 
+    -menable-experimental-extensions
+    -mno-relax
+    
+    -march=${ISA}
+    -mabi=ilp32d
+    -isystem ${SPATZ_HOME}/sw/toolchain/riscv-gnu-toolchain/riscv-newlib/newlib/libc/include
+    
+    # Optimization and debug
+    -O3 
+    -g
+)
+
+# Link options matching user command
+add_link_options(
+    # -target riscv32-unknown-elf
+    -mcpu=snitch
+    -march=${ISA}
+    -mabi=ilp32d
+    -mcmodel=small
+    
+    -fuse-ld=lld
+    -nostartfiles
+
+    -ffast-math
+    -fno-common
+    -fno-builtin-printf
+     
+    -static 
+    -Wl,-z,norelro 
+    -Wl,--gc-sections 
+    -Wl,--no-relax
+
+    --gcc-toolchain=/usr/pack/riscv-1.0-kgf/spatz-gcc-7.1.1
+)
+
+# User command linked: -lm -lgcc -lm -lgcc libsnRuntime-cluster.a
+# libsnRuntime-cluster.a is handled by our target_link_libraries(deeployspatz INTERFACE spatz-runtime)
+link_libraries(
+    -lm -lgcc -lm -lgcc
+)
diff --git a/conda_enviroment_deeply_mattia.yml b/conda_enviroment_deeply_mattia.yml
new file mode 100644
index 0000000000..ad24ceee15
--- /dev/null
+++ b/conda_enviroment_deeply_mattia.yml
@@ -0,0 +1,81 @@
+name: /scratch/mmm/.conda/envs/deeploy_conda_venv2
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2025.12.2=h06a4308_0
+  - ld_impl_linux-64=2.44=h9e0c5a2_3
+  - libexpat=2.7.5=h7354ed3_0
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc=15.2.0=h69a1729_7
+  - libgcc-ng=15.2.0=h166f726_7
+  - libgomp=15.2.0=h4751f2c_7
+  - libnsl=2.0.0=h5eee18b_0
+  - libstdcxx=15.2.0=h39759b7_7
+  - libstdcxx-ng=15.2.0=hc03a8fd_7
+  - libuuid=1.41.5=h5eee18b_0
+  - libxcb=1.17.0=h9b100fa_0
+  - libzlib=1.3.1=hb25bd0a_0
+  - ncurses=6.5=h7934f7d_0
+  - openssl=3.5.5=h1b28b03_0
+  - packaging=25.0=py311h06a4308_1
+  - pip=26.0.1=pyhc872135_0
+  - pthread-stubs=0.3=h0ce48e5_1
+  - python=3.11.15=h741d88c_0
+  - readline=8.3=hc2a1206_0
+  - setuptools=80.10.2=py311h06a4308_0
+  - sqlite=3.51.2=h3e8d24a_0
+  - tk=8.6.15=h54e0aa7_0
+  - tzdata=2026a=he532380_0
+  - wheel=0.46.3=py311h06a4308_0
+  - xorg-libx11=1.8.12=h9b100fa_1
+  - xorg-libxau=1.0.12=h9b100fa_0
+  - xorg-libxdmcp=1.1.5=h9b100fa_0
+  - xorg-xorgproto=2024.1=h5eee18b_1
+  - xz=5.8.2=h448239c_0
+  - zlib=1.3.1=hb25bd0a_0
+  - pip:
+      - absl-py==2.4.0
+      - argparse==1.4.0
+      - beautifulsoup4==4.14.3
+      - certifi==2026.2.25
+      - chardet==5.2.0
+      - charset-normalizer==3.4.6
+      - contourpy==1.3.3
+      - cycler==0.12.1
+      - deeploy-pulp==0.2.1
+      - flatbuffers==25.12.19
+      - fonttools==4.62.1
+      - idna==3.11
+      - imagesize==2.0.0
+      - iniconfig==2.3.0
+      - jinja2==3.1.6
+      - kiwisolver==1.5.0
+      - lz4==4.4.5
+      - markdown-it-py==4.0.0
+      - markupsafe==3.0.3
+      - mdurl==0.1.2
+      - mpmath==1.3.0
+      - narwhals==2.18.1
+      - pillow==12.1.1
+      - plotly==6.6.0
+      - pluggy==1.6.0
+      - psutil==7.2.2
+      - ptyprocess==0.7.0
+      - pyparsing==3.3.2
+      - pytest==9.0.2
+      - python-dateutil==2.9.0.post0
+      - pytz==2026.1.post1
+      - six==1.17.0
+      - snowballstemmer==3.0.1
+      - soupsieve==2.8.3
+      - sphinxcontrib-jsmath==1.0.1
+      - sympy==1.14.0
+      - tabulate==0.10.0
+      - toml==0.10.2
+      - typing-extensions==4.15.0
+      - urllib3==2.6.3
+      - wcwidth==0.6.0
+prefix: /scratch/mmm/.conda/envs/deeploy_conda_venv2
diff --git a/toolchain/gvsoc.patch b/toolchain/gvsoc.patch
new file mode 100644
index 0000000000..22e65922a9
--- /dev/null
+++ b/toolchain/gvsoc.patch
@@ -0,0 +1,12 @@
+diff --git a/engine/src/launcher.cpp b/engine/src/launcher.cpp
+index f0b1b654..48c83592 100644
+--- a/engine/src/launcher.cpp
++++ b/engine/src/launcher.cpp
+@@ -21,6 +21,7 @@
+ 
+ #include <pthread.h>
+ #include <signal.h>
++#include <unistd.h>
+ #include <algorithm>
+ 
+ #include <stdexcept>