From f6efa833613234d297a96ad038a220ac7e5818be Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 18 Apr 2024 15:23:29 +0100
Subject: [PATCH 01/16] [Deconv] Initial draft of deconv node

---
 fetch-repos.sh                                |   3 +-
 src/finn/custom_op/fpgadataflow/__init__.py   |   4 +-
 .../custom_op/fpgadataflow/deconvolution.py   | 167 +++++++++
 .../custom_op/fpgadataflow/hls/__init__.py    |   2 +
 .../fpgadataflow/hls/deconvolution_hls.py     | 348 ++++++++++++++++++
 src/finn/util/data_packing.py                 |  51 +++
 .../fpgadataflow/test_fpgadataflow_deconv.py  |  65 ++++
 7 files changed, 638 insertions(+), 2 deletions(-)
 create mode 100644 src/finn/custom_op/fpgadataflow/deconvolution.py
 create mode 100644 src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py

diff --git a/fetch-repos.sh b/fetch-repos.sh
index 073c052d67..45e0effb94 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 # Copyright (c) 2020-2022, Xilinx, Inc.
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -32,7 +33,7 @@ FINN_EXP_COMMIT="de99347e936d51715f5356a1b6c64e37b91c23c2"
 BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
+HLSLIB_COMMIT="e80d94ca1f28b80476b0289d154d239976c25fef"
 OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index aed2ab7fe1..a22831ca37 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -1,5 +1,5 @@
 # Copyright (C) 2020-2022, Xilinx, Inc.
-# Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,7 @@
 from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
     ConvolutionInputGenerator,
 )
+from finn.custom_op.fpgadataflow.deconvolution import Deconvolution
 from finn.custom_op.fpgadataflow.downsampler import DownSampler
 from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams
 from finn.custom_op.fpgadataflow.fmpadding import FMPadding
@@ -68,6 +69,7 @@
 custom_op["AddStreams"] = AddStreams
 custom_op["ChannelwiseOp"] = ChannelwiseOp
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
+custom_op["Deconvolution"] = Deconvolution
 custom_op["DownSampler"] = DownSampler
 custom_op["DuplicateStreams"] = DuplicateStreams
 custom_op["FMPadding"] = FMPadding
diff --git a/src/finn/custom_op/fpgadataflow/deconvolution.py b/src/finn/custom_op/fpgadataflow/deconvolution.py
new file mode 100644
index 0000000000..0e14bea641
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/deconvolution.py
@@ -0,0 +1,167 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+class Deconvolution(HWCustomOp):
+    """Abstraction layer for HW implementation of Deconvolution"""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "KernelDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "IFMChannels": ("i", True, 0),
+            "OFMChannels": ("i", True, 0),
+            "IFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "PE": ("i", True, 0),
+            "SIMD": ("i", True, 0),
+            "Stride": ("ints", True, [1, 1]),  # [H, W] = [Y, X]
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "weightDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self, ind=0):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
+        return ishape
+
+    def get_folded_input_shape(self, ind=0):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        fold = int(ifm_ch / simd)
+        folded_ishape = (1, ifm_dim_h, ifm_dim_w, fold, simd)
+        return folded_ishape
+
+    def get_normal_output_shape(self, ind=0):
+        idim_h, idim_w = self.get_nodeattr("IFMDim")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        k_h, k_w = self.get_nodeattr("KernelDim")
+        ofm_ch = self.get_nodeattr("OFMChannels")
+        pad_h = int(k_h / stride_h) - 1
+        pad_w = int(k_w / stride_w) - 1
+        odim_h = (idim_h - 1) * stride_h - 2 * pad_h + (k_h - 1) + 1
+        odim_w = (idim_w - 1) * stride_w - 2 * pad_w + (k_w - 1) + 1
+        oshape = (1, odim_h, odim_w, ofm_ch)
+        return oshape
+
+    def get_folded_output_shape(self, ind=0):
+        normal_oshape = self.get_normal_output_shape()
+        odim_h = normal_oshape[1]
+        odim_w = normal_oshape[2]
+        ofm_ch = normal_oshape[3]
+        pe = self.get_nodeattr("PE")
+        fold = int(ofm_ch / pe)
+        folded_oshape = (1, odim_h, odim_w, fold, pe)
+        return folded_oshape
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape for Deconv."
+        # implement tensor with correct shape
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_weight_datatype(self):
+        """Returns FINN DataType of weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        """Returns stream width, input and output stream width are equal for
+        the sliding window function"""
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        in_width = simd * ibits
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        o_bits = self.get_output_datatype().bitwidth()
+        out_width = o_bits * self.get_nodeattr("PE")
+        return out_width
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        num_output_elems = np.prod(folded_oshape[:-1])
+        return num_output_elems
+
+    def get_exp_cycles(self):
+        return 0
+
+    def bram_estimation(self):
+        return 0
+
+    def lut_estimation(self):
+        return 0
+
+    def uram_estimation(self):
+        return 0
+
+    def execute_node(self, context, graph):
+        pass
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index 405c47a08d..c91541cab7 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -33,6 +33,7 @@
 from finn.custom_op.fpgadataflow.hls.convolutioninputgenerator_hls import (
     ConvolutionInputGenerator_hls,
 )
+from finn.custom_op.fpgadataflow.hls.deconvolution_hls import Deconvolution_hls
 from finn.custom_op.fpgadataflow.hls.downsampler_hls import DownSampler_hls
 from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls
 from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls
@@ -61,6 +62,7 @@
 custom_op["ChannelwiseOp_hls"] = ChannelwiseOp_hls
 custom_op["CheckSum_hls"] = CheckSum_hls
 custom_op["ConvolutionInputGenerator_hls"] = ConvolutionInputGenerator_hls
+custom_op["Deconvolution_hls"] = Deconvolution_hls
 custom_op["DownSampler_hls"] = DownSampler_hls
 custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls
 custom_op["FMPadding_hls"] = FMPadding_hls
diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
new file mode 100644
index 0000000000..0448f04b07
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -0,0 +1,348 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions
+
+from finn.custom_op.fpgadataflow.deconvolution import Deconvolution
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code1,
+    rtlsim_output_to_npy,
+)
+
+
+class Deconvolution_hls(Deconvolution, HLSBackend):
+    """Corresponds to finn-hlslib deconv function."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(Deconvolution.get_nodeattr_types(self))
+        my_attrs.update(HLSBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def calc_wmem(self):
+        """Calculates and returns WMEM."""
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ofm_ch = self.get_nodeattr("OFMChannels")
+        kernel_2 = np.prod(self.get_nodeattr("KernelDim"))
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        assert ofm_ch % pe == 0, "Requirement output channels divisable by PE is violated."
+        assert ifm_ch % simd == 0, "Requirement input channels divisable by SIMD is violated."
+        wmem = (ofm_ch / pe) * kernel_2 * (ifm_ch / simd)
+        return int(wmem)
+
+    def generate_params(self, model, path):
+        code_gen_dir = path
+        # weights, if not external
+        weights = model.get_initializer(self.onnx_node.input[1])
+        # save hlslib-compatible weights in params.h
+        weight_filename = "{}/params.h".format(code_gen_dir)
+        self.make_weight_file(weights, "hls_header", weight_filename)
+
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights in appropriate format for this
+        layer. This file can be used for either synthesis or run-time reconfig
+        of weights.
+
+        Arguments:
+
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+
+        """
+        # convert weights into hlslib-compatible format
+        weight_tensor = self.get_hw_compatible_weight_tensor(weights)
+        export_wdt = self.get_weight_datatype()
+        if weight_file_mode == "hls_header":
+            weight_hls_code = numpy_to_hls_code1(weight_tensor, export_wdt, "weights", True, True)
+            # write weights into C++ header file as dictated by finn-hlslib
+            f_weights = open(weight_file_name, "w")
+            f_weights.write(
+                "static hls::vector<hls::vector<{}, {}>, {}> const weights[{}] = ".format(
+                    export_wdt.get_hls_datatype_str(),
+                    self.get_nodeattr("SIMD"),
+                    self.get_nodeattr("PE"),
+                    self.calc_wmem(),
+                )
+            )
+            f_weights.write(weight_hls_code)
+            f_weights.close()
+
+    def get_hw_compatible_weight_tensor(self, orig_weight_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure OCH % PE == 0 and ICH % SIMD == 0
+        * interleave rows between PEs
+        * reshape into (1, PE, WMEM, SIMD) and return
+        """
+        k_h, k_w = self.get_nodeattr("KernelDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ofm_ch = self.get_nodeattr("OFMChannels")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            k_h * k_w * ifm_ch,
+            ofm_ch,
+        ), """Weights matrix doesn't
+        have expected shape (k_h*k_w*ifm_ch, ofm_ch)"""
+        assert ofm_ch % pe == 0, "Requirement output channels divisable by PE is violated."
+        assert ifm_ch % simd == 0, "Requirement input channels divisable by SIMD is violated."
+        # start by transposing the original weight matrix, since ONNX and
+        # finn-hlslib use different assumptions
+        # ONNX uses (in_features, out_features) and matmul(x, W)
+        # finn-hlslib uses (out_features, in_features) and matmul(W, x)
+        ret = orig_weight_matrix.T
+        # interleave rows between PEs and reshape
+        # distribute rows between PEs
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        # create SIMD as innermost dimension and add a dummy outer dim
+        ret = ret.reshape(1, pe, wmem, simd)
+        # reverse the SIMD dimension
+        ret = np.flip(ret, axis=-1)
+        return ret
+
+    def global_includes(self):
+        # self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
+        # self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
+
+        self.code_gen_dict["$GLOBALS$"] = ['#include "deconv.hpp"']
+
+    def defines(self, var):
+        ifm_dim = self.get_nodeattr("IFMDim")
+        self.code_gen_dict["$DEFINES$"] = [
+            """#define Kernel {}\n #define Stride {}\n
+            #define IFMH {}\n #define IFMW {}\n #define ICH {}\n
+            #define OCH {}\n #define SIMD1 {}\n #define PE1 {}""".format(
+                self.get_nodeattr("KernelDim")[0],
+                self.get_nodeattr("Stride")[0],
+                ifm_dim[0],
+                ifm_dim[1],
+                self.get_nodeattr("IFMChannels"),
+                self.get_nodeattr("OFMChannels"),
+                self.get_nodeattr("SIMD"),
+                self.get_nodeattr("PE"),
+            )
+        ]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_input_datatype()
+        elem_hls_type = dtype.get_hls_datatype_str()
+        simd = self.get_nodeattr("SIMD")
+        npy_type = "float"
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # note: the innermost dim is reversed for the input
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintvectorstream<%s, %s, %d>("%s", in0_%s, false);'
+            % (
+                elem_hls_type,
+                npy_type,
+                simd,
+                npy_in,
+                self.hls_sname(),
+            )
+        )
+
+    def strm_decl(self):
+        idtype = self.get_input_datatype()
+        odtype = self.get_output_datatype()
+        simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<hls::vector<{},{}>> in0_{} ("in0_{}");'.format(
+                idtype.get_hls_datatype_str(), simd, self.hls_sname(), self.hls_sname()
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<hls::vector<{},{}>> out_{} ("out_{}");'.format(
+                odtype.get_hls_datatype_str(), pe, self.hls_sname(), self.hls_sname()
+            )
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """deconv<Kernel, Stride, IFMH, IFMW, OCH, ICH, PE1, SIMD1>
+            (weights, in0_{}, out_{});""".format(
+                self.hls_sname(),
+                self.hls_sname(),
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        pe = self.get_nodeattr("PE")
+        dtype = self.get_output_datatype()
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        shape = self.get_folded_output_shape()
+        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+        # note: the innermost dim is not reversed for the output
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintvectorstream2npy<%s, %s, %d>(out_%s, %s, "%s", false);'
+            % (
+                elem_hls_type,
+                npy_type,
+                pe,
+                self.hls_sname(),
+                shape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}>> &in0_{},
+                hls::stream<ap_uint<{}>> &out_{}
+                )""".format(
+                self.onnx_node.name,
+                self.get_instream_width(),
+                self.hls_sname(),
+                self.get_outstream_width(),
+                self.hls_sname(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+        self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+        # the weight tensor is ap_uint<simd*prec> [PE][WMEM]
+        # partition for parallel access along the PE dimension (dim 1)
+        # self.code_gen_dict["$PRAGMAS$"].append(
+        #    ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
+        # )
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        # create a npy file fore each input of the node (in_ind is input index)
+        in_ind = 0
+        for inputs in node.input:
+            # it is assumed that the first input of the node is the data input
+            # the second input are the weights
+            # the third input are the thresholds
+            if in_ind == 0:
+                assert (
+                    str(context[inputs].dtype) == "float32"
+                ), """Input datatype is
+                not float32 as expected."""
+                expected_inp_shape = self.get_folded_input_shape()
+                reshaped_input = context[inputs].reshape(expected_inp_shape)
+                if self.get_input_datatype() == DataType["BIPOLAR"]:
+                    # store bipolar activations as binary
+                    reshaped_input = (reshaped_input + 1) / 2
+                    export_idt = DataType["BINARY"]
+                else:
+                    export_idt = self.get_input_datatype()
+                # make copy before saving the array
+                reshaped_input = reshaped_input.copy()
+                np.save(
+                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                    reshaped_input,
+                )
+            elif in_ind > 2:
+                raise Exception("Unexpected input found for MatrixVectorActivation")
+            in_ind += 1
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            # reinterpret binary output as bipolar where needed
+            if self.get_output_datatype() == DataType["BIPOLAR"]:
+                out = context[node.output[0]]
+                out = 2 * out - 1
+                context[node.output[0]] = out
+            assert (
+                context[node.output[0]].shape == self.get_normal_output_shape()
+            ), "cppsim did not produce expected output shape"
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
+            self.reset_rtlsim(sim)
+            self.toggle_clk(sim)
+            output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
+
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 7698850029..f23f409f39 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -281,6 +281,57 @@ def elem2str(x):
     return ret
 
 
+def numpy_to_hls_code1(ndarray, dtype, hls_var_name, pack_innermost_dim=True, no_decl=False):
+    """Return C++ code representation of a numpy ndarray with FINN DataType
+    dtype, using hls_var_name as the resulting C++ variable name. If
+    pack_innermost_dim is specified, the innermost dimension of the ndarray
+    will be packed into a hex string using array2hexstring. If no_decl is
+    set to True, no variable name and type will be generated as part of the
+    emitted string.
+    """
+    hls_dtype = dtype.get_hls_datatype_str()
+    if type(ndarray) != np.ndarray or ndarray.dtype != np.float32:
+        # try to convert to a float numpy array (container dtype is float)
+        ndarray = np.asarray(ndarray, dtype=np.float32)
+    if pack_innermost_dim:
+        idimlen = ndarray.shape[-1]
+        idimbits = idimlen * dtype.bitwidth()
+        idimbits = roundup_to_integer_multiple(idimbits, 4)
+        ndarray = pack_innermost_dim_as_hex_string(ndarray, dtype, idimbits)
+        hls_dtype = "ap_uint<%d>" % idimbits
+    ndims = ndarray.ndim
+    # add type string and variable name
+    # e.g. "const ap_uint<64>" "weightMem0"
+    ret = "%s %s" % (hls_dtype, hls_var_name)
+    # add dimensions
+    for d in range(ndims):
+        ret += "[%d]" % ndarray.shape[d]
+    orig_printops = np.get_printoptions()
+    np.set_printoptions(threshold=sys.maxsize)
+
+    # define a function to convert a single element into a C++ init string
+    # a single element can be a hex string if we are using packing
+    def elem2str(x):
+        if type(x) == str or type(x) == np.str_:
+            return '{{"%s",},}' % (x)
+        elif type(x) == np.float32:
+            if dtype.is_integer():
+                return str(int(x))
+            else:
+                return str(x)
+        else:
+            raise Exception("Unsupported type for numpy_to_hls_code")
+
+    strarr = np.array2string(ndarray, separator=", ", formatter={"all": elem2str})
+    np.set_printoptions(**orig_printops)
+    strarr = strarr.replace("[", "{").replace("]", "}")
+    if no_decl:
+        ret = strarr + ";"
+    else:
+        ret = ret + " = \n" + strarr + ";"
+    return ret
+
+
 def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=True):
     """Convert the multidimensional NumPy array of integers (stored as floats)
     from input_file into a flattened sequence of Python arbitrary-precision
diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py
index f1fc989066..981430db81 100644
--- a/tests/fpgadataflow/test_fpgadataflow_deconv.py
+++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py
@@ -123,6 +123,71 @@ def set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding):
     return model
 
 
+def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
+    idim_h, idim_w = idim
+    stride_h, stride_w = stride
+    odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1
+    odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1
+
+    inp = helper.make_tensor_value_info(
+        "inp",
+        TensorProto.FLOAT,
+        [
+            1,
+            idim_h,
+            idim_w,
+            ifm_ch,
+        ],
+    )
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, odim_h, odim_w, ofm_ch])
+
+    W = helper.make_tensor_value_info("W", TensorProto.FLOAT, [ifm_ch * k * k, ofm_ch])
+
+    Deconv = helper.make_node(
+        "Deconvolution_hls",
+        ["inp", "W"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow.hls",
+        backend="fpgadataflow",
+        KernelDim=[k, k],
+        IFMChannels=ifm_ch,
+        OFMChannels=ofm_ch,
+        IFMDim=idim,
+        Stride=[stride_h, stride_w],
+        PE=1,
+        SIMD=1,
+        inputDataType=idt.name,
+        weightDataType=wdt.name,
+        outputDataType=odt.name,
+    )
+
+    node_list = [Deconv]
+    value_info = [W]
+
+    graph = helper.make_graph(
+        nodes=node_list,
+        name="convtranspose_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=value_info,
+    )
+
+    model = qonnx_make_model(graph, producer_name="convtranspose-model")
+    model = ModelWrapper(model)
+
+    # initialize model
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype(model.graph.output[0].name, odt)
+    model.set_tensor_datatype("W", wdt)
+
+    w_tensor = gen_finn_dt_tensor(wdt, [ifm_ch * k * k, ofm_ch])
+    model.set_initializer("W", w_tensor)
+
+    model = model.transform(InferShapes())
+
+    return model
+
+
 # input image dimension
 @pytest.mark.parametrize("idim", [[8, 8], [10, 8]])
 # number of rows and number of cols to add

From db9130b041feddf349cc976211a04a27a0dd9f76 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 25 Apr 2024 15:29:30 +0100
Subject: [PATCH 02/16] [Deconv] Add padding and cleanup hls code generation

---
 fetch-repos.sh                                |  2 +-
 .../custom_op/fpgadataflow/deconvolution.py   |  4 +-
 .../fpgadataflow/hls/deconvolution_hls.py     | 54 ++++++++++---------
 src/finn/custom_op/fpgadataflow/templates.py  |  1 +
 src/finn/util/data_packing.py                 |  4 +-
 5 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index 45e0effb94..0c4a4a9349 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -33,7 +33,7 @@ FINN_EXP_COMMIT="de99347e936d51715f5356a1b6c64e37b91c23c2"
 BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="e80d94ca1f28b80476b0289d154d239976c25fef"
+HLSLIB_COMMIT="d56b1d0c1eeb844a873fb29a29240a86e00d9f80"
 OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
diff --git a/src/finn/custom_op/fpgadataflow/deconvolution.py b/src/finn/custom_op/fpgadataflow/deconvolution.py
index 0e14bea641..8dd33fea83 100644
--- a/src/finn/custom_op/fpgadataflow/deconvolution.py
+++ b/src/finn/custom_op/fpgadataflow/deconvolution.py
@@ -48,6 +48,7 @@ def get_nodeattr_types(self):
             "PE": ("i", True, 0),
             "SIMD": ("i", True, 0),
             "Stride": ("ints", True, [1, 1]),  # [H, W] = [Y, X]
+            "Padding": ("ints", True, []),  # [H, W] = [Y, X]
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
@@ -76,8 +77,7 @@ def get_normal_output_shape(self, ind=0):
         stride_h, stride_w = self.get_nodeattr("Stride")
         k_h, k_w = self.get_nodeattr("KernelDim")
         ofm_ch = self.get_nodeattr("OFMChannels")
-        pad_h = int(k_h / stride_h) - 1
-        pad_w = int(k_w / stride_w) - 1
+        pad_h, pad_w = self.get_nodeattr("Padding")
         odim_h = (idim_h - 1) * stride_h - 2 * pad_h + (k_h - 1) + 1
         odim_w = (idim_w - 1) * stride_w - 2 * pad_w + (k_w - 1) + 1
         oshape = (1, odim_h, odim_w, ofm_ch)
diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
index 0448f04b07..8931d818d4 100644
--- a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -93,11 +93,11 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
             # write weights into C++ header file as dictated by finn-hlslib
             f_weights = open(weight_file_name, "w")
             f_weights.write(
-                "static hls::vector<hls::vector<{}, {}>, {}> const weights[{}] = ".format(
+                "static {} const weights[{}][{}][{}] = ".format(
                     export_wdt.get_hls_datatype_str(),
-                    self.get_nodeattr("SIMD"),
-                    self.get_nodeattr("PE"),
                     self.calc_wmem(),
+                    self.get_nodeattr("PE"),
+                    self.get_nodeattr("SIMD"),
                 )
             )
             f_weights.write(weight_hls_code)
@@ -117,19 +117,15 @@ def get_hw_compatible_weight_tensor(self, orig_weight_matrix):
         simd = self.get_nodeattr("SIMD")
         wmem = self.calc_wmem()
         assert orig_weight_matrix.shape == (
-            k_h * k_w * ifm_ch,
             ofm_ch,
+            k_h * k_w * ifm_ch,
         ), """Weights matrix doesn't
-        have expected shape (k_h*k_w*ifm_ch, ofm_ch)"""
+        #have expected shape (k_h*k_w*ifm_ch, ofm_ch)"""
         assert ofm_ch % pe == 0, "Requirement output channels divisable by PE is violated."
         assert ifm_ch % simd == 0, "Requirement input channels divisable by SIMD is violated."
-        # start by transposing the original weight matrix, since ONNX and
-        # finn-hlslib use different assumptions
-        # ONNX uses (in_features, out_features) and matmul(x, W)
-        # finn-hlslib uses (out_features, in_features) and matmul(W, x)
-        ret = orig_weight_matrix.T
         # interleave rows between PEs and reshape
         # distribute rows between PEs
+        ret = orig_weight_matrix
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
         # create SIMD as innermost dimension and add a dummy outer dim
         ret = ret.reshape(1, pe, wmem, simd)
@@ -138,19 +134,19 @@ def get_hw_compatible_weight_tensor(self, orig_weight_matrix):
         return ret
 
     def global_includes(self):
-        # self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
-        # self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
-
         self.code_gen_dict["$GLOBALS$"] = ['#include "deconv.hpp"']
 
     def defines(self, var):
         ifm_dim = self.get_nodeattr("IFMDim")
         self.code_gen_dict["$DEFINES$"] = [
-            """#define Kernel {}\n #define Stride {}\n
-            #define IFMH {}\n #define IFMW {}\n #define ICH {}\n
-            #define OCH {}\n #define SIMD1 {}\n #define PE1 {}""".format(
+            """constexpr unsigned Kernel = {};\n constexpr unsigned Stride = {};\n
+            constexpr unsigned Padding = {};\n constexpr unsigned IFMH = {};\n
+            constexpr unsigned IFMW = {};\n constexpr unsigned ICH = {};\n
+            constexpr unsigned OCH = {};\n constexpr unsigned SIMD1 = {};\n
+            constexpr unsigned PE1 = {};""".format(
                 self.get_nodeattr("KernelDim")[0],
                 self.get_nodeattr("Stride")[0],
+                self.get_nodeattr("Padding")[0],
                 ifm_dim[0],
                 ifm_dim[1],
                 self.get_nodeattr("IFMChannels"),
@@ -170,7 +166,7 @@ def read_npy_data(self):
         self.code_gen_dict["$READNPYDATA$"] = []
         # note: the innermost dim is reversed for the input
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintvectorstream<%s, %s, %d>("%s", in0_%s, false);'
+            'npy2vectorstream<%s, %s, %d>("%s", in0_%s, false);'
             % (
                 elem_hls_type,
                 npy_type,
@@ -198,13 +194,27 @@ def strm_decl(self):
         )
 
     def docompute(self):
+        odtype = self.get_output_datatype()
+        pe = self.get_nodeattr("PE")
+        ishape = self.get_normal_input_shape()
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """deconv<Kernel, Stride, IFMH, IFMW, OCH, ICH, PE1, SIMD1>
+            "hls::stream<hls::vector<{},{}>> strm;".format(odtype.get_hls_datatype_str(), pe)
+        ]
+        self.code_gen_dict["$DOCOMPUTE$"].append("unsigned  timeout = 0;")
+        self.code_gen_dict["$DOCOMPUTE$"].append("while(timeout < %s) {" % np.prod(ishape))
+        self.code_gen_dict["$DOCOMPUTE$"].append(
+            """deconv<Kernel, Stride, Padding, IFMH, IFMW, OCH, ICH, PE1, SIMD1>
             (weights, in0_{}, out_{});""".format(
                 self.hls_sname(),
                 self.hls_sname(),
             )
-        ]
+        )
+        self.code_gen_dict["$DOCOMPUTE$"].append("if(out_V.empty())  timeout++;")
+        self.code_gen_dict["$DOCOMPUTE$"].append("else {")
+        self.code_gen_dict["$DOCOMPUTE$"].append("strm << out_V.read();")
+        self.code_gen_dict["$DOCOMPUTE$"].append("timeout = 0;")
+        self.code_gen_dict["$DOCOMPUTE$"].append("}")
+        self.code_gen_dict["$DOCOMPUTE$"].append("}")
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -218,20 +228,16 @@ def dataoutstrm(self):
 
         # note: the innermost dim is not reversed for the output
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintvectorstream2npy<%s, %s, %d>(out_%s, %s, "%s", false);'
+            'vectorstream2npy<%s, %s, %d>(strm, %s, "%s", false);'
             % (
                 elem_hls_type,
                 npy_type,
                 pe,
-                self.hls_sname(),
                 shape_cpp_str,
                 npy_out,
             )
         ]
 
-    def save_as_npy(self):
-        self.code_gen_dict["$SAVEASCNPY$"] = []
-
     def blackboxfunction(self):
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
             """void {}(hls::stream<ap_uint<{}>> &in0_{},
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 3d89a0ab23..72c607731a 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -32,6 +32,7 @@
 #define AP_INT_MAX_W $AP_INT_MAX_W$
 #include "cnpy.h"
 #include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
 #include <vector>
 #include "bnn-library.h"
 
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index f23f409f39..6133316fc1 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -322,9 +322,9 @@ def elem2str(x):
         else:
             raise Exception("Unsupported type for numpy_to_hls_code")
 
-    strarr = np.array2string(ndarray, separator=", ", formatter={"all": elem2str})
+    strarr = np.array2string(ndarray.flatten(), separator=", ", formatter={"all": elem2str})
     np.set_printoptions(**orig_printops)
-    strarr = strarr.replace("[", "{").replace("]", "}")
+    strarr = strarr.replace("[", "{").replace("]", ",}")
     if no_decl:
         ret = strarr + ";"
     else:

From 629a65e79ebcd470ef51bf3696a297854a949661 Mon Sep 17 00:00:00 2001
From: Hugo LE BLEVEC <hugo.leblevec@gmail.com>
Date: Tue, 15 Oct 2024 10:01:55 +0200
Subject: [PATCH 03/16] update finn-hlslib commit hash

---
 fetch-repos.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index 5c54e8ecf7..66e1add27f 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -33,7 +33,7 @@ FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
 BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="d56b1d0c1eeb844a873fb29a29240a86e00d9f80"
+HLSLIB_COMMIT="be554a3c4b47e6c3082f6158c057098d926f0d58"
 OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"

From b8501c9e460ae4796723ec5c19c17f79796a3342 Mon Sep 17 00:00:00 2001
From: Hugo LE BLEVEC <hugo.leblevec@gmail.com>
Date: Tue, 15 Oct 2024 10:03:48 +0200
Subject: [PATCH 04/16] deconv test now passing multiple channels

---
 .../fpgadataflow/hls/deconvolution_hls.py     | 14 ++--
 .../fpgadataflow/test_fpgadataflow_deconv.py  | 77 ++++++++++---------
 2 files changed, 48 insertions(+), 43 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
index 8931d818d4..7442dca99d 100644
--- a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -116,17 +116,19 @@ def get_hw_compatible_weight_tensor(self, orig_weight_matrix):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
         wmem = self.calc_wmem()
-        assert orig_weight_matrix.shape == (
-            ofm_ch,
-            k_h * k_w * ifm_ch,
-        ), """Weights matrix doesn't
-        #have expected shape (k_h*k_w*ifm_ch, ofm_ch)"""
+        # assert orig_weight_matrix.shape == (
+        #     k_h * k_w * ifm_ch,
+        #     ofm_ch,
+        # ), """Weights matrix doesn't
+        # have expected shape (k_h*k_w*ifm_ch, ofm_ch)"""
         assert ofm_ch % pe == 0, "Requirement output channels divisable by PE is violated."
         assert ifm_ch % simd == 0, "Requirement input channels divisable by SIMD is violated."
         # interleave rows between PEs and reshape
         # distribute rows between PEs
         ret = orig_weight_matrix
-        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        ret = ret.flatten()
+        # breakpoint()
+        # ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
         # create SIMD as innermost dimension and add a dummy outer dim
         ret = ret.reshape(1, pe, wmem, simd)
         # reverse the SIMD dimension
diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py
index a09e788570..b642abc9a8 100644
--- a/tests/fpgadataflow/test_fpgadataflow_deconv.py
+++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py
@@ -64,12 +64,11 @@
 target_clk_ns = 10
 
 
-def set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding):
+def set_up_reference_model(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
     idim_h, idim_w = idim
     stride_h, stride_w = stride
     odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1
     odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1
-    odt = DataType["INT32"]
 
     inp = helper.make_tensor_value_info(
         "inp",
@@ -120,10 +119,10 @@ def set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding):
 
     model = model.transform(InferShapes())
 
-    return model
+    return model, w_tensor
 
 
-def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
+def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding, w_tensor):
     idim_h, idim_w = idim
     stride_h, stride_w = stride
     odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1
@@ -141,7 +140,8 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
     )
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, odim_h, odim_w, ofm_ch])
 
-    W = helper.make_tensor_value_info("W", TensorProto.FLOAT, [ifm_ch * k * k, ofm_ch])
+    # W = helper.make_tensor_value_info("W", TensorProto.FLOAT, [ifm_ch * k * k, ofm_ch])
+    W = helper.make_tensor_value_info("W", TensorProto.FLOAT, [ofm_ch, k, k, ifm_ch])
 
     Deconv = helper.make_node(
         "Deconvolution_hls",
@@ -154,6 +154,7 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
         OFMChannels=ofm_ch,
         IFMDim=idim,
         Stride=[stride_h, stride_w],
+        Padding = [padding, padding],
         PE=1,
         SIMD=1,
         inputDataType=idt.name,
@@ -180,7 +181,9 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
     model.set_tensor_datatype(model.graph.output[0].name, odt)
     model.set_tensor_datatype("W", wdt)
 
-    w_tensor = gen_finn_dt_tensor(wdt, [ifm_ch * k * k, ofm_ch])
+    # w_tensor = gen_finn_dt_tensor(wdt, [ifm_ch * k * k, ofm_ch])
+    # w_tensor = w_tensor.reshape(ifm_ch * k * k, ofm_ch)
+    w_tensor = w_tensor.transpose(1, 2, 3, 0)
     model.set_initializer("W", w_tensor)
 
     model = model.transform(InferShapes())
@@ -189,33 +192,35 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
 
 
 # input image dimension
-@pytest.mark.parametrize("idim", [[8, 8], [10, 8]])
+@pytest.mark.parametrize("idim", [[8, 8]])
 # number of rows and number of cols to add
-@pytest.mark.parametrize("stride", [[2, 2], [2, 3]])
+@pytest.mark.parametrize("stride", [[2, 2]])
 # number of channels
-@pytest.mark.parametrize("ifm_ch", [2])
+@pytest.mark.parametrize("ifm_ch", [4])
 # number of channels
-@pytest.mark.parametrize("ofm_ch", [4])
+@pytest.mark.parametrize("ofm_ch", [6])
 # Input parallelism
-@pytest.mark.parametrize("simd", [1, 2])
+@pytest.mark.parametrize("simd", [1])
 # PE
-@pytest.mark.parametrize("pe", [1, 2])
+@pytest.mark.parametrize("pe", [1])
 # kernel size
 @pytest.mark.parametrize("k", [2])
 # padding
-@pytest.mark.parametrize("padding", [0, 1])
+@pytest.mark.parametrize("padding", [0])
 # exec mode
-@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.parametrize("exec_mode", ["cppsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, exec_mode):
-    idt = wdt = DataType["INT4"]
+    idt = wdt = DataType["INT8"]
     wdt = idt
+    odt = DataType["INT32"]
     idim_h, idim_w = idim
     stride_h, stride_w = stride
 
-    ref_model = set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding)
+    ref_model, w_tensor = set_up_reference_model(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding)
+    model = create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding, w_tensor)
 
     odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1
     odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1
@@ -225,34 +230,31 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding,
 
     y_expected = oxe.execute_onnx(ref_model, input_dict)["outp"]
 
-    model = ref_model.transform(InferPixelPaddingDeconv())
-    model = model.transform(InferConvInpGen())
-    model = model.transform(InferQuantizedMatrixVectorActivation())
-    model = model.transform(InferShapes())
-    model = model.transform(GiveUniqueNodeNames())
-
-    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
-    assert (y_produced == y_expected).all()
+    # model = model.transform(InferShapes())
+    # model = model.transform(GiveUniqueNodeNames())
+    input_tensor_nhwc = input_tensor.transpose(0, 2, 3, 1)
+    input_dict_nhwc = {"inp": input_tensor_nhwc}
+    # y_produced = oxe.execute_onnx(model, input_dict_nhwc)["outp"]
+    # assert (y_produced == y_expected).all()
 
-    model = model.transform(SpecializeLayers(test_fpga_part))
-    model = model.transform(MinimizeAccumulatorWidth())
+    # model = model.transform(SpecializeLayers(test_fpga_part))
+    # model = model.transform(MinimizeAccumulatorWidth())
 
     for n in model.graph.node:
-        if n.op_type.startswith("ConvolutionInputGenerator"):
-            convinputgen_node = getCustomOp(n)
-            convinputgen_node.set_nodeattr("SIMD", simd)
-        elif n.op_type.startswith("MVAU"):
-            mvau_node = getCustomOp(n)
-            mvau_node.set_nodeattr("PE", pe)
-            mvau_node.set_nodeattr("SIMD", simd)
-
-    expected_oshape = (1, ofm_ch, odim_h, odim_w)
+        if n.op_type.startswith("Deconvolution_hls"):
+            deconv_node = getCustomOp(n)
+            deconv_node.set_nodeattr("PE", pe)
+            deconv_node.set_nodeattr("SIMD", simd)
 
+    expected_oshape = (1, odim_h, odim_w, ofm_ch)
+    # model.save("deconv.onnx")
     # cppsim
     if exec_mode == "cppsim":
+        model = model.transform(GiveUniqueNodeNames())
         model = model.transform(PrepareCppSim())
         model = model.transform(CompileCppSim())
         model = model.transform(SetExecMode("cppsim"))
+        # breakpoint()
 
     # rtlsim
     else:
@@ -262,12 +264,13 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding,
         model = model.transform(PrepareRTLSim())
         model = model.transform(SetExecMode("rtlsim"))
 
-    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    y_produced = oxe.execute_onnx(model, input_dict_nhwc)["outp"]
     assert y_produced.shape == expected_oshape
+    y_produced = y_produced.transpose(0, 3, 1, 2)
     assert (y_produced == y_expected).all()
 
     if exec_mode == "rtlsim":
-        node = model.get_nodes_by_op_type("FMPadding_Pixel_hls")[0]
+        node = model.get_nodes_by_op_type("Deconvolution_hls")[0]
         inst = getCustomOp(node)
         cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
         exp_cycles_dict = model.analysis(exp_cycles_per_layer)

From c5aa42766eb7b8391c51994c7b1fd43873847139 Mon Sep 17 00:00:00 2001
From: hlebleve <hugo.leblevec@gmail.com>
Date: Wed, 6 Nov 2024 10:51:19 +0000
Subject: [PATCH 05/16] [Deconv] Updating tests and custom HLS node, passing
 most cppsim tests

---
 .../fpgadataflow/hls/deconvolution_hls.py     | 24 +++++++++----------
 .../fpgadataflow/test_fpgadataflow_deconv.py  | 23 ++++--------------
 2 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
index 7442dca99d..6f0f8cbcff 100644
--- a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -89,7 +89,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         weight_tensor = self.get_hw_compatible_weight_tensor(weights)
         export_wdt = self.get_weight_datatype()
         if weight_file_mode == "hls_header":
-            weight_hls_code = numpy_to_hls_code1(weight_tensor, export_wdt, "weights", True, True)
+            weight_hls_code = numpy_to_hls_code1(weight_tensor, export_wdt, "weights", False, True)
             # write weights into C++ header file as dictated by finn-hlslib
             f_weights = open(weight_file_name, "w")
             f_weights.write(
@@ -116,23 +116,20 @@ def get_hw_compatible_weight_tensor(self, orig_weight_matrix):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")
         wmem = self.calc_wmem()
-        # assert orig_weight_matrix.shape == (
-        #     k_h * k_w * ifm_ch,
-        #     ofm_ch,
-        # ), """Weights matrix doesn't
-        # have expected shape (k_h*k_w*ifm_ch, ofm_ch)"""
+        assert orig_weight_matrix.shape == (
+            ofm_ch, k_h, k_w, ifm_ch
+        ), """Weights matrix doesn't
+        have expected shape (ofm_ch, k_h, k_w, ifm_ch)"""
         assert ofm_ch % pe == 0, "Requirement output channels divisable by PE is violated."
         assert ifm_ch % simd == 0, "Requirement input channels divisable by SIMD is violated."
         # interleave rows between PEs and reshape
         # distribute rows between PEs
         ret = orig_weight_matrix
-        ret = ret.flatten()
-        # breakpoint()
-        # ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
-        # create SIMD as innermost dimension and add a dummy outer dim
+        ret = ret.reshape(ofm_ch, k_h * k_w * ifm_ch)
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        # create SIMD as innermost dimension
         ret = ret.reshape(1, pe, wmem, simd)
-        # reverse the SIMD dimension
-        ret = np.flip(ret, axis=-1)
+        ret = ret.transpose(0, 2, 1, 3)
         return ret
 
     def global_includes(self):
@@ -199,11 +196,12 @@ def docompute(self):
         odtype = self.get_output_datatype()
         pe = self.get_nodeattr("PE")
         ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
         self.code_gen_dict["$DOCOMPUTE$"] = [
             "hls::stream<hls::vector<{},{}>> strm;".format(odtype.get_hls_datatype_str(), pe)
         ]
         self.code_gen_dict["$DOCOMPUTE$"].append("unsigned  timeout = 0;")
-        self.code_gen_dict["$DOCOMPUTE$"].append("while(timeout < %s) {" % np.prod(ishape))
+        self.code_gen_dict["$DOCOMPUTE$"].append("while(timeout < %s) {" % (2 * np.prod(oshape)))
         self.code_gen_dict["$DOCOMPUTE$"].append(
             """deconv<Kernel, Stride, Padding, IFMH, IFMW, OCH, ICH, PE1, SIMD1>
             (weights, in0_{}, out_{});""".format(
diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py
index b642abc9a8..74224abeb5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_deconv.py
+++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py
@@ -41,22 +41,11 @@
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.convert_to_hw_layers import (
-    InferConvInpGen,
-    InferQuantizedMatrixVectorActivation,
-)
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.infer_pixel_padding_deconv import (
-    InferPixelPaddingDeconv,
-)
-from finn.transformation.fpgadataflow.minimize_accumulator_width import (
-    MinimizeAccumulatorWidth,
-)
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.util.basic import pynq_part_map
 
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -139,8 +128,6 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding,
         ],
     )
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, odim_h, odim_w, ofm_ch])
-
-    # W = helper.make_tensor_value_info("W", TensorProto.FLOAT, [ifm_ch * k * k, ofm_ch])
     W = helper.make_tensor_value_info("W", TensorProto.FLOAT, [ofm_ch, k, k, ifm_ch])
 
     Deconv = helper.make_node(
@@ -181,8 +168,6 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding,
     model.set_tensor_datatype(model.graph.output[0].name, odt)
     model.set_tensor_datatype("W", wdt)
 
-    # w_tensor = gen_finn_dt_tensor(wdt, [ifm_ch * k * k, ofm_ch])
-    # w_tensor = w_tensor.reshape(ifm_ch * k * k, ofm_ch)
     w_tensor = w_tensor.transpose(1, 2, 3, 0)
     model.set_initializer("W", w_tensor)
 
@@ -200,13 +185,13 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding,
 # number of channels
 @pytest.mark.parametrize("ofm_ch", [6])
 # Input parallelism
-@pytest.mark.parametrize("simd", [1])
+@pytest.mark.parametrize("simd", [1,2,4])
 # PE
-@pytest.mark.parametrize("pe", [1])
+@pytest.mark.parametrize("pe", [1,3,6])
 # kernel size
-@pytest.mark.parametrize("k", [2])
+@pytest.mark.parametrize("k", [2,4])
 # padding
-@pytest.mark.parametrize("padding", [0])
+@pytest.mark.parametrize("padding", [0,1,2])
 # exec mode
 @pytest.mark.parametrize("exec_mode", ["cppsim"])
 @pytest.mark.fpgadataflow

From f7cbd48b6c5e130d40c089ee55c64df75aadad15 Mon Sep 17 00:00:00 2001
From: Hugo LE BLEVEC <hugo.leblevec@gmail.com>
Date: Wed, 6 Nov 2024 12:07:15 +0100
Subject: [PATCH 06/16] [Deconv] Updating tests and custom HLS node, passing
 most cppsim tests

---
 .../fpgadataflow/hls/deconvolution_hls.py          |  5 ++++-
 tests/fpgadataflow/test_fpgadataflow_deconv.py     | 14 ++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
index 6f0f8cbcff..a3dca69ca9 100644
--- a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -117,7 +117,10 @@ def get_hw_compatible_weight_tensor(self, orig_weight_matrix):
         simd = self.get_nodeattr("SIMD")
         wmem = self.calc_wmem()
         assert orig_weight_matrix.shape == (
-            ofm_ch, k_h, k_w, ifm_ch
+            ofm_ch,
+            k_h,
+            k_w,
+            ifm_ch,
         ), """Weights matrix doesn't
         have expected shape (ofm_ch, k_h, k_w, ifm_ch)"""
         assert ofm_ch % pe == 0, "Requirement output channels divisable by PE is violated."
diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py
index 74224abeb5..7a5da68240 100644
--- a/tests/fpgadataflow/test_fpgadataflow_deconv.py
+++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py
@@ -141,7 +141,7 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding,
         OFMChannels=ofm_ch,
         IFMDim=idim,
         Stride=[stride_h, stride_w],
-        Padding = [padding, padding],
+        Padding=[padding, padding],
         PE=1,
         SIMD=1,
         inputDataType=idt.name,
@@ -185,13 +185,13 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding,
 # number of channels
 @pytest.mark.parametrize("ofm_ch", [6])
 # Input parallelism
-@pytest.mark.parametrize("simd", [1,2,4])
+@pytest.mark.parametrize("simd", [1, 2, 4])
 # PE
-@pytest.mark.parametrize("pe", [1,3,6])
+@pytest.mark.parametrize("pe", [1, 3, 6])
 # kernel size
-@pytest.mark.parametrize("k", [2,4])
+@pytest.mark.parametrize("k", [2, 4])
 # padding
-@pytest.mark.parametrize("padding", [0,1,2])
+@pytest.mark.parametrize("padding", [0, 1, 2])
 # exec mode
 @pytest.mark.parametrize("exec_mode", ["cppsim"])
 @pytest.mark.fpgadataflow
@@ -204,7 +204,9 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding,
     idim_h, idim_w = idim
     stride_h, stride_w = stride
 
-    ref_model, w_tensor = set_up_reference_model(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding)
+    ref_model, w_tensor = set_up_reference_model(
+        idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding
+    )
     model = create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding, w_tensor)
 
     odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1

From 891a305ff371ed3d976bcd285f061f8d5e17ade9 Mon Sep 17 00:00:00 2001
From: Hugo LE BLEVEC <hugo.leblevec@gmail.com>
Date: Wed, 6 Nov 2024 12:08:15 +0100
Subject: [PATCH 07/16] [Deconv] Updating tests and custom HLS node, passing
 most cppsim tests

---
 src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
index a3dca69ca9..48c0e63453 100644
--- a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -198,7 +198,7 @@ def strm_decl(self):
     def docompute(self):
         odtype = self.get_output_datatype()
         pe = self.get_nodeattr("PE")
-        ishape = self.get_normal_input_shape()
+        # ishape = self.get_normal_input_shape()
         oshape = self.get_normal_output_shape()
         self.code_gen_dict["$DOCOMPUTE$"] = [
             "hls::stream<hls::vector<{},{}>> strm;".format(odtype.get_hls_datatype_str(), pe)

From c157dc0c694a80ef968843fc2381e014f59eaa53 Mon Sep 17 00:00:00 2001
From: Hugo LE BLEVEC <hugo.leblevec@gmail.com>
Date: Thu, 16 Jan 2025 14:03:53 +0100
Subject: [PATCH 08/16] Changing the computation of the timout value to be
 based on parameters

---
 .../fpgadataflow/hls/deconvolution_hls.py     | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
index 48c0e63453..f43fb0dbbd 100644
--- a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -198,13 +198,27 @@ def strm_decl(self):
     def docompute(self):
         odtype = self.get_output_datatype()
         pe = self.get_nodeattr("PE")
-        # ishape = self.get_normal_input_shape()
-        oshape = self.get_normal_output_shape()
+        simd = self.get_nodeattr("SIMD")
+        i_ch = self.get_nodeattr("IFMChannels")
+        k_h, k_w = self.get_nodeattr("KernelDim")
+        s_h, s_w = self.get_nodeattr("Stride")
+        i_h, i_w = self.get_nodeattr("IFMDim")
+        p_h, p_w = self.get_nodeattr("Padding")
+        if p_w >= k_w - s_w:
+            padup = 0
+        else:
+            padup = (k_w - p_w - 1) / s_w
+        crop = s_w * padup - ((k_w - s_w) - p_w)
+        sf = i_ch / simd
+        w_eff = padup + i_w + padup
+        wo_eff = (w_eff - 1) * s_w + k_w
         self.code_gen_dict["$DOCOMPUTE$"] = [
             "hls::stream<hls::vector<{},{}>> strm;".format(odtype.get_hls_datatype_str(), pe)
         ]
         self.code_gen_dict["$DOCOMPUTE$"].append("unsigned  timeout = 0;")
-        self.code_gen_dict["$DOCOMPUTE$"].append("while(timeout < %s) {" % (2 * np.prod(oshape)))
+        self.code_gen_dict["$DOCOMPUTE$"].append(
+            "while(timeout < %s) {" % (wo_eff * (crop + 1) * ((k_w / s_w) ** 2) * sf + 50)
+        )
         self.code_gen_dict["$DOCOMPUTE$"].append(
             """deconv<Kernel, Stride, Padding, IFMH, IFMW, OCH, ICH, PE1, SIMD1>
             (weights, in0_{}, out_{});""".format(

From cee4e70a05941d573937f829faa60d148c12b219 Mon Sep 17 00:00:00 2001
From: Hugo LE BLEVEC <hugo.leblevec@gmail.com>
Date: Thu, 16 Jan 2025 14:31:57 +0100
Subject: [PATCH 09/16] Increasing the timeout value as it fails some test
 configurations

---
 src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
index f43fb0dbbd..f1e7d7aca8 100644
--- a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -217,7 +217,7 @@ def docompute(self):
         ]
         self.code_gen_dict["$DOCOMPUTE$"].append("unsigned  timeout = 0;")
         self.code_gen_dict["$DOCOMPUTE$"].append(
-            "while(timeout < %s) {" % (wo_eff * (crop + 1) * ((k_w / s_w) ** 2) * sf + 50)
+            "while(timeout < %s) {" % (wo_eff * (crop + 1) * ((k_w / s_w) ** 2) * 2 * sf + 50)
         )
         self.code_gen_dict["$DOCOMPUTE$"].append(
             """deconv<Kernel, Stride, Padding, IFMH, IFMW, OCH, ICH, PE1, SIMD1>

From 248848ef95c9c69a1aa133283fbcde5a71e738a6 Mon Sep 17 00:00:00 2001
From: Hugo LE BLEVEC <hugo.leblevec@gmail.com>
Date: Thu, 16 Jan 2025 15:14:40 +0100
Subject: [PATCH 10/16] Updating HLSLIB commit hash to the most recent

---
 fetch-repos.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index 9ff546bdbb..57e14bc291 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -33,7 +33,7 @@ FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
 BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="be554a3c4b47e6c3082f6158c057098d926f0d58"
+HLSLIB_COMMIT="16cfc4b3ab895babf30f7db7c4bcac27d68317a9"
 OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"

From 91e811615017cd7b0992d82be815045003530cf8 Mon Sep 17 00:00:00 2001
From: Hugo LE BLEVEC <hugo.leblevec@gmail.com>
Date: Thu, 16 Jan 2025 15:15:17 +0100
Subject: [PATCH 11/16] Setting test parameters to failing case

---
 tests/fpgadataflow/test_fpgadataflow_deconv.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py
index 7a5da68240..26509bc738 100644
--- a/tests/fpgadataflow/test_fpgadataflow_deconv.py
+++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py
@@ -181,17 +181,17 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding,
 # number of rows and number of cols to add
 @pytest.mark.parametrize("stride", [[2, 2]])
 # number of channels
-@pytest.mark.parametrize("ifm_ch", [4])
+@pytest.mark.parametrize("ifm_ch", [2])
 # number of channels
-@pytest.mark.parametrize("ofm_ch", [6])
+@pytest.mark.parametrize("ofm_ch", [3])
 # Input parallelism
-@pytest.mark.parametrize("simd", [1, 2, 4])
+@pytest.mark.parametrize("simd", [1])
 # PE
-@pytest.mark.parametrize("pe", [1, 3, 6])
+@pytest.mark.parametrize("pe", [1])
 # kernel size
-@pytest.mark.parametrize("k", [2, 4])
+@pytest.mark.parametrize("k", [4])
 # padding
-@pytest.mark.parametrize("padding", [0, 1, 2])
+@pytest.mark.parametrize("padding", [1])
 # exec mode
 @pytest.mark.parametrize("exec_mode", ["cppsim"])
 @pytest.mark.fpgadataflow

From 05f9a7066b96f41bac083952c6b8700afc6164bd Mon Sep 17 00:00:00 2001
From: Hugo LE BLEVEC <hugo.leblevec@gmail.com>
Date: Wed, 7 May 2025 15:45:17 +0200
Subject: [PATCH 12/16] updating templates to match recent changes

---
 .../fpgadataflow/hls/deconvolution_hls.py     | 117 ++++++++++++------
 src/finn/custom_op/fpgadataflow/hlsbackend.py |  14 +++
 src/finn/custom_op/fpgadataflow/templates.py  |  45 +++++++
 3 files changed, 135 insertions(+), 41 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
index f1e7d7aca8..f30f91fad4 100644
--- a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -31,6 +31,7 @@
 from qonnx.core.datatype import DataType
 from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions
 
+from finn.custom_op.fpgadataflow import templates
 from finn.custom_op.fpgadataflow.deconvolution import Deconvolution
 from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
 from finn.util.data_packing import (
@@ -195,30 +196,14 @@ def strm_decl(self):
             )
         )
 
-    def docompute(self):
-        odtype = self.get_output_datatype()
-        pe = self.get_nodeattr("PE")
-        simd = self.get_nodeattr("SIMD")
-        i_ch = self.get_nodeattr("IFMChannels")
-        k_h, k_w = self.get_nodeattr("KernelDim")
-        s_h, s_w = self.get_nodeattr("Stride")
-        i_h, i_w = self.get_nodeattr("IFMDim")
-        p_h, p_w = self.get_nodeattr("Padding")
-        if p_w >= k_w - s_w:
-            padup = 0
-        else:
-            padup = (k_w - p_w - 1) / s_w
-        crop = s_w * padup - ((k_w - s_w) - p_w)
-        sf = i_ch / simd
-        w_eff = padup + i_w + padup
-        wo_eff = (w_eff - 1) * s_w + k_w
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            "hls::stream<hls::vector<{},{}>> strm;".format(odtype.get_hls_datatype_str(), pe)
-        ]
-        self.code_gen_dict["$DOCOMPUTE$"].append("unsigned  timeout = 0;")
-        self.code_gen_dict["$DOCOMPUTE$"].append(
-            "while(timeout < %s) {" % (wo_eff * (crop + 1) * ((k_w / s_w) ** 2) * 2 * sf + 50)
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<hls::vector<{},{}>> debug_out_{} ("out_{}");'.format(
+                odtype.get_hls_datatype_str(), pe, self.hls_sname(), self.hls_sname()
+            )
         )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = []
         self.code_gen_dict["$DOCOMPUTE$"].append(
             """deconv<Kernel, Stride, Padding, IFMH, IFMW, OCH, ICH, PE1, SIMD1>
             (weights, in0_{}, out_{});""".format(
@@ -226,12 +211,6 @@ def docompute(self):
                 self.hls_sname(),
             )
         )
-        self.code_gen_dict["$DOCOMPUTE$"].append("if(out_V.empty())  timeout++;")
-        self.code_gen_dict["$DOCOMPUTE$"].append("else {")
-        self.code_gen_dict["$DOCOMPUTE$"].append("strm << out_V.read();")
-        self.code_gen_dict["$DOCOMPUTE$"].append("timeout = 0;")
-        self.code_gen_dict["$DOCOMPUTE$"].append("}")
-        self.code_gen_dict["$DOCOMPUTE$"].append("}")
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -245,28 +224,34 @@ def dataoutstrm(self):
 
         # note: the innermost dim is not reversed for the output
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'vectorstream2npy<%s, %s, %d>(strm, %s, "%s", false);'
+            'vectorstream2npy<%s, %s, %d>(debug_out_%s, %s, "%s", false);'
             % (
                 elem_hls_type,
                 npy_type,
                 pe,
+                self.hls_sname(),
                 shape_cpp_str,
                 npy_out,
             )
         ]
 
     def blackboxfunction(self):
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}>> &in0_{},
-                hls::stream<ap_uint<{}>> &out_{}
-                )""".format(
-                self.onnx_node.name,
-                self.get_instream_width(),
-                self.hls_sname(),
-                self.get_outstream_width(),
-                self.hls_sname(),
-            )
-        ]
+        input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
+        output_elem_hls_type = self.get_output_datatype().get_hls_datatype_str()
+        simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
+        in_stream = "hls::stream<hls::vector<%s, %d>> &in0_%s" % (
+            input_elem_hls_type,
+            simd,
+            self.hls_sname(),
+        )
+        out_stream = "hls::stream<hls::vector<%s, %d>> &out_%s" % (
+            output_elem_hls_type,
+            pe,
+            self.hls_sname(),
+        )
+        blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_stream, out_stream)
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls]
 
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = [
@@ -369,3 +354,53 @@ def execute_node(self, context, graph):
                     mode
                 )
             )
+
+    def code_generation_cppsim(self, model):
+        """Generates c++ code for simulation (cppsim)."""
+        node = self.onnx_node
+        path = self.get_nodeattr("code_gen_dir_cppsim")
+        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
+        self.generate_params(model, path)
+        self.global_includes()
+        self.defines("cppsim")
+        self.read_npy_data()
+        self.strm_decl()
+        self.pragmas()
+        self.docompute()
+        self.dataoutstrm()
+        self.save_as_npy()
+        self.timeout_value()
+        self.timeout_condition()
+        self.timeout_read_stream()
+
+        template = templates.docompute_template_timeout
+
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w")
+        f.write(template)
+        f.close()
+        self.code_gen_dict.clear()
+
+    def timeout_value(self):
+        """Set timeout value for HLS functions defined for one clock cycle"""
+        simd = self.get_nodeattr("SIMD")
+        i_ch = self.get_nodeattr("IFMChannels")
+        k_h, k_w = self.get_nodeattr("KernelDim")
+        s_h, s_w = self.get_nodeattr("Stride")
+        i_h, i_w = self.get_nodeattr("IFMDim")
+        p_h, p_w = self.get_nodeattr("Padding")
+        if p_w >= k_w - s_w:
+            padup = 0
+        else:
+            padup = (k_w - p_w - 1) / s_w
+        crop = s_w * padup - ((k_w - s_w) - p_w)
+        sf = i_ch / simd
+        w_eff = padup + i_w + padup
+        wo_eff = (w_eff - 1) * s_w + k_w
+        self.code_gen_dict["$TIMEOUT_VALUE$"] = [
+            "%s" % (wo_eff * (crop + 1) * ((k_w / s_w) ** 2) * 4 * sf + 50)
+        ]
diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index d8210fd684..c03a9029db 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -474,3 +474,17 @@ def get_ap_int_max_w(self):
         ret = max([instream, outstream])
         assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret
         return ret
+
+    def timeout_value(self):
+        """Set timeout value for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_VALUE$"] = ["100"]
+
+    def timeout_condition(self):
+        """Set timeout condition for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_CONDITION$"] = ["out_{}.empty()".format(self.hls_sname())]
+
+    def timeout_read_stream(self):
+        """Set reading output stream procedure for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [
+            "debug_out_{} << out_{}.read();".format(self.hls_sname(), self.hls_sname())
+        ]
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 72c607731a..d2100a7516 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -59,6 +59,51 @@
 
 """
 
+# template for single node execution with timeout (for single clock hls operations)
+docompute_template_timeout = """
+#define AP_INT_MAX_W $AP_INT_MAX_W$
+#include "cnpy.h"
+#include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
+#include <vector>
+#include "bnn-library.h"
+
+// includes for network parameters
+$GLOBALS$
+
+// defines for network parameters
+$DEFINES$
+
+int main(){
+$PRAGMAS$
+
+$STREAMDECLARATIONS$
+
+$READNPYDATA$
+
+unsigned timeout = 0;
+while(timeout < $TIMEOUT_VALUE$){
+
+$DOCOMPUTE$
+
+if($TIMEOUT_CONDITION$){
+timeout++;
+}
+
+else{
+$TIMEOUT_READ_STREAM$
+timeout = 0;
+}
+}
+
+$DATAOUTSTREAM$
+
+$SAVEASCNPY$
+
+}
+
+"""
+
 # templates for single node ip generation
 
 # cpp file

From 7deb2969112f5f0e9c19c35db55e5a8cbb3f38e9 Mon Sep 17 00:00:00 2001
From: Hugo LE BLEVEC <hugo.leblevec@gmail.com>
Date: Wed, 7 May 2025 17:22:51 +0200
Subject: [PATCH 13/16] changing stream names to match the template

---
 src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
index f30f91fad4..eb2b800ee7 100644
--- a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -197,8 +197,8 @@ def strm_decl(self):
         )
 
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<hls::vector<{},{}>> debug_out_{} ("out_{}");'.format(
-                odtype.get_hls_datatype_str(), pe, self.hls_sname(), self.hls_sname()
+            'hls::stream<hls::vector<{},{}>> strm ("strm");'.format(
+                odtype.get_hls_datatype_str(), pe
             )
         )
 
@@ -224,12 +224,11 @@ def dataoutstrm(self):
 
         # note: the innermost dim is not reversed for the output
         self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'vectorstream2npy<%s, %s, %d>(debug_out_%s, %s, "%s", false);'
+            'vectorstream2npy<%s, %s, %d>(strm, %s, "%s", false);'
             % (
                 elem_hls_type,
                 npy_type,
                 pe,
-                self.hls_sname(),
                 shape_cpp_str,
                 npy_out,
             )

From 12832025b969c31ebd0685e85062a405ab53305e Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Tue, 12 Aug 2025 16:11:28 +0100
Subject: [PATCH 14/16] [Deconv] Align custom op with changes from dev

---
 .../custom_op/fpgadataflow/deconvolution.py   |  46 ++--
 .../fpgadataflow/hls/deconvolution_hls.py     | 208 +-----------------
 .../fpgadataflow/test_fpgadataflow_deconv.py  |   2 +
 3 files changed, 35 insertions(+), 221 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/deconvolution.py b/src/finn/custom_op/fpgadataflow/deconvolution.py
index 8dd33fea83..ad7a0bda1e 100644
--- a/src/finn/custom_op/fpgadataflow/deconvolution.py
+++ b/src/finn/custom_op/fpgadataflow/deconvolution.py
@@ -26,7 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import warnings
 from qonnx.core.datatype import DataType
 
@@ -58,18 +57,27 @@ def get_nodeattr_types(self):
         return my_attrs
 
     def get_normal_input_shape(self, ind=0):
-        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
-        ifm_ch = self.get_nodeattr("IFMChannels")
-        ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
+        if ind == 0:
+            ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+            ifm_ch = self.get_nodeattr("IFMChannels")
+            ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
+        else:
+            ifm_ch = self.get_nodeattr("IFMChannels")
+            ofm_ch = self.get_nodeattr("OFMChannels")
+            k_h, k_w = self.get_nodeattr("KernelDim")
+            ishape = (ofm_ch, k_h, k_w, ifm_ch)
         return ishape
 
     def get_folded_input_shape(self, ind=0):
-        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
-        ifm_ch = self.get_nodeattr("IFMChannels")
-        simd = self.get_nodeattr("SIMD")
-        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
-        fold = int(ifm_ch / simd)
-        folded_ishape = (1, ifm_dim_h, ifm_dim_w, fold, simd)
+        if ind == 0:
+            ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+            ifm_ch = self.get_nodeattr("IFMChannels")
+            simd = self.get_nodeattr("SIMD")
+            assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+            fold = int(ifm_ch / simd)
+            folded_ishape = (1, ifm_dim_h, ifm_dim_w, fold, simd)
+        else:
+            folded_ishape = self.get_normal_input_shape(ind)
         return folded_ishape
 
     def get_normal_output_shape(self, ind=0):
@@ -134,11 +142,14 @@ def get_output_datatype(self, ind=0):
     def get_instream_width(self, ind=0):
         """Returns stream width, input and output stream width are equal for
         the sliding window function"""
-        ibits = self.get_input_datatype().bitwidth()
-        simd = self.get_nodeattr("SIMD")
-        ifm_ch = self.get_nodeattr("IFMChannels")
-        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
-        in_width = simd * ibits
+        if ind == 0:
+            ibits = self.get_input_datatype().bitwidth()
+            simd = self.get_nodeattr("SIMD")
+            ifm_ch = self.get_nodeattr("IFMChannels")
+            assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+            in_width = simd * ibits
+        else:
+            in_width = 0
         return in_width
 
     def get_outstream_width(self, ind=0):
@@ -146,11 +157,6 @@ def get_outstream_width(self, ind=0):
         out_width = o_bits * self.get_nodeattr("PE")
         return out_width
 
-    def get_number_output_values(self):
-        folded_oshape = self.get_folded_output_shape()
-        num_output_elems = np.prod(folded_oshape[:-1])
-        return num_output_elems
-
     def get_exp_cycles(self):
         return 0
 
diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
index eb2b800ee7..39c01fa8dc 100644
--- a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -27,18 +27,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
-import os
-from qonnx.core.datatype import DataType
 from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions
 
-from finn.custom_op.fpgadataflow import templates
 from finn.custom_op.fpgadataflow.deconvolution import Deconvolution
 from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
-from finn.util.data_packing import (
-    npy_to_rtlsim_input,
-    numpy_to_hls_code1,
-    rtlsim_output_to_npy,
-)
+from finn.util.data_packing import numpy_to_hls_code1
 
 
 class Deconvolution_hls(Deconvolution, HLSBackend):
@@ -159,106 +152,32 @@ def defines(self, var):
             )
         ]
 
-    def read_npy_data(self):
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        dtype = self.get_input_datatype()
-        elem_hls_type = dtype.get_hls_datatype_str()
-        simd = self.get_nodeattr("SIMD")
-        npy_type = "float"
-        npy_in = "%s/input_0.npy" % code_gen_dir
-        self.code_gen_dict["$READNPYDATA$"] = []
-        # note: the innermost dim is reversed for the input
-        self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2vectorstream<%s, %s, %d>("%s", in0_%s, false);'
-            % (
-                elem_hls_type,
-                npy_type,
-                simd,
-                npy_in,
-                self.hls_sname(),
-            )
-        )
-
-    def strm_decl(self):
-        idtype = self.get_input_datatype()
-        odtype = self.get_output_datatype()
-        simd = self.get_nodeattr("SIMD")
-        pe = self.get_nodeattr("PE")
-        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<hls::vector<{},{}>> in0_{} ("in0_{}");'.format(
-                idtype.get_hls_datatype_str(), simd, self.hls_sname(), self.hls_sname()
-            )
-        )
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<hls::vector<{},{}>> out_{} ("out_{}");'.format(
-                odtype.get_hls_datatype_str(), pe, self.hls_sname(), self.hls_sname()
-            )
-        )
-
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<hls::vector<{},{}>> strm ("strm");'.format(
-                odtype.get_hls_datatype_str(), pe
-            )
-        )
-
     def docompute(self):
         self.code_gen_dict["$DOCOMPUTE$"] = []
         self.code_gen_dict["$DOCOMPUTE$"].append(
             """deconv<Kernel, Stride, Padding, IFMH, IFMW, OCH, ICH, PE1, SIMD1>
-            (weights, in0_{}, out_{});""".format(
-                self.hls_sname(),
-                self.hls_sname(),
-            )
+            (weights, in0_V, out0_V);"""
         )
 
-    def dataoutstrm(self):
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        pe = self.get_nodeattr("PE")
-        dtype = self.get_output_datatype()
-        elem_hls_type = dtype.get_hls_datatype_str()
-        npy_type = "float"
-        npy_out = "%s/output.npy" % code_gen_dir
-        shape = self.get_folded_output_shape()
-        shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
-
-        # note: the innermost dim is not reversed for the output
-        self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'vectorstream2npy<%s, %s, %d>(strm, %s, "%s", false);'
-            % (
-                elem_hls_type,
-                npy_type,
-                pe,
-                shape_cpp_str,
-                npy_out,
-            )
-        ]
-
     def blackboxfunction(self):
         input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
         output_elem_hls_type = self.get_output_datatype().get_hls_datatype_str()
         simd = self.get_nodeattr("SIMD")
         pe = self.get_nodeattr("PE")
-        in_stream = "hls::stream<hls::vector<%s, %d>> &in0_%s" % (
+        in_stream = "hls::stream<hls::vector<%s, %d>> &in0_V" % (
             input_elem_hls_type,
             simd,
-            self.hls_sname(),
         )
-        out_stream = "hls::stream<hls::vector<%s, %d>> &out_%s" % (
+        out_stream = "hls::stream<hls::vector<%s, %d>> &out0_V" % (
             output_elem_hls_type,
             pe,
-            self.hls_sname(),
         )
         blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_stream, out_stream)
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls]
 
     def pragmas(self):
-        self.code_gen_dict["$PRAGMAS$"] = [
-            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
-        ]
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
-        )
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0_V"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out0_V")
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
 
         self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
@@ -269,120 +188,7 @@ def pragmas(self):
         # )
 
     def execute_node(self, context, graph):
-        mode = self.get_nodeattr("exec_mode")
-        node = self.onnx_node
-
-        # TODO ensure codegen dir exists
-        if mode == "cppsim":
-            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        elif mode == "rtlsim":
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
-            )
-
-        # create a npy file fore each input of the node (in_ind is input index)
-        in_ind = 0
-        for inputs in node.input:
-            # it is assumed that the first input of the node is the data input
-            # the second input are the weights
-            # the third input are the thresholds
-            if in_ind == 0:
-                assert (
-                    str(context[inputs].dtype) == "float32"
-                ), """Input datatype is
-                not float32 as expected."""
-                expected_inp_shape = self.get_folded_input_shape()
-                reshaped_input = context[inputs].reshape(expected_inp_shape)
-                if self.get_input_datatype() == DataType["BIPOLAR"]:
-                    # store bipolar activations as binary
-                    reshaped_input = (reshaped_input + 1) / 2
-                    export_idt = DataType["BINARY"]
-                else:
-                    export_idt = self.get_input_datatype()
-                # make copy before saving the array
-                reshaped_input = reshaped_input.copy()
-                np.save(
-                    os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
-                    reshaped_input,
-                )
-            elif in_ind > 2:
-                raise Exception("Unexpected input found for MatrixVectorActivation")
-            in_ind += 1
-
-        if mode == "cppsim":
-            # execute the precompiled model
-            super().exec_precompiled_singlenode_model()
-            # load output npy file
-            super().npy_to_dynamic_output(context)
-            # reinterpret binary output as bipolar where needed
-            if self.get_output_datatype() == DataType["BIPOLAR"]:
-                out = context[node.output[0]]
-                out = 2 * out - 1
-                context[node.output[0]] = out
-            assert (
-                context[node.output[0]].shape == self.get_normal_output_shape()
-            ), "cppsim did not produce expected output shape"
-        elif mode == "rtlsim":
-            sim = self.get_rtlsim()
-            nbits = self.get_instream_width()
-            inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
-            self.reset_rtlsim(sim)
-            self.toggle_clk(sim)
-            output = self.rtlsim(sim, inp)
-            odt = self.get_output_datatype()
-            target_bits = odt.bitwidth()
-            packed_bits = self.get_outstream_width()
-            out_npy_path = "{}/output.npy".format(code_gen_dir)
-            out_shape = self.get_folded_output_shape()
-            rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
-
-            # load and reshape output
-            output = np.load(out_npy_path)
-            oshape = self.get_normal_output_shape()
-            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
-            context[node.output[0]] = output
-        else:
-            raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
-            )
-
-    def code_generation_cppsim(self, model):
-        """Generates c++ code for simulation (cppsim)."""
-        node = self.onnx_node
-        path = self.get_nodeattr("code_gen_dir_cppsim")
-        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
-        self.generate_params(model, path)
-        self.global_includes()
-        self.defines("cppsim")
-        self.read_npy_data()
-        self.strm_decl()
-        self.pragmas()
-        self.docompute()
-        self.dataoutstrm()
-        self.save_as_npy()
-        self.timeout_value()
-        self.timeout_condition()
-        self.timeout_read_stream()
-
-        template = templates.docompute_template_timeout
-
-        for key in self.code_gen_dict:
-            # transform list into long string separated by '\n'
-            code_gen_line = "\n".join(self.code_gen_dict[key])
-            template = template.replace(key, code_gen_line)
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w")
-        f.write(template)
-        f.close()
-        self.code_gen_dict.clear()
+        HLSBackend.execute_node(self, context, graph)
 
     def timeout_value(self):
         """Set timeout value for HLS functions defined for one clock cycle"""
diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py
index 3f9282f8e9..ba7f43134b 100644
--- a/tests/fpgadataflow/test_fpgadataflow_deconv.py
+++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py
@@ -146,6 +146,8 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding,
         inputDataType=idt.name,
         weightDataType=wdt.name,
         outputDataType=odt.name,
+        cpp_interface="hls_vector",
+        hls_style="freerunning",
     )
 
     node_list = [Deconv]

From 7e68ea6a917eb0a8afbc3e69c52bc95b644f980a Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Tue, 12 Aug 2025 16:27:15 +0100
Subject: [PATCH 15/16] [Tests] Bring back pixel padding test

---
 .../fpgadataflow/test_fpgadataflow_deconv.py  | 104 +++++++++++++++++-
 1 file changed, 103 insertions(+), 1 deletion(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py
index ba7f43134b..c545200016 100644
--- a/tests/fpgadataflow/test_fpgadataflow_deconv.py
+++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py
@@ -40,11 +40,22 @@
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.convert_to_hw_layers import (
+    InferConvInpGen,
+    InferQuantizedMatrixVectorActivation,
+)
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.infer_pixel_padding_deconv import (
+    InferPixelPaddingDeconv,
+)
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.util.basic import pynq_part_map
 
 test_pynq_board = "Pynq-Z1"
@@ -177,6 +188,97 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding,
     return model
 
 
+# input image dimension
+@pytest.mark.parametrize("idim", [[8, 8], [10, 8]])
+# number of rows and number of cols to add
+@pytest.mark.parametrize("stride", [[2, 2], [2, 3]])
+# number of channels
+@pytest.mark.parametrize("ifm_ch", [2])
+# number of channels
+@pytest.mark.parametrize("ofm_ch", [4])
+# Input parallelism
+@pytest.mark.parametrize("simd", [1, 2])
+# PE
+@pytest.mark.parametrize("pe", [1, 2])
+# kernel size
+@pytest.mark.parametrize("k", [2])
+# padding
+@pytest.mark.parametrize("padding", [0, 1])
+# exec mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_deconv_pixel_pad(
+    idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, exec_mode
+):
+    idt = wdt = DataType["INT4"]
+    wdt = idt
+    odt = DataType["INT32"]
+    idim_h, idim_w = idim
+    stride_h, stride_w = stride
+
+    ref_model = set_up_reference_model(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding)[0]
+
+    odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1
+    odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1
+
+    input_tensor = gen_finn_dt_tensor(idt, [1, ifm_ch, idim_h, idim_w])
+    input_dict = {"inp": input_tensor}
+
+    y_expected = oxe.execute_onnx(ref_model, input_dict)["outp"]
+
+    model = ref_model.transform(InferPixelPaddingDeconv())
+    model = model.transform(InferConvInpGen())
+    model = model.transform(InferQuantizedMatrixVectorActivation())
+    model = model.transform(InferShapes())
+    model = model.transform(GiveUniqueNodeNames())
+
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    assert (y_produced == y_expected).all()
+
+    model = model.transform(SpecializeLayers(test_fpga_part))
+    model = model.transform(MinimizeAccumulatorWidth())
+
+    for n in model.graph.node:
+        if n.op_type.startswith("ConvolutionInputGenerator"):
+            convinputgen_node = getCustomOp(n)
+            convinputgen_node.set_nodeattr("SIMD", simd)
+        elif n.op_type.startswith("MVAU"):
+            mvau_node = getCustomOp(n)
+            mvau_node.set_nodeattr("PE", pe)
+            mvau_node.set_nodeattr("SIMD", simd)
+
+    expected_oshape = (1, ofm_ch, odim_h, odim_w)
+
+    # cppsim
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+
+    # rtlsim
+    else:
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+        model = model.transform(SetExecMode("rtlsim"))
+
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    assert y_produced.shape == expected_oshape
+    assert (y_produced == y_expected).all()
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("FMPadding_Pixel_hls")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
+
+
 # input image dimension
 @pytest.mark.parametrize("idim", [[8, 8]])
 # number of rows and number of cols to add
@@ -198,7 +300,7 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding,
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, exec_mode):
+def test_fpgadataflow_deconv_revd2(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, exec_mode):
     idt = wdt = DataType["INT8"]
     wdt = idt
     odt = DataType["INT32"]

From ee7d7115bb46d3403da433e649042fa1b390ad10 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Tue, 12 Aug 2025 16:52:40 +0100
Subject: [PATCH 16/16] [Util] Remove custom numpy to hls code conversion for
 deconv and use exisiting fct

---
 .../fpgadataflow/hls/deconvolution_hls.py     |  6 ++-
 src/finn/util/data_packing.py                 | 51 -------------------
 2 files changed, 4 insertions(+), 53 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
index 39c01fa8dc..d0e90661e3 100644
--- a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -31,7 +31,7 @@
 
 from finn.custom_op.fpgadataflow.deconvolution import Deconvolution
 from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
-from finn.util.data_packing import numpy_to_hls_code1
+from finn.util.data_packing import numpy_to_hls_code
 
 
 class Deconvolution_hls(Deconvolution, HLSBackend):
@@ -83,7 +83,9 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         weight_tensor = self.get_hw_compatible_weight_tensor(weights)
         export_wdt = self.get_weight_datatype()
         if weight_file_mode == "hls_header":
-            weight_hls_code = numpy_to_hls_code1(weight_tensor, export_wdt, "weights", False, True)
+            weight_hls_code = numpy_to_hls_code(weight_tensor, export_wdt, "weights", False, True)
+            # remove framing {}
+            weight_hls_code = weight_hls_code[1:-2] + ";"
             # write weights into C++ header file as dictated by finn-hlslib
             f_weights = open(weight_file_name, "w")
             f_weights.write(
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 08f801aba2..61773d29b4 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -285,57 +285,6 @@ def elem2str(x):
     return ret
 
 
-def numpy_to_hls_code1(ndarray, dtype, hls_var_name, pack_innermost_dim=True, no_decl=False):
-    """Return C++ code representation of a numpy ndarray with FINN DataType
-    dtype, using hls_var_name as the resulting C++ variable name. If
-    pack_innermost_dim is specified, the innermost dimension of the ndarray
-    will be packed into a hex string using array2hexstring. If no_decl is
-    set to True, no variable name and type will be generated as part of the
-    emitted string.
-    """
-    hls_dtype = dtype.get_hls_datatype_str()
-    if type(ndarray) != np.ndarray or ndarray.dtype != np.float32:
-        # try to convert to a float numpy array (container dtype is float)
-        ndarray = np.asarray(ndarray, dtype=np.float32)
-    if pack_innermost_dim:
-        idimlen = ndarray.shape[-1]
-        idimbits = idimlen * dtype.bitwidth()
-        idimbits = roundup_to_integer_multiple(idimbits, 4)
-        ndarray = pack_innermost_dim_as_hex_string(ndarray, dtype, idimbits)
-        hls_dtype = "ap_uint<%d>" % idimbits
-    ndims = ndarray.ndim
-    # add type string and variable name
-    # e.g. "const ap_uint<64>" "weightMem0"
-    ret = "%s %s" % (hls_dtype, hls_var_name)
-    # add dimensions
-    for d in range(ndims):
-        ret += "[%d]" % ndarray.shape[d]
-    orig_printops = np.get_printoptions()
-    np.set_printoptions(threshold=sys.maxsize)
-
-    # define a function to convert a single element into a C++ init string
-    # a single element can be a hex string if we are using packing
-    def elem2str(x):
-        if type(x) == str or type(x) == np.str_:
-            return '{{"%s",},}' % (x)
-        elif type(x) == np.float32:
-            if dtype.is_integer():
-                return str(int(x))
-            else:
-                return str(x)
-        else:
-            raise Exception("Unsupported type for numpy_to_hls_code")
-
-    strarr = np.array2string(ndarray.flatten(), separator=", ", formatter={"all": elem2str})
-    np.set_printoptions(**orig_printops)
-    strarr = strarr.replace("[", "{").replace("]", ",}")
-    if no_decl:
-        ret = strarr + ";"
-    else:
-        ret = ret + " = \n" + strarr + ";"
-    return ret
-
-
 def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=True):
     """Convert the multidimensional NumPy array of integers (stored as floats)
     from input_file into a flattened sequence of Python arbitrary-precision