fastmachinelearning · bugracyln · Jan 26, 2026 · Jan 27, 2026 · Jan 27, 2026 · Feb 9, 2026
diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -19,7 +19,6 @@
     Embedding,
     Layer,
     SimpleRNN,
-    Softmax,
 )
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
@@ -257,13 +256,6 @@ def init_activation(self, layer):
         if layer.get_attr('recurrent_activation') == 'tanh':
             layer.set_attr('recurrent_activation', 'dense_tanh')
 
-    @layer_optimizer(Softmax)
-    def init_softmax(self, layer):
-        if layer.model.config.get_config_value('IOType') == 'io_parallel':
-            assert len(layer.get_input_variable().shape) == 1, (
-                'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
-            )
-
     @layer_optimizer(Embedding)
     def init_embed(self, layer):
         if layer.attributes['n_in'] is None:

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -2,6 +2,7 @@
 from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
 from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
+from hls4ml.utils.fixed_point_utils import ceil_log2
 
 # Dense templates
 
@@ -195,11 +196,32 @@ def format(self, node):
 softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
     static constexpr unsigned n_in = {n_in};
     static constexpr unsigned table_size = {table_size};
+    static constexpr unsigned exp_table_size = {exp_table_size};
+    static constexpr unsigned inv_table_size = {inv_table_size};
     static constexpr unsigned io_type = nnet::{iotype};
     static constexpr unsigned reuse_factor = {reuse};
+
     static constexpr nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
+    typedef {smax_accum_t} accum_t;
     typedef {exp_table_t.name} exp_table_t;
-    typedef {inv_table_t.name} inv_table_t;
+    typedef {inv_table_t.name} inv_table_t;"""
+
+softmax_config_table_template = """
+
+    using {exp_table_name}_arr_t = nnet::array<exp_table_t, exp_table_size>;
+    using {inv_table_name}_arr_t = nnet::array<inv_table_t, inv_table_size>;
+    static constexpr const {exp_table_name}_arr_t exp_table = {exp_table_name};
+    static constexpr const {inv_table_name}_arr_t invert_table = {inv_table_name};
+}};\n"""
+
+softmax_config_table_template_stable = """
+    typedef {inv_inp_t.name} inv_inp_t;
+    typedef {inp_norm_t.name} inp_norm_t;
+
+    using {exp_table_name}_arr_t = nnet::array<exp_table_t, exp_table_size>;
+    using {inv_table_name}_arr_t = nnet::array<inv_table_t, inv_table_size>;
+    static constexpr const {exp_table_name}_arr_t exp_table = {exp_table_name};
+    static constexpr const {inv_table_name}_arr_t invert_table = {inv_table_name};
 }};\n"""
 
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
@@ -221,6 +243,71 @@ def format(self, node):
         params = self._default_config_params(node)
         params['type'] = node.get_attr('activation')
 
+        if params['type'] == 'softmax':
+            # The lookup input (x - x_max) is always <= 0, so only the negative half
+            if 'exp_table_size' in params and params['exp_table_size'] is not None:
+                params['exp_table_size'] //= 2
+            else:
+                # Use the default precision
+                params['exp_table_size'] = 2 ** (params['table_t'].precision.width - 1)
+                params['exp_table_t'].precision.width = ceil_log2(params['exp_table_size'])
+                params['exp_table_t'].precision.integer = params['table_t'].precision.integer - 1
+                params['exp_table_t'].precision.signed = False
+
+            params.setdefault('table_size', params['exp_table_size'])  # Not sure if necessary
+
+            # Determine accumulator type if present, else derive it yourself based on the input size.
+            if params['accum_t'].name == 'model_default_t':
+                extra_bits_req = ceil_log2(params['n_in'])
+                s = 'true' if params['exp_table_t'].precision.signed else 'false'
+                w = params['exp_table_t'].precision.width + extra_bits_req
+                i = params['exp_table_t'].precision.integer + extra_bits_req
+                params['smax_accum_t'] = f'ac_fixed<{str(w)},{str(i)},{s}>'
+            else:
+                params['smax_accum_t'] = params['accum_t'].name
+
+            if 'inp_norm_t' not in params:
+                input_t = node.get_input_variable().type.precision
+                width, iwidth, signed = input_t.width, input_t.integer, input_t.signed  # noqa: F841
+                width, iwidth = width - signed, iwidth - signed
+                import copy
+
+                params['inp_norm_t'] = copy.deepcopy(params['exp_table_t'])  # assign type,later override
+
+                # This checks if table sizes will be default, if it is just use the table size to derive precision
+                if 'inv_table_size' not in params:
+                    params['inp_norm_t'].precision.width = params['exp_table_t'].precision.width + 1
+                    params['inp_norm_t'].precision.integer = params['exp_table_t'].precision.integer + 1
+                    params['inp_norm_t'].precision.signed = True
+                    params['inp_norm_t'].name = f'{node.name}_inp_norm_t'
+                else:
+                    params[
+                        'inp_norm_t'
+                    ].name = f'ac_fixed<{width},{iwidth},{"true" if signed else "false"},AC_RND,AC_SAT_SYM>'
+
+                node.set_attr('inp_norm_t', params['inp_norm_t'])
+
+            # Again we only look up 1/sum(e^x) which is >=0 so no need the entie address space
+            if 'inv_table_size' in params:
+                params['inv_table_size'] //= 2
+            else:
+                params['inv_table_size'] = 2 ** (params['table_t'].precision.width - 1)
+                params['inv_table_t'].precision.width = ceil_log2(params['inv_table_size'])
+                params['inv_table_t'].precision.integer = params['table_t'].precision.integer - 1
+                params['inv_table_t'].precision.signed = False
+
+                params['inv_inp_t'].precision.width = params['inv_table_t'].precision.width + 1
+                params['inv_inp_t'].precision.integer = params['inv_table_t'].precision.integer + 1
+                params['inv_inp_t'].precision.signed = True
+
+            if params['implementation'] == 'stable':
+                self.template = softmax_config_template + softmax_config_table_template_stable
+            else:
+                self.template = softmax_config_template + softmax_config_table_template
+
+            params['exp_table_name'] = node.name + '_exp_table'
+            params['inv_table_name'] = node.name + '_inv_table'
+
         return self.template.format(**params)
 
 

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -99,12 +99,13 @@ template <class data_T, class res_T, typename CONFIG_T> void sigmoid(const data_
 
 enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
 
-template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
+template <class data_T, unsigned table_size> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
     // Number of address bits for table
-    static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
+    static constexpr int N = ceillog2<table_size>::val;
 
     // Slice the top N bits of the input
     [[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N - 1);
+
     // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
     if (x != 0 && y == 0)
         y[0] = 1;
@@ -121,38 +122,38 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
 }
 
 template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(const data_T &data, res_T &res) {
-// Look-up tables
-#include "activation_tables/exp_table.tb"
-#include "activation_tables/invert_table.tb"
 
     // Find maximum
     Op_max<typename data_T::value_type> op_max;
     [[intel::fpga_register]] auto x_max =
         reduce<typename data_T::value_type, CONFIG_T::n_in, Op_max<typename data_T::value_type>>(data.data(), op_max);
 
     // For the diffs, use the same type as the input but force rounding and saturation
-    [[intel::fpga_register]] ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT>
-        d_xi_xmax[CONFIG_T::n_in];
+    [[intel::fpga_register]] typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         d_xi_xmax[i] = data[i] - x_max;
     }
 
     // Calculate all the e^x's
-    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    [[intel::fpga_register]] typename CONFIG_T::accum_t exp_res[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        exp_res[i] = exp_table[softmax_stable_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[i])];
+        exp_res[i] =
+            CONFIG_T::exp_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T::exp_table_size>(
+                d_xi_xmax[i])]; // input_t, CONFIG_T
     }
 
     // Explicitly sum previously calculated exponentials with an adder tree
-    Op_add<typename CONFIG_T::exp_table_t> op_add;
-    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
-        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+    Op_add<typename CONFIG_T::accum_t> op_add;
+    [[intel::fpga_register]] typename CONFIG_T::inv_inp_t exp_sum =
+        reduce<typename CONFIG_T::accum_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
     // Multiply previously calculated exponetials with the reciprocal of the sum
     [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
-        invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+        CONFIG_T::invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(
+            exp_sum)];
+
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         res[i] = exp_res[i] * inv_exp_sum;
@@ -265,6 +266,45 @@ template <class data_T, class res_T, typename CONFIG_T> inline void softmax(cons
     }
 }
 
+// *************************************************
+//       Multidimensional Softmax
+// *************************************************
+
+// Helper to remap the config for the core softmax function
+template <class CONFIG_T> struct softmax_multidim_slice_config : CONFIG_T {
+    static constexpr unsigned n_in = CONFIG_T::n_slice;
+};
+
+template <class data_T, class res_T, typename CONFIG_T> inline void softmax_multidim(const data_T &data, res_T &res) {
+    using buffer_data_t = std::array<typename data_T::value_type, CONFIG_T::n_slice>;
+    using buffer_res_t = std::array<typename res_T::value_type, CONFIG_T::n_slice>;
+    using slice_config = softmax_multidim_slice_config<CONFIG_T>;
+
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_outer; i++) {
+        #pragma unroll
+        for (unsigned k = 0; k < CONFIG_T::n_inner; k++) {
+
+            [[intel::fpga_register]] buffer_data_t buffer_in;
+            [[intel::fpga_register]] buffer_res_t buffer_out;
+
+            // Gather Phase
+            #pragma unroll
+            for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
+                unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
+                buffer_in[j] = data[idx];
+            }
+
+            nnet::softmax<buffer_data_t, buffer_res_t, slice_config>(buffer_in, buffer_out);
+
+            #pragma unroll
+            for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
+                unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
+                res[idx] = buffer_out[j];
+            }
+        }
+    }
+}
 // *************************************************
 //       TanH Activation
 // *************************************************

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -271,73 +271,61 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign_stre
 // *************************************************
 
 template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stable_stream() {
-#include "activation_tables/exp_table.tb"
-#include "activation_tables/invert_table.tb"
 
-    constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+    using input_arr_t = typename ExtractPipeType<data_pipe>::value_type;
+    using input_t = typename ExtractPipeType<data_pipe>::value_type::value_type;
+    constexpr unsigned input_arr_size = std::tuple_size<input_arr_t>{};
 
-    [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type
-        data_array[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(input_arr_size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = input_arr_size / multiplier_limit;
+
+    [[intel::fpga_register]] input_t data_array[input_arr_size];
 
 SoftmaxArrayLoop:
-    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
-                                                  i < CONFIG_T::n_in /
-                                                          std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
-                                                  i++) {
+    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; i < CONFIG_T::n_in / input_arr_size; i++) {
         auto in_pack = data_pipe::read();
 
     SoftmaxArrayPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+        for (unsigned j = 0; j < input_arr_size; j++) {
             data_array[j] = in_pack[j];
         }
 
         // Find the max and compute all delta(x_i, x_max)
-        Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type> op_max;
-        [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type x_max =
-            reduce<typename ExtractPipeType<data_pipe>::value_type::value_type,
-                   std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
-                   Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type>>(data_array, op_max);
-
-        // For the diffs, use the same type as the input but force rounding and saturation
-        [[intel::fpga_register]] ac_fixed<ExtractPipeType<data_pipe>::value_type::value_type::width,
-                                          ExtractPipeType<data_pipe>::value_type::value_type::i_width, true, AC_RND, AC_SAT>
-            d_xi_xmax[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+        Op_max<input_t> op_max;
+        [[intel::fpga_register]] input_t x_max = reduce<input_t, input_arr_size, Op_max<input_t>>(data_array, op_max);
+
+        [[intel::fpga_register]] typename CONFIG_T::inp_norm_t d_xi_xmax[input_arr_size];
+
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+        for (unsigned j = 0; j < input_arr_size; j++) {
             d_xi_xmax[j] = data_array[j] - x_max;
         }
 
         // Calculate all the e^x's
-        [[intel::fpga_register]]
-        typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+        [[intel::fpga_register]] typename CONFIG_T::accum_t exp_res[input_arr_size];
+
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
-            exp_res[j] =
-                exp_table[softmax_stable_idx_from_real_val<typename ExtractPipeType<data_pipe>::value_type::value_type,
-                                                           CONFIG_T>(d_xi_xmax[j])];
+        for (unsigned j = 0; j < input_arr_size; j++) {
+            exp_res[j] = CONFIG_T::exp_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t,
+                                                                              CONFIG_T::exp_table_size>(d_xi_xmax[j])];
         }
 
         // Explicitly sum the results with an adder tree.
         // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
-        Op_add<typename CONFIG_T::exp_table_t> op_add;
-        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
-            reduce<typename CONFIG_T::exp_table_t, std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
-                   Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+        Op_add<typename CONFIG_T::accum_t> op_add;
+        [[intel::fpga_register]] typename CONFIG_T::inv_inp_t exp_sum =
+            reduce<typename CONFIG_T::accum_t, input_arr_size, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
         [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
-            invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+            CONFIG_T::invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(
+                exp_sum)];
+
         typename ExtractPipeType<res_pipe>::value_type out_pack;
 
     SoftmaxInvPackLoop:
         #pragma unroll
         for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-
-            // TODO - Find Quartus-equivalent pragma
-            // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
-
             out_pack[j] = exp_res[j] * inv_exp_sum;
         }
 

diff --git a/hls4ml/templates/oneapi/firmware/parameters.h b/hls4ml/templates/oneapi/firmware/parameters.h
@@ -6,6 +6,8 @@
 #include "nnet_utils/nnet_code_gen.h"
 #include "nnet_utils/nnet_helpers.h"
 
+// hls-fpga-machine-learning insert softmax tables
+
 // hls-fpga-machine-learning insert includes
 
 // hls-fpga-machine-learning insert layer-config