fastmachinelearning · jmitrevs · May 20, 2026 · May 20, 2026 · May 20, 2026 · Jun 8, 2026
diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
@@ -129,22 +129,33 @@ def __init__(self, name):
             ConfigurableAttribute('skip', value_type=bool, default=False, description=descriptions.softmax_skip),
             TypeAttribute(
                 'exp_table',
-                default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+                default=FixedPrecisionType(
+                    18, 8, signed=False, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT
+                ),
                 description=descriptions.table_type,
             ),
             TypeAttribute(
                 'inv_table',
-                default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+                default=FixedPrecisionType(
+                    18, 8, signed=False, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT
+                ),
                 description=descriptions.table_type,
             ),
             TypeAttribute(
                 'inv_inp',
-                default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+                default=FixedPrecisionType(
+                    18, 8, signed=False, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT
+                ),
+                description='What the accumulated value is cast to before accessing the inversion table (only in stable)',
             ),
             TypeAttribute(
-                'accum',
-                default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+                'inp_norm',
+                default=FixedPrecisionType(
+                    18, 8, signed=False, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT
+                ),
+                description='The internal width used for the exp table lookup (only in stable)',
             ),
+            TypeAttribute('accum', description=descriptions.accum_type),
         ]
         self.attribute_map[Softmax] = softmax_attrs
 

diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -90,6 +90,9 @@ def _infer_precision(self, node, types_to_infer):
 
         if node_class in ['PReLU']:
             return self._infer_prelu_act_precision(node, types_to_infer)
+
+        if node_class in ['Softmax']:
+            return self._infer_softmax_precision(node, types_to_infer)
         # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent
         # this in config_from_* functions
 
@@ -605,6 +608,26 @@ def _infer_prelu_act_precision(self, node, types_to_infer):
 
         return inferred_types
 
+    def _infer_softmax_precision(self, node, types_to_infer):
+        inferred_types = []
+
+        # for softmax, the table parameters have a default setting, so they don't need to be inferred
+        # here. We never expect them to be of type auto.
+
+        # For result, we leave it to be set externally (model default if not set). We expect it to
+        # likely be the output value, in which case the output format would determine it's precision.
+        # Therefore, only the accum is configured here
+
+        if 'accum_t' in types_to_infer:
+            exp_w = node.types['exp_table_t'].precision.width
+            exp_i = node.types['exp_table_t'].precision.integer
+            exp_s = node.types['exp_table_t'].precision.signed
+            ceillog = math.ceil(np.log2(node.get_attr('n_in')))
+            node.types['accum_t'].precision = FixedPrecisionType(exp_w + ceillog, exp_i + ceillog, signed=exp_s)
+            inferred_types.append('accum_t')
+
+        return inferred_types
+
 
 def _get_precision_from_constant(value: int | float, max_width=8):
     """A utility function to find a fixed type to store the constant

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -189,14 +189,13 @@ void softmax_latency(data_T data[CONFIG_T::n_slice], res_T res[CONFIG_T::n_slice
         // Note we are exponentiating the inputs, which have type data_T
         init_exp_table<data_T, CONFIG_T>(exp_table);
         // Note we are inverting the exponentials, which have type exp_table_t
-        init_invert_table<typename CONFIG_T::inv_inp_t, CONFIG_T>(invert_table);
+        init_invert_table<typename CONFIG_T::accum_t, CONFIG_T>(invert_table);
         initialized = true;
     }
 
     // Calculate all the e^x's
     typename CONFIG_T::accum_t exp_res[CONFIG_T::n_slice];
     #pragma HLS array_partition variable=exp_res complete
-    typename CONFIG_T::inv_inp_t exp_sum(0);
     for (unsigned i = 0; i < CONFIG_T::n_slice; i++) {
         #pragma HLS unroll
         unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T::exp_table_size>(data[i]);
@@ -206,10 +205,11 @@ void softmax_latency(data_T data[CONFIG_T::n_slice], res_T res[CONFIG_T::n_slice
     // Explicitly sum the results with an adder tree.
     // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
     Op_add<typename CONFIG_T::accum_t> op_add;
-    exp_sum = reduce<typename CONFIG_T::accum_t, CONFIG_T::n_slice, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
+    typename CONFIG_T::accum_t exp_sum =
+        reduce<typename CONFIG_T::accum_t, CONFIG_T::n_slice, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
     typename CONFIG_T::inv_table_t inv_exp_sum =
-        invert_table[softmax_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(exp_sum)];
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::accum_t, CONFIG_T::inv_table_size>(exp_sum)];
     for (unsigned i = 0; i < CONFIG_T::n_slice; i++) {
         #pragma HLS unroll
         res[i] = exp_res[i] * inv_exp_sum;
@@ -251,7 +251,6 @@ void softmax_stable(data_T data[CONFIG_T::n_slice], res_T res[CONFIG_T::n_slice]
     // Calculate all the e^x's
     typename CONFIG_T::accum_t exp_res[CONFIG_T::n_slice];
     #pragma HLS array_partition variable=exp_res complete
-    typename CONFIG_T::inv_inp_t exp_sum(0);
     for (unsigned i = 0; i < CONFIG_T::n_slice; i++) {
         #pragma HLS unroll
         unsigned x = softmax_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T::exp_table_size>(d_xi_xmax[i]);
@@ -261,7 +260,8 @@ void softmax_stable(data_T data[CONFIG_T::n_slice], res_T res[CONFIG_T::n_slice]
     // Explicitly sum the results with an adder tree.
     // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
     Op_add<typename CONFIG_T::accum_t> op_add;
-    exp_sum = reduce<typename CONFIG_T::accum_t, CONFIG_T::n_slice, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
+    typename CONFIG_T::inv_inp_t exp_sum =
+        reduce<typename CONFIG_T::accum_t, CONFIG_T::n_slice, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
     typename CONFIG_T::inv_table_t inv_exp_sum =
         invert_table[softmax_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(exp_sum)];
@@ -271,18 +271,18 @@ void softmax_stable(data_T data[CONFIG_T::n_slice], res_T res[CONFIG_T::n_slice]
     }
 }
 
-template <typename CONFIG_T, int N_TABLE> void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
+template <typename CONFIG_T, int N_TABLE> void init_exp_table_legacy(typename CONFIG_T::exp_table_t table_out[N_TABLE]) {
     for (int ii = 0; ii < N_TABLE; ii++) {
         // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
         float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
         // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = exp_fcn_float(in_val);
+        typename CONFIG_T::exp_table_t real_val = exp_fcn_float(in_val);
         // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
         table_out[ii] = real_val;
     }
 }
 
-template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
+template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename CONFIG_T::inv_table_t table_out[N_TABLE]) {
     // Inversion function:
     //   result = 1/x
     for (int ii = 0; ii < N_TABLE; ii++) {
@@ -301,12 +301,12 @@ void softmax_legacy(data_T data[CONFIG_T::n_slice], res_T res[CONFIG_T::n_slice]
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
-    typename CONFIG_T::table_t exp_table[CONFIG_T::exp_table_size];
-    typename CONFIG_T::table_t invert_table[CONFIG_T::inv_table_size];
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::exp_table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::inv_table_size];
 #else
     static bool initialized = false;
-    static typename CONFIG_T::table_t exp_table[CONFIG_T::exp_table_size];
-    static typename CONFIG_T::table_t invert_table[CONFIG_T::inv_table_size];
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::exp_table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::inv_table_size];
 #endif
     if (!initialized) {
         init_exp_table_legacy<CONFIG_T, CONFIG_T::exp_table_size>(exp_table);
@@ -317,22 +317,23 @@ void softmax_legacy(data_T data[CONFIG_T::n_slice], res_T res[CONFIG_T::n_slice]
     #pragma HLS PIPELINE
 
     // Index into the lookup table based on data for exponentials
-    typename CONFIG_T::table_t exp_res[CONFIG_T::n_slice]; // different, independent, fixed point precision
-    typename CONFIG_T::table_t exp_diff_res;               // different, independent, fixed point precision
+    typename CONFIG_T::accum_t exp_res[CONFIG_T::n_slice]; // different, independent, fixed point precision
+    typename CONFIG_T::exp_table_t exp_diff_res;           // different, independent, fixed point precision
     data_T data_cache[CONFIG_T::n_slice];
-    int data_round;
     int index;
+
     for (int ii = 0; ii < CONFIG_T::n_slice; ii++) {
         data_cache[ii] = data[ii];
         exp_res[ii] = 0;
     }
 
+    // first calculate 1/softmax as a sum over fractions.
     for (int ii = 0; ii < CONFIG_T::n_slice; ii++) {
         for (int jj = 0; jj < CONFIG_T::n_slice; jj++) {
             if (ii == jj)
                 exp_diff_res = 1;
             else {
-                data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::exp_table_size / 16;
+                auto data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::exp_table_size / 16;
                 index = data_round + 8 * CONFIG_T::exp_table_size / 16;
                 if (index < 0)
                     index = 0;
@@ -352,7 +353,7 @@ void softmax_legacy(data_T data[CONFIG_T::n_slice], res_T res[CONFIG_T::n_slice]
         if (exp_res_index > CONFIG_T::inv_table_size - 1)
             exp_res_index = CONFIG_T::inv_table_size - 1;
         // typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index];
-        res[ii] = (res_T)invert_table[exp_res_index];
+        res[ii] = static_cast<res_T>(invert_table[exp_res_index]);
     }
 }
 

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
@@ -120,8 +120,8 @@ void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     if (!initialized) {
         // Note we are exponentiating the inputs, which have type data_T
         init_exp_table<typename data_T::value_type, CONFIG_T>(exp_table);
-        // Note we are inverting the exponentials, which have type exp_table_t
-        init_invert_table<typename CONFIG_T::inv_inp_t, CONFIG_T>(invert_table);
+        // Note we are inverting the summed exponentials, which have type accum_t
+        init_invert_table<typename CONFIG_T::accum_t, CONFIG_T>(invert_table);
         initialized = true;
     }
 
@@ -150,7 +150,7 @@ void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         exp_sum = reduce<typename CONFIG_T::accum_t, data_T::size, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
         typename CONFIG_T::inv_table_t inv_exp_sum =
-            invert_table[softmax_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(exp_sum)];
+            invert_table[softmax_idx_from_real_val<typename CONFIG_T::accum_t, CONFIG_T::inv_table_size>(exp_sum)];
 
         res_T out_pack;
         PRAGMA_DATA_PACK(out_pack)
@@ -216,7 +216,6 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         // Calculate all the e^x's
         typename CONFIG_T::accum_t exp_res[data_T::size];
         #pragma HLS ARRAY_PARTITION variable=exp_res complete
-        typename CONFIG_T::inv_inp_t exp_sum(0);
         for (unsigned j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
             unsigned x = softmax_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T::exp_table_size>(d_xi_xmax[j]);
@@ -226,7 +225,8 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         // Explicitly sum the results with an adder tree.
         // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
         Op_add<typename CONFIG_T::accum_t> op_add;
-        exp_sum = reduce<typename CONFIG_T::accum_t, data_T::size, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
+        typename CONFIG_T::inv_inp_t exp_sum =
+            reduce<typename CONFIG_T::accum_t, data_T::size, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
         typename CONFIG_T::inv_table_t inv_exp_sum =
             invert_table[softmax_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(exp_sum)];
@@ -249,22 +249,22 @@ void softmax_legacy(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
-    typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::exp_table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::inv_table_size];
 #else
     static bool initialized = false;
-    static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::exp_table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::inv_table_size];
 #endif
     if (!initialized) {
-        init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
-        init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
+        init_exp_table_legacy<CONFIG_T, CONFIG_T::exp_table_size>(exp_table);
+        init_invert_table_legacy<CONFIG_T, CONFIG_T::inv_table_size>(invert_table);
         initialized = true;
     }
 
     // Index into the lookup table based on data for exponentials
-    typename CONFIG_T::table_t exp_res[data_T::size];
-    typename CONFIG_T::table_t exp_diff_res;
+    typename CONFIG_T::accum_t exp_res[data_T::size];
+    typename CONFIG_T::exp_table_t exp_diff_res;
     typename data_T::value_type data_cache[data_T::size];
 
 SoftmaxInitLoop:
@@ -288,12 +288,12 @@ void softmax_legacy(hls::stream<data_T> &data, hls::stream<res_T> &res) {
                 if (i == j) {
                     exp_diff_res = 1;
                 } else {
-                    int data_round = (data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16;
+                    auto data_round = (data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16;
                     int index = data_round + 8 * CONFIG_T::table_size / 16;
                     if (index < 0)
                         index = 0;
-                    if (index > CONFIG_T::table_size - 1)
-                        index = CONFIG_T::table_size - 1;
+                    if (index > CONFIG_T::exp_table_size - 1)
+                        index = CONFIG_T::exp_table_size - 1;
                     exp_diff_res = exp_table[index];
                 }
 
@@ -311,10 +311,10 @@ void softmax_legacy(hls::stream<data_T> &data, hls::stream<res_T> &res) {
             int exp_res_index = exp_res[j] * CONFIG_T::table_size / 64;
             if (exp_res_index < 0)
                 exp_res_index = 0;
-            if (exp_res_index > CONFIG_T::table_size - 1)
-                exp_res_index = CONFIG_T::table_size - 1;
+            if (exp_res_index > CONFIG_T::inv_table_size - 1)
+                exp_res_index = CONFIG_T::inv_table_size - 1;
 
-            out_pack[j] = (typename res_T::value_type)invert_table[exp_res_index];
+            out_pack[j] = static_cast<typename res_T::value_type>(invert_table[exp_res_index]);
         }
         res.write(out_pack);
     }

diff --git a/test/pytest/test_auto_precision.py b/test/pytest/test_auto_precision.py
@@ -1,3 +1,4 @@
+import math
 from pathlib import Path
 
 import numpy as np
@@ -13,10 +14,12 @@
     ReLU,
     SeparableConv1D,
     SeparableConv2D,
+    Softmax,
 )
 from tensorflow.keras.models import Sequential
 
 import hls4ml
+import hls4ml.model.layers
 from hls4ml.model.optimizer.passes.infer_precision import _get_precision_from_constant
 
 test_root_path = Path(__file__).parent
@@ -285,3 +288,37 @@ def test_precision_from_constant_unit(val, expected_width):
     quantum = 2.0**-fp.fractional
     if expected_width < max_width:
         assert val % quantum == 0
+
+
+@pytest.mark.parametrize('n_in', [4, 8, 16])
+@pytest.mark.parametrize('backend', ['Vitis', 'oneAPI'])
+def test_auto_precision_softmax(test_case_id, n_in, backend):
+    """Test that auto accumulator precision is correctly inferred for softmax layers."""
+    model = Sequential()
+    model.add(Softmax(input_shape=(n_in,)))
+    model.compile()
+
+    config = hls4ml.utils.config_from_keras_model(model, backend=backend, granularity='name')
+
+    odir = str(test_root_path / test_case_id)
+    hls_model = hls4ml.converters.convert_from_keras_model(model, hls_config=config, output_dir=odir, backend=backend)
+
+    # Find the Softmax layer and verify accum_t precision
+    softmax_layer = next((layer for layer in hls_model.get_layers() if isinstance(layer, hls4ml.model.layers.Softmax)), None)
+    assert softmax_layer is not None, 'No Softmax layer found in converted model'
+
+    accum_t = softmax_layer.types['accum_t'].precision
+    exp_table_t = softmax_layer.types['exp_table_t'].precision
+
+    ceillog = math.ceil(math.log2(n_in))
+    expected_width = exp_table_t.width + ceillog
+    expected_integer = exp_table_t.integer + ceillog
+    expected_signed = exp_table_t.signed
+
+    assert accum_t.width == expected_width, f'Expected accum_t width {expected_width}, got {accum_t.width} (n_in={n_in})'
+    assert accum_t.integer == expected_integer, (
+        f'Expected accum_t integer {expected_integer}, got {accum_t.integer} (n_in={n_in})'
+    )
+    assert accum_t.signed == expected_signed, (
+        f'Expected accum_t signed={expected_signed}, got {accum_t.signed} (n_in={n_in})'
+    )