Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
3d463b3
weights for dense
laurilaatu Jan 26, 2026
d678573
hgq2 homogeneous quant fix
calad0i Jan 27, 2026
77258bc
Merge branch 'hgq2_homo_quant' of github.com:calad0i/hls4ml into onea…
laurilaatu Jan 27, 2026
59bd96f
Changes required for oneAPI MHA
laurilaatu Feb 9, 2026
dbb207b
Original weight implementation
laurilaatu Feb 9, 2026
0c59255
Merge branch 'main' of github.com:fastmachinelearning/hls4ml into one…
laurilaatu Feb 9, 2026
51efff0
Restore oneAPI weight placement
laurilaatu Feb 9, 2026
6067bea
pre-commit
laurilaatu Feb 9, 2026
06fda4e
Merge branch 'main' into oneapi_qmha
laurilaatu Feb 10, 2026
bf38a6b
Merge branch 'main' into oneapi_qmha
laurilaatu Feb 13, 2026
e27fd11
Merge branch 'main' into oneapi_qmha
laurilaatu Feb 16, 2026
9f4a448
Merge branch 'main' into oneapi_qmha
laurilaatu Feb 20, 2026
16ca197
softmax multidim templates
laurilaatu Feb 24, 2026
564b692
Merge branch 'oneapi_qmha' of github.com:laurilaatu/hls4ml into oneap…
laurilaatu Feb 24, 2026
974e75a
pre-commit
laurilaatu Feb 24, 2026
060c398
uncomment
laurilaatu Feb 24, 2026
f78558c
Merge branch 'main' into oneapi_qmha
laurilaatu Feb 25, 2026
772b93a
int_inp_t to config
laurilaatu Feb 25, 2026
d2b8921
Merge branch 'oneapi_qmha' of github.com:laurilaatu/hls4ml into oneap…
laurilaatu Feb 25, 2026
a1ad891
Merge branch 'main' into oneapi_qmha
laurilaatu Feb 26, 2026
d65544d
Merge branch 'main' into oneapi_qmha
laurilaatu Mar 16, 2026
2d6a5cc
Merge branch 'main' into oneapi_qmha
laurilaatu Mar 30, 2026
c3a4584
softmax fixed
bugracyln Apr 13, 2026
9b1cf17
Merge branch 'main' into oneapi_qmha
laurilaatu Apr 13, 2026
31b7ad6
table generation cleanup
bugracyln Apr 14, 2026
70b19d1
Merge pull request #4 from bugracyln/smax_fix
laurilaatu Apr 15, 2026
29bdbb3
Merge branch 'main' into oneapi_qmha
laurilaatu Jun 9, 2026
cab4cbc
Fix formatting of inp_norm_t name string
laurilaatu Jun 10, 2026
42ece34
pre-commit for core templates
laurilaatu Jun 10, 2026
7e2798a
pre-commit all
laurilaatu Jun 10, 2026
bd4778e
softmax update
bugracyln Jun 25, 2026
3946858
minor syntax fix
bugracyln Jun 25, 2026
be76917
Merge branch 'oneapi_qmha' into softmax_updated
bugracyln Jun 25, 2026
189f64a
Merge branch 'main' into softmax_updated
jmitrevs Jun 25, 2026
e0aba71
default case handling improvement
bugracyln Jun 28, 2026
584c4f7
minor improvements to default rollback and added comments
bugracyln Jun 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions hls4ml/backends/oneapi/oneapi_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
Embedding,
Layer,
SimpleRNN,
Softmax,
)
from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
Expand Down Expand Up @@ -257,13 +256,6 @@ def init_activation(self, layer):
if layer.get_attr('recurrent_activation') == 'tanh':
layer.set_attr('recurrent_activation', 'dense_tanh')

@layer_optimizer(Softmax)
def init_softmax(self, layer):
if layer.model.config.get_config_value('IOType') == 'io_parallel':
assert len(layer.get_input_variable().shape) == 1, (
'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
)

@layer_optimizer(Embedding)
def init_embed(self, layer):
if layer.attributes['n_in'] is None:
Expand Down
89 changes: 88 additions & 1 deletion hls4ml/backends/oneapi/passes/core_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
from hls4ml.utils.fixed_point_utils import ceil_log2

# Dense templates

Expand Down Expand Up @@ -195,11 +196,32 @@ def format(self, node):
softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
static constexpr unsigned n_in = {n_in};
static constexpr unsigned table_size = {table_size};
static constexpr unsigned exp_table_size = {exp_table_size};
static constexpr unsigned inv_table_size = {inv_table_size};
static constexpr unsigned io_type = nnet::{iotype};
static constexpr unsigned reuse_factor = {reuse};

static constexpr nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
typedef {smax_accum_t} accum_t;
typedef {exp_table_t.name} exp_table_t;
typedef {inv_table_t.name} inv_table_t;
typedef {inv_table_t.name} inv_table_t;"""

softmax_config_table_template = """

using {exp_table_name}_arr_t = nnet::array<exp_table_t, exp_table_size>;
using {inv_table_name}_arr_t = nnet::array<inv_table_t, inv_table_size>;
static constexpr const {exp_table_name}_arr_t exp_table = {exp_table_name};
static constexpr const {inv_table_name}_arr_t invert_table = {inv_table_name};
}};\n"""

softmax_config_table_template_stable = """
typedef {inv_inp_t.name} inv_inp_t;
typedef {inp_norm_t.name} inp_norm_t;

using {exp_table_name}_arr_t = nnet::array<exp_table_t, exp_table_size>;
using {inv_table_name}_arr_t = nnet::array<inv_table_t, inv_table_size>;
static constexpr const {exp_table_name}_arr_t exp_table = {exp_table_name};
static constexpr const {inv_table_name}_arr_t invert_table = {inv_table_name};
}};\n"""

activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
Expand All @@ -221,6 +243,71 @@ def format(self, node):
params = self._default_config_params(node)
params['type'] = node.get_attr('activation')

if params['type'] == 'softmax':
# The lookup input (x - x_max) is always <= 0, so only the negative half
if 'exp_table_size' in params and params['exp_table_size'] is not None:
params['exp_table_size'] //= 2
else:
# Use the default precision
params['exp_table_size'] = 2 ** (params['table_t'].precision.width - 1)
params['exp_table_t'].precision.width = ceil_log2(params['exp_table_size'])
params['exp_table_t'].precision.integer = params['table_t'].precision.integer - 1
params['exp_table_t'].precision.signed = False

params.setdefault('table_size', params['exp_table_size']) # Not sure if necessary

# Determine accumulator type if present, else derive it yourself based on the input size.
if params['accum_t'].name == 'model_default_t':
extra_bits_req = ceil_log2(params['n_in'])
s = 'true' if params['exp_table_t'].precision.signed else 'false'
w = params['exp_table_t'].precision.width + extra_bits_req
i = params['exp_table_t'].precision.integer + extra_bits_req
params['smax_accum_t'] = f'ac_fixed<{str(w)},{str(i)},{s}>'
else:
params['smax_accum_t'] = params['accum_t'].name

if 'inp_norm_t' not in params:
input_t = node.get_input_variable().type.precision
width, iwidth, signed = input_t.width, input_t.integer, input_t.signed # noqa: F841
width, iwidth = width - signed, iwidth - signed
import copy

params['inp_norm_t'] = copy.deepcopy(params['exp_table_t']) # assign type,later override

# This checks if table sizes will be default, if it is just use the table size to derive precision
if 'inv_table_size' not in params:
params['inp_norm_t'].precision.width = params['exp_table_t'].precision.width + 1
params['inp_norm_t'].precision.integer = params['exp_table_t'].precision.integer + 1
params['inp_norm_t'].precision.signed = True
params['inp_norm_t'].name = f'{node.name}_inp_norm_t'
else:
params[
'inp_norm_t'
].name = f'ac_fixed<{width},{iwidth},{"true" if signed else "false"},AC_RND,AC_SAT_SYM>'

node.set_attr('inp_norm_t', params['inp_norm_t'])

# Again we only look up 1/sum(e^x) which is >=0 so no need the entie address space
if 'inv_table_size' in params:
params['inv_table_size'] //= 2
else:
params['inv_table_size'] = 2 ** (params['table_t'].precision.width - 1)
params['inv_table_t'].precision.width = ceil_log2(params['inv_table_size'])
params['inv_table_t'].precision.integer = params['table_t'].precision.integer - 1
params['inv_table_t'].precision.signed = False

params['inv_inp_t'].precision.width = params['inv_table_t'].precision.width + 1
params['inv_inp_t'].precision.integer = params['inv_table_t'].precision.integer + 1
params['inv_inp_t'].precision.signed = True

if params['implementation'] == 'stable':
self.template = softmax_config_template + softmax_config_table_template_stable
else:
self.template = softmax_config_template + softmax_config_table_template

params['exp_table_name'] = node.name + '_exp_table'
params['inv_table_name'] = node.name + '_inv_table'

return self.template.format(**params)


Expand Down
66 changes: 53 additions & 13 deletions hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,13 @@ template <class data_T, class res_T, typename CONFIG_T> void sigmoid(const data_

enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };

template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
template <class data_T, unsigned table_size> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
// Number of address bits for table
static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
static constexpr int N = ceillog2<table_size>::val;

// Slice the top N bits of the input
[[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N - 1);

// If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
if (x != 0 && y == 0)
y[0] = 1;
Expand All @@ -121,38 +122,38 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
}

template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(const data_T &data, res_T &res) {
// Look-up tables
#include "activation_tables/exp_table.tb"
#include "activation_tables/invert_table.tb"

// Find maximum
Op_max<typename data_T::value_type> op_max;
[[intel::fpga_register]] auto x_max =
reduce<typename data_T::value_type, CONFIG_T::n_in, Op_max<typename data_T::value_type>>(data.data(), op_max);

// For the diffs, use the same type as the input but force rounding and saturation
[[intel::fpga_register]] ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT>
d_xi_xmax[CONFIG_T::n_in];
[[intel::fpga_register]] typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in];
#pragma unroll
for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
d_xi_xmax[i] = data[i] - x_max;
}

// Calculate all the e^x's
[[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
[[intel::fpga_register]] typename CONFIG_T::accum_t exp_res[CONFIG_T::n_in];
#pragma unroll
for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
exp_res[i] = exp_table[softmax_stable_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[i])];
exp_res[i] =
CONFIG_T::exp_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T::exp_table_size>(
d_xi_xmax[i])]; // input_t, CONFIG_T
}

// Explicitly sum previously calculated exponentials with an adder tree
Op_add<typename CONFIG_T::exp_table_t> op_add;
[[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
Op_add<typename CONFIG_T::accum_t> op_add;
[[intel::fpga_register]] typename CONFIG_T::inv_inp_t exp_sum =
reduce<typename CONFIG_T::accum_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);

// Multiply previously calculated exponetials with the reciprocal of the sum
[[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
CONFIG_T::invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(
exp_sum)];

#pragma unroll
for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
res[i] = exp_res[i] * inv_exp_sum;
Expand Down Expand Up @@ -265,6 +266,45 @@ template <class data_T, class res_T, typename CONFIG_T> inline void softmax(cons
}
}

// *************************************************
// Multidimensional Softmax
// *************************************************

// Helper to remap the config for the core softmax function
template <class CONFIG_T> struct softmax_multidim_slice_config : CONFIG_T {
static constexpr unsigned n_in = CONFIG_T::n_slice;
};

template <class data_T, class res_T, typename CONFIG_T> inline void softmax_multidim(const data_T &data, res_T &res) {
using buffer_data_t = std::array<typename data_T::value_type, CONFIG_T::n_slice>;
using buffer_res_t = std::array<typename res_T::value_type, CONFIG_T::n_slice>;
using slice_config = softmax_multidim_slice_config<CONFIG_T>;

#pragma unroll
for (unsigned i = 0; i < CONFIG_T::n_outer; i++) {
#pragma unroll
for (unsigned k = 0; k < CONFIG_T::n_inner; k++) {

[[intel::fpga_register]] buffer_data_t buffer_in;
[[intel::fpga_register]] buffer_res_t buffer_out;

// Gather Phase
#pragma unroll
for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
buffer_in[j] = data[idx];
}

nnet::softmax<buffer_data_t, buffer_res_t, slice_config>(buffer_in, buffer_out);

#pragma unroll
for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
res[idx] = buffer_out[j];
}
}
}
}
// *************************************************
// TanH Activation
// *************************************************
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -271,73 +271,61 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign_stre
// *************************************************

template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stable_stream() {
#include "activation_tables/exp_table.tb"
#include "activation_tables/invert_table.tb"

constexpr unsigned multiplier_limit =
DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
using input_arr_t = typename ExtractPipeType<data_pipe>::value_type;
using input_t = typename ExtractPipeType<data_pipe>::value_type::value_type;
constexpr unsigned input_arr_size = std::tuple_size<input_arr_t>{};

[[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type
data_array[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
constexpr unsigned multiplier_limit = DIV_ROUNDUP(input_arr_size, CONFIG_T::reuse_factor);
constexpr unsigned pipeline = input_arr_size / multiplier_limit;

[[intel::fpga_register]] input_t data_array[input_arr_size];

SoftmaxArrayLoop:
[[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
i < CONFIG_T::n_in /
std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
i++) {
[[intel::initiation_interval(pipeline)]] for (unsigned i = 0; i < CONFIG_T::n_in / input_arr_size; i++) {
auto in_pack = data_pipe::read();

SoftmaxArrayPackLoop:
#pragma unroll
for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
for (unsigned j = 0; j < input_arr_size; j++) {
data_array[j] = in_pack[j];
}

// Find the max and compute all delta(x_i, x_max)
Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type> op_max;
[[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type x_max =
reduce<typename ExtractPipeType<data_pipe>::value_type::value_type,
std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type>>(data_array, op_max);

// For the diffs, use the same type as the input but force rounding and saturation
[[intel::fpga_register]] ac_fixed<ExtractPipeType<data_pipe>::value_type::value_type::width,
ExtractPipeType<data_pipe>::value_type::value_type::i_width, true, AC_RND, AC_SAT>
d_xi_xmax[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
Op_max<input_t> op_max;
[[intel::fpga_register]] input_t x_max = reduce<input_t, input_arr_size, Op_max<input_t>>(data_array, op_max);

[[intel::fpga_register]] typename CONFIG_T::inp_norm_t d_xi_xmax[input_arr_size];

#pragma unroll
for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
for (unsigned j = 0; j < input_arr_size; j++) {
d_xi_xmax[j] = data_array[j] - x_max;
}

// Calculate all the e^x's
[[intel::fpga_register]]
typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
[[intel::fpga_register]] typename CONFIG_T::accum_t exp_res[input_arr_size];

#pragma unroll
for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
exp_res[j] =
exp_table[softmax_stable_idx_from_real_val<typename ExtractPipeType<data_pipe>::value_type::value_type,
CONFIG_T>(d_xi_xmax[j])];
for (unsigned j = 0; j < input_arr_size; j++) {
exp_res[j] = CONFIG_T::exp_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t,
CONFIG_T::exp_table_size>(d_xi_xmax[j])];
}

// Explicitly sum the results with an adder tree.
// Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
Op_add<typename CONFIG_T::exp_table_t> op_add;
[[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
reduce<typename CONFIG_T::exp_table_t, std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
Op_add<typename CONFIG_T::accum_t> op_add;
[[intel::fpga_register]] typename CONFIG_T::inv_inp_t exp_sum =
reduce<typename CONFIG_T::accum_t, input_arr_size, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);

[[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
CONFIG_T::invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(
exp_sum)];

typename ExtractPipeType<res_pipe>::value_type out_pack;

SoftmaxInvPackLoop:
#pragma unroll
for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {

// TODO - Find Quartus-equivalent pragma
// #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation

out_pack[j] = exp_res[j] * inv_exp_sum;
}

Expand Down
2 changes: 2 additions & 0 deletions hls4ml/templates/oneapi/firmware/parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include "nnet_utils/nnet_code_gen.h"
#include "nnet_utils/nnet_helpers.h"

// hls-fpga-machine-learning insert softmax tables

// hls-fpga-machine-learning insert includes

// hls-fpga-machine-learning insert layer-config
Expand Down
Loading