diff --git a/python/tinyProp.py b/python/tinyProp.py index c643979..44ee2f0 100644 --- a/python/tinyProp.py +++ b/python/tinyProp.py @@ -2,7 +2,7 @@ import torch.nn as nn import torch.nn.functional as F from typing import Union -from torch.nn.common_types import _size_2_t +from torch.nn.common_types import _size_1_t, _size_2_t # classes to hold TinyProp parameters on Net and Layer scope @@ -55,7 +55,22 @@ def selectGradients(self, grad_output, params): idx = torch.hstack(idx) val = torch.cat(val) return idx, val - + + +#========== Helper functions ==========# + +def _apply_tinyprop_mask(tp_info: "TinyPropLayer", grad_output: torch.Tensor, tp_params: TinyPropParams) -> torch.Tensor: + """Apply the TinyProp gradient selection to the gradient tensor.""" + + flattened = torch.flatten(grad_output, start_dim=1) + indices, values = tp_info.selectGradients(flattened, tp_params) + + masked_flat = torch.zeros_like(flattened) + if values.numel() > 0: + masked_flat[indices[0], indices[1]] = values + + return masked_flat.view_as(grad_output) + #========== LINEAR ==========# @@ -113,9 +128,79 @@ def forward(self, input): return SparseLinear.apply(input, self.weight, self.tpParams, self, self.bias) -#========== CONVOLUTION ==========# +#========== CONVOLUTION 1D ==========# + +class SparseConv1d(torch.autograd.Function): + + @staticmethod + def forward(ctx, input, weight, bias, stride, padding, dilation, groups, padding_mode, + _reversed_padding_repeated_twice, tpParams: TinyPropParams, tpInfo: TinyPropLayer): + ctx.save_for_backward(input, weight, bias) + + ctx.stride = stride + ctx.padding = padding + ctx.dilation = dilation + ctx.groups = groups + ctx.tpParams = tpParams + ctx.tpInfo = tpInfo + + if padding_mode != 'zeros': + return F.conv1d(F.pad(input, _reversed_padding_repeated_twice, mode=padding_mode), + weight, bias, stride, 0, dilation, groups) + return F.conv1d(input, weight, bias, stride, padding, dilation, groups) + + @staticmethod + def backward(ctx, grad_output): + input, weight, bias = ctx.saved_tensors + + grad_input = grad_weight = grad_bias = None + + masked_grad = _apply_tinyprop_mask(ctx.tpInfo, grad_output, ctx.tpParams) + + if ctx.needs_input_grad[0]: + grad_input = torch.nn.grad.conv1d_input(input.shape, weight, masked_grad, ctx.stride, + ctx.padding, ctx.dilation, ctx.groups) + if ctx.needs_input_grad[1]: + grad_weight = torch.nn.grad.conv1d_weight(input, weight.shape, masked_grad, ctx.stride, + ctx.padding, ctx.dilation, ctx.groups) + if bias is not None and ctx.needs_input_grad[2]: + sum_dims = (0,) + tuple(range(2, masked_grad.dim())) + grad_bias = masked_grad.sum(dim=sum_dims) + + return grad_input, grad_weight, grad_bias, None, None, None, None, None, None, None, None + + +class TinyPropConv1d(TinyPropLayer, nn.Conv1d): + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: _size_1_t, + tinyPropParams: TinyPropParams, + layer_number: int, + stride: _size_1_t = 1, + padding: Union[str, _size_1_t] = 0, + dilation: _size_1_t = 1, + groups: int = 1, + bias: bool = True, + padding_mode: str = 'zeros', + device=None, + dtype=None): + TinyPropLayer.__init__(self, tinyPropParams.number_of_layers - layer_number) + nn.Conv1d.__init__(self, in_channels, out_channels, kernel_size, stride=stride, padding=padding, + dilation=dilation, groups=groups, bias=bias, padding_mode=padding_mode, + device=device, dtype=dtype) + + self.tpParams = tinyPropParams -class SparseConv2d(torch.autograd.Function): + def forward(self, input): + return SparseConv1d.apply(input, self.weight, self.bias, self.stride, self.padding, self.dilation, + self.groups, self.padding_mode, self._reversed_padding_repeated_twice, + self.tpParams, self) + + +#========== CONVOLUTION 2D ==========# + +class SparseConv2d(torch.autograd.Function): # keep in mind that convolution operations DO NOT reduce the batchSize (in contrast to matmul)! @staticmethod @@ -150,44 +235,19 @@ def backward(ctx, grad_output): # Initialize all gradients w.r.t. inputs to None grad_input = grad_weight = grad_bias = None - # This is the TinyProp part: conv can't handle sparse matrices so I have to build a masked version based on the selected gradients - out_ch = grad_output.shape[1] - out_width = grad_output.shape[2] - out_height = grad_output.shape[3] - # flatten elements to work with the gradient selection - flattened = torch.flatten(grad_output, start_dim=1) - indices, values = ctx.tpInfo.selectGradients(flattened, ctx.tpParams) - # mask grad_output by reinitializing with zeros - grad_output = torch.zeros(flattened.size()) - # then loop over and set all selected gradient entries - for i in range(indices.size(1)): - grad_output[indices[0, i], indices[1, i]] = values[i] - # undo the flattening - grad_output = grad_output.view(-1, out_ch, out_width, out_height).to(weight.device) - - - # proceed with layer specific computations + masked_grad = _apply_tinyprop_mask(ctx.tpInfo, grad_output, ctx.tpParams) + if ctx.needs_input_grad[0]: - # can be solved by deconvolving grad_output with weight - grad_input = F.conv_transpose2d(grad_output, weight, None, ctx.stride, ctx.padding, groups=ctx.groups, dilation=ctx.dilation) + grad_input = torch.nn.grad.conv2d_input(input.shape, weight, masked_grad, ctx.stride, + ctx.padding, ctx.dilation, ctx.groups) if ctx.needs_input_grad[1]: - # can be solved by convolving input with grad_output, but the resulting grad_weight is 5d which the conv function can't handle. - # I mitigate this problem by slicing the input by input channel. I can then do the convolution with this reduced dimension, where - # I can process the batch-dimension as input channel. Later grad_weight is constructed from these sub-convolutions - - # use batch-dimension as in-channel [out, b, w, h] = [out, in, w, h] - permutated = grad_output.permute(1, 0, 2, 3) - # dismantle real input-channel - input_channels = torch.unbind(input, dim=1) - res = [] - for channel in input_channels: - res.append(F.conv2d(channel, permutated, None, ctx.stride, ctx.padding, groups=ctx.groups, dilation=ctx.dilation)) - grad_weight = torch.stack(res, dim=0).permute(1, 0, 2, 3) + grad_weight = torch.nn.grad.conv2d_weight(input, weight.shape, masked_grad, ctx.stride, + ctx.padding, ctx.dilation, ctx.groups) if bias is not None and ctx.needs_input_grad[2]: - # simply sum up all elements over width, height - grad_bias = torch.sum(grad_output, dim=(2,3)) + sum_dims = (0,) + tuple(range(2, masked_grad.dim())) + grad_bias = masked_grad.sum(dim=sum_dims) return grad_input, grad_weight, grad_bias, None, None, None, None, None, None, None, None @@ -202,11 +262,15 @@ def __init__(self, stride: _size_2_t = 1, padding: Union[str, _size_2_t] = 0, dilation: _size_2_t = 1, + groups: int = 1, bias: bool = True, + padding_mode: str = 'zeros', device = None, dtype = None): TinyPropLayer.__init__(self, tinyPropParams.number_of_layers - layer_number) - nn.Conv2d.__init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, 1, bias, device=device, dtype=dtype) + nn.Conv2d.__init__(self, in_channels, out_channels, kernel_size, stride=stride, padding=padding, + dilation=dilation, groups=groups, bias=bias, padding_mode=padding_mode, + device=device, dtype=dtype) # Saving variables like this will pass it by REFERENCE, so changes # made in backwards are reflected in layer diff --git a/src/aifes.h b/src/aifes.h index a078f8f..5551121 100644 --- a/src/aifes.h +++ b/src/aifes.h @@ -43,6 +43,8 @@ extern "C" { // Include the layer base implementations #include "basic/base/ailayer/ailayer_dense.h" +#include "basic/base/ailayer/ailayer_conv1d.h" +#include "basic/base/ailayer/ailayer_conv2d.h" #include "basic/base/ailayer/ailayer_input.h" #include "basic/base/ailayer/ailayer_relu.h" #include "basic/base/ailayer/ailayer_leaky_relu.h" @@ -68,6 +70,8 @@ extern "C" { // Include the layers in default implementation #include "basic/default/ailayer/ailayer_dense_default.h" +#include "basic/default/ailayer/ailayer_conv1d_default.h" +#include "basic/default/ailayer/ailayer_conv2d_default.h" #include "basic/default/ailayer/ailayer_input_default.h" #include "basic/default/ailayer/ailayer_relu_default.h" #include "basic/default/ailayer/ailayer_leaky_relu_default.h" diff --git a/src/basic/base/ailayer/ailayer_conv1d.c b/src/basic/base/ailayer/ailayer_conv1d.c new file mode 100644 index 0000000..d361422 --- /dev/null +++ b/src/basic/base/ailayer/ailayer_conv1d.c @@ -0,0 +1,263 @@ +/** + * \file basic/base/ailayer/ailayer_conv1d.c + * \version 2.0alpha + * \date 27.05.2024 + * \copyright Copyright (C) 2020-2024 Fraunhofer Institute for Microelectronic Circuits and Systems. + All rights reserved. + + AIfES is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#include "basic/base/ailayer/ailayer_conv1d.h" +#include "basic/base/aimath/aimath_basic.h" + +const aicore_layertype_t ailayer_conv1d_type_s = { +#ifdef AIDEBUG_PRINT_MODULE_SPECS + .name = "Conv1D", + .print_specs = ailayer_conv1d_print_specs +#else + .name = 0, + .print_specs = 0 +#endif +}; +const aicore_layertype_t *ailayer_conv1d_type = &ailayer_conv1d_type_s; + +ailayer_t *ailayer_conv1d(ailayer_conv1d_t *layer, ailayer_t *input_layer) +{ + layer->requires_grad = 0x03; /* weights and bias enabled by default */ + layer->base.layer_type = ailayer_conv1d_type; + + layer->base.input_layer = input_layer; + input_layer->output_layer = &(layer->base); + + layer->base.layer_configuration = layer; + layer->base.result.dtype = layer->result_dtype; + layer->base.result.dim = 3; + layer->base.result.shape = layer->result_shape; + + layer->base.deltas.dtype = layer->result_dtype; + layer->base.deltas.dim = input_layer->result.dim; +#ifdef DEBUG_CHECKS + if(input_layer->result.dim != 3) + { + LOG_E("Conv1D: input tensor must have 3 dimensions.\n"); + return 0; + } +#endif + layer->base.deltas.shape = layer->deltas_shape; + + uint8_t i; + for(i = 0; i < input_layer->result.dim && i < 3; i++){ + layer->deltas_shape[i] = input_layer->result.shape[i]; + } + + layer->weights.dim = 3; + layer->weights.dtype = layer->weights_dtype; + layer->weights.shape = layer->weights_shape; + layer->weights.shape[0] = layer->out_channels; +#ifdef DEBUG_CHECKS + if(layer->groups == 0){ + LOG_E("Conv1D: groups must be greater than zero.\n"); + return 0; + } + if(input_layer->result.shape[1] % layer->groups != 0){ + LOG_E("Conv1D: input channels not divisible by groups.\n"); + return 0; + } + if(layer->out_channels % layer->groups != 0){ + LOG_E("Conv1D: output channels not divisible by groups.\n"); + return 0; + } +#endif + layer->weights.shape[1] = input_layer->result.shape[1] / layer->groups; + layer->weights.shape[2] = layer->kernel_size; + + layer->bias.dim = 1; + layer->bias.dtype = layer->bias_dtype; + layer->bias.shape = layer->bias_shape; + layer->bias.shape[0] = layer->out_channels; + + layer->base.forward = ailayer_conv1d_forward; + layer->base.backward = ailayer_conv1d_backward; + layer->base.backward_meProp = ailayer_conv1d_backward_meProp; + + layer->base.calc_result_shape = ailayer_conv1d_calc_result_shape; + layer->base.sizeof_paramem = ailayer_conv1d_sizeof_paramem; + layer->base.set_paramem = ailayer_conv1d_set_paramem; + layer->base.sizeof_trainmem = ailayer_conv1d_sizeof_trainmem; + layer->base.set_trainmem = ailayer_conv1d_set_trainmem; + + layer->base.get_result_bound = 0; + + layer->base.trainable_params_count = 2; + layer->base.trainable_params = layer->trainable_params; + layer->base.gradients = layer->gradients; + layer->base.optimem = layer->optimem; + + layer->trainable_params[0] = &(layer->weights); + layer->trainable_params[1] = &(layer->bias); + + return &(layer->base); +} + +void ailayer_conv1d_forward(ailayer_t *self) +{ + ailayer_conv1d_t *layer = (ailayer_conv1d_t *)(self->layer_configuration); + aitensor_t *input_tensor = &(self->input_layer->result); + aitensor_t *result_tensor = &(self->result); + + layer->conv(input_tensor, &(layer->weights), &(layer->bias), + layer->stride, layer->padding, layer->dilation, layer->groups, + result_tensor); +} + +void ailayer_conv1d_backward(ailayer_t *self) +{ + ailayer_conv1d_t *layer = (ailayer_conv1d_t *)(self->layer_configuration); + aitensor_t *delta_in = &(self->deltas); + aitensor_t *delta_out = &(self->output_layer->deltas); + aitensor_t *x_in = &(self->input_layer->result); + + if(layer->requires_grad & 0x01){ + layer->conv_weight_grad(delta_out, x_in, + layer->stride, layer->padding, layer->dilation, layer->groups, + layer->gradients[0]); + } + if(layer->requires_grad & 0x02){ + layer->conv_bias_grad(delta_out, layer->gradients[1]); + } + + layer->conv_input_grad(delta_out, &(layer->weights), + layer->stride, layer->padding, layer->dilation, layer->groups, + delta_in); +} + +void ailayer_conv1d_backward_meProp(ailayer_t *self, float maxBpr, float minBpr, float damping, int dense_counter) +{ + (void)maxBpr; + (void)minBpr; + (void)damping; + (void)dense_counter; + ailayer_conv1d_backward(self); +} + +void ailayer_conv1d_calc_result_shape(ailayer_t *self) +{ + ailayer_conv1d_t *layer = (ailayer_conv1d_t *)(self->layer_configuration); + aitensor_t *x_in = &(self->input_layer->result); + + uint16_t batch = x_in->shape[0]; + uint16_t in_length = x_in->shape[2]; + + int32_t numerator = (int32_t)in_length + 2 * (int32_t)layer->padding - + (int32_t)layer->dilation * ((int32_t)layer->kernel_size - 1) - 1; + uint16_t out_length = (uint16_t)(numerator / layer->stride + 1); + + self->result.shape[0] = batch; + self->result.shape[1] = layer->out_channels; + self->result.shape[2] = out_length; + + layer->deltas_shape[0] = batch; + layer->deltas_shape[1] = x_in->shape[1]; + layer->deltas_shape[2] = in_length; +} + +uint32_t ailayer_conv1d_sizeof_paramem(const ailayer_t *self) +{ + const ailayer_conv1d_t *layer = (const ailayer_conv1d_t *)(self->layer_configuration); + uint32_t memory = 0; + + memory += layer->weights_dtype->tensor_params_size; + memory += layer->out_channels * layer->weights.shape[1] * layer->kernel_size * + aimath_sizeof_dtype(layer->weights_dtype); + + memory += layer->bias_dtype->tensor_params_size; + memory += layer->out_channels * aimath_sizeof_dtype(layer->bias_dtype); + + return memory; +} + +void ailayer_conv1d_set_paramem(ailayer_t *self, void *memory_ptr) +{ + uint32_t address_counter = 0; + ailayer_conv1d_t *layer = (ailayer_conv1d_t *)(self->layer_configuration); + + layer->weights.tensor_params = memory_ptr + address_counter; + address_counter += layer->weights_dtype->tensor_params_size; + layer->weights.dim = 3; + layer->weights.dtype = layer->weights_dtype; + layer->weights.shape = layer->weights_shape; + layer->weights.shape[0] = layer->out_channels; + layer->weights.shape[1] = self->input_layer->result.shape[1] / layer->groups; + layer->weights.shape[2] = layer->kernel_size; + layer->weights.data = memory_ptr + address_counter; + address_counter += aimath_sizeof_tensor_data(&(layer->weights)); + + layer->bias.tensor_params = memory_ptr + address_counter; + address_counter += layer->bias_dtype->tensor_params_size; + layer->bias.dim = 1; + layer->bias.dtype = layer->bias_dtype; + layer->bias.shape = layer->bias_shape; + layer->bias.shape[0] = layer->out_channels; + layer->bias.data = memory_ptr + address_counter; + + layer->trainable_params[0] = &(layer->weights); + layer->trainable_params[1] = &(layer->bias); +} + +uint32_t ailayer_conv1d_sizeof_trainmem(const ailayer_t *self) +{ + const ailayer_conv1d_t *layer = (const ailayer_conv1d_t *)(self->layer_configuration); + uint32_t memory = 0; + + memory += aimath_sizeof_tensor(&(layer->weights)); + memory += aimath_sizeof_tensor(&(layer->bias)); + + return memory; +} + +void ailayer_conv1d_set_trainmem(ailayer_t *self, void *memory_ptr) +{ + uint32_t address_counter = 0; + ailayer_conv1d_t *layer = (ailayer_conv1d_t *)(self->layer_configuration); + + self->gradients[0] = memory_ptr + address_counter; + address_counter += sizeof(aitensor_t); + self->gradients[0]->dim = layer->weights.dim; + self->gradients[0]->dtype = layer->weights.dtype; + self->gradients[0]->shape = layer->weights.shape; + self->gradients[0]->data = memory_ptr + address_counter; + address_counter += aimath_sizeof_tensor_data(self->gradients[0]); + self->gradients[0]->tensor_params = memory_ptr + address_counter; + address_counter += aimath_sizeof_tensor_params(self->gradients[0]); + + self->gradients[1] = memory_ptr + address_counter; + address_counter += sizeof(aitensor_t); + self->gradients[1]->dim = layer->bias.dim; + self->gradients[1]->dtype = layer->bias.dtype; + self->gradients[1]->shape = layer->bias.shape; + self->gradients[1]->data = memory_ptr + address_counter; + address_counter += aimath_sizeof_tensor_data(self->gradients[1]); + self->gradients[1]->tensor_params = memory_ptr + address_counter; + address_counter += aimath_sizeof_tensor_params(self->gradients[1]); +} + +#ifdef AIDEBUG_PRINT_MODULE_SPECS +void ailayer_conv1d_print_specs(const ailayer_t *self, int (*print)(const char *format, ...)) +{ + const ailayer_conv1d_t *layer = (const ailayer_conv1d_t *)(self->layer_configuration); + print("out_channels: %lu, kernel_size: %lu, stride: %lu", (unsigned long)layer->out_channels, + (unsigned long)layer->kernel_size, (unsigned long)layer->stride); +} +#endif diff --git a/src/basic/base/ailayer/ailayer_conv1d.h b/src/basic/base/ailayer/ailayer_conv1d.h new file mode 100644 index 0000000..655f39e --- /dev/null +++ b/src/basic/base/ailayer/ailayer_conv1d.h @@ -0,0 +1,108 @@ +/** + * \file basic/base/ailayer/ailayer_conv1d.h + * \internal + * \date 27.05.2024 + * \endinternal + * \version 2.0alpha + * \copyright Copyright (C) 2020-2024 Fraunhofer Institute for Microelectronic Circuits and Systems. + All rights reserved. + + AIfES is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#ifndef AILAYER_CONV1D +#define AILAYER_CONV1D + +#include "core/aifes_core.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** Forward declaration of the Conv1D layer structure. */ +typedef struct ailayer_conv1d ailayer_conv1d_t; + +/** + * @brief General Conv1D layer structure. + */ +struct ailayer_conv1d { + ailayer_t base; /**< Inherited field members from general ailayer struct. */ + const aimath_dtype_t *result_dtype; /**< Data type of the inference result values. */ + const aimath_dtype_t *weights_dtype; /**< Data type of the weights. */ + const aimath_dtype_t *bias_dtype; /**< Data type of the bias. */ + + /** @name Layer configuration */ + ///@{ + uint16_t out_channels; /**< Number of output channels. */ + uint16_t kernel_size; /**< Size of the convolution kernel. */ + uint16_t stride; /**< Stride of the convolution. */ + uint16_t padding; /**< Zero padding applied to both sides of the input. */ + uint16_t dilation; /**< Kernel dilation factor. */ + uint16_t groups; /**< Number of blocked connections from input channels to output channels. */ + ///@} + + /** @name Trainable parameters */ + ///@{ + aitensor_t weights; /**< Tensor containing the convolution kernels. */ + aitensor_t bias; /**< Tensor containing the bias. */ + + uint16_t result_shape[3]; /**< Result tensor shape (batch, channels, length). */ + uint16_t deltas_shape[3]; /**< Delta tensor shape. */ + uint16_t weights_shape[3]; /**< Weights tensor shape (out_channels, in_channels / groups, kernel_size). */ + uint16_t bias_shape[1]; /**< Bias tensor shape (out_channels). */ + + uint8_t requires_grad; /**< Bit mask to control gradient calculation (bit0: weights, bit1: bias). */ + + aitensor_t *trainable_params[2]; /**< Pointers to trainable parameter tensors. */ + aitensor_t *gradients[2]; /**< Gradient tensors (same ordering as trainable_params). */ + void *optimem[2]; /**< Memory used by the training optimizer. */ + ///@} + + /** @name Math functions */ + ///@{ + void (*conv)(const aitensor_t *input, const aitensor_t *weights, const aitensor_t *bias, + uint16_t stride, uint16_t padding, uint16_t dilation, uint16_t groups, + aitensor_t *output); + void (*conv_input_grad)(const aitensor_t *delta_out, const aitensor_t *weights, + uint16_t stride, uint16_t padding, uint16_t dilation, uint16_t groups, + aitensor_t *delta_in); + void (*conv_weight_grad)(const aitensor_t *delta_out, const aitensor_t *input, + uint16_t stride, uint16_t padding, uint16_t dilation, uint16_t groups, + aitensor_t *d_weights); + void (*conv_bias_grad)(const aitensor_t *delta_out, aitensor_t *d_bias); + ///@} +}; + +/** @brief Conv1D layer type indicator. */ +extern const aicore_layertype_t *ailayer_conv1d_type; + +ailayer_t *ailayer_conv1d(ailayer_conv1d_t *layer, ailayer_t *input_layer); +void ailayer_conv1d_forward(ailayer_t *self); +void ailayer_conv1d_backward(ailayer_t *self); +void ailayer_conv1d_backward_meProp(ailayer_t *self, float maxBpr, float minBpr, float damping, int dense_counter); +void ailayer_conv1d_calc_result_shape(ailayer_t *self); +uint32_t ailayer_conv1d_sizeof_paramem(const ailayer_t *self); +void ailayer_conv1d_set_paramem(ailayer_t *self, void *memory_ptr); +uint32_t ailayer_conv1d_sizeof_trainmem(const ailayer_t *self); +void ailayer_conv1d_set_trainmem(ailayer_t *self, void *memory_ptr); + +#ifdef AIDEBUG_PRINT_MODULE_SPECS +void ailayer_conv1d_print_specs(const ailayer_t *self, int (*print)(const char *format, ...)); +#endif + +#ifdef __cplusplus +} +#endif + +#endif // AILAYER_CONV1D diff --git a/src/basic/base/ailayer/ailayer_conv2d.c b/src/basic/base/ailayer/ailayer_conv2d.c new file mode 100644 index 0000000..9047aba --- /dev/null +++ b/src/basic/base/ailayer/ailayer_conv2d.c @@ -0,0 +1,280 @@ +/** + * \file basic/base/ailayer/ailayer_conv2d.c + * \version 2.0alpha + * \date 27.05.2024 + * \copyright Copyright (C) 2020-2024 Fraunhofer Institute for Microelectronic Circuits and Systems. + All rights reserved. + + AIfES is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#include "basic/base/ailayer/ailayer_conv2d.h" +#include "basic/base/aimath/aimath_basic.h" + +const aicore_layertype_t ailayer_conv2d_type_s = { +#ifdef AIDEBUG_PRINT_MODULE_SPECS + .name = "Conv2D", + .print_specs = ailayer_conv2d_print_specs +#else + .name = 0, + .print_specs = 0 +#endif +}; +const aicore_layertype_t *ailayer_conv2d_type = &ailayer_conv2d_type_s; + +ailayer_t *ailayer_conv2d(ailayer_conv2d_t *layer, ailayer_t *input_layer) +{ + layer->requires_grad = 0x03; /* weights and bias enabled by default */ + layer->base.layer_type = ailayer_conv2d_type; + + layer->base.input_layer = input_layer; + input_layer->output_layer = &(layer->base); + + layer->base.layer_configuration = layer; + layer->base.result.dtype = layer->result_dtype; + layer->base.result.dim = 4; + layer->base.result.shape = layer->result_shape; + + layer->base.deltas.dtype = layer->result_dtype; + layer->base.deltas.dim = input_layer->result.dim; +#ifdef DEBUG_CHECKS + if(input_layer->result.dim != 4) + { + LOG_E("Conv2D: input tensor must have 4 dimensions.\n"); + return 0; + } +#endif + layer->base.deltas.shape = layer->deltas_shape; + + uint8_t i; + for(i = 0; i < input_layer->result.dim && i < 4; i++){ + layer->deltas_shape[i] = input_layer->result.shape[i]; + } + + layer->weights.dim = 4; + layer->weights.dtype = layer->weights_dtype; + layer->weights.shape = layer->weights_shape; + layer->weights.shape[0] = layer->out_channels; +#ifdef DEBUG_CHECKS + if(layer->groups == 0){ + LOG_E("Conv2D: groups must be greater than zero.\n"); + return 0; + } + if(input_layer->result.shape[1] % layer->groups != 0){ + LOG_E("Conv2D: input channels not divisible by groups.\n"); + return 0; + } + if(layer->out_channels % layer->groups != 0){ + LOG_E("Conv2D: output channels not divisible by groups.\n"); + return 0; + } +#endif + layer->weights.shape[1] = input_layer->result.shape[1] / layer->groups; + layer->weights.shape[2] = layer->kernel_height; + layer->weights.shape[3] = layer->kernel_width; + + layer->bias.dim = 1; + layer->bias.dtype = layer->bias_dtype; + layer->bias.shape = layer->bias_shape; + layer->bias.shape[0] = layer->out_channels; + + layer->base.forward = ailayer_conv2d_forward; + layer->base.backward = ailayer_conv2d_backward; + layer->base.backward_meProp = ailayer_conv2d_backward_meProp; + + layer->base.calc_result_shape = ailayer_conv2d_calc_result_shape; + layer->base.sizeof_paramem = ailayer_conv2d_sizeof_paramem; + layer->base.set_paramem = ailayer_conv2d_set_paramem; + layer->base.sizeof_trainmem = ailayer_conv2d_sizeof_trainmem; + layer->base.set_trainmem = ailayer_conv2d_set_trainmem; + + layer->base.get_result_bound = 0; + + layer->base.trainable_params_count = 2; + layer->base.trainable_params = layer->trainable_params; + layer->base.gradients = layer->gradients; + layer->base.optimem = layer->optimem; + + layer->trainable_params[0] = &(layer->weights); + layer->trainable_params[1] = &(layer->bias); + + return &(layer->base); +} + +void ailayer_conv2d_forward(ailayer_t *self) +{ + ailayer_conv2d_t *layer = (ailayer_conv2d_t *)(self->layer_configuration); + aitensor_t *input_tensor = &(self->input_layer->result); + aitensor_t *result_tensor = &(self->result); + + layer->conv(input_tensor, &(layer->weights), &(layer->bias), + layer->stride_height, layer->stride_width, + layer->padding_height, layer->padding_width, + layer->dilation_height, layer->dilation_width, + layer->groups, result_tensor); +} + +void ailayer_conv2d_backward(ailayer_t *self) +{ + ailayer_conv2d_t *layer = (ailayer_conv2d_t *)(self->layer_configuration); + aitensor_t *delta_in = &(self->deltas); + aitensor_t *delta_out = &(self->output_layer->deltas); + aitensor_t *x_in = &(self->input_layer->result); + + if(layer->requires_grad & 0x01){ + layer->conv_weight_grad(delta_out, x_in, + layer->stride_height, layer->stride_width, + layer->padding_height, layer->padding_width, + layer->dilation_height, layer->dilation_width, + layer->groups, layer->gradients[0]); + } + if(layer->requires_grad & 0x02){ + layer->conv_bias_grad(delta_out, layer->gradients[1]); + } + + layer->conv_input_grad(delta_out, &(layer->weights), + layer->stride_height, layer->stride_width, + layer->padding_height, layer->padding_width, + layer->dilation_height, layer->dilation_width, + layer->groups, delta_in); +} + +void ailayer_conv2d_backward_meProp(ailayer_t *self, float maxBpr, float minBpr, float damping, int dense_counter) +{ + (void)maxBpr; + (void)minBpr; + (void)damping; + (void)dense_counter; + ailayer_conv2d_backward(self); +} + +void ailayer_conv2d_calc_result_shape(ailayer_t *self) +{ + ailayer_conv2d_t *layer = (ailayer_conv2d_t *)(self->layer_configuration); + aitensor_t *x_in = &(self->input_layer->result); + + uint16_t batch = x_in->shape[0]; + uint16_t in_height = x_in->shape[2]; + uint16_t in_width = x_in->shape[3]; + + int32_t numerator_h = (int32_t)in_height + 2 * (int32_t)layer->padding_height - + (int32_t)layer->dilation_height * ((int32_t)layer->kernel_height - 1) - 1; + uint16_t out_height = (uint16_t)(numerator_h / layer->stride_height + 1); + + int32_t numerator_w = (int32_t)in_width + 2 * (int32_t)layer->padding_width - + (int32_t)layer->dilation_width * ((int32_t)layer->kernel_width - 1) - 1; + uint16_t out_width = (uint16_t)(numerator_w / layer->stride_width + 1); + + self->result.shape[0] = batch; + self->result.shape[1] = layer->out_channels; + self->result.shape[2] = out_height; + self->result.shape[3] = out_width; + + layer->deltas_shape[0] = batch; + layer->deltas_shape[1] = x_in->shape[1]; + layer->deltas_shape[2] = in_height; + layer->deltas_shape[3] = in_width; +} + +uint32_t ailayer_conv2d_sizeof_paramem(const ailayer_t *self) +{ + const ailayer_conv2d_t *layer = (const ailayer_conv2d_t *)(self->layer_configuration); + uint32_t memory = 0; + + memory += layer->weights_dtype->tensor_params_size; + memory += layer->out_channels * layer->weights.shape[1] * layer->kernel_height * layer->kernel_width * + aimath_sizeof_dtype(layer->weights_dtype); + + memory += layer->bias_dtype->tensor_params_size; + memory += layer->out_channels * aimath_sizeof_dtype(layer->bias_dtype); + + return memory; +} + +void ailayer_conv2d_set_paramem(ailayer_t *self, void *memory_ptr) +{ + uint32_t address_counter = 0; + ailayer_conv2d_t *layer = (ailayer_conv2d_t *)(self->layer_configuration); + + layer->weights.tensor_params = memory_ptr + address_counter; + address_counter += layer->weights_dtype->tensor_params_size; + layer->weights.dim = 4; + layer->weights.dtype = layer->weights_dtype; + layer->weights.shape = layer->weights_shape; + layer->weights.shape[0] = layer->out_channels; + layer->weights.shape[1] = self->input_layer->result.shape[1] / layer->groups; + layer->weights.shape[2] = layer->kernel_height; + layer->weights.shape[3] = layer->kernel_width; + layer->weights.data = memory_ptr + address_counter; + address_counter += aimath_sizeof_tensor_data(&(layer->weights)); + + layer->bias.tensor_params = memory_ptr + address_counter; + address_counter += layer->bias_dtype->tensor_params_size; + layer->bias.dim = 1; + layer->bias.dtype = layer->bias_dtype; + layer->bias.shape = layer->bias_shape; + layer->bias.shape[0] = layer->out_channels; + layer->bias.data = memory_ptr + address_counter; + + layer->trainable_params[0] = &(layer->weights); + layer->trainable_params[1] = &(layer->bias); +} + +uint32_t ailayer_conv2d_sizeof_trainmem(const ailayer_t *self) +{ + const ailayer_conv2d_t *layer = (const ailayer_conv2d_t *)(self->layer_configuration); + uint32_t memory = 0; + + memory += aimath_sizeof_tensor(&(layer->weights)); + memory += aimath_sizeof_tensor(&(layer->bias)); + + return memory; +} + +void ailayer_conv2d_set_trainmem(ailayer_t *self, void *memory_ptr) +{ + uint32_t address_counter = 0; + ailayer_conv2d_t *layer = (ailayer_conv2d_t *)(self->layer_configuration); + + self->gradients[0] = memory_ptr + address_counter; + address_counter += sizeof(aitensor_t); + self->gradients[0]->dim = layer->weights.dim; + self->gradients[0]->dtype = layer->weights.dtype; + self->gradients[0]->shape = layer->weights.shape; + self->gradients[0]->data = memory_ptr + address_counter; + address_counter += aimath_sizeof_tensor_data(self->gradients[0]); + self->gradients[0]->tensor_params = memory_ptr + address_counter; + address_counter += aimath_sizeof_tensor_params(self->gradients[0]); + + self->gradients[1] = memory_ptr + address_counter; + address_counter += sizeof(aitensor_t); + self->gradients[1]->dim = layer->bias.dim; + self->gradients[1]->dtype = layer->bias.dtype; + self->gradients[1]->shape = layer->bias.shape; + self->gradients[1]->data = memory_ptr + address_counter; + address_counter += aimath_sizeof_tensor_data(self->gradients[1]); + self->gradients[1]->tensor_params = memory_ptr + address_counter; + address_counter += aimath_sizeof_tensor_params(self->gradients[1]); +} + +#ifdef AIDEBUG_PRINT_MODULE_SPECS +void ailayer_conv2d_print_specs(const ailayer_t *self, int (*print)(const char *format, ...)) +{ + const ailayer_conv2d_t *layer = (const ailayer_conv2d_t *)(self->layer_configuration); + print("out_channels: %lu, kernel: %lux%lu, stride: %lux%lu", + (unsigned long)layer->out_channels, + (unsigned long)layer->kernel_height, (unsigned long)layer->kernel_width, + (unsigned long)layer->stride_height, (unsigned long)layer->stride_width); +} +#endif diff --git a/src/basic/base/ailayer/ailayer_conv2d.h b/src/basic/base/ailayer/ailayer_conv2d.h new file mode 100644 index 0000000..947c18f --- /dev/null +++ b/src/basic/base/ailayer/ailayer_conv2d.h @@ -0,0 +1,118 @@ +/** + * \file basic/base/ailayer/ailayer_conv2d.h + * \internal + * \date 27.05.2024 + * \endinternal + * \version 2.0alpha + * \copyright Copyright (C) 2020-2024 Fraunhofer Institute for Microelectronic Circuits and Systems. + All rights reserved. + + AIfES is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#ifndef AILAYER_CONV2D +#define AILAYER_CONV2D + +#include "core/aifes_core.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** Forward declaration of the Conv2D layer structure. */ +typedef struct ailayer_conv2d ailayer_conv2d_t; + +/** + * @brief General Conv2D layer structure. + */ +struct ailayer_conv2d { + ailayer_t base; /**< Inherited field members from general ailayer struct. */ + const aimath_dtype_t *result_dtype; /**< Data type of the inference result values. */ + const aimath_dtype_t *weights_dtype; /**< Data type of the weights. */ + const aimath_dtype_t *bias_dtype; /**< Data type of the bias. */ + + /** @name Layer configuration */ + ///@{ + uint16_t out_channels; /**< Number of output feature maps. */ + uint16_t kernel_height; /**< Kernel height. */ + uint16_t kernel_width; /**< Kernel width. */ + uint16_t stride_height; /**< Stride along the height dimension. */ + uint16_t stride_width; /**< Stride along the width dimension. */ + uint16_t padding_height; /**< Padding applied along the height dimension. */ + uint16_t padding_width; /**< Padding applied along the width dimension. */ + uint16_t dilation_height; /**< Dilation along the height dimension. */ + uint16_t dilation_width; /**< Dilation along the width dimension. */ + uint16_t groups; /**< Number of blocked connections. */ + ///@} + + /** @name Trainable parameters */ + ///@{ + aitensor_t weights; /**< Tensor containing the convolution kernels. */ + aitensor_t bias; /**< Tensor containing the bias. */ + + uint16_t result_shape[4]; /**< Result tensor shape (batch, channels, height, width). */ + uint16_t deltas_shape[4]; /**< Delta tensor shape. */ + uint16_t weights_shape[4]; /**< Weight tensor shape (out_channels, in_channels/groups, kernel_h, kernel_w). */ + uint16_t bias_shape[1]; /**< Bias tensor shape (out_channels). */ + + uint8_t requires_grad; /**< Bit mask to control gradient calculation (bit0: weights, bit1: bias). */ + + aitensor_t *trainable_params[2]; /**< Pointers to trainable parameter tensors. */ + aitensor_t *gradients[2]; /**< Gradient tensors (same ordering as trainable_params). */ + void *optimem[2]; /**< Memory used by the training optimizer. */ + ///@} + + /** @name Math functions */ + ///@{ + void (*conv)(const aitensor_t *input, const aitensor_t *weights, const aitensor_t *bias, + uint16_t stride_height, uint16_t stride_width, + uint16_t padding_height, uint16_t padding_width, + uint16_t dilation_height, uint16_t dilation_width, + uint16_t groups, aitensor_t *output); + void (*conv_input_grad)(const aitensor_t *delta_out, const aitensor_t *weights, + uint16_t stride_height, uint16_t stride_width, + uint16_t padding_height, uint16_t padding_width, + uint16_t dilation_height, uint16_t dilation_width, + uint16_t groups, aitensor_t *delta_in); + void (*conv_weight_grad)(const aitensor_t *delta_out, const aitensor_t *input, + uint16_t stride_height, uint16_t stride_width, + uint16_t padding_height, uint16_t padding_width, + uint16_t dilation_height, uint16_t dilation_width, + uint16_t groups, aitensor_t *d_weights); + void (*conv_bias_grad)(const aitensor_t *delta_out, aitensor_t *d_bias); + ///@} +}; + +/** @brief Conv2D layer type indicator. */ +extern const aicore_layertype_t *ailayer_conv2d_type; + +ailayer_t *ailayer_conv2d(ailayer_conv2d_t *layer, ailayer_t *input_layer); +void ailayer_conv2d_forward(ailayer_t *self); +void ailayer_conv2d_backward(ailayer_t *self); +void ailayer_conv2d_backward_meProp(ailayer_t *self, float maxBpr, float minBpr, float damping, int dense_counter); +void ailayer_conv2d_calc_result_shape(ailayer_t *self); +uint32_t ailayer_conv2d_sizeof_paramem(const ailayer_t *self); +void ailayer_conv2d_set_paramem(ailayer_t *self, void *memory_ptr); +uint32_t ailayer_conv2d_sizeof_trainmem(const ailayer_t *self); +void ailayer_conv2d_set_trainmem(ailayer_t *self, void *memory_ptr); + +#ifdef AIDEBUG_PRINT_MODULE_SPECS +void ailayer_conv2d_print_specs(const ailayer_t *self, int (*print)(const char *format, ...)); +#endif + +#ifdef __cplusplus +} +#endif + +#endif // AILAYER_CONV2D diff --git a/src/basic/default/ailayer/ailayer_conv1d_default.c b/src/basic/default/ailayer/ailayer_conv1d_default.c new file mode 100644 index 0000000..4292e9e --- /dev/null +++ b/src/basic/default/ailayer/ailayer_conv1d_default.c @@ -0,0 +1,36 @@ +/** + * \file basic/default/ailayer/ailayer_conv1d_default.c + * \version 2.0alpha + * \date 27.05.2024 + * \copyright Copyright (C) 2020-2024 Fraunhofer Institute for Microelectronic Circuits and Systems. + All rights reserved. + + AIfES is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#include "basic/default/ailayer/ailayer_conv1d_default.h" + +ailayer_t *ailayer_conv1d_f32_default(ailayer_conv1d_f32_t *layer, ailayer_t *input_layer) +{ + layer->result_dtype = aif32; + layer->weights_dtype = aif32; + layer->bias_dtype = aif32; + + layer->conv = aimath_f32_default_conv1d_forward; + layer->conv_input_grad = aimath_f32_default_conv1d_input_grad; + layer->conv_weight_grad = aimath_f32_default_conv1d_weight_grad; + layer->conv_bias_grad = aimath_f32_default_conv_bias_grad; + + return ailayer_conv1d(layer, input_layer); +} diff --git a/src/basic/default/ailayer/ailayer_conv1d_default.h b/src/basic/default/ailayer/ailayer_conv1d_default.h new file mode 100644 index 0000000..7e73b33 --- /dev/null +++ b/src/basic/default/ailayer/ailayer_conv1d_default.h @@ -0,0 +1,42 @@ +/** + * \file basic/default/ailayer/ailayer_conv1d_default.h + * \internal + * \date 27.05.2024 + * \endinternal + * \version 2.0alpha + * \copyright Copyright (C) 2020-2024 Fraunhofer Institute for Microelectronic Circuits and Systems. + All rights reserved. + + AIfES is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#ifndef AILAYER_CONV1D_DEFAULT +#define AILAYER_CONV1D_DEFAULT + +#include "basic/base/ailayer/ailayer_conv1d.h" +#include "basic/default/aimath/aimath_f32_default.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct ailayer_conv1d ailayer_conv1d_f32_t; + +ailayer_t *ailayer_conv1d_f32_default(ailayer_conv1d_f32_t *layer, ailayer_t *input_layer); + +#ifdef __cplusplus +} +#endif + +#endif // AILAYER_CONV1D_DEFAULT diff --git a/src/basic/default/ailayer/ailayer_conv2d_default.c b/src/basic/default/ailayer/ailayer_conv2d_default.c new file mode 100644 index 0000000..7cc734a --- /dev/null +++ b/src/basic/default/ailayer/ailayer_conv2d_default.c @@ -0,0 +1,36 @@ +/** + * \file basic/default/ailayer/ailayer_conv2d_default.c + * \version 2.0alpha + * \date 27.05.2024 + * \copyright Copyright (C) 2020-2024 Fraunhofer Institute for Microelectronic Circuits and Systems. + All rights reserved. + + AIfES is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#include "basic/default/ailayer/ailayer_conv2d_default.h" + +ailayer_t *ailayer_conv2d_f32_default(ailayer_conv2d_f32_t *layer, ailayer_t *input_layer) +{ + layer->result_dtype = aif32; + layer->weights_dtype = aif32; + layer->bias_dtype = aif32; + + layer->conv = aimath_f32_default_conv2d_forward; + layer->conv_input_grad = aimath_f32_default_conv2d_input_grad; + layer->conv_weight_grad = aimath_f32_default_conv2d_weight_grad; + layer->conv_bias_grad = aimath_f32_default_conv_bias_grad; + + return ailayer_conv2d(layer, input_layer); +} diff --git a/src/basic/default/ailayer/ailayer_conv2d_default.h b/src/basic/default/ailayer/ailayer_conv2d_default.h new file mode 100644 index 0000000..c105406 --- /dev/null +++ b/src/basic/default/ailayer/ailayer_conv2d_default.h @@ -0,0 +1,42 @@ +/** + * \file basic/default/ailayer/ailayer_conv2d_default.h + * \internal + * \date 27.05.2024 + * \endinternal + * \version 2.0alpha + * \copyright Copyright (C) 2020-2024 Fraunhofer Institute for Microelectronic Circuits and Systems. + All rights reserved. + + AIfES is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#ifndef AILAYER_CONV2D_DEFAULT +#define AILAYER_CONV2D_DEFAULT + +#include "basic/base/ailayer/ailayer_conv2d.h" +#include "basic/default/aimath/aimath_f32_default.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct ailayer_conv2d ailayer_conv2d_f32_t; + +ailayer_t *ailayer_conv2d_f32_default(ailayer_conv2d_f32_t *layer, ailayer_t *input_layer); + +#ifdef __cplusplus +} +#endif + +#endif // AILAYER_CONV2D_DEFAULT diff --git a/src/basic/default/aimath/aimath_f32_default.c b/src/basic/default/aimath/aimath_f32_default.c index 8103e56..48099a3 100644 --- a/src/basic/default/aimath/aimath_f32_default.c +++ b/src/basic/default/aimath/aimath_f32_default.c @@ -24,6 +24,7 @@ #include "basic/default/aimath/aimath_f32_default.h" #include +#include void aimath_f32_default_linear(const aitensor_t *a, const aitensor_t *b, const aitensor_t *c, aitensor_t *result) @@ -662,19 +663,421 @@ void aimath_f32_default_init_glorot_uniform(aitensor_t *tensor) */ void aimath_f32_default_init_he_uniform(aitensor_t *tensor) { - float fan_in, fan_avg; - if(tensor->dim == 2) - { - fan_in = tensor->shape[0]; - } - else if(tensor->dim == 4) - { - fan_in = tensor->shape[1] * tensor->shape[2] * tensor->shape[3]; // In channel * kernel_elems - } + float fan_in, fan_avg; + if(tensor->dim == 2) + { + fan_in = tensor->shape[0]; + } + else if(tensor->dim == 4) + { + fan_in = tensor->shape[1] * tensor->shape[2] * tensor->shape[3]; // In channel * kernel_elems + } - fan_avg = fan_in / 2.0f; - float r = sqrt(3.0f / fan_avg); - aimath_f32_default_tensor_init_uniform(tensor, -r, r); + fan_avg = fan_in / 2.0f; + float r = sqrt(3.0f / fan_avg); + aimath_f32_default_tensor_init_uniform(tensor, -r, r); +} + +void aimath_f32_default_conv_bias_grad(const aitensor_t *delta_out, aitensor_t *d_bias) +{ + float *grad_data = (float *) d_bias->data; + const float *delta_data = (const float *) delta_out->data; + + uint16_t batch = delta_out->shape[0]; + uint16_t channels = delta_out->shape[1]; + uint32_t spatial = 1; + uint8_t dim; + for(dim = 2; dim < delta_out->dim; dim++) + { + spatial *= delta_out->shape[dim]; + } + + uint32_t channel_stride = spatial; + uint32_t batch_stride = channels * spatial; + + for(uint16_t oc = 0; oc < channels; oc++) + { + float sum = 0.0f; + for(uint16_t n = 0; n < batch; n++) + { + uint32_t base = n * batch_stride + oc * channel_stride; + for(uint32_t idx = 0; idx < spatial; idx++) + { + sum += delta_data[base + idx]; + } + } + grad_data[oc] += sum; + } +} + +void aimath_f32_default_conv1d_forward(const aitensor_t *input, const aitensor_t *weights, const aitensor_t *bias, + uint16_t stride, uint16_t padding, uint16_t dilation, uint16_t groups, + aitensor_t *output) +{ + const float *input_data = (const float *) input->data; + const float *weight_data = (const float *) weights->data; + const float *bias_data = (bias != 0 && bias->data != 0) ? (const float *) bias->data : 0; + float *out_data = (float *) output->data; + + uint16_t batch = input->shape[0]; + uint16_t in_channels = input->shape[1]; + uint16_t in_length = input->shape[2]; + uint16_t out_channels = weights->shape[0]; + uint16_t kernel = weights->shape[2]; + uint16_t out_length = output->shape[2]; + uint16_t channels_per_group = in_channels / groups; + uint16_t out_per_group = out_channels / groups; + +#ifdef SHAPE_CHECK + if(output->shape[1] != out_channels) + { + LOG_E("Conv1D forward: output channels mismatch.\n"); + return; + } + if(channels_per_group * groups != in_channels) + { + LOG_E("Conv1D forward: invalid group configuration.\n"); + return; + } + if(out_per_group * groups != out_channels) + { + LOG_E("Conv1D forward: invalid output group configuration.\n"); + return; + } +#endif + + for(uint16_t n = 0; n < batch; n++) + { + for(uint16_t g = 0; g < groups; g++) + { + for(uint16_t oc = 0; oc < out_per_group; oc++) + { + uint16_t oc_global = g * out_per_group + oc; + for(uint16_t out_pos = 0; out_pos < out_length; out_pos++) + { + float sum = bias_data ? bias_data[oc_global] : 0.0f; + for(uint16_t ic = 0; ic < channels_per_group; ic++) + { + uint16_t ic_global = g * channels_per_group + ic; + for(uint16_t k = 0; k < kernel; k++) + { + int32_t in_pos = (int32_t) out_pos * stride - (int32_t) padding + (int32_t) k * dilation; + if(in_pos < 0 || in_pos >= in_length) + { + continue; + } + + uint32_t input_index = ((uint32_t) n * in_channels + ic_global) * in_length + (uint32_t) in_pos; + uint32_t weight_index = ((uint32_t) oc_global * channels_per_group + ic) * kernel + k; + sum += input_data[input_index] * weight_data[weight_index]; + } + } + uint32_t out_index = ((uint32_t) n * out_channels + oc_global) * out_length + out_pos; + out_data[out_index] = sum; + } + } + } + } +} + +void aimath_f32_default_conv1d_input_grad(const aitensor_t *delta_out, const aitensor_t *weights, + uint16_t stride, uint16_t padding, uint16_t dilation, uint16_t groups, + aitensor_t *delta_in) +{ + float *grad_input = (float *) delta_in->data; + const float *delta_data = (const float *) delta_out->data; + const float *weight_data = (const float *) weights->data; + + memset(grad_input, 0, aimath_sizeof_tensor_data(delta_in)); + + uint16_t batch = delta_out->shape[0]; + uint16_t out_channels = delta_out->shape[1]; + uint16_t out_length = delta_out->shape[2]; + uint16_t in_channels = delta_in->shape[1]; + uint16_t in_length = delta_in->shape[2]; + uint16_t kernel = weights->shape[2]; + uint16_t channels_per_group = in_channels / groups; + uint16_t out_per_group = out_channels / groups; + + for(uint16_t n = 0; n < batch; n++) + { + for(uint16_t g = 0; g < groups; g++) + { + for(uint16_t oc = 0; oc < out_per_group; oc++) + { + uint16_t oc_global = g * out_per_group + oc; + for(uint16_t out_pos = 0; out_pos < out_length; out_pos++) + { + float grad = delta_data[((uint32_t) n * out_channels + oc_global) * out_length + out_pos]; + for(uint16_t ic = 0; ic < channels_per_group; ic++) + { + uint16_t ic_global = g * channels_per_group + ic; + for(uint16_t k = 0; k < kernel; k++) + { + int32_t in_pos = (int32_t) out_pos * stride - (int32_t) padding + (int32_t) k * dilation; + if(in_pos < 0 || in_pos >= in_length) + { + continue; + } + + uint32_t input_index = ((uint32_t) n * in_channels + ic_global) * in_length + (uint32_t) in_pos; + uint32_t weight_index = ((uint32_t) oc_global * channels_per_group + ic) * kernel + k; + grad_input[input_index] += grad * weight_data[weight_index]; + } + } + } + } + } + } +} + +void aimath_f32_default_conv1d_weight_grad(const aitensor_t *delta_out, const aitensor_t *input, + uint16_t stride, uint16_t padding, uint16_t dilation, uint16_t groups, + aitensor_t *d_weights) +{ + const float *delta_data = (const float *) delta_out->data; + const float *input_data = (const float *) input->data; + float *grad_data = (float *) d_weights->data; + + uint16_t batch = input->shape[0]; + uint16_t in_channels = input->shape[1]; + uint16_t in_length = input->shape[2]; + uint16_t out_channels = delta_out->shape[1]; + uint16_t out_length = delta_out->shape[2]; + uint16_t kernel = d_weights->shape[2]; + uint16_t channels_per_group = in_channels / groups; + uint16_t out_per_group = out_channels / groups; + + for(uint16_t g = 0; g < groups; g++) + { + for(uint16_t oc = 0; oc < out_per_group; oc++) + { + uint16_t oc_global = g * out_per_group + oc; + for(uint16_t ic = 0; ic < channels_per_group; ic++) + { + uint16_t ic_global = g * channels_per_group + ic; + for(uint16_t k = 0; k < kernel; k++) + { + uint32_t weight_index = ((uint32_t) oc_global * channels_per_group + ic) * kernel + k; + for(uint16_t n = 0; n < batch; n++) + { + for(uint16_t out_pos = 0; out_pos < out_length; out_pos++) + { + int32_t in_pos = (int32_t) out_pos * stride - (int32_t) padding + (int32_t) k * dilation; + if(in_pos < 0 || in_pos >= in_length) + { + continue; + } + + uint32_t input_index = ((uint32_t) n * in_channels + ic_global) * in_length + (uint32_t) in_pos; + uint32_t delta_index = ((uint32_t) n * out_channels + oc_global) * out_length + out_pos; + grad_data[weight_index] += input_data[input_index] * delta_data[delta_index]; + } + } + } + } + } + } +} + +void aimath_f32_default_conv2d_forward(const aitensor_t *input, const aitensor_t *weights, const aitensor_t *bias, + uint16_t stride_height, uint16_t stride_width, + uint16_t padding_height, uint16_t padding_width, + uint16_t dilation_height, uint16_t dilation_width, + uint16_t groups, aitensor_t *output) +{ + const float *input_data = (const float *) input->data; + const float *weight_data = (const float *) weights->data; + const float *bias_data = (bias != 0 && bias->data != 0) ? (const float *) bias->data : 0; + float *out_data = (float *) output->data; + + uint16_t batch = input->shape[0]; + uint16_t in_channels = input->shape[1]; + uint16_t in_height = input->shape[2]; + uint16_t in_width = input->shape[3]; + uint16_t out_channels = weights->shape[0]; + uint16_t kernel_h = weights->shape[2]; + uint16_t kernel_w = weights->shape[3]; + uint16_t out_height = output->shape[2]; + uint16_t out_width = output->shape[3]; + uint16_t channels_per_group = in_channels / groups; + uint16_t out_per_group = out_channels / groups; + + for(uint16_t n = 0; n < batch; n++) + { + for(uint16_t g = 0; g < groups; g++) + { + for(uint16_t oc = 0; oc < out_per_group; oc++) + { + uint16_t oc_global = g * out_per_group + oc; + for(uint16_t oh = 0; oh < out_height; oh++) + { + for(uint16_t ow = 0; ow < out_width; ow++) + { + float sum = bias_data ? bias_data[oc_global] : 0.0f; + for(uint16_t ic = 0; ic < channels_per_group; ic++) + { + uint16_t ic_global = g * channels_per_group + ic; + for(uint16_t kh = 0; kh < kernel_h; kh++) + { + int32_t ih = (int32_t) oh * stride_height - (int32_t) padding_height + (int32_t) kh * dilation_height; + if(ih < 0 || ih >= in_height) + { + continue; + } + for(uint16_t kw = 0; kw < kernel_w; kw++) + { + int32_t iw = (int32_t) ow * stride_width - (int32_t) padding_width + (int32_t) kw * dilation_width; + if(iw < 0 || iw >= in_width) + { + continue; + } + + uint32_t input_index = (((uint32_t) n * in_channels + ic_global) * in_height + (uint32_t) ih) * in_width + (uint32_t) iw; + uint32_t weight_index = ((((uint32_t) oc_global * channels_per_group + ic) * kernel_h) + kh) * kernel_w + kw; + sum += input_data[input_index] * weight_data[weight_index]; + } + } + } + uint32_t out_index = (((uint32_t) n * out_channels + oc_global) * out_height + oh) * out_width + ow; + out_data[out_index] = sum; + } + } + } + } + } +} + +void aimath_f32_default_conv2d_input_grad(const aitensor_t *delta_out, const aitensor_t *weights, + uint16_t stride_height, uint16_t stride_width, + uint16_t padding_height, uint16_t padding_width, + uint16_t dilation_height, uint16_t dilation_width, + uint16_t groups, aitensor_t *delta_in) +{ + float *grad_input = (float *) delta_in->data; + const float *delta_data = (const float *) delta_out->data; + const float *weight_data = (const float *) weights->data; + + memset(grad_input, 0, aimath_sizeof_tensor_data(delta_in)); + + uint16_t batch = delta_out->shape[0]; + uint16_t out_channels = delta_out->shape[1]; + uint16_t out_height = delta_out->shape[2]; + uint16_t out_width = delta_out->shape[3]; + uint16_t in_channels = delta_in->shape[1]; + uint16_t in_height = delta_in->shape[2]; + uint16_t in_width = delta_in->shape[3]; + uint16_t kernel_h = weights->shape[2]; + uint16_t kernel_w = weights->shape[3]; + uint16_t channels_per_group = in_channels / groups; + uint16_t out_per_group = out_channels / groups; + + for(uint16_t n = 0; n < batch; n++) + { + for(uint16_t g = 0; g < groups; g++) + { + for(uint16_t oc = 0; oc < out_per_group; oc++) + { + uint16_t oc_global = g * out_per_group + oc; + for(uint16_t oh = 0; oh < out_height; oh++) + { + for(uint16_t ow = 0; ow < out_width; ow++) + { + float grad = delta_data[(((uint32_t) n * out_channels + oc_global) * out_height + oh) * out_width + ow]; + for(uint16_t ic = 0; ic < channels_per_group; ic++) + { + uint16_t ic_global = g * channels_per_group + ic; + for(uint16_t kh = 0; kh < kernel_h; kh++) + { + int32_t ih = (int32_t) oh * stride_height - (int32_t) padding_height + (int32_t) kh * dilation_height; + if(ih < 0 || ih >= in_height) + { + continue; + } + for(uint16_t kw = 0; kw < kernel_w; kw++) + { + int32_t iw = (int32_t) ow * stride_width - (int32_t) padding_width + (int32_t) kw * dilation_width; + if(iw < 0 || iw >= in_width) + { + continue; + } + + uint32_t input_index = (((uint32_t) n * in_channels + ic_global) * in_height + (uint32_t) ih) * in_width + (uint32_t) iw; + uint32_t weight_index = ((((uint32_t) oc_global * channels_per_group + ic) * kernel_h) + kh) * kernel_w + kw; + grad_input[input_index] += grad * weight_data[weight_index]; + } + } + } + } + } + } + } + } +} + +void aimath_f32_default_conv2d_weight_grad(const aitensor_t *delta_out, const aitensor_t *input, + uint16_t stride_height, uint16_t stride_width, + uint16_t padding_height, uint16_t padding_width, + uint16_t dilation_height, uint16_t dilation_width, + uint16_t groups, aitensor_t *d_weights) +{ + const float *delta_data = (const float *) delta_out->data; + const float *input_data = (const float *) input->data; + float *grad_data = (float *) d_weights->data; + + uint16_t batch = input->shape[0]; + uint16_t in_channels = input->shape[1]; + uint16_t in_height = input->shape[2]; + uint16_t in_width = input->shape[3]; + uint16_t out_channels = delta_out->shape[1]; + uint16_t out_height = delta_out->shape[2]; + uint16_t out_width = delta_out->shape[3]; + uint16_t kernel_h = d_weights->shape[2]; + uint16_t kernel_w = d_weights->shape[3]; + uint16_t channels_per_group = in_channels / groups; + uint16_t out_per_group = out_channels / groups; + + for(uint16_t g = 0; g < groups; g++) + { + for(uint16_t oc = 0; oc < out_per_group; oc++) + { + uint16_t oc_global = g * out_per_group + oc; + for(uint16_t ic = 0; ic < channels_per_group; ic++) + { + uint16_t ic_global = g * channels_per_group + ic; + for(uint16_t kh = 0; kh < kernel_h; kh++) + { + for(uint16_t kw = 0; kw < kernel_w; kw++) + { + uint32_t weight_index = ((((uint32_t) oc_global * channels_per_group + ic) * kernel_h) + kh) * kernel_w + kw; + for(uint16_t n = 0; n < batch; n++) + { + for(uint16_t oh = 0; oh < out_height; oh++) + { + int32_t ih = (int32_t) oh * stride_height - (int32_t) padding_height + (int32_t) kh * dilation_height; + if(ih < 0 || ih >= in_height) + { + continue; + } + for(uint16_t ow = 0; ow < out_width; ow++) + { + int32_t iw = (int32_t) ow * stride_width - (int32_t) padding_width + (int32_t) kw * dilation_width; + if(iw < 0 || iw >= in_width) + { + continue; + } + + uint32_t input_index = (((uint32_t) n * in_channels + ic_global) * in_height + (uint32_t) ih) * in_width + (uint32_t) iw; + uint32_t delta_index = (((uint32_t) n * out_channels + oc_global) * out_height + oh) * out_width + ow; + grad_data[weight_index] += input_data[input_index] * delta_data[delta_index]; + } + } + } + } + } + } + } + } } //Info(?): http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.4508&rep=rep1&type=pdf diff --git a/src/basic/default/aimath/aimath_f32_default.h b/src/basic/default/aimath/aimath_f32_default.h index 9569fc2..28953f8 100644 --- a/src/basic/default/aimath/aimath_f32_default.h +++ b/src/basic/default/aimath/aimath_f32_default.h @@ -1309,5 +1309,33 @@ void getAbsTopKIndices(uint16_t* topK, const uint16_t k, const aitensor_t* tenso */ void getAbsTopKIndices_minHeap(uint16_t* topK, const uint16_t k, const aitensor_t* tensor, uint16_t row); +void aimath_f32_default_conv1d_forward(const aitensor_t *input, const aitensor_t *weights, const aitensor_t *bias, + uint16_t stride, uint16_t padding, uint16_t dilation, uint16_t groups, + aitensor_t *output); +void aimath_f32_default_conv1d_input_grad(const aitensor_t *delta_out, const aitensor_t *weights, + uint16_t stride, uint16_t padding, uint16_t dilation, uint16_t groups, + aitensor_t *delta_in); +void aimath_f32_default_conv1d_weight_grad(const aitensor_t *delta_out, const aitensor_t *input, + uint16_t stride, uint16_t padding, uint16_t dilation, uint16_t groups, + aitensor_t *d_weights); + +void aimath_f32_default_conv2d_forward(const aitensor_t *input, const aitensor_t *weights, const aitensor_t *bias, + uint16_t stride_height, uint16_t stride_width, + uint16_t padding_height, uint16_t padding_width, + uint16_t dilation_height, uint16_t dilation_width, + uint16_t groups, aitensor_t *output); +void aimath_f32_default_conv2d_input_grad(const aitensor_t *delta_out, const aitensor_t *weights, + uint16_t stride_height, uint16_t stride_width, + uint16_t padding_height, uint16_t padding_width, + uint16_t dilation_height, uint16_t dilation_width, + uint16_t groups, aitensor_t *delta_in); +void aimath_f32_default_conv2d_weight_grad(const aitensor_t *delta_out, const aitensor_t *input, + uint16_t stride_height, uint16_t stride_width, + uint16_t padding_height, uint16_t padding_width, + uint16_t dilation_height, uint16_t dilation_width, + uint16_t groups, aitensor_t *d_weights); + +void aimath_f32_default_conv_bias_grad(const aitensor_t *delta_out, aitensor_t *d_bias); + #endif // AIMATH_F32_DEFAULT